diff --git a/.github/workflows/linux_cuda_ci.yml b/.github/workflows/linux_cuda_ci.yml
index 4f059d8811962..e6d66df93cde9 100644
--- a/.github/workflows/linux_cuda_ci.yml
+++ b/.github/workflows/linux_cuda_ci.yml
@@ -28,7 +28,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1'
       docker_image_repo: onnxruntimecuda12manylinuxbuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.2 --cuda_home=/usr/local/cuda-12.2 --cudnn_home=/usr/local/cuda-12.2 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=80 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.2 --cuda_home=/usr/local/cuda-12.2 --cudnn_home=/usr/local/cuda-12.2 --enable_cuda_profiling --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=90 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
@@ -41,7 +41,7 @@ jobs:
     needs: build-linux-cuda-x64-release
     runs-on:
       - self-hosted
-      - "1ES.Pool=Onnxruntime-github-Linux-GPU-A100-WUS3"
+      - "1ES.Pool=Onnxruntime-github-Linux-GPU-H100"
     permissions:
       contents: read
       packages: read
@@ -98,5 +98,5 @@ jobs:
           build_config: Release
           mode: 'test' # Set mode to test
           execution_providers: 'cuda'
-          extra_build_flags: '--use_binskim_compliant_compile_flags --cuda_version=12.2 --cuda_home=/usr/local/cuda-12.2 --cudnn_home=/usr/local/cuda-12.2 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=80 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+          extra_build_flags: '--use_binskim_compliant_compile_flags --cuda_version=12.2 --cuda_home=/usr/local/cuda-12.2 --cudnn_home=/usr/local/cuda-12.2 --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=90 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
           python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
diff --git a/.github/workflows/linux_tensorrt_ci.yml b/.github/workflows/linux_tensorrt_ci.yml
index 009697917e257..fa404842b79e2 100644
--- a/.github/workflows/linux_tensorrt_ci.yml
+++ b/.github/workflows/linux_tensorrt_ci.yml
@@ -28,7 +28,7 @@ jobs:
       dockerfile_path: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       docker_build_args: '--build-arg BASEIMAGE=onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20250124.1 --build-arg TRT_VERSION=10.9.0.34-1.cuda12.8 --network=host'
       docker_image_repo: onnxruntimetensorrt86gpubuild
-      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.2 --cuda_home=/usr/local/cuda-12.2 --cudnn_home=/usr/local/cuda-12.2 --use_tensorrt --tensorrt_home /usr  --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=80 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+      extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.2 --cuda_home=/usr/local/cuda-12.2 --cudnn_home=/usr/local/cuda-12.2 --use_tensorrt --tensorrt_home /usr  --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=90 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
       python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
       run_tests: false            # <<< Do not run tests in this job
       upload_build_output: true   # <<< Upload the build/Release directory
@@ -41,7 +41,7 @@ jobs:
     needs: build-linux-TensorRT-x64-release
     runs-on:
       - self-hosted
-      - "1ES.Pool=Onnxruntime-github-Linux-GPU-A100-WUS3"
+      - "1ES.Pool=Onnxruntime-github-Linux-GPU-H100"
     permissions:
       contents: read
       packages: read
@@ -100,5 +100,5 @@ jobs:
           build_config: Release
           mode: 'test' # Set mode to test
           execution_providers: 'cuda tensorrt'
-          extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.2 --cuda_home=/usr/local/cuda-12.2 --cudnn_home=/usr/local/cuda-12.2 --use_tensorrt --tensorrt_home /usr  --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=80 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
+          extra_build_flags: '--use_binskim_compliant_compile_flags --build_wheel --cuda_version=12.2 --cuda_home=/usr/local/cuda-12.2 --cudnn_home=/usr/local/cuda-12.2 --use_tensorrt --tensorrt_home /usr  --build_java --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=90 onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON'
           python_path_prefix: 'PATH=/opt/python/cp310-cp310/bin:$PATH'
diff --git a/VERSION_NUMBER b/VERSION_NUMBER
index 6245beecd39cd..6fee2fedb0a46 100644
--- a/VERSION_NUMBER
+++ b/VERSION_NUMBER
@@ -1 +1 @@
-1.22.1
+1.22.2
diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json
index f29857a231eb9..bf889e9fb61a8 100644
--- a/cgmanifests/cgmanifest.json
+++ b/cgmanifests/cgmanifest.json
@@ -36,7 +36,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3",
+          "commitHash": "5c210da409e7f1e51ddf445134a4376fdbd70d7d",
           "repositoryUrl": "https://github.com/dmlc/dlpack.git"
         }
       }
@@ -316,16 +316,6 @@
         "comments": "gtest-ios-framework"
       }
     },
-    {
-      "component": {
-        "type": "git",
-        "git": {
-          "commitHash": "277508879878e0a5b5b43599b1bea11f66eb3c6c",
-          "repositoryUrl": "https://github.com/dmlc/dlpack.git"
-        },
-        "comments": "dlpack"
-      }
-    },
     {
       "component": {
         "Type": "other",
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 5e689908f4fcc..adf0fc0261ab3 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -150,6 +150,7 @@ option(onnxruntime_DISABLE_SPARSE_TENSORS "Disable sparse tensors data types" OF
 option(onnxruntime_DISABLE_OPTIONAL_TYPE "Disable optional type" OFF)
 option(onnxruntime_DISABLE_FLOAT8_TYPES "Disable float 8 types" OFF)
 option(onnxruntime_MINIMAL_BUILD "Exclude as much as possible from the build. Support ORT format models. No support for ONNX format models." OFF)
+option(onnxruntime_CLIENT_PACKAGE_BUILD "Enables default settings that are more appropriate for client/on-device workloads." OFF)
 cmake_dependent_option(onnxruntime_DISABLE_RTTI "Disable RTTI" ON "NOT onnxruntime_ENABLE_PYTHON;NOT onnxruntime_USE_CUDA" OFF)
 # For now onnxruntime_DISABLE_EXCEPTIONS will only work with onnxruntime_MINIMAL_BUILD, more changes (ONNX, non-CPU EP, ...) are required to run this standalone
 cmake_dependent_option(onnxruntime_DISABLE_EXCEPTIONS "Disable exception handling. Requires onnxruntime_MINIMAL_BUILD currently." ON "onnxruntime_MINIMAL_BUILD;NOT onnxruntime_ENABLE_PYTHON" OFF)
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index 8f5ef15c53ef2..78e0bf67991b5 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -98,6 +98,11 @@ if (onnxruntime_MINIMAL_BUILD)
   endif()
 endif()
 
+# ORT build with default settings more appropriate for client/on-device workloads.
+if (onnxruntime_CLIENT_PACKAGE_BUILD)
+  add_compile_definitions(ORT_CLIENT_PACKAGE_BUILD)
+endif()
+
 if (onnxruntime_ENABLE_LTO)
     include(CheckIPOSupported)
     check_ipo_supported(RESULT ipo_enabled OUTPUT ipo_output)
diff --git a/cmake/deps.txt b/cmake/deps.txt
index eacec6f17eb04..9f81a674a9c5e 100644
--- a/cmake/deps.txt
+++ b/cmake/deps.txt
@@ -16,7 +16,7 @@ abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20240722.0.zip
 coremltools;https://github.com/apple/coremltools/archive/refs/tags/7.1.zip;f1bab0f30966f2e217d8e01207d518f230a1641a
 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
 date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159
-dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b31321e5549591d78aa7f377173445
+dlpack;https://github.com/dmlc/dlpack/archive/5c210da409e7f1e51ddf445134a4376fdbd70d7d.zip;e499c86e4e5c5268a87661d7ea39c27fae10907c
 # This Eigen commit id matches the eigen archive being consumed from https://gitlab.com/libeigen/eigen/-/archive/3.4/eigen-3.4.zip
 # prior to the 3.4.1 RC changing the bits and invalidating the hash.
 # it contains changes on top of 3.4.0 which are required to fix build issues.
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 8decca10937ba..698192aee1552 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -31,6 +31,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/eltwise.cpp
   ${MLAS_SRC_DIR}/erf.cpp
   ${MLAS_SRC_DIR}/compute.cpp
+  ${MLAS_SRC_DIR}/dequantize.cpp
   ${MLAS_SRC_DIR}/quantize.cpp
   ${MLAS_SRC_DIR}/qgemm_kernel_default.cpp
   ${MLAS_SRC_DIR}/qladd.cpp
diff --git a/cmake/onnxruntime_providers_qnn.cmake b/cmake/onnxruntime_providers_qnn.cmake
index 60b3aaf38cd85..9fe4d9fadc44e 100644
--- a/cmake/onnxruntime_providers_qnn.cmake
+++ b/cmake/onnxruntime_providers_qnn.cmake
@@ -66,10 +66,10 @@
         COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} $<TARGET_FILE_DIR:${onnxruntime_providers_qnn_target}>
         )
     endif()
-    if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
+    if (EXISTS "${onnxruntime_QNN_HOME}/LICENSE.pdf")
       add_custom_command(
         TARGET ${onnxruntime_providers_qnn_target} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf" $<TARGET_FILE_DIR:${onnxruntime_providers_qnn_target}>
+        COMMAND ${CMAKE_COMMAND} -E copy "${onnxruntime_QNN_HOME}/LICENSE.pdf" $<TARGET_FILE_DIR:${onnxruntime_providers_qnn_target}>/Qualcomm_LICENSE.pdf
         )
     endif()
   else()
@@ -154,10 +154,10 @@
         COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} $<TARGET_FILE_DIR:${onnxruntime_providers_qnn_target}>
         )
     endif()
-    if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
+    if (EXISTS "${onnxruntime_QNN_HOME}/LICENSE.pdf")
       add_custom_command(
         TARGET ${onnxruntime_providers_qnn_target} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf" $<TARGET_FILE_DIR:${onnxruntime_providers_qnn_target}>
+        COMMAND ${CMAKE_COMMAND} -E copy "${onnxruntime_QNN_HOME}/LICENSE.pdf" $<TARGET_FILE_DIR:${onnxruntime_providers_qnn_target}>/Qualcomm_LICENSE.pdf
         )
     endif()
   endif()
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index c57a2a962303d..67c80bfb4955c 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -1050,18 +1050,10 @@ if (onnxruntime_USE_QNN)
         ${QNN_LIB_FILES}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
   )
-  add_custom_command(
-    TARGET onnxruntime_pybind11_state POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy
-        $<TARGET_FILE:ep_weight_sharing_ctx_gen>
-        $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
-  )
-  if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
+  if (EXISTS "${onnxruntime_QNN_HOME}/LICENSE.pdf")
     add_custom_command(
       TARGET onnxruntime_pybind11_state POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E copy
-          "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf"
-          $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/
+        COMMAND ${CMAKE_COMMAND} -E copy "${onnxruntime_QNN_HOME}/LICENSE.pdf" $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/Qualcomm_LICENSE.pdf
     )
   endif()
 endif()
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 9797d8019f2d3..5ec174b43e864 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -724,6 +724,7 @@ endif()
 # or reduced op builds.
 if(onnxruntime_USE_QNN AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_REDUCED_OPS_BUILD)
   list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/qnn/*)
+  list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/qnn/qnn_node_group/*)
   list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_qnn)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_qnn)
   if(NOT onnxruntime_BUILD_QNN_EP_STATIC_LIB)
diff --git a/cmake/vcpkg-ports/dlpack/portfile.cmake b/cmake/vcpkg-ports/dlpack/portfile.cmake
new file mode 100644
index 0000000000000..fdf328836d4dd
--- /dev/null
+++ b/cmake/vcpkg-ports/dlpack/portfile.cmake
@@ -0,0 +1,25 @@
+set(VCPKG_BUILD_TYPE release) # header-only port
+
+vcpkg_from_github(
+    OUT_SOURCE_PATH SOURCE_PATH
+    REPO dmlc/dlpack
+    REF 5c210da409e7f1e51ddf445134a4376fdbd70d7d
+    SHA512 4bc5f5fd36b20ef2943989d5c06fe9cd34f942cdfd4b4866a4405649f7faac47fcdcf3a1fa60eb7b96b643222e5e4b036cbca7d49835dc5f8b659708620a2e8f
+    HEAD_REF main
+)
+
+vcpkg_cmake_configure(
+    SOURCE_PATH "${SOURCE_PATH}"
+    OPTIONS
+      -DBUILD_MOCK=FALSE
+)
+
+vcpkg_cmake_install()
+
+vcpkg_cmake_config_fixup(CONFIG_PATH "lib/cmake/dlpack")
+
+file(REMOVE_RECURSE "${CURRENT_PACKAGES_DIR}/lib")
+
+vcpkg_install_copyright(FILE_LIST "${SOURCE_PATH}/LICENSE")
+
+file(COPY "${CMAKE_CURRENT_LIST_DIR}/usage" DESTINATION "${CURRENT_PACKAGES_DIR}/share/${PORT}")
diff --git a/cmake/vcpkg-ports/dlpack/usage b/cmake/vcpkg-ports/dlpack/usage
new file mode 100644
index 0000000000000..771ec78517174
--- /dev/null
+++ b/cmake/vcpkg-ports/dlpack/usage
@@ -0,0 +1,4 @@
+dlpack provides CMake targets:
+
+    find_package(dlpack CONFIG REQUIRED)
+    target_link_libraries(main PRIVATE dlpack::dlpack)
diff --git a/cmake/vcpkg-ports/dlpack/vcpkg.json b/cmake/vcpkg-ports/dlpack/vcpkg.json
new file mode 100644
index 0000000000000..48f2f22a0a058
--- /dev/null
+++ b/cmake/vcpkg-ports/dlpack/vcpkg.json
@@ -0,0 +1,17 @@
+{
+  "name": "dlpack",
+  "version-semver": "1.1.1",
+  "description": "DLPack is an open in-memory tensor structure for sharing tensors among frameworks",
+  "homepage": "https://github.com/dmlc/dlpack",
+  "license": "Apache-2.0",
+  "dependencies": [
+    {
+      "name": "vcpkg-cmake",
+      "host": true
+    },
+    {
+      "name": "vcpkg-cmake-config",
+      "host": true
+    }
+  ]
+}
diff --git a/docs/python/README.rst b/docs/python/README.rst
index 2a25791b1574a..af4e57cbaeeda 100644
--- a/docs/python/README.rst
+++ b/docs/python/README.rst
@@ -8,6 +8,11 @@ For more information on ONNX Runtime, please see `aka.ms/onnxruntime <https://ak
 Changes
 -------
 
+1.22.2
+^^^^^^
+
+Release Notes : https://github.com/Microsoft/onnxruntime/releases/tag/v1.22.2
+
 1.22.1
 ^^^^^^
 
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 5497d7c71a393..ed2b277abda3b 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -113,7 +113,9 @@ static const char* const kOrtSessionOptionsUseDeviceAllocatorForInitializers = "
 
 // Configure whether to allow the inter_op/intra_op threads spinning a number of times before blocking
 // "0": thread will block if found no job to run
-// "1": default, thread will spin a number of times before blocking
+// "1": thread will spin a number of times before blocking
+// The default is "0" when ORT is built with "ORT_CLIENT_PACKAGE_BUILD" and "1" otherwise.
+// Thread spinning is disabled by default for client/on-device workloads to reduce cpu utilization and improve power efficiency.
 static const char* const kOrtSessionOptionsConfigAllowInterOpSpinning = "session.inter_op.allow_spinning";
 static const char* const kOrtSessionOptionsConfigAllowIntraOpSpinning = "session.intra_op.allow_spinning";
 
diff --git a/js/common/lib/version.ts b/js/common/lib/version.ts
index fecafcfeccb9b..40c60078a93e0 100644
--- a/js/common/lib/version.ts
+++ b/js/common/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.22.1';
+export const version = '1.22.2';
diff --git a/js/common/package-lock.json b/js/common/package-lock.json
index f6baaa76e02b7..f6521b617cb31 100644
--- a/js/common/package-lock.json
+++ b/js/common/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-common",
-  "version": "1.22.1",
+  "version": "1.22.2",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-common",
-      "version": "1.22.1",
+      "version": "1.22.2",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/common/package.json b/js/common/package.json
index 12ac231856ddb..1b74cfdce9484 100644
--- a/js/common/package.json
+++ b/js/common/package.json
@@ -2,7 +2,7 @@
   "license": "MIT",
   "type": "module",
   "name": "onnxruntime-common",
-  "version": "1.22.1",
+  "version": "1.22.2",
   "repository": {
     "url": "https://github.com/Microsoft/onnxruntime.git",
     "type": "git"
diff --git a/js/node/lib/version.ts b/js/node/lib/version.ts
index fecafcfeccb9b..40c60078a93e0 100644
--- a/js/node/lib/version.ts
+++ b/js/node/lib/version.ts
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-export const version = '1.22.1';
+export const version = '1.22.2';
diff --git a/js/node/package-lock.json b/js/node/package-lock.json
index c791b4eff9f28..6f7205ff1cd2a 100644
--- a/js/node/package-lock.json
+++ b/js/node/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-node",
-  "version": "1.22.1",
+  "version": "1.22.2",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-node",
-      "version": "1.22.1",
+      "version": "1.22.2",
       "hasInstallScript": true,
       "license": "MIT",
       "os": [
@@ -30,7 +30,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.22.1",
+      "version": "1.22.2",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/node/package.json b/js/node/package.json
index 657247d732644..aa199bfe17c0c 100644
--- a/js/node/package.json
+++ b/js/node/package.json
@@ -11,7 +11,7 @@
       6
     ]
   },
-  "version": "1.22.1",
+  "version": "1.22.2",
   "dependencies": {
     "adm-zip": "^0.5.16",
     "global-agent": "^3.0.0",
diff --git a/js/node/script/install-metadata-versions.js b/js/node/script/install-metadata-versions.js
index 128eff9456eaf..7f304a7b85703 100644
--- a/js/node/script/install-metadata-versions.js
+++ b/js/node/script/install-metadata-versions.js
@@ -4,4 +4,4 @@
 // This file is generated by /js/scripts/update-version.ts
 // Do not modify file content manually.
 
-module.exports = { nuget: [{ feed: 'nuget', version: '1.22.1' }] };
+module.exports = { nuget: [{ feed: 'nuget', version: '1.22.2' }] };
diff --git a/js/package-lock.json b/js/package-lock.json
index dbbf42d9c8f35..1fa9032ff6d3c 100644
--- a/js/package-lock.json
+++ b/js/package-lock.json
@@ -4,7 +4,6 @@
   "requires": true,
   "packages": {
     "": {
-      "name": "js",
       "license": "MIT",
       "devDependencies": {
         "@types/fs-extra": "^11.0.4",
diff --git a/js/react_native/package-lock.json b/js/react_native/package-lock.json
index 7a57e10899851..74f0dd9e449fe 100644
--- a/js/react_native/package-lock.json
+++ b/js/react_native/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-react-native",
-  "version": "1.22.1",
+  "version": "1.22.2",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-react-native",
-      "version": "1.22.1",
+      "version": "1.22.2",
       "license": "MIT",
       "dependencies": {
         "buffer": "^6.0.3",
@@ -31,7 +31,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.22.1",
+      "version": "1.22.2",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/react_native/package.json b/js/react_native/package.json
index e7352dacdd18f..88f11ed563bc3 100644
--- a/js/react_native/package.json
+++ b/js/react_native/package.json
@@ -37,7 +37,7 @@
     "registry": "https://registry.npmjs.org/"
   },
   "source": "lib/index",
-  "version": "1.22.1",
+  "version": "1.22.2",
   "main": "dist/commonjs/index",
   "homepage": "https://github.com/microsoft/onnxruntime/blob/main/js/react_native/README.md",
   "files": [
diff --git a/js/web/package-lock.json b/js/web/package-lock.json
index 3e955d314428b..8d0fa69bd5621 100644
--- a/js/web/package-lock.json
+++ b/js/web/package-lock.json
@@ -1,12 +1,12 @@
 {
   "name": "onnxruntime-web",
-  "version": "1.22.1",
+  "version": "1.22.2",
   "lockfileVersion": 2,
   "requires": true,
   "packages": {
     "": {
       "name": "onnxruntime-web",
-      "version": "1.22.1",
+      "version": "1.22.2",
       "license": "MIT",
       "dependencies": {
         "flatbuffers": "^25.1.24",
@@ -50,7 +50,7 @@
     },
     "../common": {
       "name": "onnxruntime-common",
-      "version": "1.22.1",
+      "version": "1.22.2",
       "license": "MIT",
       "devDependencies": {
         "typedoc": "^0.25.7"
diff --git a/js/web/package.json b/js/web/package.json
index 7bdc71acf053f..c573d4f211b0e 100644
--- a/js/web/package.json
+++ b/js/web/package.json
@@ -7,7 +7,7 @@
     "type": "git"
   },
   "author": "fs-eire",
-  "version": "1.22.1",
+  "version": "1.22.2",
   "jsdelivr": "dist/ort.min.js",
   "dependencies": {
     "flatbuffers": "^25.1.24",
diff --git a/onnxruntime/__init__.py b/onnxruntime/__init__.py
index eca1cde82765f..b95f41edae32b 100644
--- a/onnxruntime/__init__.py
+++ b/onnxruntime/__init__.py
@@ -8,7 +8,7 @@
 or the `Github project <https://github.com/microsoft/onnxruntime/>`_.
 """
 
-__version__ = "1.22.1"
+__version__ = "1.22.2"
 __author__ = "Microsoft"
 
 # we need to do device version validation (for example to check Cuda version for an onnxruntime-training package).
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 266370997fd46..217881a89aa6e 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -1223,6 +1223,21 @@ MlasQuantizeLinearS4(
     int8_t ZeroPoint
     );
 
+//
+// Linear dequantization routines.
+//
+
+template<typename InputType>
+void
+MLASCALL
+MlasDequantizeLinear(
+    const InputType* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    InputType ZeroPoint
+    );
+
 /**
  * @brief Requantize a block of the intermediate buffer to the output buffer,
  *        optionally adding the supplied bias
diff --git a/onnxruntime/core/mlas/lib/dequantize.cpp b/onnxruntime/core/mlas/lib/dequantize.cpp
new file mode 100644
index 0000000000000..175d3f668ac39
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/dequantize.cpp
@@ -0,0 +1,395 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    dequantize.cpp
+
+Abstract:
+
+    This module implements routines to dequantize buffers.
+
+    The dequantization formula as specified in the ONNX operator documentation is:
+
+        Output = (Input - ZeroPoint) * Scale
+
+--*/
+
+#include "mlasi.h"
+
+//
+// DequantizeLinear reference implementation using the C++ runtime.
+//
+
+template<typename InputType>
+static
+MLAS_FORCEINLINE
+void
+MlasDequantizeLinearRefImpl(
+    const InputType* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    InputType ZeroPoint
+    )
+/*++
+
+Routine Description:
+
+    This routine quantizes the input buffer using the supplied quantization
+    parameters.
+
+Arguments:
+
+    Input - Supplies the input buffer with quantized data.
+
+    Output - Supplies the output buffer.
+
+    N - Supplies the number of elements to process.
+
+    Scale - Supplies the quantization scale.
+
+    ZeroPoint - Supplies the quantization zero point value.
+
+Return Value:
+
+    None.
+
+--*/
+{
+    int32_t ZeroPointS32 = static_cast<int32_t>(ZeroPoint);
+
+    for (size_t n = 0; n < N; n++) {
+        Output[n] = static_cast<float>(static_cast<int32_t>(Input[n]) - ZeroPointS32) * Scale;
+    }
+}
+
+#if defined(MLAS_SSE2_INTRINSICS)
+// Implementation for Intel SSE 2. Refer to the Intel Intrisics Guide:
+// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html
+
+void
+MLASCALL
+MlasDequantizeLinearS8Kernel(
+    const int8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    int8_t ZeroPoint
+    )
+{
+    const __m128 ScaleVector = MlasBroadcastFloat32x4(Scale);
+    const __m128i ZeroPointS16Vector = _mm_set1_epi16(static_cast<int16_t>(ZeroPoint)); // Broadcast zp to 8 int16s
+    const __m128i Zeros = _mm_setzero_si128();
+
+    while (N >= 16) {
+        // Load a vector of 16 int8s: [0 ... 15]
+        __m128i VectorS8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(Input));
+
+        // Sign-extend into 2 vectors of 8 int16s
+        __m128i SignMaskS8 = _mm_cmpgt_epi8(Zeros, VectorS8); // 0xFF for every negative byte in VectorS8
+        __m128i VectorS16_0 = _mm_unpacklo_epi8(VectorS8, SignMaskS8); // [0 ... 7]
+        __m128i VectorS16_1 = _mm_unpackhi_epi8(VectorS8, SignMaskS8); // [8 ... 15]
+
+        // Subtract the zero-points in int16 domain.
+        VectorS16_0 = _mm_sub_epi16(VectorS16_0, ZeroPointS16Vector);
+        VectorS16_1 = _mm_sub_epi16(VectorS16_1, ZeroPointS16Vector);
+
+        // Sign-extend into 4 vectors of 4 int32s
+        __m128i SignMaskS16_0 = _mm_cmpgt_epi16(Zeros, VectorS16_0);
+        __m128i VectorS32_0 = _mm_unpacklo_epi16(VectorS16_0, SignMaskS16_0); // [0 ... 3]
+        __m128i VectorS32_1 = _mm_unpackhi_epi16(VectorS16_0, SignMaskS16_0); // [4 ... 7]
+
+        __m128i SignMaskS16_1 = _mm_cmpgt_epi16(Zeros, VectorS16_1);
+        __m128i VectorS32_2 = _mm_unpacklo_epi16(VectorS16_1, SignMaskS16_1); // [8 ... 11]
+        __m128i VectorS32_3 = _mm_unpackhi_epi16(VectorS16_1, SignMaskS16_1); // [12 ... 15]
+
+        // Cast each int32x4 to float and multiply by the scale vector.
+        __m128 VectorF32_0 = _mm_mul_ps(_mm_cvtepi32_ps(VectorS32_0), ScaleVector);
+        __m128 VectorF32_1 = _mm_mul_ps(_mm_cvtepi32_ps(VectorS32_1), ScaleVector);
+        __m128 VectorF32_2 = _mm_mul_ps(_mm_cvtepi32_ps(VectorS32_2), ScaleVector);
+        __m128 VectorF32_3 = _mm_mul_ps(_mm_cvtepi32_ps(VectorS32_3), ScaleVector);
+
+        // Store each int32x4 into the output.
+        _mm_storeu_ps(Output + 0, VectorF32_0);
+        _mm_storeu_ps(Output + 4, VectorF32_1);
+        _mm_storeu_ps(Output + 8, VectorF32_2);
+        _mm_storeu_ps(Output + 12, VectorF32_3);
+
+        Input += 16;
+        Output += 16;
+        N -= 16;
+    }
+
+    // Handle leftover elements (< 16) with the scalar reference implementation.
+    MlasDequantizeLinearRefImpl(Input, Output, N, Scale, ZeroPoint);
+}
+
+void
+MLASCALL
+MlasDequantizeLinearU8Kernel(
+    const uint8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    uint8_t ZeroPoint
+    )
+{
+    const __m128 ScaleVector = MlasBroadcastFloat32x4(Scale);
+    const __m128i ZeroPointS16Vector = _mm_set1_epi16(static_cast<int16_t>(ZeroPoint)); // Broadcast zp to 8 int16s
+    const __m128i Zeros = _mm_setzero_si128();
+
+    while (N >= 16) {
+        // Load a vector of 16 uint8s: [0 ... 15]
+        __m128i VectorU8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(Input));
+
+        // Zero-extend into 2 vectors of 8 uint16s
+        __m128i VectorU16_0 = _mm_unpacklo_epi8(VectorU8, Zeros); // [0 ... 7]
+        __m128i VectorU16_1 = _mm_unpackhi_epi8(VectorU8, Zeros); // [8 ... 15]
+
+        // Subtract the zero-points as uint16s. Due to two's compliment, negative results can be reinterpreted as int16
+        __m128i VectorS16_0 = _mm_sub_epi16(VectorU16_0, ZeroPointS16Vector);
+        __m128i VectorS16_1 = _mm_sub_epi16(VectorU16_1, ZeroPointS16Vector);
+
+        // Sign-extend into 4 vectors of 4 int32s
+        __m128i SignMaskS16_0 = _mm_cmpgt_epi16(Zeros, VectorS16_0);
+        __m128i VectorS32_0 = _mm_unpacklo_epi16(VectorS16_0, SignMaskS16_0); // [0 ... 3]
+        __m128i VectorS32_1 = _mm_unpackhi_epi16(VectorS16_0, SignMaskS16_0); // [4 ... 7]
+
+        __m128i SignMaskS16_1 = _mm_cmpgt_epi16(Zeros, VectorS16_1);
+        __m128i VectorS32_2 = _mm_unpacklo_epi16(VectorS16_1, SignMaskS16_1); // [8 ... 11]
+        __m128i VectorS32_3 = _mm_unpackhi_epi16(VectorS16_1, SignMaskS16_1); // [12 ... 15]
+
+        // Cast each int32x4 to float and multiply by the scale vector.
+        __m128 VectorF32_0 = _mm_mul_ps(_mm_cvtepi32_ps(VectorS32_0), ScaleVector);
+        __m128 VectorF32_1 = _mm_mul_ps(_mm_cvtepi32_ps(VectorS32_1), ScaleVector);
+        __m128 VectorF32_2 = _mm_mul_ps(_mm_cvtepi32_ps(VectorS32_2), ScaleVector);
+        __m128 VectorF32_3 = _mm_mul_ps(_mm_cvtepi32_ps(VectorS32_3), ScaleVector);
+
+        // Store each int32x4 into the output.
+        _mm_storeu_ps(Output + 0, VectorF32_0);
+        _mm_storeu_ps(Output + 4, VectorF32_1);
+        _mm_storeu_ps(Output + 8, VectorF32_2);
+        _mm_storeu_ps(Output + 12, VectorF32_3);
+
+        Input += 16;
+        Output += 16;
+        N -= 16;
+    }
+
+    // Handle leftover elements (< 16) with the scalar reference implementation.
+    MlasDequantizeLinearRefImpl(Input, Output, N, Scale, ZeroPoint);
+}
+
+template<>
+void
+MLASCALL
+MlasDequantizeLinear<int8_t>(
+    const int8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    int8_t ZeroPoint
+    )
+{
+#if defined(MLAS_TARGET_AMD64)
+    GetMlasPlatform().DequantizeLinearS8Kernel(
+#else
+    MlasDequantizeLinearS8Kernel(
+#endif
+        Input, Output, N, Scale, ZeroPoint);
+}
+
+template<>
+void
+MLASCALL
+MlasDequantizeLinear<uint8_t>(
+    const uint8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    uint8_t ZeroPoint
+    )
+{
+#if defined(MLAS_TARGET_AMD64)
+    GetMlasPlatform().DequantizeLinearU8Kernel(
+#else
+    MlasDequantizeLinearU8Kernel(
+#endif
+        Input, Output, N, Scale, ZeroPoint);
+}
+#elif defined(MLAS_NEON64_INTRINSICS)
+// Implementation for ARM64 NEON. Refer to the ARM instrinsics guide:
+// https://developer.arm.com/architectures/instruction-sets/intrinsics/
+
+void
+MLASCALL
+MlasDequantizeLinearS8Kernel(
+    const int8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    int8_t ZeroPoint
+    )
+{
+    const float32x4_t ScaleVector = MlasBroadcastFloat32x4(Scale);
+    const int16x8_t ZeroPointVector = vdupq_n_s16(ZeroPoint); // Broadcast ZeroPoint (sign-extended to 16bits)
+
+    while (N >= 16) {
+        // Load a vector of 16 int8s: [0 ... 15]
+        int8x16_t VectorS8 = vld1q_s8(Input);
+
+        // Sign-extend into 2 vectors of 8 int16s
+        int16x8_t VectorS16_0 = vmovl_s8(vget_low_s8(VectorS8));  // [0 ... 7]
+        int16x8_t VectorS16_1 = vmovl_s8(vget_high_s8(VectorS8)); // [8 ... 15]
+
+        // Subtract the zero-points in int16 domain.
+        VectorS16_0 = vsubq_s16(VectorS16_0, ZeroPointVector);
+        VectorS16_1 = vsubq_s16(VectorS16_1, ZeroPointVector);
+
+        // Sign-extend into 4 vectors of 4 int32s
+        int32x4_t VectorS32_0 = vmovl_s16(vget_low_s16(VectorS16_0));  // [0 ... 3]
+        int32x4_t VectorS32_1 = vmovl_s16(vget_high_s16(VectorS16_0)); // [4 ... 7]
+        int32x4_t VectorS32_2 = vmovl_s16(vget_low_s16(VectorS16_1));  // [8 ... 11]
+        int32x4_t VectorS32_3 = vmovl_s16(vget_high_s16(VectorS16_1)); // [12 ... 15]
+
+        // Cast each int32x4 to float and multiply by the scale vector.
+        float32x4_t VectorF32_0 = vmulq_f32(vcvtq_f32_s32(VectorS32_0), ScaleVector);
+        float32x4_t VectorF32_1 = vmulq_f32(vcvtq_f32_s32(VectorS32_1), ScaleVector);
+        float32x4_t VectorF32_2 = vmulq_f32(vcvtq_f32_s32(VectorS32_2), ScaleVector);
+        float32x4_t VectorF32_3 = vmulq_f32(vcvtq_f32_s32(VectorS32_3), ScaleVector);
+
+        // Store each int32x4 into the output.
+        vst1q_f32(Output + 0, VectorF32_0);
+        vst1q_f32(Output + 4, VectorF32_1);
+        vst1q_f32(Output + 8, VectorF32_2);
+        vst1q_f32(Output + 12, VectorF32_3);
+
+        N -= 16;
+        Input += 16;
+        Output += 16;
+    }
+
+    // Handle leftover elements (< 16) with the scalar reference implementation.
+    MlasDequantizeLinearRefImpl(Input, Output, N, Scale, ZeroPoint);
+}
+
+void
+MLASCALL
+MlasDequantizeLinearU8Kernel(
+    const uint8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    uint8_t ZeroPoint
+    )
+{
+    const float32x4_t ScaleVector = MlasBroadcastFloat32x4(Scale);
+    const uint8x8_t ZeroPointVector = vdup_n_u8(ZeroPoint); // Broadcast ZeroPoint to 8 uint8s
+
+    while (N >= 16) {
+        // Load a vector of 16 uint8s: [0 ... 15]
+        uint8x16_t VectorU8 = vld1q_u8(Input);
+
+        // Subtract zero-point. The vsubl_u8 instruction zero-extends its arguments to uint16 first.
+        // The reinterpret from uint16x8 to int16x8 is actually a NOP.
+        int16x8_t VectorS16_0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(VectorU8), ZeroPointVector));  // [0 ... 7]
+        int16x8_t VectorS16_1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(VectorU8), ZeroPointVector)); // [8 ... 15]
+
+        // Sign-extend into 4 vectors of 4 int32s
+        int32x4_t VectorS32_0 = vmovl_s16(vget_low_s16(VectorS16_0));  // [0 ... 3]
+        int32x4_t VectorS32_1 = vmovl_s16(vget_high_s16(VectorS16_0)); // [4 ... 7]
+        int32x4_t VectorS32_2 = vmovl_s16(vget_low_s16(VectorS16_1));  // [8 ... 11]
+        int32x4_t VectorS32_3 = vmovl_s16(vget_high_s16(VectorS16_1)); // [12 ... 15]
+
+        // Cast each int32x4 to float and multiply by the scale vector.
+        float32x4_t VectorF32_0 = vmulq_f32(vcvtq_f32_s32(VectorS32_0), ScaleVector);
+        float32x4_t VectorF32_1 = vmulq_f32(vcvtq_f32_s32(VectorS32_1), ScaleVector);
+        float32x4_t VectorF32_2 = vmulq_f32(vcvtq_f32_s32(VectorS32_2), ScaleVector);
+        float32x4_t VectorF32_3 = vmulq_f32(vcvtq_f32_s32(VectorS32_3), ScaleVector);
+
+        // Store each int32x4 into the output.
+        vst1q_f32(Output + 0, VectorF32_0);
+        vst1q_f32(Output + 4, VectorF32_1);
+        vst1q_f32(Output + 8, VectorF32_2);
+        vst1q_f32(Output + 12, VectorF32_3);
+
+        N -= 16;
+        Input += 16;
+        Output += 16;
+    }
+
+    // Handle leftover elements (< 16) with the scalar reference implementation.
+    MlasDequantizeLinearRefImpl(Input, Output, N, Scale, ZeroPoint);
+}
+
+template<>
+void
+MLASCALL
+MlasDequantizeLinear<int8_t>(
+    const int8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    int8_t ZeroPoint
+    )
+{
+    MlasDequantizeLinearS8Kernel(Input, Output, N, Scale, ZeroPoint);
+}
+
+template<>
+void
+MLASCALL
+MlasDequantizeLinear<uint8_t>(
+    const uint8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    uint8_t ZeroPoint
+    )
+{
+    MlasDequantizeLinearU8Kernel(Input, Output, N, Scale, ZeroPoint);
+}
+#else
+// Implementation that uses the scalar reference implementation.
+
+template<typename InputType>
+void
+MLASCALL
+MlasDequantizeLinear(
+    const InputType* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    InputType ZeroPoint
+    )
+{
+    MlasDequantizeLinearRefImpl(Input, Output, N, Scale, ZeroPoint);
+}
+
+template
+void
+MLASCALL
+MlasDequantizeLinear<int8_t>(
+    const int8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    int8_t ZeroPoint
+    );
+
+template
+void
+MLASCALL
+MlasDequantizeLinear<uint8_t>(
+    const uint8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    uint8_t ZeroPoint
+    );
+
+#endif
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index f402309016bf8..793a8abceba46 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -730,6 +730,24 @@ void
     float Scale,
     int8_t ZeroPoint);
 
+typedef
+void
+(MLASCALL MLAS_DEQUANTIZE_LINEAR_U8_KERNEL)(
+    const uint8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    uint8_t ZeroPoint);
+
+typedef
+void
+(MLASCALL MLAS_DEQUANTIZE_LINEAR_S8_KERNEL)(
+    const int8_t* Input,
+    float* Output,
+    size_t N,
+    float Scale,
+    int8_t ZeroPoint);
+
 template<typename InputType, typename FilterType>
 struct MLAS_QUANT_KERNEL
 {
@@ -886,6 +904,8 @@ extern "C" {
     MLAS_QUANTIZE_LINEAR_S4_KERNEL MlasQuantizeLinearS4Kernel;
     MLAS_QUANTIZE_LINEAR_U4_KERNEL MlasQuantizeLinearU4Kernel;
 #if defined(MLAS_TARGET_AMD64)
+    MLAS_DEQUANTIZE_LINEAR_S8_KERNEL MlasDequantizeLinearS8Kernel;
+    MLAS_DEQUANTIZE_LINEAR_U8_KERNEL MlasDequantizeLinearU8Kernel;
     MLAS_COMPUTE_UNARY_FLOAT_KERNEL MlasErfKernelFma3;
     MLAS_COMPUTE_UNARY_FLOAT_KERNEL MlasComputeExpF32KernelFma3;
     MLAS_COMPUTE_UNARY_FLOAT_KERNEL MlasComputeExpF32KernelAvx512F;
@@ -1229,6 +1249,8 @@ struct MLAS_PLATFORM {
     MLAS_QUANTIZE_LINEAR_U16_KERNEL* QuantizeLinearU16Kernel;
     MLAS_QUANTIZE_LINEAR_S4_KERNEL* QuantizeLinearS4Kernel;
     MLAS_QUANTIZE_LINEAR_U4_KERNEL* QuantizeLinearU4Kernel;
+    MLAS_DEQUANTIZE_LINEAR_S8_KERNEL* DequantizeLinearS8Kernel;
+    MLAS_DEQUANTIZE_LINEAR_U8_KERNEL* DequantizeLinearU8Kernel;
     uint32_t NchwcBlockSize;
     uint32_t PreferredBufferAlignment;
     int32_t MaximumThreadCount;
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 7724259e7c228..7cb8a90bc86cd 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -285,6 +285,8 @@ Return Value:
     this->QuantizeLinearU16Kernel = MlasQuantizeLinearU16Kernel;
     this->QuantizeLinearS4Kernel = MlasQuantizeLinearS4Kernel;
     this->QuantizeLinearU4Kernel = MlasQuantizeLinearU4Kernel;
+    this->DequantizeLinearS8Kernel = MlasDequantizeLinearS8Kernel;
+    this->DequantizeLinearU8Kernel = MlasDequantizeLinearU8Kernel;
 #ifndef __APPLE__
 #ifndef FORCE_GENERIC_ALGORITHMS
     this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelSse;
diff --git a/onnxruntime/core/optimizer/bias_softmax_fusion.cc b/onnxruntime/core/optimizer/bias_softmax_fusion.cc
index bcbb70ba8fac5..2bbc70db16cde 100755
--- a/onnxruntime/core/optimizer/bias_softmax_fusion.cc
+++ b/onnxruntime/core/optimizer/bias_softmax_fusion.cc
@@ -135,7 +135,7 @@ bool TrySelectInputAndBiasWithAlignment(Node& add_node, Node& softmax_node, Node
   new_axis = (int)HandleNegativeAxis(axis, rank);
 
   // The axis attribute for Softmax in OpSet-11 and OpSet-13 are different.
-  // Details in function documentatin.
+  // Details in function documentation.
   if (is_since_opset_13 && new_axis != rank - 1) return false;
 
   int singlebatch_rank = rank - new_axis;
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
index 05627dd25857f..a5a7425453cc3 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc
@@ -733,6 +733,24 @@ bool TopKNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& n
   return IsQDQPairSupported(q_node, dq_node, get_const_initializer, graph_viewer.ModelPath());
 }
 
+bool CumSumNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node, const Node* redundant_clip_node,
+                                    const std::vector<const Node*>& dq_nodes,
+                                    const std::vector<const Node*>& q_nodes) const {
+  // Only the first input has DQ node
+  if (!CheckQDQNodes(graph_viewer, node, redundant_clip_node, dq_nodes, q_nodes, 1)) {
+    return false;
+  }
+
+  int32_t dt_input = dq_nodes[0]->InputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+  int32_t dt_output = q_nodes[0]->OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type();
+
+  if (dt_input != dt_output) {
+    return false;
+  }
+
+  return true;
+}
+
 }  // namespace QDQ
 }  // namespace onnxruntime
 
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
index 36e04146040db..a4ac65b7c47ce 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h
@@ -269,6 +269,14 @@ class TopKNodeGroupSelector : public NodeGroupSelector {
              const std::vector<const Node*>& q_nodes) const override;
 };
 
+// one DQ node for first input -> node -> Q
+class CumSumNodeGroupSelector : public NodeGroupSelector {
+  bool Check(const GraphViewer& graph_viewer,
+             const Node& node, const Node* redundant_clip_node,
+             const std::vector<const Node*>& dq_nodes,
+             const std::vector<const Node*>& q_nodes) const override;
+};
+
 /*
  * NodeSelector instances for use in the QDQ::SelectorActionTransformer.
  */
diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index e531d19d4c643..ccad361dc2491 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -143,6 +143,9 @@ static const OpVersionsAndSelector::OpVersionsMap GetPadOpVersionsMap() {
 static const OpVersionsAndSelector::OpVersionsMap GetTopKOpVersionsMap() {
   return {{"TopK", {}}};
 }
+static const OpVersionsAndSelector::OpVersionsMap GetCumSumOpVersionsMap() {
+  return {{"CumSum", {}}};
+}
 
 /* Selector rules registration related */
 void RegisterMiscSelectors(Selectors& qdq_selectors) {
@@ -258,6 +261,13 @@ void RegisterTopKSelector(Selectors& qdq_selectors) {
                                  std::move(selector));
 }
 
+void RegisterCumSumSelector(Selectors& qdq_selectors) {
+  /* register selector for cumsum op */
+  std::unique_ptr<NodeGroupSelector> selector = std::make_unique<CumSumNodeGroupSelector>();
+  qdq_selectors.RegisterSelector(GetCumSumOpVersionsMap(),
+                                 std::move(selector));
+}
+
 void SelectorManager::CreateSelectors() {
   RegisterMiscSelectors(qdq_selectors_);
   RegisterDropDQSelectors(qdq_selectors_);
@@ -275,6 +285,7 @@ void SelectorManager::CreateSelectors() {
   RegisterWhereSelectors(qdq_selectors_);
   RegisterPadSelectors(qdq_selectors_);
   RegisterTopKSelector(qdq_selectors_);
+  RegisterCumSumSelector(qdq_selectors_);
 }
 
 void SelectorManager::InitializeSelectorsMap() {
diff --git a/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc b/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
index 3d3e831a12d13..ab7499d6f8317 100644
--- a/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
+++ b/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <type_traits>
 #include <core/common/safeint.h>
 #include "core/framework/element_type_lists.h"
 #include "core/framework/float8.h"
@@ -285,14 +286,31 @@ struct DequantizeLinearApply<T, OutT, false> {
    * @param[in]    zero_point             same shape as scale
    */
   void op(size_t M, size_t K, size_t N, const T* input,
-          const OutT* scale, OutT* output, const T* zero_point) {
+          const OutT* scale, OutT* output, const T* zero_point, concurrency::ThreadPool* thread_pool) {
     for (size_t m = 0; m < M; m++) {
       for (size_t k = 0; k < K; k++) {
+#if defined(ORT_CLIENT_PACKAGE_BUILD)
+        // TODO: Only using multithreaded/SIMD DQ when ORT is built for client/on-device workloads.
+        // Make this the default behavior after more testing.
+        if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, int8_t>) {
+          ParDequantizeLinearStd<T>(input, output, N, scale[k], zero_point ? zero_point[k] : 0, thread_pool);
+          input += N;
+          output += N;
+        } else {
+          auto zp = zero_point ? static_cast<int32_t>(zero_point[k]) : 0;
+          auto sc = static_cast<float>(scale[k]);
+          for (size_t n = 0; n < N; n++) {
+            *output++ = static_cast<OutT>(static_cast<float>(static_cast<int32_t>(*input++) - zp) * sc);
+          }
+        }
+#else
+        ORT_UNUSED_PARAMETER(thread_pool);
         auto zp = zero_point ? static_cast<int32_t>(zero_point[k]) : 0;
         auto sc = static_cast<float>(scale[k]);
         for (size_t n = 0; n < N; n++) {
           *output++ = static_cast<OutT>(static_cast<float>(static_cast<int32_t>(*input++) - zp) * sc);
         }
+#endif  // defined(ORT_CLIENT_PACKAGE_BUILD)
       }
     }
   }
@@ -311,7 +329,8 @@ struct DequantizeLinearApply<T, OutT, false> {
    * @param[in]    zero_point             same shape as scale
    */
   void op(size_t M, size_t K, size_t N, size_t quant_block_size,
-          const T* input, const OutT* scale, OutT* output, const T* zero_point) {
+          const T* input, const OutT* scale, OutT* output, const T* zero_point, concurrency::ThreadPool* thread_pool) {
+    ORT_UNUSED_PARAMETER(thread_pool);
     if (zero_point) {
       for (size_t m = 0; m < M; m++) {
         for (size_t bd = 0; bd < K; bd += quant_block_size) {
@@ -352,7 +371,8 @@ template <typename T, typename OutT>
 struct DequantizeLinearApply<T, OutT, true> {
   // per-tensor/layer or per-axis quantization
   void op(size_t M, size_t K, size_t N,
-          const T* input, const OutT* scale, OutT* output, const T* zero_point) {
+          const T* input, const OutT* scale, OutT* output, const T* zero_point, concurrency::ThreadPool* thread_pool) {
+    ORT_UNUSED_PARAMETER(thread_pool);
     size_t input_index = 0;
 
     for (size_t m = 0; m < M; m++) {
@@ -378,7 +398,8 @@ struct DequantizeLinearApply<T, OutT, true> {
   // Blocked quantization
   // TODO(fajin) : add mlas kernel to utilize multithreading, refer MlasDequantizeBlockwise.
   void op(size_t M, size_t K, size_t N, size_t quant_block_size,
-          const T* input, const OutT* scale, OutT* output, const T* zero_point) {
+          const T* input, const OutT* scale, OutT* output, const T* zero_point, concurrency::ThreadPool* thread_pool) {
+    ORT_UNUSED_PARAMETER(thread_pool);
     size_t input_index = 0;
 
     if (zero_point) {
@@ -424,36 +445,36 @@ struct DequantizeLinearApply<T, OutT, true> {
 
 #if !defined(DISABLE_FLOAT8_TYPES)
 
-#define DEQUANTIZE_LINEAR_APPLY_FLOAT8(T)                                                       \
-  template <typename OutT>                                                                      \
-  struct DequantizeLinearApply<T, OutT, false> {                                                \
-    /* Per-tensor/layer or per-axis quantization */                                             \
-    void op(size_t M, size_t K, size_t N,                                                       \
-            const T* input, const OutT* scale, OutT* output, const T*) {                        \
-      for (size_t m = 0; m < M; m++) {                                                          \
-        for (size_t bd = 0; bd < K; bd++) {                                                     \
-          auto sc = scale[bd];                                                                  \
-          for (size_t bs = 0; bs < N; bs++, input++) {                                          \
-            *output++ = static_cast<OutT>(input->ToFloat() * sc);                               \
-          }                                                                                     \
-        }                                                                                       \
-      }                                                                                         \
-    }                                                                                           \
-    /* Blocked quantization */                                                                  \
-    void op(size_t M, size_t K, size_t N, size_t quant_block_size,                              \
-            const T* input, const OutT* scale, OutT* output, const T*) {                        \
-      for (size_t m = 0; m < M; m++) {                                                          \
-        for (size_t bd = 0; bd < K; bd += quant_block_size) {                                   \
-          for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) { \
-            for (size_t bs = 0; bs < N; bs++, input++) {                                        \
-              auto sc = static_cast<float>(scale[bs]);                                          \
-              *output++ = static_cast<OutT>(input->ToFloat() * sc);                             \
-            }                                                                                   \
-          }                                                                                     \
-          scale += N;                                                                           \
-        }                                                                                       \
-      }                                                                                         \
-    }                                                                                           \
+#define DEQUANTIZE_LINEAR_APPLY_FLOAT8(T)                                                          \
+  template <typename OutT>                                                                         \
+  struct DequantizeLinearApply<T, OutT, false> {                                                   \
+    /* Per-tensor/layer or per-axis quantization */                                                \
+    void op(size_t M, size_t K, size_t N,                                                          \
+            const T* input, const OutT* scale, OutT* output, const T*, concurrency::ThreadPool*) { \
+      for (size_t m = 0; m < M; m++) {                                                             \
+        for (size_t bd = 0; bd < K; bd++) {                                                        \
+          auto sc = scale[bd];                                                                     \
+          for (size_t bs = 0; bs < N; bs++, input++) {                                             \
+            *output++ = static_cast<OutT>(input->ToFloat() * sc);                                  \
+          }                                                                                        \
+        }                                                                                          \
+      }                                                                                            \
+    }                                                                                              \
+    /* Blocked quantization */                                                                     \
+    void op(size_t M, size_t K, size_t N, size_t quant_block_size,                                 \
+            const T* input, const OutT* scale, OutT* output, const T*, concurrency::ThreadPool*) { \
+      for (size_t m = 0; m < M; m++) {                                                             \
+        for (size_t bd = 0; bd < K; bd += quant_block_size) {                                      \
+          for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) {    \
+            for (size_t bs = 0; bs < N; bs++, input++) {                                           \
+              auto sc = static_cast<float>(scale[bs]);                                             \
+              *output++ = static_cast<OutT>(input->ToFloat() * sc);                                \
+            }                                                                                      \
+          }                                                                                        \
+          scale += N;                                                                              \
+        }                                                                                          \
+      }                                                                                            \
+    }                                                                                              \
   };
 
 DEQUANTIZE_LINEAR_APPLY_FLOAT8(Float8E4M3FN)
@@ -497,6 +518,7 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
   const auto to = x_scale.GetElementType();
   const T* input = x.Data<T>();
   constexpr bool is_4bit = boost::mp11::mp_contains<TypeList<Int4x2, UInt4x2>, T>::value;
+  concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
 
   if (to == ONNX_NAMESPACE::TensorProto::FLOAT) {
     const float* scale = x_scale.Data<float>();
@@ -506,12 +528,12 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
                                                     static_cast<size_t>(broadcast_dim),
                                                     static_cast<size_t>(process_block_size),
                                                     static_cast<size_t>(block_size_),
-                                                    input, scale, output, zero_point);
+                                                    input, scale, output, zero_point, thread_pool);
     } else {
       DequantizeLinearApply<T, float, is_4bit>().op(static_cast<size_t>(process_block_count),
                                                     static_cast<size_t>(broadcast_dim),
                                                     static_cast<size_t>(process_block_size),
-                                                    input, scale, output, zero_point);
+                                                    input, scale, output, zero_point, thread_pool);
     }
   } else if (to == ONNX_NAMESPACE::TensorProto::FLOAT16) {
     const MLFloat16* scale = x_scale.Data<MLFloat16>();
@@ -521,12 +543,12 @@ Status DequantizeLinear<T>::Compute(OpKernelContext* ctx) const {
                                                         static_cast<size_t>(broadcast_dim),
                                                         static_cast<size_t>(process_block_size),
                                                         static_cast<size_t>(block_size_),
-                                                        input, scale, output, zero_point);
+                                                        input, scale, output, zero_point, thread_pool);
     } else {
       DequantizeLinearApply<T, MLFloat16, is_4bit>().op(static_cast<size_t>(process_block_count),
                                                         static_cast<size_t>(broadcast_dim),
                                                         static_cast<size_t>(process_block_size),
-                                                        input, scale, output, zero_point);
+                                                        input, scale, output, zero_point, thread_pool);
     }
   } else if (to == ONNX_NAMESPACE::TensorProto::BFLOAT16) {
     ORT_THROW("DequantizeLinear into BFLOAT16 is not implemented yet.");
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index 247a0585423f8..53fef09aec0fa 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -134,6 +134,10 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
     CreateResizeOpBuilder("Resize", *this);
   }
 
+  {
+    CreateUpsampleOpBuilder("Upsample", *this);
+  }
+
   {
     CreateTopKOpBuilder("TopK", *this);
   }
@@ -170,9 +174,21 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
     CreateExpandOpBuilder("Expand", *this);
   }
 
+  {
+    CreateEinsumOpBuilder("Einsum", *this);
+  }
+
   {
     CreateMatMulOpBuilder("MatMul", *this);
   }
+
+  {
+    CreateLSTMOpBuilder("LSTM", *this);
+  }
+
+  {
+    CreateCumSumOpBuilder("CumSum", *this);
+  }
 }
 
 const IOpBuilder* GetOpBuilder(const std::string& onnx_op_type) {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
index e11eae84341fe..1cc8e12068cca 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
@@ -75,6 +75,8 @@ void CreateSplitOpBuilder(const std::string& op_type, OpBuilderRegistrations& op
 
 void CreateResizeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
+void CreateUpsampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
 void CreateTopKOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
 void CreateTileOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
@@ -98,5 +100,12 @@ void CreateExpandOpBuilder(const std::string& op_type, OpBuilderRegistrations& o
 void CreateHardSigmoidOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
 void CreateMatMulOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
+void CreateEinsumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
+void CreateLSTMOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
+void CreateCumSumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
index 02d2bf22b8144..6d580447a7978 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -138,6 +138,10 @@ Status BaseOpBuilder::ProcessInt64Tensors(QnnModelWrapper& qnn_model_wrapper,
     return Status::OK();
   }
   for (size_t i = 0; i < input_names.size(); i++) {
+    if (input_names[i].size() == 0) {
+      // For optional inputs, the input_name is empty
+      continue;
+    }
     auto& input_tensorwrapper = qnn_model_wrapper.GetQnnTensorWrapper(input_names[i]);
     // Insert cast to int32 if input dtype is int64
     if (input_tensorwrapper.GetTensorDataType() == QNN_DATATYPE_INT_64) {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 5d818ed3f7f6c..a83e8e064c7d0 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -107,6 +107,35 @@ class BaseOpBuilder : public IOpBuilder {
                           const logging::Logger& logger,
                           std::vector<std::string>& input_names) const ORT_MUST_USE_RESULT;
 
+  template <typename T>
+  Status AddQnnScalar(QnnModelWrapper& qnn_model_wrapper,
+                      const NodeIndex& node_index,
+                      const std::string& node_name,
+                      const T& scalar,
+                      const std::string& qnn_scalar_param_name,
+                      std::vector<std::string>& param_names) const {
+    Qnn_Scalar_t qnn_scalar = QNN_SCALAR_INIT;
+    if (std::is_same<T, float>::value) {
+      qnn_scalar.dataType = QNN_DATATYPE_FLOAT_32;
+      qnn_scalar.floatValue = static_cast<float>(scalar);
+    } else if (std::is_same<T, uint32_t>::value) {
+      qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
+      qnn_scalar.uint32Value = static_cast<uint32_t>(scalar);
+    } else if (std::is_same<T, int32_t>::value) {
+      qnn_scalar.dataType = QNN_DATATYPE_INT_32;
+      qnn_scalar.int32Value = static_cast<int32_t>(scalar);
+    } else if (std::is_same<T, bool>::value) {
+      qnn_scalar.dataType = QNN_DATATYPE_BOOL_8;
+      qnn_scalar.bool8Value = static_cast<uint8_t>(scalar);
+    } else {
+      ORT_RETURN_IF(true, "QNN EP: Unsupported scalar dtype");
+    }
+    QnnParamWrapper qnn_param_wrapper(node_index, node_name, qnn_scalar_param_name, qnn_scalar);
+    param_names.push_back(qnn_param_wrapper.GetParamTensorName());
+    qnn_model_wrapper.AddParamWrapper(std::move(qnn_param_wrapper));
+    return Status::OK();
+  }
+
   Status SetOutputQParamEqualToInputIfNearlyEqual(QnnModelWrapper& qnn_model_wrapper,
                                                   const NodeUnit& node_unit,
                                                   const logging::Logger& logger,
@@ -140,6 +169,7 @@ class BaseOpBuilder : public IOpBuilder {
         {"Less", QNN_OP_ELEMENT_WISE_LESS},
         {"LessOrEqual", QNN_OP_ELEMENT_WISE_LESS_EQUAL},
         {"Log", QNN_OP_ELEMENT_WISE_LOG},
+        {"LSTM", QNN_OP_LSTM},
         {"Max", QNN_OP_ELEMENT_WISE_MAXIMUM},
         {"Min", QNN_OP_ELEMENT_WISE_MINIMUM},
         {"Neg", QNN_OP_ELEMENT_WISE_NEG},
@@ -193,12 +223,14 @@ class BaseOpBuilder : public IOpBuilder {
 
         {"Reshape", QNN_OP_RESHAPE},
         {"Resize", QNN_OP_RESIZE},
+        {"Upsample", QNN_OP_RESIZE},
         {"Flatten", QNN_OP_RESHAPE},
         {"Squeeze", QNN_OP_RESHAPE},
         {"Unsqueeze", QNN_OP_RESHAPE},
 
         {"LogSoftmax", QNN_OP_LOG_SOFTMAX},
         {"Concat", QNN_OP_CONCAT},
+        {"CumSum", QNN_OP_CUMULATIVE_SUM},
 
         {"Gemm", QNN_OP_FULLY_CONNECTED},
 
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
index 193b507083360..a1a658d5d963c 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/clip_op_builder.cc
@@ -94,13 +94,13 @@ Status ClipOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const N
   if (node_unit.Inputs().size() > 1) {
     const auto& min_input_name = node_unit.Inputs()[1].node_arg.Name();
     if (!min_input_name.empty() && !qnn_model_wrapper.IsConstantInput(min_input_name)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN desn't support dynamic min/max.");
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic min/max.");
     }
   }
   if (node_unit.Inputs().size() > 2) {
     const auto& max_input_name = node_unit.Inputs()[2].node_arg.Name();
     if (!max_input_name.empty() && !qnn_model_wrapper.IsConstantInput(max_input_name)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN desn't support dynamic min/max.");
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic min/max.");
     }
   }
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/cumsum_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/cumsum_op_builder.cc
new file mode 100644
index 0000000000000..68d2808a91e3e
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/cumsum_op_builder.cc
@@ -0,0 +1,148 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+
+namespace onnxruntime {
+namespace qnn {
+namespace {
+
+Status GetOnnxAxis(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, uint32_t& onnx_axis) {
+  const auto& inputs = node_unit.Inputs();
+  TensorInfo axis_input_info = {};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[1], axis_input_info));
+  ORT_RETURN_IF_NOT(axis_input_info.is_initializer, "axis must be initializers");
+  std::vector<uint8_t> axis_unpacked_tensor;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*axis_input_info.initializer_tensor, axis_unpacked_tensor));
+  ORT_RETURN_IF_NOT(1 == static_cast<uint32_t>(axis_unpacked_tensor.size() / sizeof(axis_input_info.qnn_data_type)),
+                    "axis should be a single element");
+
+  int32_t axis = 0;
+  if (axis_input_info.qnn_data_type == QNN_DATATYPE_INT_64) {
+    axis = static_cast<int32_t>(*reinterpret_cast<const int64_t*>(axis_unpacked_tensor.data()));
+  } else {
+    axis = static_cast<int32_t>(*reinterpret_cast<const int32_t*>(axis_unpacked_tensor.data()));
+  }
+
+  std::vector<uint32_t> input_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape");
+
+  auto rank = static_cast<int32_t>(input_shape.size());
+  if (axis < 0) {
+    axis += rank;
+  }
+
+  ORT_RETURN_IF_NOT((axis >= 0 && axis < static_cast<int32_t>(input_shape.size())), "QNN requires axis range [0, rank-1].");
+
+  onnx_axis = static_cast<uint32_t>(axis);
+
+  return Status::OK();
+}
+
+}  // namespace
+
+class CumSumOpBuilder : public BaseOpBuilder {
+ public:
+  CumSumOpBuilder() : BaseOpBuilder("CumSumOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CumSumOpBuilder);
+
+  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger,
+                       std::vector<std::string>& input_names,
+                       bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                     const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names,
+                                     const logging::Logger& logger,
+                                     bool do_op_validation) const override ORT_MUST_USE_RESULT;
+};
+
+Status CumSumOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                                      const NodeUnit& node_unit,
+                                      const logging::Logger& logger) const {
+  const auto& inputs = node_unit.Inputs();
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.IsConstantInput(inputs[1].node_arg.Name()),
+                    "QNN CumSum needs axis as a param, hence input[1] must be a constant.");
+
+  NodeAttrHelper node_helper(node_unit);
+  int64_t exclusive = node_helper.Get("exclusive", static_cast<int64_t>(0));
+  int64_t reverse = node_helper.Get("reverse", static_cast<int64_t>(0));
+
+  // QNN HTP op validation passes for non-default values of attributes but fails in finalize.
+  // Hence adding the checks here.
+  ORT_RETURN_IF_NOT(exclusive == 0, "QNN only supports default value 0 for exclusive attribute");
+  ORT_RETURN_IF_NOT(reverse == 0, "QNN only supports default value 0 for reverse attribute");
+
+  return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
+}
+
+Status CumSumOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                                      const NodeUnit& node_unit,
+                                      const logging::Logger& logger,
+                                      std::vector<std::string>& input_names,
+                                      bool do_op_validation) const {
+  ORT_UNUSED_PARAMETER(do_op_validation);
+  const auto& inputs = node_unit.Inputs();
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names));
+  return Status::OK();
+}
+
+Status CumSumOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                    const NodeUnit& node_unit,
+                                                    std::vector<std::string>&& input_names,
+                                                    const logging::Logger& logger,
+                                                    bool do_op_validation) const {
+  ORT_UNUSED_PARAMETER(do_op_validation);
+
+  std::vector<std::string> param_tensor_names;
+
+  // Add axis param
+  Qnn_Scalar_t axis_qnn_scalar = QNN_SCALAR_INIT;
+  uint32_t onnx_axis = 0;
+  ORT_RETURN_IF_ERROR(GetOnnxAxis(qnn_model_wrapper, node_unit, onnx_axis));
+  axis_qnn_scalar.dataType = QNN_DATATYPE_UINT_32;
+  axis_qnn_scalar.uint32Value = onnx_axis;
+  QnnParamWrapper axis_param(node_unit.Index(), node_unit.Name(), QNN_OP_CUMULATIVE_SUM_PARAM_AXIS, axis_qnn_scalar);
+  param_tensor_names.push_back(axis_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(axis_param));
+
+  // Add exclusive param
+  NodeAttrHelper node_helper(node_unit);
+  int64_t exclusive = node_helper.Get("exclusive", static_cast<int64_t>(0));
+  Qnn_Scalar_t exclusive_qnn_scalar = QNN_SCALAR_INIT;
+  exclusive_qnn_scalar.dataType = QNN_DATATYPE_BOOL_8;
+  exclusive_qnn_scalar.bool8Value = static_cast<uint8_t>(exclusive == 0 ? 0 : 1);
+  QnnParamWrapper exclusive_param(node_unit.Index(), node_unit.Name(), QNN_OP_CUMULATIVE_SUM_PARAM_EXCLUSIVE, exclusive_qnn_scalar);
+  param_tensor_names.push_back(exclusive_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(exclusive_param));
+
+  // Add reverse param
+  int64_t reverse = node_helper.Get("reverse", static_cast<int64_t>(0));
+  Qnn_Scalar_t reverse_qnn_scalar = QNN_SCALAR_INIT;
+  reverse_qnn_scalar.dataType = QNN_DATATYPE_BOOL_8;
+  reverse_qnn_scalar.bool8Value = static_cast<uint8_t>(reverse == 0 ? 0 : 1);
+  QnnParamWrapper reverse_param(node_unit.Index(), node_unit.Name(), QNN_OP_CUMULATIVE_SUM_PARAM_REVERSE, reverse_qnn_scalar);
+  param_tensor_names.push_back(reverse_param.GetParamTensorName());
+  qnn_model_wrapper.AddParamWrapper(std::move(reverse_param));
+
+  return ProcessOutputs(qnn_model_wrapper, node_unit,
+                        std::move(input_names),
+                        std::move(param_tensor_names),
+                        logger, do_op_validation, GetQnnOpType(node_unit.OpType()));
+}
+
+void CreateCumSumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.AddOpBuilder(op_type, std::make_unique<CumSumOpBuilder>());
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/einsum_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/einsum_op_builder.cc
new file mode 100644
index 0000000000000..9db0b5202dcd4
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/einsum_op_builder.cc
@@ -0,0 +1,396 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/cpu/tensor/slice_helper.h"
+
+namespace {
+
+// Represented as a tuple of 3 strings <term_1, term_2, result>.
+// The equation string is expected to follow the format "term_1,term_2->result"
+using Equation = std::tuple<std::string, std::string, std::string>;
+
+/**
+ * @brief Parses an equation string into its components if it adheres to the expected format.
+ *
+ * @param equation_string The input equation string to parse.
+ * @return A std::optional containing a tuple of 3 strings (term_1, term_2, result) if the parsing is successful.
+ *         Returns std::nullopt if the input string is invalid or does not conform to the expected format.
+ */
+std::optional<Equation> ParseEquation(std::string_view equation_string) {
+  std::string equation(equation_string);
+  equation.erase(std::remove(equation.begin(), equation.end(), ' '),
+                 equation.end());
+  if (equation.empty()) {
+    return std::nullopt;
+  }
+  auto index_arrow = equation.find("->");
+  if (index_arrow == std::string::npos) {
+    return std::nullopt;
+  }
+  const std::string lhs = equation.substr(0, index_arrow);
+  const std::string result = equation.substr(index_arrow + 2);
+  if (lhs.empty() || result.empty()) {
+    return std::nullopt;
+  }
+  auto index_comma = lhs.find(",");
+  if (index_comma == std::string::npos) {
+    return std::nullopt;
+  }
+  const std::string term_1 = lhs.substr(0, index_comma);
+  const std::string term_2 = lhs.substr(index_comma + 1);
+  if (term_1.empty() || term_2.empty()) {
+    return std::nullopt;
+  }
+  if (term_1.size() < 2) {
+    return std::nullopt;
+  }
+  if (term_1.size() != term_2.size()) {
+    return std::nullopt;
+  }
+  if (term_1.size() != result.size()) {
+    return std::nullopt;
+  }
+  if (!std::all_of(term_1.begin(), term_1.end(), [](unsigned char c) { return std::islower(c); })) {
+    return std::nullopt;
+  }
+  if (!std::all_of(term_2.begin(), term_2.end(), [](unsigned char c) { return std::islower(c); })) {
+    return std::nullopt;
+  }
+  if (!std::all_of(result.begin(), result.end(), [](unsigned char c) { return std::islower(c); })) {
+    return std::nullopt;
+  }
+  return std::make_tuple(term_1, term_2, result);
+}
+
+bool IsEquationMatMul(const Equation& equation) {
+  // MatMul: e.g., "ij,jk->ik"
+  const auto& [term_1, term_2, result] = equation;
+  const size_t num_dims = term_1.size();
+  for (size_t i = 0; i < num_dims; ++i) {
+    if (i >= num_dims - 2) {
+      continue;
+    }
+    if (!(term_1[i] == term_2[i] && term_1[i] == result[i])) {
+      return false;
+    }
+  }
+  char term_1_m = term_1[num_dims - 2];
+  char term_2_k = term_2[num_dims - 2];
+  char result_m = result[num_dims - 2];
+  char term_1_k = term_1[num_dims - 1];
+  char term_2_n = term_2[num_dims - 1];
+  char result_n = result[num_dims - 1];
+  if (term_1_m != result_m) {
+    return false;
+  }
+  if (term_1_k != term_2_k) {
+    return false;
+  }
+  if (term_2_n != result_n) {
+    return false;
+  }
+  return true;
+}
+
+bool IsEquationMatMulTransposeY(const Equation& equation) {
+  // MatMul with 2nd input transposed: e.g., "id,jd->ij"
+  const auto& [term_1, term_2, result] = equation;
+  const size_t num_dims = term_1.size();
+  for (size_t i = 0; i < num_dims; ++i) {
+    if (i >= num_dims - 2) {
+      continue;
+    }
+    if (!(term_1[i] == term_2[i] && term_1[i] == result[i])) {
+      return false;
+    }
+  }
+  char term_1_m = term_1[num_dims - 2];
+  char term_2_k = term_2[num_dims - 2];
+  char result_m = result[num_dims - 2];
+  char term_1_k = term_1[num_dims - 1];
+  char term_2_n = term_2[num_dims - 1];
+  char result_n = result[num_dims - 1];
+  if (term_1_m != result_m) {
+    return false;
+  }
+  if (term_1_k != term_2_n) {
+    return false;
+  }
+  if (term_2_k != result_n) {
+    return false;
+  }
+  return true;
+}
+
+bool IsEquationMatMulTransposeAll(const Equation& equation) {
+  // MatMul transpose both inputs and output, e.g., "bchq,bkhc->bkhq", "bkhq,bchk->bchq"
+  const auto& [term_1, term_2, result] = equation;
+  const size_t num_dims = term_1.size();
+  if (num_dims != 4) {
+    return false;
+  }
+  if (term_1[0] != term_2[0] || term_1[0] != result[0]) {
+    return false;
+  }
+  char term_1_m = term_1[num_dims - 1];
+  char term_1_k = term_1[num_dims - 3];
+  char term_2_k = term_2[num_dims - 1];
+  char term_2_n = term_2[num_dims - 3];
+  char result_m = result[num_dims - 1];
+  char result_n = result[num_dims - 3];
+  if (term_1_m != result_m) {
+    return false;
+  }
+  if (term_1_k != term_2_k) {
+    return false;
+  }
+  if (term_2_n != result_n) {
+    return false;
+  }
+  return true;
+}
+
+/**
+ * @brief Sets the parameter tensor names for a MatMul op.
+ *
+ * @param qnn_model_wrapper Pointer to the QnnModelWrapper instance that manages the QNN model.
+ * @param node_unit Reference to the NodeUnit representing the ONNX node for which the parameters are being set.
+ * @param transpose_in0 Boolean flag indicating whether the 1st input tensor should be transposed (default: false).
+ * @param transpose_in1 Boolean flag indicating whether the 2nd input tensor should be transposed (default: false).
+ * @return A vector of strings containing the names of the parameter tensors added to the QNN model.
+ */
+std::vector<std::string> SetMatMulParamTensorNames(
+    onnxruntime::qnn::QnnModelWrapper* qnn_model_wrapper,
+    const onnxruntime::NodeUnit& node_unit,
+    bool transpose_in0 = false,
+    bool transpose_in1 = false) {
+  std::vector<std::string> param_tensor_names;
+  Qnn_Scalar_t scalar_params[2] = {QNN_SCALAR_INIT, QNN_SCALAR_INIT};
+  scalar_params[0].dataType = QNN_DATATYPE_BOOL_8;
+  scalar_params[1].dataType = QNN_DATATYPE_BOOL_8;
+  scalar_params[0].bool8Value = static_cast<uint8_t>(transpose_in0);
+  scalar_params[1].bool8Value = static_cast<uint8_t>(transpose_in1);
+  onnxruntime::qnn::QnnParamWrapper transpose_in0_param(
+      node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN0, scalar_params[0]);
+  onnxruntime::qnn::QnnParamWrapper transpose_in1_param(
+      node_unit.Index(), node_unit.Name(), QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, scalar_params[1]);
+  param_tensor_names.push_back(transpose_in0_param.GetParamTensorName());
+  param_tensor_names.push_back(transpose_in1_param.GetParamTensorName());
+  qnn_model_wrapper->AddParamWrapper(std::move(transpose_in0_param));
+  qnn_model_wrapper->AddParamWrapper(std::move(transpose_in1_param));
+  return param_tensor_names;
+}
+
+/**
+ * @brief Creates a MatMul operation with transposed inputs and output in a QNN model.
+ *
+ * @param qnn_model_wrapper Pointer to the QnnModelWrapper instance used to manage the QNN model.
+ * @param node_unit The NodeUnit representing the ONNX node to be converted.
+ * @param do_op_validation A boolean flag indicating whether to perform operation validation.
+ * @return Status indicating success or failure of the operation.
+ */
+Status CreateMatMulTransposeAll(
+    onnxruntime::qnn::QnnModelWrapper* qnn_model_wrapper,
+    const onnxruntime::NodeUnit& node_unit,
+    std::vector<std::string>&& input_names,
+    bool do_op_validation) {
+  onnxruntime::qnn::TensorInfo input_info0{}, input_info1{};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->GetTensorInfo(node_unit.Inputs()[0], input_info0));
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->GetTensorInfo(node_unit.Inputs()[1], input_info1));
+  std::vector<uint32_t> input_shape0(input_info0.shape);
+  std::vector<uint32_t> input_shape1(input_info1.shape);
+  std::swap(input_shape0[1], input_shape0[2]);
+  std::swap(input_shape1[1], input_shape1[2]);
+  const std::string input_transpos0 = input_names[0] + "_t0";
+  const std::string input_transpos1 = input_names[1] + "_t1";
+  const std::vector<uint32_t> transpose_perm{0, 2, 1, 3};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->AddTransposeNode(
+      /*node_index=*/node_unit.Index(),
+      /*input_name=*/input_names[0],
+      /*output_name=*/input_transpos0,
+      /*input_shape=*/input_info0.shape,
+      /*transpose_perm=*/transpose_perm,
+      /*output_shape=*/input_shape0,
+      /*qnn_data_type=*/input_info0.qnn_data_type,
+      /*quantize_param=*/input_info0.quant_param.Copy(),
+      /*do_op_validation=*/do_op_validation,
+      /*is_for_input=*/qnn_model_wrapper->IsGraphInput(input_names[0])));
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->AddTransposeNode(
+      /*node_index=*/node_unit.Index(),
+      /*input_name=*/input_names[1],
+      /*output_name=*/input_transpos1,
+      /*input_shape=*/input_info1.shape,
+      /*transpose_perm=*/transpose_perm,
+      /*output_shape=*/input_shape1,
+      /*qnn_data_type=*/input_info1.qnn_data_type,
+      /*quantize_param=*/input_info1.quant_param.Copy(),
+      /*do_op_validation=*/do_op_validation,
+      /*is_for_input=*/qnn_model_wrapper->IsGraphInput(input_names[1])));
+  onnxruntime::qnn::TensorInfo matmul_output_info{};
+  const auto& output = node_unit.Outputs()[0];
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->GetTensorInfo(output, matmul_output_info));
+  const std::string matmul_output_name = onnxruntime::qnn::utils::GetNodeName(node_unit) + "_matmul";
+  std::vector<uint32_t> matmul_output_shape(matmul_output_info.shape);
+  std::swap(matmul_output_shape[1], matmul_output_shape[2]);
+  onnxruntime::qnn::QnnTensorWrapper matmul_output_wrapper(
+      matmul_output_name, QNN_TENSOR_TYPE_NATIVE, matmul_output_info.qnn_data_type,
+      matmul_output_info.quant_param.Copy(), std::vector<uint32_t>(matmul_output_shape));
+  ORT_RETURN_IF_NOT(qnn_model_wrapper->AddTensorWrapper(std::move(matmul_output_wrapper)),
+                    node_unit.OpType() + " failed to add tensor.");
+  std::vector<std::string> param_tensor_names = SetMatMulParamTensorNames(
+      qnn_model_wrapper, node_unit, /*transpose_in0=*/false, /*transpose_in1=*/false);
+  ORT_RETURN_IF_NOT(qnn_model_wrapper->CreateQnnNode(/*qnn_node_name=*/onnxruntime::qnn::utils::GetNodeName(node_unit),
+                                                     /*package_name=*/QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                     /*qnn_node_type=*/QNN_OP_MAT_MUL,
+                                                     /*input_names=*/{input_transpos1, input_transpos0},
+                                                     /*output_names=*/{matmul_output_name},
+                                                     /*param_tensor_names=*/std::move(param_tensor_names),
+                                                     /*do_op_validation=*/do_op_validation),
+                    node_unit.OpType() + " failed to add node.");
+  std::vector<uint32_t> transpose_output_shape(matmul_output_info.shape);
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->AddTransposeNode(
+      /*node_index=*/node_unit.Index(),
+      /*input_name=*/matmul_output_name,
+      /*output_name=*/output.node_arg.Name(),
+      /*input_shape=*/std::move(matmul_output_shape),
+      /*transpose_perm=*/transpose_perm,
+      /*output_shape=*/matmul_output_info.shape,
+      /*tensor_data_type=*/matmul_output_info.qnn_data_type,
+      /*quantize_param=*/matmul_output_info.quant_param.Copy(),
+      /*do_op_validation=*/do_op_validation,
+      /*is_for_input=*/qnn_model_wrapper->IsGraphInput(output.node_arg.Name()),
+      /*is_for_output=*/qnn_model_wrapper->IsGraphOutput(output.node_arg.Name())));
+  return Status::OK();
+}
+
+}  // namespace
+
+namespace onnxruntime {
+namespace qnn {
+
+class EinsumOpBuilder : public BaseOpBuilder {
+ public:
+  EinsumOpBuilder() : BaseOpBuilder("EinsumOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(EinsumOpBuilder);
+
+  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+ protected:
+  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger,
+                       std::vector<std::string>& input_names,
+                       bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                     const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names,
+                                     const logging::Logger& logger,
+                                     bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+  Status OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrapper,
+                                  const NodeUnit& node_unit,
+                                  const logging::Logger& logger,
+                                  const std::vector<std::string>& input_names,
+                                  size_t output_index,
+                                  Qnn_DataType_t qnn_data_type,
+                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
+};
+
+Status EinsumOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                                      const NodeUnit& node_unit,
+                                      const logging::Logger& logger) const {
+  if (node_unit.Inputs().size() < 2) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_unit.OpType() + " requires at least 2 inputs.");
+  }
+  NodeAttrHelper node_helper{node_unit};
+  const std::string equation = node_helper.Get("equation", std::string(""));
+  std::optional<Equation> parsed_equation = ParseEquation(equation);
+  if (!parsed_equation.has_value()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_unit.OpType() + " unsupported equation: " + equation);
+  }
+  if (!IsEquationMatMul(parsed_equation.value()) &&
+      !IsEquationMatMulTransposeY(parsed_equation.value()) &&
+      !IsEquationMatMulTransposeAll(parsed_equation.value())) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_unit.OpType() + " unsupported equation: " + equation);
+  }
+  return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
+}
+
+Status EinsumOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                                      const NodeUnit& node_unit,
+                                      const logging::Logger& logger,
+                                      std::vector<std::string>& input_names,
+                                      bool do_op_validation) const {
+  ORT_UNUSED_PARAMETER(do_op_validation);
+  const auto& inputs = node_unit.Inputs();
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names));
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[1], logger, input_names));
+  return Status::OK();
+}
+
+Status EinsumOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                    const NodeUnit& node_unit,
+                                                    std::vector<std::string>&& input_names,
+                                                    const logging::Logger& logger,
+                                                    bool do_op_validation) const {
+  NodeAttrHelper node_helper(node_unit);
+  const std::string equation = node_helper.Get("equation", std::string(""));
+  std::optional<Equation> parsed_equation = ParseEquation(equation);
+  if (IsEquationMatMul(parsed_equation.value())) {
+    std::vector<std::string> param_tensor_names = SetMatMulParamTensorNames(
+        &qnn_model_wrapper, node_unit, /*transpose_in0=*/false, /*transpose_in1=*/false);
+    ORT_RETURN_IF_ERROR(ProcessOutputs(/*qnn_model_wrapper=*/qnn_model_wrapper,
+                                       /*node_unit=*/node_unit,
+                                       /*input_names=*/std::move(input_names),
+                                       /*param_tensor_names=*/std::move(param_tensor_names),
+                                       /*logger=*/logger,
+                                       /*do_op_validation=*/do_op_validation,
+                                       /*qnn_op_type=*/QNN_OP_MAT_MUL));
+  } else if (IsEquationMatMulTransposeY(parsed_equation.value())) {
+    std::vector<std::string> param_tensor_names = SetMatMulParamTensorNames(
+        &qnn_model_wrapper, node_unit, /*transpose_in0=*/false, /*transpose_in1=*/true);
+    ORT_RETURN_IF_ERROR(ProcessOutputs(/*qnn_model_wrapper=*/qnn_model_wrapper,
+                                       /*node_unit=*/node_unit,
+                                       /*input_names=*/std::move(input_names),
+                                       /*param_tensor_names=*/std::move(param_tensor_names),
+                                       /*logger=*/logger,
+                                       /*do_op_validation=*/do_op_validation,
+                                       /*qnn_op_type=*/QNN_OP_MAT_MUL));
+  } else if (IsEquationMatMulTransposeAll(parsed_equation.value())) {
+    ORT_RETURN_IF_ERROR(CreateMatMulTransposeAll(&qnn_model_wrapper, node_unit, std::move(input_names), do_op_validation));
+  } else {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, node_unit.OpType() + " unsupported equation: " + equation);
+  }
+  return Status::OK();
+}
+
+Status EinsumOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrapper,
+                                                 const NodeUnit& node_unit,
+                                                 const logging::Logger& logger,
+                                                 const std::vector<std::string>& input_names,
+                                                 size_t output_index,
+                                                 Qnn_DataType_t qnn_data_type,
+                                                 QnnQuantParamsWrapper& quant_param) const {
+  if (!quant_param.IsPerTensor()) {
+    return Status::OK();
+  }
+
+  // Force the operator output to use the same quantization parameters as the input if nearly equal.
+  // This helps the HTP backend employ certain optimizations.
+  return SetOutputQParamEqualToInputIfNearlyEqual(qnn_model_wrapper, node_unit, logger, input_names,
+                                                  0 /*input_index*/, output_index, qnn_data_type, quant_param);
+}
+
+void CreateEinsumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.AddOpBuilder(op_type, std::make_unique<EinsumOpBuilder>());
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/lstm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/lstm_op_builder.cc
new file mode 100644
index 0000000000000..f131d58277038
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/lstm_op_builder.cc
@@ -0,0 +1,807 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+class LSTMOpBuilder : public BaseOpBuilder {
+ public:
+  LSTMOpBuilder() : BaseOpBuilder("LSTMOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(LSTMOpBuilder);
+
+ protected:
+  /*
+  ONNX LSTM inputs:
+  in[0]: X [seq_length, batch_size, input_size], the input sequences packed
+  in[1]: W [num_directions, 4*hidden_size, input_size], the weight tensor for the gates. Concatenation of W[iofc] and WB[iofc]
+  in[2]: R [num_directions, 4*hidden_size, hidden_size], the recurrence weight tensor. Concatenation of R[iofc] and RB[iofc]
+
+  ONNX LSTM optional inputs:
+  in[3]: B [num_directions, 8*hidden_size], the bias tensor for input gate. Concatenation of [Wb[iofc], Rb[iofc]], and [WBb[iofc], RBb[iofc]] (if bidirectional)
+  in[4]: sequence_lens
+  in[5]: initial_h [num_directions, batch_size, hidden_size].
+  in[6]: initial_c [num_directions, batch_size, hidden_size].
+  in[7]: P [num_directions, 3*hidde_size], the weight tensor for peepholes. Concatenation of P[iof] and PB[iof]
+
+  ONNX LSTM Parameters:
+  - activation_alpha ---> Not supported by QNN.
+  - activation_beta  ---> Not supported by QNN.
+  - activations      ---> Not supported by QNN.
+  - clip             ---> Not supported by QNN since the clip in ONNX applied to iofc while QNN only apply to c. Refer
+                          https://github.com/microsoft/onnxruntime/blob/v1.21.0/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc
+  - direction
+  - hidden_size
+  - input_forget     ---> Not supported by QNN
+  - layout: The shape format of inputs X, initial_h, initial_c and outputs Y, Y_h, Y_c.
+            If 0, the following shapes are expected:
+                X.shape = [seq_length, batch_size, input_size],
+                Y.shape = [seq_length, num_directions, batch_size, hidden_size],
+                initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = [num_directions, batch_size, hidden_size].
+            If 1, the following shapes are expected:
+                X.shape = [batch_size, seq_length, input_size],
+                Y.shape = [batch_size, seq_length, num_directions, hidden_size],
+                initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = [batch_size, num_directions, hidden_size].
+
+  ONNX LSTM optional outputs:
+  out[0]: Y [seq_length, num_directions, batch_size, hidden_size] = stack of out[0] from QNN_LSTM with varient directions
+  out[1]: Y_h [num_directions, batch_size, hidden_size] = stack of out[2] from QNN_LSTM with varient directions
+  out[2]: Y_c [num_directions, batch_size, hidden_size] = stack of out[1] from QNN_LSTM with varient directions
+
+  QNN LSTM inputs:
+  in[0]: x_t: 2D of shape [batch_size, input_size] or
+              3D of shape [time_steps, batch_size, input_size] if time_major
+                          [batch_size, time_steps, input_size] else
+  in[1]: W_xf: input-to-forget weights [num_units, input_size]      = ONNX in[1][direction, 2*hidden_size:3*hidden_size, :]
+  in[2]: W_xc: input-to-cell weights [num_units, input_size]        = ONNX in[1][direction, 3*hidden_size:4*hidden_size, :]
+  in[3]: W_xo: input-to-output weights [num_units, input_size]      = ONNX in[1][direction, 1*hidden_size:2*hidden_size, :]
+  in[4]: W_hf: recurrent-to-forget weights [num_units, output_size] = ONNX in[2][direction, 2*hidden_size:3*hidden_size, :]
+  in[5]: W_hc: recurrent-to-cell weights [num_units, output_size]   = ONNX in[2][direction, 3*hidden_size:4*hidden_size, :]
+  in[6]: W_ho: recurrent-to-output weights [num_units, output_size] = ONNX in[2][direction, 1*hidden_size:2*hidden_size, :]
+  in[7]: b_f: forget gate bias [num_units]                          = ONNX in[3][direction, 2*hidden_size:3*hidden_size] + in[3][direction, 6*hidden_size:7*hidden_size]
+  in[8]: b_c: cell bias [num_units]                                 = ONNX in[3][direction, 3*hidden_size:4*hidden_size] + in[3][direction, 7*hidden_size:8*hidden_size]
+  in[9]: b_o: output gate bias [num_units]                          = ONNX in[3][direction, 1*hidden_size:4*hidden_size] + in[3][direction, 5*hidden_size:6*hidden_size]
+
+  # optional inputs
+  in[10]: h_t_init: hidden state init [batch_size, output_size]     = ONNX in[5][direction]
+  in[11]: c_t_init: cell state init [batch_size, num_units]         = ONNX in[6][direction]
+  in[12]: The input layer normalization weights  ---> not supported on fp16 yet.
+  in[13]: The forget layer normalization weights ---> not supported on fp16 yet.
+  in[14]: The cell layer normalization weights   ---> not supported on fp16 yet.
+  in[15]: The output layer normalization weights ---> not supported on fp16 yet.
+  in[16]: W_xi: input-to-input weights [num_units, input_size]      = ONNX in[1][direction, 0*hidden_size:1*hidden_size, :]
+  in[17]: W_hi: recurrent-to-input weights [num_units, output_size] = ONNX in[2][direction, 0*hidden_size:1*hidden_size, :]
+  in[18]: W_ci: cell-to-input weights [num_units]                   = ONNX in[7][direction, 0*hidden_size:1*hidden_size]
+  in[19]: W_cf: cell-to-forget weights [num_units]                  = ONNX in[7][direction, 2*hidden_size:3*hidden_size]
+  in[20]: W_co: cell-to-output weights [num_units]                  = ONNX in[7][direction, 1*hidden_size:2*hidden_size]
+  in[21]: b_i: input gate bias [num_units]                          = ONNX in[3][direction, 0*hidden_size:1*hidden_size] + in[3][direction, 4*hidden_size:5*hidden_size]
+  in[22]: W_proj: projection weights [output_size, num_units]     ---> not used
+  in[23]: b_proj: projection bias [output_size]                   ---> not used
+  in[24]: reset: Determines if the internal state should be reset ---> not used
+
+  QNN LSTM Parameters:
+  - direction
+  - cell_clip_threshold   ---> not used
+  - output_clip_threshold ---> not used
+  - time_major
+  - input_gate_qscale     ---> not used since we fallback to fp16.
+  - forget_gate_qscale    ---> not used since we fallback to fp16.
+  - cell_gate_qscale      ---> not used since we fallback to fp16.
+  - output_gate_qscale    ---> not used since we fallback to fp16.
+  - hidden_state_offset   ---> not used since we fallback to fp16.
+ -  hidden_state_qscale   ---> not used since we fallback to fp16.
+
+  QNN LSTM outputs:
+  out[0]: h_t 2D of shape [batch_size, output_size] or
+              3D of shape [time_steps, batch_size, output_size] if time_major
+                          [batch_size, time_steps, output_size] else
+  out[1]: c_t [batch_size, num_unit]
+  out[2]: o_t [batch_size, output_size]
+
+  QNN LSTM optional outputs:
+  out[3]: input_gate [batch_size, num_unit]      ---> not used
+  out[4]: forget_gate [batch_size, num_unit]     ---> not used
+  out[5]: cell_gate [batch_size, num_unit]       ---> not used
+  out[6]: output_gate [batch_size, num_unit]     ---> not used
+  out[7]: hidden_state [batch_size, output_size] ---> not used
+  */
+
+  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger,
+                       std::vector<std::string>& input_names,
+                       bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                     const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names,
+                                     const logging::Logger& logger,
+                                     bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+ private:
+  Status AddUnidirectionLSTM(QnnModelWrapper& qnn_model_wrapper,
+                             const NodeUnit& node_unit,
+                             const std::string& direction,
+                             const std::vector<std::string>& input_names,
+                             const logging::Logger& logger,
+                             const bool& do_op_validation,
+                             const bool& is_bidirection,
+                             std::vector<std::string>& uni_lstm_output_names) const;
+  Status AddStridedSliceOrReshape(QnnModelWrapper& qnn_model_wrapper,
+                                  const NodeUnit& node_unit,
+                                  const std::string& input_name,
+                                  const std::string& output_name,
+                                  const std::vector<uint32_t>& input_shape,
+                                  const std::vector<uint32_t>& output_shape,
+                                  const std::vector<std::vector<int32_t>>& ranges,
+                                  const uint32_t& begin_mask,
+                                  const uint32_t& end_mask,
+                                  const uint32_t& shrink_axes,
+                                  const uint32_t& new_axes_mask,
+                                  const Qnn_DataType_t& tensor_data_type,
+                                  const QnnQuantParamsWrapper& quantize_param,
+                                  bool do_op_validation,
+                                  bool is_for_input,
+                                  bool is_for_output) const;
+};
+
+Status LSTMOpBuilder::AddStridedSliceOrReshape(QnnModelWrapper& qnn_model_wrapper,
+                                               const NodeUnit& node_unit,
+                                               const std::string& input_name,
+                                               const std::string& output_name,
+                                               const std::vector<uint32_t>& input_shape,
+                                               const std::vector<uint32_t>& output_shape,
+                                               const std::vector<std::vector<int32_t>>& ranges,
+                                               const uint32_t& begin_mask,
+                                               const uint32_t& end_mask,
+                                               const uint32_t& shrink_axes,
+                                               const uint32_t& new_axes_mask,
+                                               const Qnn_DataType_t& tensor_data_type,
+                                               const QnnQuantParamsWrapper& quantize_param,
+                                               bool do_op_validation,
+                                               bool is_for_input,
+                                               bool is_for_output) const {
+  if (qnn_model_wrapper.IsQnnTensorWrapperExist(output_name)) {
+    return Status::OK();
+  }
+  // add strided_slice or reshape
+  // this is not general condition, only limited to caller in this builder
+  size_t minSize = std::min(input_shape.size(), output_shape.size());
+  if (input_shape[0] == 1 && std::equal(output_shape.rbegin(), output_shape.rbegin() + minSize, input_shape.rbegin())) {
+    // add Reshape
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(input_name,
+                                                         output_name,
+                                                         input_shape,
+                                                         output_shape,
+                                                         tensor_data_type,
+                                                         quantize_param.Copy(),
+                                                         quantize_param.Copy(),
+                                                         do_op_validation,
+                                                         is_for_input,
+                                                         is_for_output));
+  } else {
+    // add StridedSlice
+    // inputs
+    QnnTensorWrapper input_tensorwrapper(input_name, is_for_input ? QNN_TENSOR_TYPE_APP_WRITE : QNN_TENSOR_TYPE_NATIVE,
+                                         tensor_data_type, quantize_param.Copy(),
+                                         std::vector<uint32_t>(input_shape));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(input_tensorwrapper)),
+                      "Failed to add input tensor for inserted StridedSlice or Reshape.");
+
+    // params
+    const std::string& node_name = output_name;
+
+    // ranges
+    std::vector<uint32_t> ranges_data;
+    for (size_t i = 0; i < ranges.size(); i++) {
+      for (size_t j = 0; j < 3; j++) {
+        ranges_data.emplace_back(SafeInt<uint32_t>(ranges[i][j]));
+      }
+    }
+    QnnParamWrapper ranges_param_wrapper(node_unit.Index(), node_name, QNN_OP_STRIDED_SLICE_PARAM_RANGES, {static_cast<uint32_t>(ranges.size()), 3}, std::move(ranges_data), true);
+    std::vector<std::string> param_names = {
+        ranges_param_wrapper.GetParamTensorName(),
+    };
+    qnn_model_wrapper.AddParamWrapper(std::move(ranges_param_wrapper));
+
+    // begin_mask
+    ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), node_name, begin_mask, QNN_OP_STRIDED_SLICE_PARAM_BEGIN_MASK, param_names));
+
+    // end_mask
+    ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), node_name, end_mask, QNN_OP_STRIDED_SLICE_PARAM_END_MASK, param_names));
+
+    // shrink_axes
+    ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), node_name, shrink_axes, QNN_OP_STRIDED_SLICE_PARAM_SHRINK_AXES, param_names));
+
+    // new_axes_mask
+    ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), node_name, new_axes_mask, QNN_OP_STRIDED_SLICE_PARAM_NEW_AXES_MASK, param_names));
+
+    // outputs
+    QnnTensorWrapper output_tensorwrapper(output_name,
+                                          is_for_output ? QNN_TENSOR_TYPE_APP_READ : QNN_TENSOR_TYPE_NATIVE,
+                                          tensor_data_type,
+                                          quantize_param.Copy(),
+                                          std::vector<uint32_t>(output_shape));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)),
+                      "Failed to add output tensor for inserted StridedSlice.");
+    // addNode
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(node_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_STRIDED_SLICE, {input_name},
+                                                      {output_name}, std::move(param_names), do_op_validation),
+                      "Failed to create manually inserted Qnn StridedSlice node.");
+  }
+
+  return Status::OK();
+}
+
+Status LSTMOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                                    const NodeUnit& node_unit,
+                                    const logging::Logger& logger) const {
+  ORT_UNUSED_PARAMETER(qnn_model_wrapper);
+  ORT_UNUSED_PARAMETER(node_unit);
+  ORT_UNUSED_PARAMETER(logger);
+  if (node_unit.Inputs().size() > 4 && node_unit.Inputs()[4].node_arg.Exists()) {
+    TensorInfo tensor_info = {};
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(node_unit.Inputs()[4], tensor_info));
+
+    ORT_RETURN_IF_NOT(tensor_info.is_initializer, "QNN EP: dynamic sequence_length is not supported.");
+
+    std::vector<uint8_t> sequence_lens_bytes;
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper.UnpackInitializerData(*tensor_info.initializer_tensor, sequence_lens_bytes));
+    const size_t num_elems = sequence_lens_bytes.size() / sizeof(int32_t);
+    gsl::span<const int32_t> sequence_lens{reinterpret_cast<const int32_t*>(sequence_lens_bytes.data()), num_elems};
+    ORT_RETURN_IF(std::any_of(sequence_lens.begin(),
+                              sequence_lens.end(),
+                              [sequence_lens](int i) { return i != sequence_lens[0]; }),
+                  "QNN EP: Only support LSTM with same sequence length.");
+  }
+
+  NodeAttrHelper node_helper(node_unit);
+  const float clip = node_helper.Get("clip", (float)0.0);
+  ORT_RETURN_IF(clip != 0,
+                "QNN EP doesn't support non-default clip for LSTM.");
+  const std::vector<std::string> activations = node_helper.Get("activations", std::vector<std::string>{});
+  ORT_RETURN_IF((activations.size() >= 3 && (activations[0] != "sigmoid" || activations[1] != "tanh" || activations[2] != "tanh")) ||
+                    (activations.size() == 6 && (activations[3] != "sigmoid" || activations[5] != "tanh" || activations[5] != "tanh")),
+                "QNN EP doesn't support non-default activations for LSTM.");
+  // TODO: Add support for layout==1
+  const int64_t layout = node_helper.Get("layout", static_cast<int64_t>(0));
+  ORT_RETURN_IF_NOT(layout == 0,
+                    "QNN EP: Unsupport layout mode %ld for %s.", layout, node_unit.Name().c_str());
+  return Status::OK();
+}
+
+Status LSTMOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                                    const NodeUnit& node_unit,
+                                    const logging::Logger& logger,
+                                    std::vector<std::string>& input_names,
+                                    bool do_op_validation) const {
+  ORT_UNUSED_PARAMETER(do_op_validation);
+  const auto& onnx_inputs = node_unit.Inputs();
+  for (size_t i = 0; i < onnx_inputs.size(); i++) {
+    if (onnx_inputs[i].node_arg.Exists()) {
+      ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, onnx_inputs[i], logger, input_names));
+    } else {
+      input_names.emplace_back("");
+    }
+  }
+  return Status::OK();
+}
+
+Status LSTMOpBuilder::AddUnidirectionLSTM(QnnModelWrapper& qnn_model_wrapper,
+                                          const NodeUnit& node_unit,
+                                          const std::string& direction,
+                                          const std::vector<std::string>& input_names,
+                                          const logging::Logger& logger,
+                                          const bool& do_op_validation,
+                                          const bool& is_bidirection,
+                                          std::vector<std::string>& uni_lstm_output_names) const {
+  ORT_UNUSED_PARAMETER(logger);
+
+  const auto& onnx_inputs = node_unit.Inputs();
+  const auto& onnx_outputs = node_unit.Outputs();
+  const std::string& node_name = node_unit.Name();
+  std::vector<TensorInfo> input_tensor_infos(onnx_inputs.size());
+  for (size_t i = 0; i < onnx_inputs.size(); i++) {
+    if (onnx_inputs[i].node_arg.Exists()) {
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(onnx_inputs[i], input_tensor_infos[i]));
+    }
+  }
+  // becuase QNN LSTM three outputs are mandatory, we should provide them tensor info
+  std::vector<TensorInfo> output_tensor_infos(3);
+  for (size_t i = 0; i < 3; i++) {
+    if (onnx_outputs.size() > i && onnx_outputs[i].node_arg.Exists()) {
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(onnx_outputs[i], output_tensor_infos[i]));
+    } else {
+      output_tensor_infos[i].qnn_data_type = input_tensor_infos[0].qnn_data_type;
+    }
+  }
+
+  NodeAttrHelper node_helper(node_unit);
+  const uint32_t hidden_size = node_helper.Get("hidden_size", 0);
+  const int32_t hidden_size_sign = SafeInt<int32_t>(hidden_size);
+  ORT_RETURN_IF_NOT(hidden_size > 0, "hidden size is not set for LSTM");
+  const int64_t layout = node_helper.Get("layout", static_cast<int64_t>(0));
+
+  const uint32_t input_size = input_tensor_infos[0].shape[2];
+  const uint32_t batch_size = layout == 0 ? input_tensor_infos[0].shape[1] : input_tensor_infos[0].shape[0];
+  const uint32_t seq_length = layout == 0 ? input_tensor_infos[0].shape[0] : input_tensor_infos[0].shape[1];
+  const int32_t direction_idx = input_tensor_infos[1].shape[0] < 2 || direction == "forward" ? 0 : 1;
+
+  // params
+  std::vector<std::string> param_names;
+
+  // direction
+  ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), direction == "forward" ? QNN_OP_LSTM_DIRECTION_FORWARD : QNN_OP_LSTM_DIRECTION_REVERSE, QNN_OP_LSTM_PARAM_DIRECTION, param_names));
+
+  // cell_clip_threshold
+  ORT_RETURN_IF_ERROR(AddQnnScalar<float>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), 0.0, QNN_OP_LSTM_PARAM_CELL_CLIP_THRESHOLD, param_names));
+
+  // output_clip_threshold
+  ORT_RETURN_IF_ERROR(AddQnnScalar<float>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), 0.0, QNN_OP_LSTM_PARAM_OUTPUT_CLIP_THRESHOLD, param_names));
+
+  // time_major
+  ORT_RETURN_IF_ERROR(AddQnnScalar<bool>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), false, QNN_OP_LSTM_PARAM_TIME_MAJOR, param_names));
+
+  // // input_gate_qscale
+  ORT_RETURN_IF_ERROR(AddQnnScalar<float>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), 0.0, QNN_OP_LSTM_PARAM_INPUT_GATE_QSCALE, param_names));
+
+  // // forget_gate_qscale
+  ORT_RETURN_IF_ERROR(AddQnnScalar<float>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), 0.0, QNN_OP_LSTM_PARAM_FORGET_GATE_QSCALE, param_names));
+
+  // // cell_gate_qscale
+  ORT_RETURN_IF_ERROR(AddQnnScalar<float>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), 0.0, QNN_OP_LSTM_PARAM_CELL_GATE_QSCALE, param_names));
+
+  // // output_gate_qscale
+  ORT_RETURN_IF_ERROR(AddQnnScalar<float>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), 0.0, QNN_OP_LSTM_PARAM_OUTPUT_GATE_QSCALE, param_names));
+
+  // // hidden_state_offset
+  ORT_RETURN_IF_ERROR(AddQnnScalar<float>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), 0.0, QNN_OP_LSTM_PARAM_HIDDEN_STATE_OFFSET, param_names));
+
+  // // hidden_state_qscale
+  ORT_RETURN_IF_ERROR(AddQnnScalar<float>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), 0.0, QNN_OP_LSTM_PARAM_HIDDEN_STATE_QSCALE, param_names));
+
+  // Common LSTM cell inputs
+  const std::string null_tensor_name = "null_tensor";
+  QnnTensorWrapper null_tensor_wrapper(null_tensor_name, QNN_TENSOR_TYPE_NULL, QNN_DATATYPE_UNDEFINED,
+                                       QnnQuantParamsWrapper(), std::vector<uint32_t>{0});
+
+  qnn_model_wrapper.AddTensorWrapper(std::move(null_tensor_wrapper));
+  std::vector<std::string> qnn_lstm_input_names(24, null_tensor_name);
+
+  // input W
+  {
+    // QNN in[1] = ONNX in[1][direction, 2*hidden_size:3*hidden_size, :]
+    // QNN in[2] = ONNX in[1][direction, 3*hidden_size:4*hidden_size, :]
+    // QNN in[3] = ONNX in[1][direction, 1*hidden_size:2*hidden_size, :]
+    // QNN in[16] = ONNX in[1][direction, 0*hidden_size:1*hidden_size, :]
+    uint32_t begin_mask = 0b000U;
+    uint32_t end_mask = 0b000U;
+    uint32_t shrink_axes = 0b001U;
+    uint32_t new_axes_mask = 0b000U;
+    std::vector<uint32_t> qnn_input_indices = {1, 2, 3, 16};
+    std::vector<int32_t> begins = {2, 3, 1, 0};
+    std::vector<std::string> qnn_lstm_weight_name = {
+        input_names[1] + "_input_to_forget_gate_weight_" + direction,
+        input_names[1] + "_input_to_cell_gate_weight_" + direction,
+        input_names[1] + "_input_to_output_gate_weight_" + direction,
+        input_names[1] + "_input_to_input_gate_weight_" + direction,
+    };
+    for (size_t i = 0; i < 4; i++) {
+      std::vector<std::vector<int32_t>> ranges = {{direction_idx, direction_idx + 1, 1},
+                                                  {begins[i] * hidden_size_sign, (begins[i] + 1) * hidden_size_sign, 1},
+                                                  {0, SafeInt<int32_t>(input_size), 1}};
+      std::vector<uint32_t> output_shape = {hidden_size, input_size};
+      ORT_RETURN_IF_ERROR(AddStridedSliceOrReshape(/*qnn_model_wrapper=*/qnn_model_wrapper,
+                                                   /*node_unit=*/node_unit,
+                                                   /*input_name=*/input_names[1],
+                                                   /*output_name=*/qnn_lstm_weight_name[i],
+                                                   /*input_shape=*/input_tensor_infos[1].shape,
+                                                   /*output_shape=*/output_shape,
+                                                   /*ranges=*/ranges,
+                                                   /*begin_mask=*/begin_mask,
+                                                   /*end_mask=*/end_mask,
+                                                   /*shrink_axes=*/shrink_axes,
+                                                   /*new_axes_mask=*/new_axes_mask,
+                                                   /*tensor_data_type=*/input_tensor_infos[1].qnn_data_type,
+                                                   /*QnnQuantParamsWrapper=*/input_tensor_infos[1].quant_param,
+                                                   /*do_op_validation=*/do_op_validation,
+                                                   /*is_for_input=*/false,
+                                                   /*is_for_output=*/false));
+      qnn_lstm_input_names[qnn_input_indices[i]] = qnn_lstm_weight_name[i];
+    }
+  }
+
+  // input R
+  {
+    // QNN in[4] = ONNX in[2][direction, 2*hidden_size:3*hidden_size, :]
+    // QNN in[5] = ONNX in[2][direction, 3*hidden_size:4*hidden_size, :]
+    // QNN in[6] = ONNX in[2][direction, 1*hidden_size:2*hidden_size, :]
+    // QNN in[17] = ONNX in[2][direction, 0*hidden_size:1*hidden_size, :]
+    uint32_t begin_mask = 0b000U;
+    uint32_t end_mask = 0b000U;
+    uint32_t shrink_axes = 0b001U;
+    uint32_t new_axes_mask = 0b000U;
+    std::vector<uint32_t> qnn_input_indices = {4, 5, 6, 17};
+    std::vector<int32_t> begins = {2, 3, 1, 0};
+    std::vector<std::string> qnn_lstm_weight_name = {
+        input_names[2] + "_recurrent_to_forget_gate_weight_" + direction,
+        input_names[2] + "_recurrent_to_cell_gate_weight_" + direction,
+        input_names[2] + "_recurrent_to_output_gate_weight_" + direction,
+        input_names[2] + "_recurrent_to_input_gate_weight_" + direction};
+    for (size_t i = 0; i < 4; i++) {
+      std::vector<std::vector<int32_t>> ranges = {{direction_idx, direction_idx + 1, 1},
+                                                  {begins[i] * hidden_size_sign, (begins[i] + 1) * hidden_size_sign, 1},
+                                                  {0, hidden_size_sign, 1}};
+      std::vector<uint32_t> output_shape = {hidden_size, hidden_size};
+      ORT_RETURN_IF_ERROR(AddStridedSliceOrReshape(/*qnn_model_wrapper=*/qnn_model_wrapper,
+                                                   /*node_unit=*/node_unit,
+                                                   /*input_name=*/input_names[2],
+                                                   /*output_name=*/qnn_lstm_weight_name[i],
+                                                   /*input_shape=*/input_tensor_infos[2].shape,
+                                                   /*output_shape=*/output_shape,
+                                                   /*ranges=*/ranges,
+                                                   /*begin_mask=*/begin_mask,
+                                                   /*end_mask=*/end_mask,
+                                                   /*shrink_axes=*/shrink_axes,
+                                                   /*new_axes_mask=*/new_axes_mask,
+                                                   /*tensor_data_type=*/input_tensor_infos[2].qnn_data_type,
+                                                   /*QnnQuantParamsWrapper=*/input_tensor_infos[2].quant_param,
+                                                   /*do_op_validation=*/do_op_validation,
+                                                   /*is_for_input=*/false,
+                                                   /*is_for_output=*/false));
+      qnn_lstm_input_names[qnn_input_indices[i]] = qnn_lstm_weight_name[i];
+    }
+  }
+
+  // input B
+  {
+    // QNN in[7] = ONNX in[3][direction, 2*hidden_size:3*hidden_size] + ONNX in[3][direction, 6*hidden_size:7*hidden_size]
+    // QNN in[8] = ONNX in[3][direction, 3*hidden_size:4*hidden_size] + ONNX in[3][direction, 7*hidden_size:8*hidden_size]
+    // QNN in[9] = ONNX in[3][direction, 1*hidden_size:2*hidden_size] + ONNX in[3][direction, 5*hidden_size:6*hidden_size]
+    // QNN in[21] = ONNX in[3][direction, 0*hidden_size:1*hidden_size] + ONNX in[3][direction, 4*hidden_size:5*hidden_size]
+    uint32_t begin_mask = 0b00U;
+    uint32_t end_mask = 0b00U;
+    uint32_t shrink_axes = 0b01U;
+    uint32_t new_axes_mask = 0b00U;
+    std::vector<uint32_t> output_shape = {hidden_size};
+    std::vector<std::string> qnn_lstm_bias_name = {
+        node_name + "_forget_gate_bias_" + direction,
+        node_name + "_cell_gate_bias_" + direction,
+        node_name + "_output_gate_bias_" + direction,
+        node_name + "_input_gate_bias_" + direction};
+    std::vector<uint32_t> qnn_input_indices = {7, 8, 9, 21};
+    if (onnx_inputs.size() > 3 && onnx_inputs[3].node_arg.Exists()) {
+      std::vector<int32_t> begins = {2, 3, 1, 0, 6, 7, 5, 4};
+      std::vector<std::string> onnx_lstm_bias_name = {
+          input_names[3] + "_input_to_forget_gate_bias_" + direction,
+          input_names[3] + "_input_to_cell_gate_bias_" + direction,
+          input_names[3] + "_input_to_output_gate_bias_" + direction,
+          input_names[3] + "_input_to_input_gate_bias_" + direction,
+          input_names[3] + "_recurrent_to_forget_gate_bias_" + direction,
+          input_names[3] + "_recurrent_to_cell_gate_bias_" + direction,
+          input_names[3] + "_recurrent_to_output_gate_bias_" + direction,
+          input_names[3] + "_recurrent_to_input_gate_bias_" + direction};
+      for (size_t i = 0; i < 8; i++) {
+        std::vector<std::vector<int32_t>> ranges = {{direction_idx, direction_idx + 1, 1},
+                                                    {begins[i] * hidden_size_sign, (begins[i] + 1) * hidden_size_sign, 1}};
+        ORT_RETURN_IF_ERROR(AddStridedSliceOrReshape(/*qnn_model_wrapper=*/qnn_model_wrapper,
+                                                     /*node_unit=*/node_unit,
+                                                     /*input_name=*/input_names[3],
+                                                     /*output_name=*/onnx_lstm_bias_name[i],
+                                                     /*input_shape=*/input_tensor_infos[3].shape,
+                                                     /*output_shape=*/output_shape,
+                                                     /*ranges=*/ranges,
+                                                     /*begin_mask=*/begin_mask,
+                                                     /*end_mask=*/end_mask,
+                                                     /*shrink_axes=*/shrink_axes,
+                                                     /*new_axes_mask=*/new_axes_mask,
+                                                     /*tensor_data_type=*/input_tensor_infos[3].qnn_data_type,
+                                                     /*QnnQuantParamsWrapper=*/input_tensor_infos[3].quant_param,
+                                                     /*do_op_validation=*/do_op_validation,
+                                                     /*is_for_input=*/false,
+                                                     /*is_for_output=*/false));
+      }
+      for (size_t i = 0; i < 4; i++) {
+        std::vector<std::string> add_input_names = {onnx_lstm_bias_name[i], onnx_lstm_bias_name[i + 4]};
+        // TODO: The quantize_param should not be used directly, we should calculate an approximate quant_param here.
+        QnnTensorWrapper add_output_tensorwrapper(qnn_lstm_bias_name[i], QNN_TENSOR_TYPE_NATIVE, input_tensor_infos[3].qnn_data_type,
+                                                  input_tensor_infos[3].quant_param.Copy(), std::vector<uint32_t>(output_shape));
+        ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(add_output_tensorwrapper)),
+                          "QNN EP: Failed to add output tensor for inserted ElementWiseAdd node.");
+        ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(node_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_ELEMENT_WISE_ADD,
+                                                          std::move(add_input_names), {qnn_lstm_bias_name[i]}, {}, do_op_validation),
+                          "Failed to create manually inserted ElementWiseAdd node.");
+        qnn_lstm_input_names[qnn_input_indices[i]] = qnn_lstm_bias_name[i];
+      }
+    } else {
+      // prepare zero bias
+      std::string zero_bias_name = node_name + "_zero_bias";
+      QnnTensorWrapper zero_bias_tensor_wrapper(zero_bias_name,
+                                                QNN_TENSOR_TYPE_STATIC,
+                                                input_tensor_infos[0].qnn_data_type,
+                                                QnnQuantParamsWrapper(),
+                                                std::vector<uint32_t>(output_shape),
+                                                std::vector<uint8_t>(utils::GetElementSizeByType(input_tensor_infos[0].qnn_data_type) * hidden_size, 0));
+      ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(zero_bias_tensor_wrapper)),
+                        "Failed to add additional zero bias for QNN LSTM node.");
+      for (size_t i = 0; i < 4; i++) {
+        qnn_lstm_input_names[qnn_input_indices[i]] = zero_bias_name;
+      }
+    }
+  }
+
+  // input P
+  if (onnx_inputs.size() > 7 && onnx_inputs[7].node_arg.Exists()) {
+    // QNN in[18] = ONNX in[7][direction, 0*hidden_size:1*hidden_size]
+    // QNN in[19] = ONNX in[7][direction, 2*hidden_size:1*hidden_size]
+    // QNN in[20] = ONNX in[7][direction, 1*hidden_size:1*hidden_size]
+    uint32_t begin_mask = 0b00U;
+    uint32_t end_mask = 0b00U;
+    uint32_t shrink_axes = 0b01U;
+    uint32_t new_axes_mask = 0b00U;
+    std::vector<uint32_t> output_shape = {hidden_size};
+    std::vector<uint32_t> qnn_input_indices = {18, 19, 20};
+    std::vector<int32_t> begins = {0, 2, 1};
+    std::vector<std::string> qnn_lstm_weight_name = {
+        input_names[7] + "_cell_to_input_gate_weight_" + direction,
+        input_names[7] + "_cell_to_forget_gate_weight_" + direction,
+        input_names[7] + "_cell_to_output_gate_weight_" + direction};
+    for (size_t i = 0; i < 3; i++) {
+      std::vector<std::vector<int32_t>> ranges = {
+          {direction_idx, direction_idx + 1, 1},
+          {begins[i] * hidden_size_sign, (begins[i] + 1) * hidden_size_sign, 1},
+      };
+      ORT_RETURN_IF_ERROR(AddStridedSliceOrReshape(/*qnn_model_wrapper=*/qnn_model_wrapper,
+                                                   /*node_unit=*/node_unit,
+                                                   /*input_name=*/input_names[7],
+                                                   /*output_name=*/qnn_lstm_weight_name[i],
+                                                   /*input_shape=*/input_tensor_infos[7].shape,
+                                                   /*output_shape=*/output_shape,
+                                                   /*ranges=*/ranges,
+                                                   /*begin_mask=*/begin_mask,
+                                                   /*end_mask=*/end_mask,
+                                                   /*shrink_axes=*/shrink_axes,
+                                                   /*new_axes_mask=*/new_axes_mask,
+                                                   /*tensor_data_type=*/input_tensor_infos[7].qnn_data_type,
+                                                   /*QnnQuantParamsWrapper=*/input_tensor_infos[7].quant_param,
+                                                   /*do_op_validation=*/do_op_validation,
+                                                   /*is_for_input=*/false,
+                                                   /*is_for_output=*/false));
+      qnn_lstm_input_names[qnn_input_indices[i]] = qnn_lstm_weight_name[i];
+    }
+  }
+
+  // input initial h, c
+  {
+    // QNN in[10] = ONNX in[5][direction_idx, :, :]
+    // QNN in[11] = ONNX in[6][direction_idx, :, :]
+    uint32_t begin_mask = 0b000U;
+    uint32_t end_mask = 0b000U;
+    uint32_t shrink_axes = 0b001U;
+    uint32_t new_axes_mask = 0b000U;
+    std::vector<std::vector<int32_t>> ranges = {{direction_idx, direction_idx + 1, 1},
+                                                {0, SafeInt<int32_t>(batch_size), 1},
+                                                {0, hidden_size_sign, 1}};
+    std::vector<uint32_t> src_indices = {5, 6};
+    std::vector<uint32_t> qnn_input_indices = {10, 11};
+    std::vector<uint32_t> output_shape = {batch_size, hidden_size};
+    for (size_t i = 0; i < 2; i++) {
+      if (onnx_inputs.size() > src_indices[i] && onnx_inputs[src_indices[i]].node_arg.Exists()) {
+        std::string qnn_lstm_input_name = input_names[src_indices[i]] + "_" + direction;
+        ORT_RETURN_IF_ERROR(AddStridedSliceOrReshape(/*qnn_model_wrapper=*/qnn_model_wrapper,
+                                                     /*node_unit=*/node_unit,
+                                                     /*input_name=*/input_names[src_indices[i]],
+                                                     /*output_name=*/qnn_lstm_input_name,
+                                                     /*input_shape=*/input_tensor_infos[src_indices[i]].shape,
+                                                     /*output_shape=*/output_shape,
+                                                     /*ranges=*/ranges,
+                                                     /*begin_mask=*/begin_mask,
+                                                     /*end_mask=*/end_mask,
+                                                     /*shrink_axes=*/shrink_axes,
+                                                     /*new_axes_mask=*/new_axes_mask,
+                                                     /*tensor_data_type=*/input_tensor_infos[src_indices[i]].qnn_data_type,
+                                                     /*QnnQuantParamsWrapper=*/input_tensor_infos[src_indices[i]].quant_param,
+                                                     /*do_op_validation=*/do_op_validation,
+                                                     /*is_for_input=*/false,
+                                                     /*is_for_output=*/false));
+        qnn_lstm_input_names[qnn_input_indices[i]] = qnn_lstm_input_name;
+      } else {
+        // prepare zero initial values
+        std::string zero_initial_values_name = node_name + "_LSTM_initial_values_" + (i == 0 ? "h" : "c");
+        QnnTensorWrapper zero_bias_tensor_wrapper(zero_initial_values_name,
+                                                  QNN_TENSOR_TYPE_STATIC,
+                                                  input_tensor_infos[0].qnn_data_type,
+                                                  QnnQuantParamsWrapper(),
+                                                  std::vector<uint32_t>(output_shape),
+                                                  std::vector<uint8_t>(utils::GetElementSizeByType(input_tensor_infos[0].qnn_data_type) * batch_size * hidden_size, 0));
+        ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(zero_bias_tensor_wrapper)),
+                          "Failed to add additional initial values for QNN LSTM node.");
+        qnn_lstm_input_names[qnn_input_indices[i]] = zero_initial_values_name;
+      }
+    }
+  }
+
+  // add QNN LSTM
+  // since HTP doesn't not support 3d yet, add #sequence_length LSTM node
+  std::vector<std::string> qnn_all_hidden_state_names;
+  qnn_all_hidden_state_names.resize(seq_length);
+  for (uint32_t i = 0; i < seq_length; i++) {
+    uint32_t sequence_idx = direction == "forward" ? i : seq_length - i - 1;
+    // Add LSTM inputs
+    std::vector<std::string> qnn_lstm_input_names_i = qnn_lstm_input_names;
+
+    // input X
+    {
+      // QNN in[0] = ONNX in[0][sequence_idx, :, :]
+      uint32_t begin_mask = 0b000U;
+      uint32_t end_mask = 0b000U;
+      uint32_t shrink_axes = 0b001U;
+      uint32_t new_axes_mask = 0b000U;
+      std::vector<std::vector<int32_t>> ranges = {{SafeInt<int32_t>(sequence_idx), SafeInt<int32_t>(sequence_idx + 1), 1},
+                                                  {0, SafeInt<int32_t>(batch_size), 1},
+                                                  {0, SafeInt<int32_t>(input_size), 1}};
+      std::string qnn_lstm_input_name = input_names[0] + "_cell_" + std::to_string(sequence_idx) + "_input";
+      std::vector<uint32_t> output_shape = {batch_size, input_size};
+      ORT_RETURN_IF_ERROR(AddStridedSliceOrReshape(/*qnn_model_wrapper=*/qnn_model_wrapper,
+                                                   /*node_unit=*/node_unit,
+                                                   /*input_name=*/input_names[0],
+                                                   /*output_name=*/qnn_lstm_input_name,
+                                                   /*input_shape=*/input_tensor_infos[0].shape,
+                                                   /*output_shape=*/output_shape,
+                                                   /*ranges=*/ranges,
+                                                   /*begin_mask=*/begin_mask,
+                                                   /*end_mask=*/end_mask,
+                                                   /*shrink_axes=*/shrink_axes,
+                                                   /*new_axes_mask=*/new_axes_mask,
+                                                   /*tensor_data_type=*/input_tensor_infos[0].qnn_data_type,
+                                                   /*QnnQuantParamsWrapper=*/input_tensor_infos[0].quant_param,
+                                                   /*do_op_validation=*/do_op_validation,
+                                                   /*is_for_input=*/false,
+                                                   /*is_for_output=*/false));
+      qnn_lstm_input_names_i[0] = qnn_lstm_input_name;
+    }
+
+    // outputs
+    std::vector<uint32_t> qnn_lstm_output_shape = {batch_size, hidden_size};
+
+    std::vector<std::string> qnn_lstm_output_names = {
+        node_name + "_QNN_LSTM_output_all_hidden_state_" + std::to_string(sequence_idx) + "_" + direction,
+        node_name + "_QNN_LSTM_output_cell_state_" + std::to_string(sequence_idx) + "_" + direction,
+        node_name + "_QNN_LSTM_output_hidden_state_" + std::to_string(sequence_idx) + "_" + direction};
+    qnn_lstm_input_names[10] = qnn_lstm_output_names[2];  // update initial_h
+    qnn_lstm_input_names[11] = qnn_lstm_output_names[1];  // update initial_c
+    qnn_all_hidden_state_names[sequence_idx] = qnn_lstm_output_names[2];
+
+    for (size_t j = 0; j < 3; j++) {
+      QnnTensorWrapper output_tensorwrapper(qnn_lstm_output_names[j],
+                                            QNN_TENSOR_TYPE_NATIVE,
+                                            output_tensor_infos[j].qnn_data_type,
+                                            output_tensor_infos[j].quant_param.Copy(),
+                                            std::vector<uint32_t>(qnn_lstm_output_shape));
+      ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)),
+                        "QNN EP: Failed to add %ldth output tensor for QNN LSTM.", j);
+    }
+    std::string lstm_node_name = node_name + "_cell_" + std::to_string(sequence_idx) + "_" + direction;
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(lstm_node_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_LSTM,
+                                                      std::move(qnn_lstm_input_names_i), std::move(qnn_lstm_output_names),
+                                                      std::vector<std::string>(param_names), do_op_validation),
+                      "QNN EP: Failed to create Qnn LSTM node.");
+  }
+
+  // pack all timestamp outputs together for onnx output[0]
+  std::string qnn_pack_output_name = node_name + "_QNN_LSTM_output_hidden_state_all_" + direction;
+
+  // add pack for output[0]
+  std::vector<std::string> pack_param_names;
+  ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), qnn_pack_output_name, 0, QNN_OP_PACK_PARAM_AXIS, pack_param_names));
+
+  QnnTensorWrapper pack_output_tensorwrapper(qnn_pack_output_name,
+                                             QNN_TENSOR_TYPE_NATIVE,
+                                             output_tensor_infos[0].qnn_data_type,
+                                             output_tensor_infos[0].quant_param.Copy(),
+                                             {seq_length, batch_size, hidden_size});
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(pack_output_tensorwrapper)),
+                    "QNN EP: Failed to add output tensor for QNN Pack.");
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(qnn_pack_output_name, QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_PACK,
+                                                    std::move(qnn_all_hidden_state_names), {qnn_pack_output_name},
+                                                    std::move(pack_param_names), do_op_validation),
+                    "QNN EP: Failed to create Qnn Pack node.");
+
+  // add reshape for all outputs to align onnx output shape for unidirection
+  std::vector<std::string> qnn_reshape_input_names = {
+      qnn_pack_output_name,
+      qnn_lstm_input_names[10],
+      qnn_lstm_input_names[11]};
+  std::vector<std::vector<uint32_t>> qnn_lstm_output_shapes = {
+      {seq_length, batch_size, hidden_size},
+      {batch_size, hidden_size},
+      {batch_size, hidden_size}};
+  // in the output shapes below, the value of 1 indicates unidirectional
+  std::vector<std::vector<uint32_t>> onnx_lstm_output_shapes = {
+      {seq_length, 1, batch_size, hidden_size},
+      {1, batch_size, hidden_size},
+      {1, batch_size, hidden_size}};
+  for (size_t i = 0; i < 3; i++) {
+    if (onnx_outputs.size() > i && onnx_outputs[i].node_arg.Exists()) {
+      const std::string reshape_output_name = is_bidirection ? qnn_reshape_input_names[i] + "_unsqueeze_" + direction : onnx_outputs[i].node_arg.Name();
+      ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddReshapeNode(/*input_name=*/qnn_reshape_input_names[i],
+                                                           /*output_name=*/reshape_output_name,
+                                                           /*input_shape=*/qnn_lstm_output_shapes[i],
+                                                           /*output_shape=*/onnx_lstm_output_shapes[i],
+                                                           /*tensor_data_type=*/output_tensor_infos[i].qnn_data_type,
+                                                           /*quantize_param=*/output_tensor_infos[i].quant_param,
+                                                           /*do_op_validation=*/do_op_validation,
+                                                           /*is_for_input=*/false,
+                                                           /*is_for_output=*/qnn_model_wrapper.IsGraphOutput(reshape_output_name)));
+      uni_lstm_output_names.emplace_back(reshape_output_name);
+    } else {
+      uni_lstm_output_names.emplace_back("");
+    }
+  }
+  return Status::OK();
+}
+
+Status LSTMOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                  const NodeUnit& node_unit,
+                                                  std::vector<std::string>&& input_names,
+                                                  const logging::Logger& logger,
+                                                  bool do_op_validation) const {
+  ORT_UNUSED_PARAMETER(do_op_validation);
+  const auto& inputs = node_unit.Inputs();
+
+  NodeAttrHelper node_helper(node_unit);
+  std::string direction = node_helper.Get("direction", "forward");
+  ORT_RETURN_IF_NOT(inputs.size() >= 3 && inputs.size() <= 8, "LSTM should receive inputs ranging from 3 to 8!");
+
+  if (direction == "bidirectional") {
+    std::vector<std::string> uni_lstm_output_names_forward, uni_lstm_output_names_reverse;
+    ORT_RETURN_IF_ERROR(AddUnidirectionLSTM(qnn_model_wrapper, node_unit, "forward", input_names, logger, do_op_validation, true, uni_lstm_output_names_forward));
+    ORT_RETURN_IF_ERROR(AddUnidirectionLSTM(qnn_model_wrapper, node_unit, "reverse", input_names, logger, do_op_validation, true, uni_lstm_output_names_reverse));
+
+    // Concat forward and reverse output
+    for (size_t i = 0; i < 3; i++) {
+      TensorInfo output_info = {};
+      if (node_unit.Outputs().size() > i && node_unit.Outputs()[i].node_arg.Exists()) {
+        ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(node_unit.Outputs()[i], output_info));
+        std::string onnx_output_name = node_unit.Outputs()[i].node_arg.Name();
+
+        // param
+        std::vector<std::string> concat_param_names;
+        ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), onnx_output_name, static_cast<uint32_t>(output_info.shape.size() - 3), QNN_OP_CONCAT_PARAM_AXIS, concat_param_names));
+
+        // create tensor and add op
+        Qnn_TensorType_t output_tensor_type = qnn_model_wrapper.IsGraphOutput(onnx_output_name) ? QNN_TENSOR_TYPE_APP_READ : QNN_TENSOR_TYPE_NATIVE;
+        QnnTensorWrapper concat_output_tensorwrapper(onnx_output_name,
+                                                     output_tensor_type,
+                                                     output_info.qnn_data_type,
+                                                     output_info.quant_param.Copy(),
+                                                     std::vector<uint32_t>(output_info.shape));
+        ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(concat_output_tensorwrapper)),
+                          "QNN EP: Failed to add output tensor for QNN Concat.");
+        ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(node_unit.Name(), QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_CONCAT,
+                                                          {uni_lstm_output_names_forward[i], uni_lstm_output_names_reverse[i]}, {onnx_output_name},
+                                                          std::move(concat_param_names), do_op_validation),
+                          "QNN EP: Failed to create Qnn Concat node.");
+      }
+    }
+  } else {
+    std::vector<std::string> uni_lstm_output_names;
+    ORT_RETURN_IF_ERROR(AddUnidirectionLSTM(qnn_model_wrapper, node_unit, direction, input_names, logger, do_op_validation, false, uni_lstm_output_names));
+  }
+  return Status::OK();
+}
+
+void CreateLSTMOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.AddOpBuilder(op_type, std::make_unique<LSTMOpBuilder>());
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
index 19e5ee298f5fb..bcf4df8186dd2 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/slice_op_builder.cc
@@ -46,7 +46,7 @@ Status SliceOpBuilder::ExplictOpCheck(QnnModelWrapper& qnn_model_wrapper, const
     for (size_t i = 1; i < input_count; i++) {
       const auto& next_input = node_unit.Inputs()[i].node_arg.Name();
       if (!qnn_model_wrapper.IsConstantInput(next_input)) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN desn't support dynamic slice.");
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN doesn't support dynamic slice.");
       }
     }
   }
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
index 555992ef00bfe..cba1faaa4fa2d 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/tile_op_builder.cc
@@ -42,7 +42,7 @@ Status TileOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
                                     std::vector<std::string>& input_names,
                                     bool do_op_validation) const {
   const auto& inputs = node_unit.Inputs();
-  // QNN Tile only support 1 input, the 2nd input need to be initialier and set as Qnn node parameter
+  // QNN Tile only support 1 input, the 2nd input need to be initializer and set as Qnn node parameter
   if (do_op_validation) {
     auto& repeats_input_name = inputs[1].node_arg.Name();
     ORT_RETURN_IF_NOT(qnn_model_wrapper.IsConstantInput(repeats_input_name),
@@ -60,7 +60,7 @@ Status TileOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
                                                   const logging::Logger& logger,
                                                   bool do_op_validation) const {
   std::vector<std::string> param_tensor_names;
-  // Already confirmed repeats input is initailizer in ProcessInputs()
+  // Already confirmed repeats input is initializer in ProcessInputs()
   const auto& repeats_input_name = node_unit.Inputs()[1].node_arg.Name();
 
   std::vector<uint8_t> unpacked_tensor;
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/upsample_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/upsample_op_builder.cc
new file mode 100644
index 0000000000000..cba0eb350992f
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/upsample_op_builder.cc
@@ -0,0 +1,219 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+class UpsampleOpBuilder : public BaseOpBuilder {
+ public:
+  UpsampleOpBuilder() : BaseOpBuilder("UpsampleOpBuilder") {}
+  ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(UpsampleOpBuilder);
+
+  Status IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger) const final ORT_MUST_USE_RESULT;
+
+ protected:
+  Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                       const NodeUnit& node_unit,
+                       const logging::Logger& logger,
+                       std::vector<std::string>& input_names,
+                       bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                     const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names,
+                                     const logging::Logger& logger,
+                                     bool do_op_validation) const override ORT_MUST_USE_RESULT;
+
+  Status OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrapper,
+                                  const NodeUnit& node_unit,
+                                  const logging::Logger& logger,
+                                  const std::vector<std::string>& input_names,
+                                  size_t output_index,
+                                  Qnn_DataType_t qnn_data_type,
+                                  QnnQuantParamsWrapper& quant_param) const override ORT_MUST_USE_RESULT;
+
+ private:
+  const std::unordered_map<std::string, uint32_t> supported_modes = {
+      {"nearest", QNN_OP_RESIZE_INTERPOLATION_MODE_NEAREST},
+      {"linear", QNN_OP_RESIZE_INTERPOLATION_MODE_LINEAR},
+      {"cubic", QNN_OP_RESIZE_INTERPOLATION_MODE_CUBIC}};
+
+  // Info for Onnx Upsample attribute {<attribute_name>, <default_value>}
+  const OnnxAttrInfo<std::string> onnx_mode_attr = {"mode", "nearest"};
+};
+
+Status UpsampleOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
+                                        const NodeUnit& node_unit,
+                                        const logging::Logger& logger) const {
+  // Resize ops are sensitive with data layout, no special validation so far
+  // The nodes from 1st call of GetCapability do not get layout transformer applied, it's still NCHW
+  // The nodes from 2nd call of GetCapability get layout transformer applied, it's NHWC
+  // Need to do op validation in 1st call of GetCapability
+  if (node_unit.Domain() == kMSInternalNHWCDomain) {
+    return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true);
+  }
+
+  const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
+  NodeAttrHelper node_helper(node_unit);
+
+  // Check mode
+  const std::string interp_mode = GetOnnxAttr(node_helper, onnx_mode_attr);
+  ORT_RETURN_IF_NOT(supported_modes.find(interp_mode) != supported_modes.end(),
+                    "QNN EP: Resize does not support mode ", interp_mode.c_str());
+
+  const auto& input_0 = node_unit.Inputs()[0];
+  std::vector<uint32_t> input_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape),
+                    "QNN EP: Cannot get input shape for Onnx Upsample ", input_0.node_arg.Name().c_str());
+  const size_t input_rank = input_shape.size();
+
+  ORT_RETURN_IF(is_npu_backend && (input_rank < 3 || input_rank > 5),
+                "QNN EP: The input rank for Resize must be at least 3 and no greater than 5 on the HTP.");
+
+  const auto& output_0 = node_unit.Outputs()[0];
+  std::vector<uint32_t> output_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(output_0.node_arg, output_shape),
+                    "QNN EP: Cannot get output shape for Onnx Upsample ", output_0.node_arg.Name().c_str(),
+                    ". Dynamic scales input is not supported in QNN EP.");
+
+  // Check that only the spatial dimensions (width, height) are resized. The batch_size (N) and channels (C) should
+  // be untouched. This code runs before layout transformation, so we know that the current layout is "channel first"
+  // (e.g., N, C, S1, S2, ..., SN).
+  ORT_RETURN_IF_NOT(input_shape[0] == output_shape[0] && input_shape[1] == output_shape[1],
+                    "QNN EP: Resize may only change the spatial dimensions.");
+
+  if (!is_npu_backend) {
+    ONNX_NAMESPACE::DataType input_data_type = input_0.node_arg.Type();
+    ORT_RETURN_IF(input_data_type != ONNX_NAMESPACE::Utils::DataTypeUtils::ToType("float"),
+                  "QNN EP: Data type ", input_data_type->c_str(),
+                  " is not supported for Resize operator in CPU backend.");
+  }
+
+  return Status::OK();
+}
+
+Status UpsampleOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
+                                        const NodeUnit& node_unit,
+                                        const logging::Logger& logger,
+                                        std::vector<std::string>& input_names,
+                                        bool do_op_validation) const {
+  const int opset_version = node_unit.SinceVersion();
+  const auto& inputs = node_unit.Inputs();
+
+  if (opset_version > 7 && do_op_validation) {
+    const std::string& scales_input_name = inputs[1].node_arg.Name();
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.IsConstantInput(scales_input_name),
+                      "QNN doesn't support dynamic scales input for ONNX Upsample op ", node_unit.Name().c_str());
+  }
+
+  // Only need to consider the first input of Onnx upsample.
+  ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[0], logger, input_names));
+
+  return Status::OK();
+}
+
+Status UpsampleOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                      const NodeUnit& node_unit,
+                                                      std::vector<std::string>&& input_names,
+                                                      const logging::Logger& logger,
+                                                      bool do_op_validation) const {
+  std::vector<std::string> param_tensor_names;
+  NodeAttrHelper node_helper(node_unit);
+  const std::string interp_mode = GetOnnxAttr(node_helper, onnx_mode_attr);
+
+  const auto& input_0 = node_unit.Inputs()[0];
+  std::vector<uint32_t> input_shape;
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(input_0.node_arg, input_shape),
+                    "QNN EP: Cannot get input shape for Onnx Upsample ", input_0.node_arg.Name().c_str());
+
+  const size_t input_rank = input_shape.size();
+  const bool is_npu_backend = IsNpuBackend(qnn_model_wrapper.GetQnnBackendType());
+  std::string qnn_op_type = GetQnnOpType(node_unit.OpType());
+
+  if (is_npu_backend && input_rank == 4 && interp_mode != "cubic") {
+    // Translate QNN's Resize to QNN's ResizeNearestNeighbor/ResizeBilinear to achieve better performance on
+    // the HTP backend. QNN's ResizeNearestNeighbor and ResizeBilinear are only supported when input rank is 4.
+    qnn_op_type = (interp_mode == "nearest") ? QNN_OP_RESIZE_NEAREST_NEIGHBOR : QNN_OP_RESIZE_BILINEAR;
+
+    // Parameter 'align_corners'
+    const std::string align_corners_param_name = (qnn_op_type == QNN_OP_RESIZE_BILINEAR)
+                                                     ? QNN_OP_RESIZE_BILINEAR_PARAM_ALIGN_CORNERS
+                                                     : QNN_OP_RESIZE_NEAREST_NEIGHBOR_PARAM_ALIGN_CORNERS;
+    ORT_RETURN_IF_ERROR(AddQnnScalar<bool>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), false, align_corners_param_name, param_tensor_names));
+
+    // Parameter 'half_pixel_centers'
+    const std::string half_pixel_centers_param_name = (qnn_op_type == QNN_OP_RESIZE_BILINEAR)
+                                                          ? QNN_OP_RESIZE_BILINEAR_PARAM_HALF_PIXEL_CENTERS
+                                                          : QNN_OP_RESIZE_NEAREST_NEIGHBOR_PARAM_HALF_PIXEL_CENTERS;
+    ORT_RETURN_IF_ERROR(AddQnnScalar<bool>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), false, half_pixel_centers_param_name, param_tensor_names));
+
+    if (qnn_op_type == QNN_OP_RESIZE_BILINEAR) {
+      // Parameter 'antialias'
+      ORT_RETURN_IF_ERROR(AddQnnScalar<bool>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), false, QNN_OP_RESIZE_BILINEAR_PARAM_ANTIALIAS, param_tensor_names));
+    }
+  } else {
+    // Remain as QNN's Resize.
+    // Parameter 'exclude_outside'
+    ORT_RETURN_IF_ERROR(AddQnnScalar<bool>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), false, QNN_OP_RESIZE_PARAM_EXCLUDE_OUTSIDE, param_tensor_names));
+
+    // Parameter 'transformation_mode'
+    uint32_t transformation_mode = (supported_modes.at(interp_mode) == QNN_OP_RESIZE_INTERPOLATION_MODE_NEAREST)
+                                       ? static_cast<uint32_t>(QNN_OP_RESIZE_TRANSFORMATION_MODE_HALF_PIXEL)
+                                       : static_cast<uint32_t>(QNN_OP_RESIZE_TRANSFORMATION_MODE_ASYMMETRIC);
+    ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), transformation_mode, QNN_OP_RESIZE_PARAM_TRANSFORMATION_MODE, param_tensor_names));
+
+    // Parameter 'interpolation_mode'
+    uint32_t qnn_interp_mode = static_cast<uint32_t>(supported_modes.at(interp_mode));
+    ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), qnn_interp_mode, QNN_OP_RESIZE_PARAM_INTERPOLATION_MODE, param_tensor_names));
+
+    // Parameter 'nearest_mode'. Process only when 'interpolation_mode' is NEAREST.
+    if (qnn_interp_mode == QNN_OP_RESIZE_INTERPOLATION_MODE_NEAREST) {
+      uint32_t qnn_nearest_mode = static_cast<uint32_t>(QNN_OP_RESIZE_NEAREST_MODE_ROUND_PREFER_FLOOR);
+      ORT_RETURN_IF_ERROR(AddQnnScalar<uint32_t>(qnn_model_wrapper, node_unit.Index(), node_unit.Name(), qnn_nearest_mode, QNN_OP_RESIZE_PARAM_NEAREST_MODE, param_tensor_names));
+    }
+  }
+
+  ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit,
+                                     std::move(input_names),
+                                     std::move(param_tensor_names),
+                                     logger, do_op_validation, qnn_op_type));
+
+  return Status::OK();
+}
+
+Status UpsampleOpBuilder::OverrideOutputQuantParam(QnnModelWrapper& qnn_model_wrapper,
+                                                   const NodeUnit& node_unit,
+                                                   const logging::Logger& logger,
+                                                   const std::vector<std::string>& input_names,
+                                                   size_t output_index,
+                                                   Qnn_DataType_t qnn_data_type,
+                                                   QnnQuantParamsWrapper& quant_param) const {
+  if (!quant_param.IsPerTensor()) {
+    return Status::OK();
+  }
+
+  // Force Resize op's output to use the same quantization parameters as the input if nearly equal.
+  // This helps the HTP backend employ certain optimizations.
+  return SetOutputQParamEqualToInputIfNearlyEqual(qnn_model_wrapper, node_unit, logger, input_names,
+                                                  0 /*input_index*/, output_index, qnn_data_type, quant_param);
+}
+
+void CreateUpsampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.AddOpBuilder(op_type, std::make_unique<UpsampleOpBuilder>());
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index aea354d0550b7..522226ae9e438 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -1145,13 +1145,33 @@ Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id,
   return Status::OK();
 }
 
-Status QnnBackendManager::SetRpcControlLatency(uint32_t htp_power_config_client_id,
-                                               uint32_t rpc_control_latency) {
+Status QnnBackendManager::SetRpcPowerConfigs(uint32_t htp_power_config_client_id,
+                                             uint32_t rpc_control_latency,
+                                             uint32_t rpc_polling_time) {
   // This function is called in QNN EP's OnRunStart() even if QNN backend setup failed and the model is assigned
   // to a different EP. Therefore, we have to check that backend setup actually completed before trying to
   // set RPC control latency. Otherwise, this causes a segfault because the QNN backend library is unloaded.
   ORT_RETURN_IF_NOT(backend_setup_completed_, "Cannot set HTP RPC control latency if backend setup is not complete.");
+
+  constexpr int kNumRpcPollingPowerConfigs = 2;
+  std::vector<QnnHtpPerfInfrastructure_PowerConfig_t> rpc_power_configs;
+  rpc_power_configs.reserve(kNumRpcPollingPowerConfigs);
+
+  // Set rpc control latency here
   if (rpc_control_latency != 0) {
+    auto& rpc_control_latency_cfg = rpc_power_configs.emplace_back();
+    rpc_control_latency_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
+    rpc_control_latency_cfg.rpcControlLatencyConfig = rpc_control_latency;
+  }
+
+  // Note: v68 does not support rpc polling mode
+  if (rpc_polling_time != 0) {
+    auto& rpc_polling_time_cfg = rpc_power_configs.emplace_back();
+    rpc_polling_time_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
+    rpc_polling_time_cfg.rpcPollingTimeConfig = rpc_polling_time;
+  }
+
+  if (rpc_power_configs.size() > 0) {
     QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
     auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
     ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
@@ -1161,15 +1181,6 @@ Status QnnBackendManager::SetRpcControlLatency(uint32_t htp_power_config_client_
                   "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
     QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
 
-    // Set rpc control latency here, but note that v68 doesn't support rpc polling mode.
-    constexpr int kNumRpcPollingPowerConfigs = 2;
-    std::vector<QnnHtpPerfInfrastructure_PowerConfig_t> rpc_power_configs(kNumRpcPollingPowerConfigs);
-    QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency_cfg = rpc_power_configs[0];
-    // v68 doesn't support this.
-    QnnHtpPerfInfrastructure_PowerConfig_t& rpc_polling_time = rpc_power_configs[1];
-    rpc_control_latency_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
-    rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
-    rpc_control_latency_cfg.rpcControlLatencyConfig = rpc_control_latency;
     std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr =
         ObtainNullTermPtrVector(rpc_power_configs);
     status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data());
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 137b3856d431d..1a65d6039695f 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -90,8 +90,9 @@ class QnnBackendManager : public std::enable_shared_from_this<QnnBackendManager>
   Status SetHtpPowerConfig(uint32_t htp_power_config_client_id,
                            HtpPerformanceMode htp_performance_mode);
 
-  Status SetRpcControlLatency(uint32_t htp_power_config_client_id,
-                              uint32_t rpc_control_latency);
+  Status SetRpcPowerConfigs(uint32_t htp_power_config_client_id,
+                            uint32_t rpc_control_latency,
+                            uint32_t rpc_polling_time);
 
   const QNN_INTERFACE_VER_TYPE& GetQnnInterface() { return qnn_interface_; }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
index 3f2faea698259..0b2412b021675 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc
@@ -8,10 +8,10 @@
 #include <gsl/gsl>
 #include "QnnOpDef.h"
 
-#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/qnn_allocator.h"
 #include "core/providers/qnn/shared_context.h"
 
@@ -180,14 +180,16 @@ Status QnnModel::SetupQnnInputOutput(const logging::Logger& logger) {
   auto result = SetupTensors(qnn_input_infos_, graph_info_->InputTensors());
 
   if (Status::OK() != result) {
-    LOGS(logger, ERROR) << "Failed to setup QNN input output tensors for graph: " << graph_info_->Name();
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to setup QNN input tensors!");
+    const std::string message = "Failed to setup QNN input tensors for graph: " + graph_info_->Name();
+    LOGS(logger, ERROR) << message;
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, message);
   }
 
   result = SetupTensors(qnn_output_infos_, graph_info_->OutputTensors(), false);
   if (Status::OK() != result) {
-    LOGS(logger, ERROR) << "Failed to setup QNN input output tensors for graph: " << graph_info_->Name();
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to setup QNN output tensors!");
+    const std::string message = "Failed to setup QNN output tensors for graph: " + graph_info_->Name();
+    LOGS(logger, ERROR) << message;
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, message);
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
index d3d552bc172ec..cbc052cbebe25 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h
@@ -7,8 +7,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
index 0a1b16d24ffcd..51243b9ffa79b 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h
@@ -7,8 +7,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
index 85969b9e2dc05..dd2834c49e8f9 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.cc
@@ -1,8 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/qnn/builder/qnn_node_group.h"
-
 #include <gsl/gsl>
 #include <limits>
 #include <memory>
@@ -10,13 +8,16 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_utils.h"
-#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+
 #include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
 #include "core/providers/qnn/builder/qnn_node_group/dq_q_fusion.h"
 #include "core/providers/qnn/builder/qnn_node_group/hardsigmoid_mul_fusion.h"
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h"
+#include "core/providers/qnn/builder/qnn_node_group/scale_softmax_fusion.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/ort_api.h"
 
 namespace onnxruntime {
 namespace qnn {
@@ -90,6 +91,7 @@ static std::unique_ptr<IQnnNodeGroup> TryQnnFusions(
       {"DequantizeLinear", DQQFusion::TryFusion},
       {"HardSigmoid", HardSigmoidMulFusion::TryFusion},
       {"Gemm", ReshapeGemmFusion::TryFusion},
+      {"Mul", ScaleSoftmaxFusion::TryFusion},
   };
 
   // For now, all fusions involve standalone node units (i.e., no wrapping DQ/Q nodes).
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.h
similarity index 100%
rename from onnxruntime/core/providers/qnn/builder/qnn_node_group.h
rename to onnxruntime/core/providers/qnn/builder/qnn_node_group/qnn_node_group.h
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h
index 6c953e6cf72c5..7e3f4b962a15c 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/reshape_gemm_fusion.h
@@ -9,8 +9,8 @@
 #include <unordered_map>
 #include <vector>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/scale_softmax_fusion.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/scale_softmax_fusion.cc
new file mode 100644
index 0000000000000..5c7091b3be3cc
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/scale_softmax_fusion.cc
@@ -0,0 +1,226 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/qnn/builder/qnn_node_group/scale_softmax_fusion.h"
+
+#include <gsl/gsl>
+#include <optional>
+#include <utility>
+#include <string>
+#include <array>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_node_group/utils.h"
+#include "core/providers/qnn/builder/qnn_model_wrapper.h"
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+
+namespace onnxruntime {
+namespace qnn {
+namespace {
+
+constexpr char kOpMul[] = "Mul";
+constexpr char kOpSoftmax[] = "Softmax";
+
+/// @brief Get the index of the scalar input in the mul node
+/// @param mul Multiply node unit
+/// @return The index of the scalar input (0 or 1) if found, otherwise std::nullopt
+std::optional<size_t> GetMulScalarInputIndex(const NodeUnit* mul) {
+  const NodeArg* mul_y = mul->GetNode().InputDefs()[1];
+  const NodeArg* mul_x = mul->GetNode().InputDefs()[0];
+  auto y_shape_proto = mul_y->Shape();
+  auto x_shape_proto = mul_x->Shape();
+  bool is_y_scalar = false;
+  if (y_shape_proto != nullptr) {
+    auto y_shape = utils::GetTensorProtoShape(*y_shape_proto);
+    is_y_scalar = y_shape.NumDimensions() == 0;
+  }
+  bool is_x_scalar = false;
+  if (x_shape_proto != nullptr) {
+    auto x_shape = utils::GetTensorProtoShape(*x_shape_proto);
+    is_x_scalar = x_shape.NumDimensions() == 0;
+  }
+  if (is_y_scalar) {
+    return 1U;
+  } else if (is_x_scalar) {
+    return 0U;
+  }
+  return std::nullopt;
+}
+
+/// @brief Get the axis for softmax
+/// @param mul Multiply node unit
+/// @param softmax Softmax node unit
+/// @return The axis for softmax
+std::optional<uint32_t> GetPositiveSoftmaxAxis(const NodeUnit* mul, const NodeUnit* softmax) {
+  NodeAttrHelper softmax_attr_helper(softmax->GetNode());
+  std::optional<int64_t> param_axis = softmax_attr_helper.GetInt64(QNN_OP_SOFTMAX_PARAM_AXIS);
+  if (!param_axis.has_value()) {
+    return std::nullopt;
+  }
+  int64_t axis_value = param_axis.value();
+  if (axis_value < 0) {
+    size_t input_scale_index = GetMulScalarInputIndex(mul).value();
+    size_t input_other_index = 1U - input_scale_index;
+    int rank = mul->GetNode().InputDefs()[input_other_index]->Shape()->dim_size();
+    axis_value += static_cast<int64_t>(rank);
+  }
+  return static_cast<uint32_t>(axis_value);
+}
+
+/// @brief Identify scalar input from mul node if present
+/// @param mul Multiply node unit
+/// @return The scalar input float value if found, otherwise std::nullopt
+std::optional<float> ExtractScalarValueFromMul(const GraphViewer& graph_viewer, const NodeUnit* mul) {
+  std::optional<size_t> input_scale_index = GetMulScalarInputIndex(mul);
+  if (!input_scale_index.has_value()) {
+    return std::nullopt;
+  }
+  const NodeArg* scalar_arg = mul->GetNode().InputDefs()[input_scale_index.value()];
+  if (!graph_viewer.IsConstantInitializer(scalar_arg->Name(), true)) {
+    return std::nullopt;
+  }
+  const auto* scalar_tensor = graph_viewer.GetConstantInitializer(scalar_arg->Name());
+  if (!scalar_tensor) {
+    return std::nullopt;
+  }
+  if (scalar_tensor->data_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+    return std::nullopt;
+  }
+  const auto& raw_data = scalar_tensor->raw_data();
+  if (raw_data.size() != sizeof(float) || reinterpret_cast<uintptr_t>(raw_data.data()) % alignof(float) != 0) {
+    return std::nullopt;
+  }
+  return *reinterpret_cast<const float*>(raw_data.data());
+}
+
+/// @brief Create or validate the QNN node
+/// @param qnn_model_wrapper QNN model wrapper
+/// @param node_units The node units containing the softmax and mul nodes
+/// @param validate Whether to validate the QNN node
+/// @return Status
+Status CreateOrValidateOnQnn(
+    QnnModelWrapper* qnn_model_wrapper,
+    gsl::span<const NodeUnit* const> node_units,
+    bool validate) {
+  const NodeUnit* mul = node_units[0];
+  const NodeUnit* softmax = node_units[1];
+  ORT_RETURN_IF_NOT(mul->OpType() == kOpMul,
+                    "Expected scale node to be of type Mul, got ", mul->OpType());
+  ORT_RETURN_IF_NOT(softmax->OpType() == kOpSoftmax,
+                    "Expected softmax node to be of type Softmax, got ", softmax->OpType());
+  size_t input_scale_index = GetMulScalarInputIndex(mul).value();
+  size_t input_other_index = 1U - input_scale_index;
+  const NodeUnitIODef& mul_input_other = mul->Inputs()[input_other_index];
+  const NodeUnitIODef& softmax_output = softmax->Outputs()[0];
+
+  std::vector<std::string> param_tensor_names;
+  {  // axis
+    std::optional<uint32_t> axis = GetPositiveSoftmaxAxis(mul, softmax);
+    if (axis.has_value()) {
+      Qnn_Scalar_t axis_scalar = QNN_SCALAR_INIT;
+      axis_scalar.dataType = QNN_DATATYPE_UINT_32;
+      axis_scalar.uint32Value = axis.value();
+      QnnParamWrapper param_wrapper(softmax->Index(),
+                                    softmax->Name(),
+                                    QNN_OP_SOFTMAX_PARAM_AXIS,
+                                    axis_scalar);
+      ORT_RETURN_IF_NOT(qnn_model_wrapper->AddParamWrapper(std::move(param_wrapper)), "Failed to add param");
+      param_tensor_names.push_back(param_wrapper.GetParamTensorName());
+    }
+  }
+  {  // beta
+    NodeAttrHelper softmax_attr_helper(softmax->GetNode());
+    std::optional<float> beta = softmax_attr_helper.GetFloat(QNN_OP_SOFTMAX_PARAM_BETA);
+    float scale = ExtractScalarValueFromMul(qnn_model_wrapper->GetGraphViewer(), mul).value_or(1.0f);
+    Qnn_Scalar_t beta_scalar = QNN_SCALAR_INIT;
+    beta_scalar.dataType = QNN_DATATYPE_FLOAT_32;
+    beta_scalar.floatValue = scale * beta.value_or(1.0f);
+    QnnParamWrapper param_wrapper(softmax->Index(),
+                                  softmax->Name(),
+                                  QNN_OP_SOFTMAX_PARAM_BETA,
+                                  beta_scalar);
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->AddParamWrapper(std::move(param_wrapper)), "Failed to add param");
+    param_tensor_names.push_back(param_wrapper.GetParamTensorName());
+  }
+
+  QnnTensorWrapper fused_softmax_input;
+  QnnTensorWrapper fused_softmax_output;
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->MakeTensorWrapper(mul_input_other, fused_softmax_input));
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper->MakeTensorWrapper(softmax_output, fused_softmax_output));
+
+  if (validate) {
+    ORT_RETURN_IF_ERROR(qnn_model_wrapper->ValidateQnnNode(softmax->Name(),
+                                                           QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                           QNN_OP_SOFTMAX,
+                                                           {fused_softmax_input.GetQnnTensor()},
+                                                           {fused_softmax_output.GetQnnTensor()},
+                                                           {}));
+  } else {
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->AddTensorWrapper(std::move(fused_softmax_input)), "Failed to add input");
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->AddTensorWrapper(std::move(fused_softmax_output)), "Failed to add output");
+    ORT_RETURN_IF_NOT(qnn_model_wrapper->CreateQnnNode(softmax->Name(),
+                                                       QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                       QNN_OP_SOFTMAX,
+                                                       {mul_input_other.node_arg.Name()},
+                                                       {softmax_output.node_arg.Name()},
+                                                       std::move(param_tensor_names),
+                                                       validate),
+                      "Failed to add fused " + std::string(kOpSoftmax) + " node.");
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+std::unique_ptr<IQnnNodeGroup> ScaleSoftmaxFusion::TryFusion(
+    QnnModelWrapper& qnn_model_wrapper,
+    const NodeUnit& mul_node_unit,
+    const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
+    const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
+    [[maybe_unused]] const logging::Logger& logger) {
+  if (mul_node_unit.OpType() != kOpMul || mul_node_unit.UnitType() != NodeUnit::Type::SingleNode) {
+    return nullptr;
+  }
+  // Check if the mul node has a scalar input that can fold into the softmax's beta
+  const GraphViewer& graph_viewer = qnn_model_wrapper.GetGraphViewer();
+  std::optional<float> scalar = ExtractScalarValueFromMul(graph_viewer, &mul_node_unit);
+  if (!scalar.has_value()) {
+    return nullptr;
+  }
+
+  // Mul node must have a single Softmax node as child
+  const std::array<std::string_view, 1> child_op_types{kOpSoftmax};
+  const NodeUnit* softmax = GetOnlyChildOfType(graph_viewer, mul_node_unit, child_op_types,
+                                               node_to_node_unit, node_unit_to_qnn_node_group);
+  if (softmax == nullptr) {
+    return nullptr;
+  }
+
+  std::array<const NodeUnit*, 2> node_unit_array{&mul_node_unit, softmax};
+  auto node_units = gsl::make_span<const NodeUnit*>(node_unit_array.data(), 2);
+  if (CreateOrValidateOnQnn(&qnn_model_wrapper, node_units, /*validate=*/true) != Status::OK()) {
+    return nullptr;
+  }
+  return std::make_unique<ScaleSoftmaxFusion>(node_units);
+}
+
+gsl::span<const NodeUnit* const> ScaleSoftmaxFusion::GetNodeUnits() const {
+  return gsl::span<const NodeUnit* const>{node_units_.data(), node_units_.size()};
+}
+
+Status ScaleSoftmaxFusion::IsSupported(
+    QnnModelWrapper& qnn_model_wrapper, [[maybe_unused]] const logging::Logger& logger) const {
+  return CreateOrValidateOnQnn(&qnn_model_wrapper, GetNodeUnits(), /*validate=*/true);
+}
+
+Status ScaleSoftmaxFusion::AddToModelBuilder(
+    QnnModelWrapper& qnn_model_wrapper, [[maybe_unused]] const logging::Logger& logger) const {
+  return CreateOrValidateOnQnn(&qnn_model_wrapper, GetNodeUnits(), /*validate=*/false);
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/scale_softmax_fusion.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/scale_softmax_fusion.h
new file mode 100644
index 0000000000000..66eb892e7a884
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/scale_softmax_fusion.h
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <gsl/gsl>
+#include <array>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
+#include "core/providers/qnn/ort_api.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+class QnnModelWrapper;
+
+/// <summary>
+/// Represents a fusion of pattern: Softmax(Mul(x, scalar_scale)) => QnnSoftmax(x, beta=scalar_scale)
+/// </summary>
+class ScaleSoftmaxFusion : public IQnnNodeGroup {
+ public:
+  explicit ScaleSoftmaxFusion(gsl::span<const NodeUnit* const> node_units) {
+    ORT_ENFORCE(node_units.size() == 2, "Pattern expect exactly 2 NodeUnits.");
+    node_units_[0] = node_units[0];
+    node_units_[1] = node_units[1];
+  }
+  ORT_DISALLOW_COPY_AND_ASSIGNMENT(ScaleSoftmaxFusion);
+
+  Status IsSupported(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const override;
+  Status AddToModelBuilder(QnnModelWrapper& qnn_model_wrapper, const logging::Logger& logger) const override;
+  gsl::span<const NodeUnit* const> GetNodeUnits() const override;
+  const NodeUnit* GetTargetNodeUnit() const override { return node_units_[1]; }
+  std::string_view Type() const override { return "ScaleSoftmaxFusion"; }
+
+  /// <summary>
+  /// Traverses graph to check if the given starting NodeUnit is part of a valid Softmax -> Mul sequence.
+  /// If so, returns a IQnnNodeGroup that contains the Softmax and Mul NodeUnits.
+  /// </summary>
+  static std::unique_ptr<IQnnNodeGroup> TryFusion(
+      QnnModelWrapper& qnn_model_wrapper,
+      const NodeUnit& mul_node_unit,
+      const std::unordered_map<const Node*, const NodeUnit*>& node_to_node_unit,
+      const std::unordered_map<const NodeUnit*, const IQnnNodeGroup*>& node_unit_to_qnn_node_group,
+      const logging::Logger& logger);
+
+ private:
+  std::array<const NodeUnit*, 2> node_units_;
+};
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
index 93b2fca296389..bd74f3d43b325 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.cc
@@ -4,8 +4,8 @@
 #include <string_view>
 #include <unordered_map>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
index c4cf4e8a20a92..f0b2afb67006e 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_node_group/utils.h
@@ -7,8 +7,8 @@
 #include <string_view>
 #include <unordered_map>
 
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/ort_api.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
 
 namespace onnxruntime {
 namespace qnn {
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
index 4fe223d821f1c..cafd727c6a057 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc
@@ -40,7 +40,7 @@ size_t GetElementSizeByType(const Qnn_DataType_t& data_type) {
       {QNN_DATATYPE_UFIXED_POINT_8, 1},
       {QNN_DATATYPE_UFIXED_POINT_16, 2},
       {QNN_DATATYPE_UFIXED_POINT_32, 4},
-  };
+      {QNN_DATATYPE_UNDEFINED, 1}};
 
   auto pos = data_type_to_size.find(data_type);
   ORT_ENFORCE(pos != data_type_to_size.end(), "Unknown QNN data type", data_type);
@@ -228,6 +228,9 @@ std::ostream& operator<<(std::ostream& out, const Qnn_DataType_t& data_type) {
     case QNN_DATATYPE_UFIXED_POINT_4:
       out << "QNN_DATATYPE_UFIXED_POINT_4";
       break;
+    case QNN_DATATYPE_UNDEFINED:
+      out << "QNN_DATATYPE_UNDEFINED";
+      break;
     default:
       ORT_THROW("Unknown Qnn Data type");
   }
diff --git a/onnxruntime/core/providers/qnn/ort_api.cc b/onnxruntime/core/providers/qnn/ort_api.cc
index 809593b409dad..aec09d043d2bc 100644
--- a/onnxruntime/core/providers/qnn/ort_api.cc
+++ b/onnxruntime/core/providers/qnn/ort_api.cc
@@ -102,6 +102,18 @@ const std::string& NodeAttrHelper::Get(const std::string& key, const std::string
   return def_val;
 }
 
+std::vector<std::string> NodeAttrHelper::Get(const std::string& key, const std::vector<std::string>& def_val) const {
+  if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
+    std::vector<std::string> res;
+    for (int i = 0; i < NODE_ATTR_ITER_VAL(entry).strings_size(); i++) {
+      res.emplace_back(NODE_ATTR_ITER_VAL(entry).strings(i));
+    }
+    return res;
+  }
+
+  return def_val;
+}
+
 std::vector<int32_t> NodeAttrHelper::Get(const std::string& key, const std::vector<int32_t>& def_val) const {
   if (auto entry = node_attributes_.find(key); entry != node_attributes_.end()) {
     const auto& values = NODE_ATTR_ITER_VAL(entry).ints();
diff --git a/onnxruntime/core/providers/qnn/ort_api.h b/onnxruntime/core/providers/qnn/ort_api.h
index d25269be075de..2cb4d5c2003bc 100644
--- a/onnxruntime/core/providers/qnn/ort_api.h
+++ b/onnxruntime/core/providers/qnn/ort_api.h
@@ -151,6 +151,7 @@ class NodeAttrHelper {
   std::vector<int64_t> Get(const std::string& key, const std::vector<int64_t>& def_val) const;
 
   const std::string& Get(const std::string& key, const std::string& def_val) const;
+  std::vector<std::string> Get(const std::string& key, const std::vector<std::string>& def_val) const;
 
   // Convert the i() or ints() of the attribute from int64_t to int32_t
   int32_t Get(const std::string& key, int32_t def_val) const;
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index ed5fd60fc71d8..269e7ddd5631c 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -8,13 +8,13 @@
 #include <string_view>
 #include <unordered_set>
 
-#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/qnn_model_wrapper.h"
-#include "core/providers/qnn/builder/qnn_node_group.h"
+#include "core/providers/qnn/builder/qnn_node_group/qnn_node_group.h"
 #include "core/providers/qnn/builder/qnn_utils.h"
+#include "core/providers/qnn/ort_api.h"
 #include "core/providers/qnn/qnn_allocator.h"
 #include "core/providers/qnn/qnn_telemetry.h"
 #include "core/providers/qnn/rpcmem_library.h"
@@ -1141,7 +1141,8 @@ QNNExecutionProvider::PerThreadContext::PerThreadContext(qnn::QnnBackendManager*
                                                          uint32_t device_id,
                                                          uint32_t core_id,
                                                          qnn::HtpPerformanceMode default_htp_performance_mode,
-                                                         uint32_t default_rpc_control_latency)
+                                                         uint32_t default_rpc_control_latency,
+                                                         uint32_t default_rpc_polling_time)
     : qnn_backend_manager_(qnn_backend_manager) {
   Status rt = qnn_backend_manager_->CreateHtpPowerCfgId(device_id, core_id, htp_power_config_id_);
   is_htp_power_config_id_valid_ = rt.IsOK();
@@ -1152,9 +1153,10 @@ QNNExecutionProvider::PerThreadContext::PerThreadContext(qnn::QnnBackendManager*
       ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetHtpPowerConfig(htp_power_config_id_,
                                                                       default_htp_performance_mode));
     }
-    if (default_rpc_control_latency > 0) {
-      ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetRpcControlLatency(htp_power_config_id_,
-                                                                         default_rpc_control_latency));
+    if (default_rpc_control_latency > 0 || default_rpc_polling_time > 0) {
+      ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetRpcPowerConfigs(htp_power_config_id_,
+                                                                       default_rpc_control_latency,
+                                                                       default_rpc_polling_time));
     }
   }
 }
@@ -1185,7 +1187,8 @@ QNNExecutionProvider::PerThreadContext& QNNExecutionProvider::GetPerThreadContex
     if (context_state_.retired_context_pool.empty()) {
       uint32_t core_id = 0;
       context = std::make_shared<PerThreadContext>(qnn_backend_manager_.get(), device_id_, core_id,
-                                                   default_htp_performance_mode_, default_rpc_control_latency_);
+                                                   default_htp_performance_mode_, default_rpc_control_latency_,
+                                                   default_rpc_polling_time_);
     } else {
       context = context_state_.retired_context_pool.back();
       context_state_.retired_context_pool.pop_back();
@@ -1253,15 +1256,21 @@ Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_optio
     LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
   }
 
+  uint32_t rpc_polling_time = 0;
+  if (qnn::HtpPerformanceMode::kHtpBurst != htp_performance_mode) {
+    rpc_polling_time = 9999;
+  }
+
   if (GetPerThreadContext().IsHtpPowerConfigIdValid()) {
     if (qnn::HtpPerformanceMode::kHtpDefault != htp_performance_mode) {
       ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetHtpPowerConfig(GetPerThreadContext().GetHtpPowerConfigId(),
                                                                   htp_performance_mode));
     }
 
-    if (rpc_control_latency > 0) {
-      ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetRpcControlLatency(GetPerThreadContext().GetHtpPowerConfigId(),
-                                                                     rpc_control_latency));
+    if (rpc_control_latency > 0 || rpc_polling_time > 0) {
+      ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetRpcPowerConfigs(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                   rpc_control_latency,
+                                                                   rpc_polling_time));
     }
   }
 
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index d7a5d04d22692..923be142e1f47 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -89,6 +89,7 @@ class QNNExecutionProvider : public IExecutionProvider {
   uint32_t device_id_ = 0;
   qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
   uint32_t default_rpc_control_latency_ = 0;
+  uint32_t default_rpc_polling_time_ = 0;
   bool enable_HTP_FP16_precision_ = true;
   bool share_ep_contexts_ = false;
   bool stop_share_ep_contexts_ = false;
@@ -109,7 +110,8 @@ class QNNExecutionProvider : public IExecutionProvider {
     PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager,
                      uint32_t device_id, uint32_t core_id,
                      qnn::HtpPerformanceMode default_htp_performance_mode,
-                     uint32_t default_rpc_control_latency);
+                     uint32_t default_rpc_control_latency,
+                     uint32_t default_rpc_polling_time);
     ~PerThreadContext();
     ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext);
 
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 6e9ef06aa22aa..e1802c8a8286d 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -423,7 +423,13 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options,
     {
       if (!external_intra_op_thread_pool_) {
         bool allow_intra_op_spinning =
+#if !defined(ORT_CLIENT_PACKAGE_BUILD)
             session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigAllowIntraOpSpinning, "1") == "1";
+#else
+            // default KOrtSessionOptionsConfigAllowIntraOpSpinning to "0" for ORT builds targeting client/on-device workloads,
+            // to reduce CPU utilization and improve power efficiency.
+            session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigAllowIntraOpSpinning, "0") == "1";
+#endif
         OrtThreadPoolParams to = session_options_.intra_op_param;
         std::basic_stringstream<ORTCHAR_T> ss;
         if (to.name) {
@@ -461,7 +467,13 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options,
     if (session_options_.execution_mode == ExecutionMode::ORT_PARALLEL) {
       if (!external_inter_op_thread_pool_) {
         bool allow_inter_op_spinning =
+#if !defined(ORT_CLIENT_PACKAGE_BUILD)
             session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigAllowInterOpSpinning, "1") == "1";
+#else
+            // default kOrtSessionOptionsConfigAllowInterOpSpinning to "0" for ORT builds targeting client/on-device workloads,
+            // to reduce CPU utilization and improve power efficiency.
+            session_options_.config_options.GetConfigOrDefault(kOrtSessionOptionsConfigAllowInterOpSpinning, "0") == "1";
+#endif
         OrtThreadPoolParams to = session_options_.inter_op_param;
         to.auto_set_affinity = to.thread_pool_size == 0 && session_options_.execution_mode == ExecutionMode::ORT_SEQUENTIAL;
         std::basic_stringstream<ORTCHAR_T> ss;
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 02696524042e7..b60d97e38fbad 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -3066,7 +3066,7 @@ static_assert(offsetof(OrtApi, SetEpDynamicOptions) / sizeof(void*) == 284, "Siz
 static_assert(offsetof(OrtApi, GetEpApi) / sizeof(void*) == 317, "Size of version 22 API cannot change");
 
 // So that nobody forgets to finish an API version, this check will serve as a reminder:
-static_assert(std::string_view(ORT_VERSION) == "1.22.1",
+static_assert(std::string_view(ORT_VERSION) == "1.22.2",
               "ORT_Version change detected, please follow below steps to ensure OrtApi is updated properly");
 // 1. Update the hardcoded version string in above static_assert to silence it
 // 2. If there were any APIs added to ort_api_1_to_22 above:
diff --git a/onnxruntime/core/util/qmath.h b/onnxruntime/core/util/qmath.h
index 0172902bdf4e2..f7d5cdb98aa1d 100644
--- a/onnxruntime/core/util/qmath.h
+++ b/onnxruntime/core/util/qmath.h
@@ -1001,4 +1001,53 @@ struct BlockedQuantizeLinear<MLFloat16, TOut, 1> {
 
 #endif
 
+/**
+ * @brief Run MlasDequantizeLinear in parallel, with provided thread pool
+ */
+
+template <typename InputQuantType>
+void ParDequantizeLinearStd(const InputQuantType* input,
+                            float* output,
+                            size_t num_elems,
+                            float scale,
+                            InputQuantType zero_point,
+                            concurrency::ThreadPool* thread_pool) {
+  constexpr std::ptrdiff_t block_size = 128;
+  const std::ptrdiff_t num_blocks = (num_elems + block_size - 1) / block_size;
+  const TensorOpCost unit_cost{static_cast<double>(block_size * sizeof(InputQuantType)),
+                               static_cast<double>(block_size * sizeof(float)),
+                               static_cast<double>(block_size) * 2.0};
+  concurrency::ThreadPool::TryParallelFor(thread_pool, num_blocks, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+    auto begin_idx = begin * block_size;
+    auto end_idx = std::min(static_cast<std::ptrdiff_t>(num_elems), end * block_size);
+    MlasDequantizeLinear(&(input[begin_idx]), &(output[begin_idx]), end_idx - begin_idx, scale, zero_point);
+  });
+}
+
+// Note: this doesn't use MLAS kernel. There are currently no MLAS kernels for fp16 QuantizeLinear or DequantizeLinear.
+template <typename InputQuantType>
+void ParDequantizeLinearStd(const InputQuantType* input,
+                            MLFloat16* output,
+                            size_t num_elems,
+                            MLFloat16 scale,
+                            InputQuantType zero_point,
+                            concurrency::ThreadPool* thread_pool) {
+  constexpr std::ptrdiff_t block_size = 128;
+  const std::ptrdiff_t num_blocks = (num_elems + block_size - 1) / block_size;
+  const TensorOpCost unit_cost{static_cast<double>(block_size * sizeof(InputQuantType)),
+                               static_cast<double>(block_size * sizeof(MLFloat16)),
+                               static_cast<double>(block_size) * 2.0};
+
+  const int32_t zp_s32 = static_cast<int32_t>(zero_point);
+  const float sc_f32 = scale.ToFloat();
+
+  concurrency::ThreadPool::TryParallelFor(thread_pool, num_blocks, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) {
+    auto begin_idx = begin * block_size;
+    auto end_idx = std::min(static_cast<std::ptrdiff_t>(num_elems), end * block_size);
+    for (; begin_idx != end_idx; ++begin_idx) {
+      output[begin_idx] = MLFloat16(static_cast<float>(static_cast<int32_t>(input[begin_idx]) - zp_s32) * sc_f32);
+    }
+  });
+}
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/util/thread_utils.h b/onnxruntime/core/util/thread_utils.h
index d63d620dbc321..0b99723b2c75b 100644
--- a/onnxruntime/core/util/thread_utils.h
+++ b/onnxruntime/core/util/thread_utils.h
@@ -19,7 +19,13 @@ struct OrtThreadPoolParams {
   bool auto_set_affinity = false;
 
   // If it is true, the thread pool will spin a while after the queue became empty.
+#if !defined(ORT_CLIENT_PACKAGE_BUILD)
   bool allow_spinning = true;
+#else
+  // default allow_spinning to false for ORT builds targeting client/on-device workloads,
+  // to reduce CPU utilization and improve power efficiency.
+  bool allow_spinning = false;
+#endif
 
   // It it is non-negative, thread pool will split a task by a decreasing block size
   // of remaining_of_total_iterations / (num_of_threads * dynamic_block_base_)
diff --git a/onnxruntime/test/mlas/unittest/test_dequantizelinear.cpp b/onnxruntime/test/mlas/unittest/test_dequantizelinear.cpp
new file mode 100644
index 0000000000000..b994981364947
--- /dev/null
+++ b/onnxruntime/test/mlas/unittest/test_dequantizelinear.cpp
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "test_util.h"
+
+template <typename QuantInt>
+class MlasDequantizeLinearTest : public MlasTestBase {
+ private:
+  MatrixGuardBuffer<QuantInt> BufferInput;
+  MatrixGuardBuffer<float> BufferOutput;
+  MatrixGuardBuffer<float> BufferOutputReference;
+
+  void GenerateReference(const QuantInt* Input, float* OutputReference, size_t N, float Scale, QuantInt ZeroPoint) {
+    int32_t ZeroPointS32 = static_cast<int32_t>(ZeroPoint);
+
+    for (size_t n = 0; n < N; n++) {
+      OutputReference[n] = static_cast<float>(static_cast<int32_t>(Input[n]) - ZeroPointS32) * Scale;
+    }
+  }
+
+  void Test(size_t N) {
+    QuantInt* Input = BufferInput.GetBuffer(N);
+    float* Output = BufferOutput.GetBuffer(N);
+    float* OutputReference = BufferOutputReference.GetBuffer(N);
+
+    std::default_random_engine generator(static_cast<unsigned>(N));
+
+    std::uniform_real_distribution<float> min_gen(-10.f, -10e-3f);
+    float MinimumValue = min_gen(generator);
+
+    std::uniform_real_distribution<float> max_gen(10e-3f, 10.f);
+    float MaximumValue = max_gen(generator);
+
+    float Scale = (MaximumValue - MinimumValue) / 512.f;
+
+    std::uniform_int_distribution<int32_t> zp_distribution(std::numeric_limits<QuantInt>::min(),
+                                                           std::numeric_limits<QuantInt>::max());
+    QuantInt ZeroPoint = static_cast<QuantInt>(zp_distribution(generator));
+
+    for (size_t n = 0; n < N; n++) {
+      Input[n] = static_cast<QuantInt>(zp_distribution(generator));
+    }
+
+    GenerateReference(Input, OutputReference, N, Scale, ZeroPoint);
+    MlasDequantizeLinear(Input, Output, N, Scale, ZeroPoint);
+
+    for (size_t n = 0; n < N; n++) {
+      ASSERT_EQ(Output[n], OutputReference[n]) << ", size=" << N << ", index=" << n;
+    }
+  }
+
+ public:
+  static const char* GetTestSuiteName() {
+    if constexpr (std::is_same_v<QuantInt, int8_t>) {
+      return "DequantizeLinearS8";
+    } else {
+      return "DequantizeLinearU8";
+    }
+  }
+
+  void ExecuteShort(void) override {
+    for (size_t n = 1; n <= 512; n++) {
+      Test(n);
+    }
+  }
+};
+
+static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) {
+  size_t count = 0;
+  if (is_short_execute) {
+    count += MlasDirectShortExecuteTests<MlasDequantizeLinearTest<int8_t>>::RegisterShortExecute();
+    count += MlasDirectShortExecuteTests<MlasDequantizeLinearTest<uint8_t>>::RegisterShortExecute();
+  }
+  return count;
+});
diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc
index bc8b672512d8d..3945dbf567cbe 100644
--- a/onnxruntime/test/onnx/TestCase.cc
+++ b/onnxruntime/test/onnx/TestCase.cc
@@ -1393,72 +1393,25 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
   }
 
   if (provider_name == "qnn") {
-    broken_tests->insert({"gemm_default_no_bias", "result differs"});
     broken_tests->insert({"resize_downsample_scales_linear", "result differs"});
-    broken_tests->insert({"resize_downsample_scales_linear_antialias", "result differs"});
-    broken_tests->insert({"resize_downsample_sizes_linear_antialias", "result differs"});
-    broken_tests->insert({"sce_NCd1_mean_weight_negative_ii", "result differs"});
-    broken_tests->insert({"sce_NCd1_mean_weight_negative_ii_expanded", "result differs"});
-    broken_tests->insert({"sce_NCd1_mean_weight_negative_ii_log_prob", "result differs"});
-    broken_tests->insert({"sce_NCd1_mean_weight_negative_ii_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_mean", "result differs"});
-    broken_tests->insert({"sce_mean_3d", "result differs"});
-    broken_tests->insert({"sce_mean_3d_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_3d_log_prob", "result differs"});
-    broken_tests->insert({"sce_mean_3d_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_log_prob", "result differs"});
-    broken_tests->insert({"sce_mean_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_3d", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_3d_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_3d_log_prob", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_3d_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_4d", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_4d_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_4d_log_prob", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_4d_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_log_prob", "result differs"});
-    broken_tests->insert({"sce_mean_no_weight_ii_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_weight", "result differs"});
-    broken_tests->insert({"sce_mean_weight_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_3d", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_3d_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_3d_log_prob", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_3d_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_4d", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_4d_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_4d_log_prob", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_4d_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_log_prob", "result differs"});
-    broken_tests->insert({"sce_mean_weight_ii_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_mean_weight_log_prob", "result differs"});
-    broken_tests->insert({"sce_mean_weight_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_none", "result differs"});
-    broken_tests->insert({"sce_none_expanded", "result differs"});
-    broken_tests->insert({"sce_none_log_prob", "result differs"});
-    broken_tests->insert({"sce_none_log_prob_expanded", "result differs"});
-    broken_tests->insert({"sce_sum", "result differs"});
-    broken_tests->insert({"sce_sum_expanded", "result differs"});
-    broken_tests->insert({"sce_sum_log_prob", "result differs"});
-    broken_tests->insert({"sce_sum_log_prob_expanded", "result differs"});
-    broken_tests->insert({"gridsample_reflection_padding", "result differs"});
     broken_tests->insert({"gridsample_volumetric_nearest_align_corners_0", "unknown version"});
     broken_tests->insert({"gridsample_volumetric_nearest_align_corners_1", "unknown version"});
-    broken_tests->insert({"spacetodepth", "result differs"});
-    broken_tests->insert({"reduce_sum_square_empty_set_expanded", "unknown version"});
-    // Fails with QNN SDK 2.17.0:
+    broken_tests->insert({"rotary_embedding", "unknown version"});
+    broken_tests->insert({"rotary_embedding_no_position_ids", "unknown version"});
+    broken_tests->insert({"rotary_embedding_interleaved", "unknown version"});
+    broken_tests->insert({"rotary_embedding_no_position_ids_expanded", "unknown version"});
+    broken_tests->insert({"rotary_embedding_no_position_ids_interleaved", "unknown version"});
+    broken_tests->insert({"rotary_embedding_no_position_ids_interleaved_expanded", "unknown version"});
+    // Fails since QNN SDK 2.17.0:
     // expected 7.70947 (40f6b3f3), got 7.84096 (40fae920), diff: 0.131491, tol=0.00870947 idx=419. 100 of 1715 differ
     broken_tests->insert({"facedetection_op8_qdq", "result differs"});
+    // Fails with QNN SDK 2.34.0:
+    // expected 2.18661 (400bf164), got 1.48898 (3fbe96ce), diff: 0.697631, tol=0.00318661 idx=0. 8 of 8 differ
+    broken_tests->insert({"gemm_default_vector_bias", "result differs with 2.34"});
+    // expected 0.0505495 (3d4f0d00), got 0.0506369 (3d4f68ae), diff: 8.74326e-05, tol=6.05495e-05 idx=448
+    broken_tests->insert({"mobilenetv2-1.0", "result differs with 2.34"});
+    broken_tests->insert({"facedetection_op8", "segfault with CPU backend, will be fixed by QNN 2.36"});
 
-#if defined(_WIN32) && defined(_M_AMD64)
-    // Fails with QNN SDK 2.17.0 on Windows x64:
-    // expected 13.5 (41580000), got 0 (0), diff: 13.5, tol=0.0145 idx=3. 3 of 4 differ
-    broken_tests->insert({"averagepool_2d_ceil", "result differs"});
-#endif
     // These next 3 Resize tests fail on CPU backend with QNN SDK 2.22.0 due to inaccuracy.
     // output=Y:expected 1 (3f800000), got 3 (40400000), diff: 2, tol=0.002 idx=24. 8 of 56 differ
     broken_tests->insert({"resize_upsample_sizes_nearest", "result differs"});
@@ -1470,12 +1423,6 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
     broken_tests->insert({"convtranspose_group_2_image_3", "Segmentation fault (core dumped). CPU test passed."});
     // Fails with QNN 2.31 on Windows x64 for CPU
     broken_tests->insert({"gelu_tanh_2", "y:expected -0.0131778 (bc57e7d5), got -0.0136333 (bc5f5e38), diff: 0.000455472, tol=2.31778e-05."});
-    broken_tests->insert({"convtranspose_pad", "Access violation 0xc000005 from call graphAddNode."});
-    broken_tests->insert({"convtranspose_pads", "Access violation 0xc000005 from call graphAddNode."});
-    broken_tests->insert({"convtranspose_output_shape", "Access violation 0xc000005 from call graphAddNode."});
-    broken_tests->insert({"convtranspose_kernel_shape", "Access violation 0xc000005 from call graphAddNode."});
-    broken_tests->insert({"convtranspose_1d", "Access violation 0xc000005 from call graphAddNode."});
-    broken_tests->insert({"convtranspose", "Access violation 0xc000005 from call graphAddNode."});
     broken_tests->insert({"averagepool_2d_ceil", "result differs. expected 13.5 (41580000), got 0 (0)"});
     // Fails with QNN 2.32
     broken_tests->insert({"resize_upsample_scales_linear", "expected 1 (3f800000), got 0.25 (3e800000)"});
diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h
index 4e50881ad4f90..26df588eab73f 100644
--- a/onnxruntime/test/optimizer/graph_transform_test_builder.h
+++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h
@@ -147,6 +147,14 @@ class ModelTestBuilder {
     }
   }
 
+  // Make optional tensor
+  NodeArg* MakeOptionalTensor() {
+    ONNX_NAMESPACE::TypeProto type_proto;
+    type_proto.mutable_tensor_type()->set_elem_type(utils::ToTensorProtoElementType<float>());
+    std::string name;
+    return &graph_.GetOrCreateNodeArg(name, &type_proto);
+  }
+
   template <typename T>
   NodeArg* MakeSymbolicInput(const std::vector<std::variant<int64_t, std::string>>& shape) {
     ONNX_NAMESPACE::TypeProto type_proto;
diff --git a/onnxruntime/test/providers/cpu/math/gemm_test.cc b/onnxruntime/test/providers/cpu/math/gemm_test.cc
index d0069a0069646..e3d319be84999 100644
--- a/onnxruntime/test/providers/cpu/math/gemm_test.cc
+++ b/onnxruntime/test/providers/cpu/math/gemm_test.cc
@@ -430,6 +430,7 @@ TYPED_TEST(GemmOpTypedTests, TestGemm2DBroadcast_2) {
                             {static_cast<TypeParam>(11.0f), static_cast<TypeParam>(12.0f), static_cast<TypeParam>(13.0f),
                              static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-8.0f), static_cast<TypeParam>(-7.0f)});
   test.Config(run_with_tunable_op)
+      .ConfigExcludeEps({kQnnExecutionProvider})  // Accuracy issues with QNN CPU backend since QNN 2.34
       .RunWithConfig();
 }
 
@@ -476,10 +477,8 @@ TYPED_TEST(GemmOpTypedTests, TestGemmBroadcast) {
     excluded_providers.insert(kOpenVINOExecutionProvider);  // OpenVINO: Temporarily disabled due to accuracy issues
 #endif
 
-    if (b_is_initializer && !c_is_initializer) {
-      // Accuracy issues on QNN's CPU backend with QNN SDK version 2.17
-      excluded_providers.insert(kQnnExecutionProvider);
-    }
+    // Accuracy issues with QNN CPU backend since QNN 2.34
+    excluded_providers.insert(kQnnExecutionProvider);
 
     test.ConfigExcludeEps(excluded_providers)
         .Config(run_with_tunable_op)
@@ -511,10 +510,16 @@ TYPED_TEST(GemmOpTypedTests, TestGemmTrans) {
   test.AddOutput<TypeParam>("Y", {2, 3},
                             {static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f),
                              static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f)});
+
+  std::unordered_set<std::string> excluded_providers;
 #if defined(OPENVINO_CONFIG_GPU)
-  test.ConfigExcludeEps({kOpenVINOExecutionProvider});  // OpenVINO: Temporarily disabled due to accuracy issues
+  excluded_providers.insert(kOpenVINOExecutionProvider);  // OpenVINO: Temporarily disabled due to accuracy issues
 #endif
-  test.Config(run_with_tunable_op)
+  // Accuracy issues with QNN CPU backend since QNN 2.34
+  excluded_providers.insert(kQnnExecutionProvider);
+
+  test.ConfigExcludeEps(excluded_providers)
+      .Config(run_with_tunable_op)
       .RunWithConfig();
 }
 
@@ -537,10 +542,15 @@ TYPED_TEST(GemmOpTypedTests, TestGemmTransB) {
     test.AddOutput<TypeParam>("Y", {2, 3},
                               {static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f), static_cast<TypeParam>(11.0f),
                                static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f), static_cast<TypeParam>(-9.0f)});
+
+    std::unordered_set<std::string> excluded_providers;
 #if defined(OPENVINO_CONFIG_GPU)
-    test.ConfigExcludeEps({kOpenVINOExecutionProvider});  // OpenVINO: Temporarily disabled due to accuracy issues
+    excluded_providers.insert(kOpenVINOExecutionProvider);  // OpenVINO: Temporarily disabled due to accuracy issues
 #endif
-    test.Config(run_with_tunable_op)
+    excluded_providers.insert(kQnnExecutionProvider);  // Accuracy issues with QNN CPU backend since QNN 2.34
+
+    test.ConfigExcludeEps(excluded_providers)
+        .Config(run_with_tunable_op)
         .RunWithConfig();
   };
   run_test(false, false);
diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
index 4e7a6356a5129..8fdbf0060eaa0 100644
--- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc
@@ -33,6 +33,32 @@ TEST(DequantizeLinearOpTest, Int8) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+// scalar zero & scale with uint8 (large enough input to execute MLAS vectorized loop)
+TEST(DequantizeLinearOpTest, Uint8_Large) {
+  OpTester test("DequantizeLinear", 10);
+  std::vector<int64_t> dims{1, 1039};  // not evenly divisible by 16 (loop unroll amount) to test handling of leftover inputs
+  test.AddInput<uint8_t>("x", dims, std::vector<uint8_t>(1039, 1));
+  test.AddInput<float>("x_scale", {}, {1.0f});
+  test.AddInput<uint8_t>("x_zero_point", {}, {1});
+  test.AddOutput<float>("y", dims, std::vector<float>(1039, 0.0f));
+  // Disable Tensorrt EP due to error:node1_quantize_scale_node: out of bounds channel axis 1. Number of input dimensions is 1.
+  // Disable WebGPU EP because it requires dims.Size() to be multiple of 4. Fails with error: needs at least component size 4.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+}
+
+// scalar zero & scale with int8 (large enough input to execute MLAS vectorized loop)
+TEST(DequantizeLinearOpTest, Int8_Large) {
+  OpTester test("DequantizeLinear", 10);
+  std::vector<int64_t> dims{1, 1039};  // not evenly divisible by 16 (loop unroll amount) to test handling of leftover inputs
+  test.AddInput<int8_t>("x", dims, std::vector<int8_t>(1039, 1));
+  test.AddInput<float>("x_scale", {}, {1.0f});
+  test.AddInput<int8_t>("x_zero_point", {}, {1});
+  test.AddOutput<float>("y", dims, std::vector<float>(1039, 0.0f));
+  // Disable Tensorrt EP due to error:node1_quantize_scale_node: out of bounds channel axis 1. Number of input dimensions is 1.
+  // Disable WebGPU EP because it requires dims.Size() to be multiple of 4. Fails with error: needs at least component size 4.
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kWebGpuExecutionProvider});
+}
+
 // scalar zero & scale with int4
 TEST(DequantizeLinearOpTest, Int4) {
   OpTester test("DequantizeLinear", 21);
diff --git a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
index 1aea58c8d7a10..a49f662ca1adb 100644
--- a/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
+++ b/onnxruntime/test/providers/cuda/nhwc/nhwc_cuda_helper.h
@@ -46,7 +46,7 @@
   } else if (std::is_same<T, double>::value) {      \
     MAKE_PROVIDERS_EPS_EXT(2e-4, pad_to_nc1d)       \
   } else {                                          \
-    MAKE_PROVIDERS_EPS_EXT(2e-3, pad_to_nc1d)       \
+    MAKE_PROVIDERS_EPS_EXT(4e-3, pad_to_nc1d)       \
   }
 
 #define MAKE_PROVIDERS_EPS_TYPE(T) \
diff --git a/onnxruntime/test/providers/qnn/average_pool_test.cc b/onnxruntime/test/providers/qnn/average_pool_test.cc
index 7969f4472629a..aace6256702ec 100644
--- a/onnxruntime/test/providers/qnn/average_pool_test.cc
+++ b/onnxruntime/test/providers/qnn/average_pool_test.cc
@@ -142,9 +142,7 @@ TEST_F(QnnHTPBackendTests, AveragePool_CountIncludePad_HTP_u8) {
                                    {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
                                     utils::MakeAttribute("count_include_pad", static_cast<int64_t>(1))},
                                    ExpectedEPNodeAssignment::All,
-                                   18,
-                                   // Need tolerance of 0.414% of output range after QNN SDK 2.17
-                                   QDQTolerance(0.00414f));
+                                   18);
 }
 
 // QDQ AveragePool that use auto_pad 'SAME_UPPER'.
@@ -157,9 +155,7 @@ TEST_F(QnnHTPBackendTests, AveragePool_AutopadSameUpper_HTP_u8) {
                                    {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
                                     utils::MakeAttribute("auto_pad", "SAME_UPPER")},
                                    ExpectedEPNodeAssignment::All,
-                                   18,
-                                   // Need to use tolerance of 0.414% of output range after QNN SDK 2.17
-                                   QDQTolerance(0.00414f));
+                                   18);
 }
 
 // QDQ AveragePool that use auto_pad 'SAME_LOWER'.
@@ -172,9 +168,7 @@ TEST_F(QnnHTPBackendTests, AveragePool_AutopadSameLower_HTP_u8) {
                                    {utils::MakeAttribute("kernel_shape", std::vector<int64_t>{1, 1}),
                                     utils::MakeAttribute("auto_pad", "SAME_LOWER")},
                                    ExpectedEPNodeAssignment::All,
-                                   18,
-                                   // Need to use tolerance of 0.414% of output range after QNN SDK 2.17
-                                   QDQTolerance(0.00414f));
+                                   18);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/cast_test.cc b/onnxruntime/test/providers/qnn/cast_test.cc
index fa26c764c1b7a..2a63d98ebb37e 100644
--- a/onnxruntime/test/providers/qnn/cast_test.cc
+++ b/onnxruntime/test/providers/qnn/cast_test.cc
@@ -127,7 +127,9 @@ TEST_F(QnnHTPBackendTests, TestCastInt32ToFloatHTP) {
 }
 
 // Cast uint8_t to float on HTP
-TEST_F(QnnHTPBackendTests, TestCastUInt8ToFloatHTP) {
+// Fails with QNN SDK 2.35.0:
+// value pair (13, 1.00000012) at index #0 don't match, which is -12 from 13
+TEST_F(QnnHTPBackendTests, DISABLED_TestCastUInt8ToFloatHTP) {
   RunCastOpTest<uint8_t>({3, 3}, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT, ExpectedEPNodeAssignment::All,
                          true, false);
 }
diff --git a/onnxruntime/test/providers/qnn/clip_op_test.cc b/onnxruntime/test/providers/qnn/clip_op_test.cc
index 512403bc5a10b..83296d342e62b 100644
--- a/onnxruntime/test/providers/qnn/clip_op_test.cc
+++ b/onnxruntime/test/providers/qnn/clip_op_test.cc
@@ -76,7 +76,9 @@ TEST_F(QnnCPUBackendTests, Clip_5D_f32) {
 //
 
 // Test Clip with float32 on HTP
-TEST_F(QnnHTPBackendTests, Clip_f32) {
+// Fails with QNN SDK 2.35.0:
+// value pair (-4.54545403, -4.54687548) at index #3 don't match, which is -0.00142145 from -4.54545
+TEST_F(QnnHTPBackendTests, DISABLED_Clip_f32) {
   bool on_cpu_backend = false;
   RunClipTest<float>(TestInputDef<float>({1, 1, 3, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 12)),
                      {TestInputDef<float>({}, true, {-5.0f}),
diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc
index b15042a808c37..c99c51380a51e 100644
--- a/onnxruntime/test/providers/qnn/conv_test.cc
+++ b/onnxruntime/test/providers/qnn/conv_test.cc
@@ -654,7 +654,9 @@ TEST_F(QnnCPUBackendTests, ConvTranspose1Df32_DynamicWeights_DefaultBias) {
 // It has to be QDQ model, because the DQ node with initializer on Conv gets processed first
 // and DQ node requires its node unit to be processed
 // So, Conv gets processed before Mul node
-TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
+//
+// Since at least QAIRT 2.33 value pair (3.549, 3.588) at index #12709 don't match, which is 0.039 from 3.549
+TEST_F(QnnHTPBackendTests, DISABLED_Test_QDQConvWithDynamicWeightsFromMul) {
   ProviderOptions provider_options;
   provider_options["backend_type"] = "htp";
   provider_options["offload_graph_io_quantization"] = "0";
@@ -706,9 +708,7 @@ TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
   RunQnnModelTest(BuildConvMulGraph,
                   provider_options,
                   13,
-                  ExpectedEPNodeAssignment::All,
-                  4e-4f);  // Accuracy decreased slightly in QNN SDK 2.17.
-                           // Expected: 9.94500065, Actual: 9.94537735
+                  ExpectedEPNodeAssignment::All);
 }
 
 // Check that QNN compiles DQ -> Conv -> Q as a single unit.
@@ -725,9 +725,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_bias_dynamic_input) {
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_qdq_contrib_ops
-                                     13,     // opset
-                                     // Need tolerance of 0.413% of output range after QNN SDK 2.17
-                                     QDQTolerance(0.00413f));
+                                     13);    // opset
 
   RunHTPConvOpTest<uint8_t, uint8_t>("Conv",
                                      TestInputDef<float>({1, 1, 5, 5, 5}, false, 0.0f, 10.0f),   // Random dynamic input
@@ -740,9 +738,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_bias_dynamic_input) {
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_qdq_contrib_ops
-                                     13,     // opset
-                                     // Need tolerance of 0.413% of output range after QNN SDK 2.17
-                                     QDQTolerance(0.00413f));
+                                     13);    // opset
 }
 
 // Test per-channel QDQ Conv. in0: u8, in1 (weight): s8, in2 (bias): s32, out: u8
@@ -1851,9 +1847,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_bias_initializer) {
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_qdq_contrib_ops
-                                     13,     // opset
-                                     // Need tolerance of 0.413% of output range after QNN SDK 2.17
-                                     QDQTolerance(0.00413f));
+                                     13);    // opset
 
   RunHTPConvOpTest<uint8_t, uint8_t>("Conv",
                                      TestInputDef<float>({1, 1, 5, 5, 5}, false, 0.0f, 10.0f),   // Random dynamic input
@@ -1866,9 +1860,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_bias_initializer) {
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All,
                                      false,  // use_qdq_contrib_ops
-                                     13,     // opset
-                                     // Need tolerance of 0.413% of output range after QNN SDK 2.17
-                                     QDQTolerance(0.00413f));
+                                     13);    // opset
 }
 
 // Tests 1D Conv with bias as an initializer.
@@ -2056,7 +2048,9 @@ TEST_F(QnnHTPBackendTests, ConvTranspose1DU8U8S32_AutoPadLower) {
                                      13);
 }
 
-TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input1_padding_bias_initializer) {
+// Fails with QNN SDK 2.35.0:
+// value pair (-4.54545403, -4.54687548) at index #3 don't match, which is -0.00142145 from -4.54545
+TEST_F(QnnHTPBackendTests, DISABLED_ConvU8U8S32_large_input1_padding_bias_initializer) {
   RunHTPConvOpTest<uint8_t, uint8_t>("Conv",
                                      TestInputDef<float>({1, 3, 60, 452}, false, 0.f, 10.f),        // Dynamic input
                                      TestInputDef<float>({16, 3, 3, 3}, true, -1.f, 1.f),           // Static weights
@@ -2074,12 +2068,6 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input1_padding_bias_initializer) {
 }
 
 TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input2_bias_initializer) {
-#ifdef __linux__
-  // On Linux QNN SDK 2.17: Need a tolerance of 0.785% of output range to pass.
-  QDQTolerance tolerance = QDQTolerance(0.00785f);
-#else
-  QDQTolerance tolerance = QDQTolerance();
-#endif
   RunHTPConvOpTest<uint8_t, uint8_t>("Conv",
                                      TestInputDef<float>({1, 128, 8, 56}, false, 0.f, 10.f),  // Dynamic input
                                      TestInputDef<float>({32, 128, 1, 1}, true, -1.f, 1.f),   // Random static weights
@@ -2091,8 +2079,7 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input2_bias_initializer) {
                                      "NOTSET",
                                      ExpectedEPNodeAssignment::All,
                                      false,
-                                     13,
-                                     tolerance);
+                                     13);
 }
 
 TEST_F(QnnHTPBackendTests, ConvU8U8S32_LargeInput_Dilations_Pads) {
diff --git a/onnxruntime/test/providers/qnn/cumsum_op_htp_test.cc b/onnxruntime/test/providers/qnn/cumsum_op_htp_test.cc
new file mode 100644
index 0000000000000..cfe6523639e96
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/cumsum_op_htp_test.cc
@@ -0,0 +1,138 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "test/providers/qnn/qnn_test_utils.h"
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+// Runs a non-QDQ model on HTP and compares output to CPU EP.
+template <typename InputType1 = float, typename InputType2 = float>
+static void RunCumSumOpTest(const std::string& op_type,
+                            const TestInputDef<InputType1>& input_def_1,
+                            const TestInputDef<InputType2>& input_def_2,
+                            const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                            int opset_version,
+                            ExpectedEPNodeAssignment expected_ep_assignment,
+                            float fp32_abs_err = 2e-3f) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  // Runs model with a Q/DQ binary op and compares the outputs of the CPU and QNN EPs.
+  RunQnnModelTest(BuildOpTestCase<InputType1, InputType2>(op_type, {input_def_1}, {input_def_2}, attrs),
+                  provider_options,
+                  opset_version,
+                  expected_ep_assignment,
+                  fp32_abs_err);
+}
+
+// Non-QDQ model, CumSum with float input and axis input as initializer with axis 0
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_CumSum_float_int32_e0_r0_axis_0) {
+  RunCumSumOpTest<float, int32_t>("CumSum",
+                                  TestInputDef<float>({3, 2}, false, {1.3f, 7.2f, 0.4f, 3.4f, 5.7f, 0.8f}),
+                                  TestInputDef<int32_t>({}, true, {0}),
+                                  {utils::MakeAttribute("exclusive", static_cast<int64_t>(0)),
+                                   utils::MakeAttribute("reverse", static_cast<int64_t>(0))},
+                                  17,
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Non-QDQ model, CumSum with float input and axis input as initializer with axis -1
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_CumSum_float_int32_e0_r0_axis_neg1) {
+  RunCumSumOpTest<float, int32_t>("CumSum",
+                                  TestInputDef<float>({3, 2}, false, {1.3f, 7.2f, 0.4f, 3.4f, 5.7f, 0.8f}),
+                                  TestInputDef<int32_t>({}, true, {-1}),
+                                  {utils::MakeAttribute("exclusive", static_cast<int64_t>(0)),
+                                   utils::MakeAttribute("reverse", static_cast<int64_t>(0))},
+                                  17,
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Returns a function that creates a graph with a QDQ CumSum operator.
+template <typename QuantType, typename AxisType>
+GetTestQDQModelFn<QuantType> BuildQDQCumSumTestCase(const TestInputDef<float>& input_def,
+                                                    const TestInputDef<AxisType>& axis_def,
+                                                    const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                    bool use_contrib_qdq = false) {
+  return [input_def, axis_def, attrs, use_contrib_qdq](ModelTestBuilder& builder,
+                                                       std::vector<QuantParams<QuantType>>& output_qparams) {
+    // input -> Q -> DQ ->
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder, input, input_qparams.scale, input_qparams.zero_point,
+                                                   use_contrib_qdq);
+
+    // axis input
+    NodeArg* axis_input = MakeTestInput(builder, axis_def);
+
+    // CumSum op
+    NodeArg* op_output = builder.MakeIntermediate();
+    Node& cumsum_node = builder.AddNode("CumSum", {input_qdq, axis_input}, {op_output});
+
+    for (const auto& attr : attrs) {
+      cumsum_node.AddAttributeProto(attr);
+    }
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<QuantType>(builder, op_output, output_qparams[0].scale,
+                                                     output_qparams[0].zero_point, use_contrib_qdq);
+  };
+}
+
+// Test the accuracy of a QDQ CumSum model on QNN EP. Checks if the QDQ model on QNN EP is as accurate as the QDQ model on CPU EP
+// (compared to float32 model).
+template <typename QuantType, typename AxisType>
+static void RunQDQCumSumOpTest(const TestInputDef<float>& input_def,
+                               const TestInputDef<AxisType>& axis_def,
+                               const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                               int opset,
+                               ExpectedEPNodeAssignment expected_ep_assignment,
+                               bool use_contrib_qdq = false) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  auto f32_model_builder = BuildOpTestCase<float, AxisType>("CumSum", {input_def}, {axis_def}, attrs);
+  auto qdq_model_builder = BuildQDQCumSumTestCase<QuantType, AxisType>(input_def, axis_def, attrs,
+                                                                       use_contrib_qdq);
+
+  TestQDQModelAccuracy<QuantType>(f32_model_builder,
+                                  qdq_model_builder,
+                                  provider_options,
+                                  opset,
+                                  expected_ep_assignment);
+}
+
+// Test creates a DQ -> CumSum -> Q -> DQ graph, and checks that all
+// nodes are supported by the QNN EP, and that the inference results are as accurate as CPU EP.
+//
+// QDQ model, CumSum with uint8 input and axis input as initializer
+TEST_F(QnnHTPBackendTests, CumSum_uint8_int32_e0_r0) {
+  RunQDQCumSumOpTest<uint8_t, int32_t>(TestInputDef<float>({3, 2}, false, {1.3f, 7.2f, 0.4f, 3.4f, 5.7f, 0.8f}),
+                                       TestInputDef<int32_t>({}, true, {0}),
+                                       {utils::MakeAttribute("exclusive", static_cast<int64_t>(0)),
+                                        utils::MakeAttribute("reverse", static_cast<int64_t>(0))},
+                                       17,
+                                       ExpectedEPNodeAssignment::All);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif
diff --git a/onnxruntime/test/providers/qnn/einsum_op_test.cc b/onnxruntime/test/providers/qnn/einsum_op_test.cc
new file mode 100644
index 0000000000000..55412a7b15d98
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/einsum_op_test.cc
@@ -0,0 +1,341 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include <vector>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "core/graph/node_attr_utils.h"
+#include "test/util/include/test_utils.h"
+
+#include "core/graph/onnx_protobuf.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using onnxruntime::Node;
+using onnxruntime::NodeArg;
+using onnxruntime::ProviderOptions;
+using onnxruntime::test::AddQDQNodePair;
+using onnxruntime::test::AddQDQNodePairWithOutputAsGraphOutput;
+using onnxruntime::test::BuildOpTestCase;
+using onnxruntime::test::ExpectedEPNodeAssignment;
+using onnxruntime::test::GetTestInputQuantParams;
+using onnxruntime::test::GetTestQDQModelFn;
+using onnxruntime::test::MakeTestInput;
+using onnxruntime::test::ModelTestBuilder;
+using onnxruntime::test::QDQTolerance;
+using onnxruntime::test::QuantParams;
+using onnxruntime::test::RunQnnModelTest;
+using onnxruntime::test::TestInputDef;
+using onnxruntime::test::TestQDQModelAccuracy;
+using onnxruntime::utils::MakeAttribute;
+
+constexpr char kEinsumOp[] = "Einsum";
+constexpr char kEinsumEquation[] = "equation";
+constexpr char kQnnBackendType[] = "backend_type";
+constexpr char kQnnBackendTypeCpu[] = "cpu";
+constexpr char kQnnBackendTypeHtp[] = "htp";
+constexpr char kOffloadGraphIoQuantization[] = "offload_graph_io_quantization";
+constexpr char kOffloadGraphIoQuantizationDisable[] = "0";
+
+template <typename DataType>
+static void RunQnnEinsum(
+    const std::string& backend,
+    const TestInputDef<DataType>& in0,
+    const TestInputDef<DataType>& in1,
+    const std::string& equation,
+    const float tolerance) {
+  ProviderOptions provider_options;
+  provider_options[kQnnBackendType] = backend;
+  provider_options[kOffloadGraphIoQuantization] = kOffloadGraphIoQuantizationDisable;
+  RunQnnModelTest(
+      /*build_test_case=*/BuildOpTestCase<DataType, DataType>(
+          /*op_type=*/kEinsumOp,
+          /*input_defs_1=*/{in0, in1},
+          /*input_defs_2=*/{},
+          /*attrs=*/{MakeAttribute(kEinsumEquation, equation)}),
+      /*provider_options=*/provider_options,
+      /*opset_version=*/12,
+      /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+      /*tolerance=*/tolerance);
+}
+
+template <typename InputAQType, typename InputBQType>
+GetTestQDQModelFn<InputAQType> BuildTestCaseQdq(const std::vector<TestInputDef<float>>& input_defs,
+                                                const std::vector<ONNX_NAMESPACE::AttributeProto>& attrs,
+                                                bool use_contrib_qdq = false) {
+  return [input_defs, attrs, use_contrib_qdq](ModelTestBuilder& builder,
+                                              std::vector<QuantParams<InputAQType>>& output_qparams) {
+    const size_t num_inputs = input_defs.size();
+
+    std::vector<NodeArg*> op_inputs;
+    op_inputs.reserve(num_inputs);
+
+    // Process input 0
+    NodeArg* input0 = MakeTestInput<float>(builder, input_defs[0]);
+    QuantParams<InputAQType> input0_qparams = GetTestInputQuantParams<InputAQType>(input_defs[0]);
+    NodeArg* input0_after_qdq = AddQDQNodePair<InputAQType>(builder, input0, input0_qparams.scale,
+                                                            input0_qparams.zero_point, use_contrib_qdq);
+    op_inputs.push_back(input0_after_qdq);
+
+    // Process input 1
+    NodeArg* input1 = MakeTestInput<float>(builder, input_defs[1]);
+    QuantParams<InputBQType> input1_qparams = GetTestInputQuantParams<InputBQType>(input_defs[1]);
+    NodeArg* input1_after_qdq = AddQDQNodePair<InputBQType>(builder, input1, input1_qparams.scale,
+                                                            input1_qparams.zero_point, use_contrib_qdq);
+    op_inputs.push_back(input1_after_qdq);
+
+    // Op -> op_output
+    auto* output = builder.MakeIntermediate();
+    Node& node = builder.AddNode(kEinsumOp, op_inputs, {output});
+    for (const auto& attr : attrs) {
+      node.AddAttributeProto(attr);
+    }
+
+    // op_output -> Q -> DQ -> output
+    AddQDQNodePairWithOutputAsGraphOutput<InputAQType>(builder, output, output_qparams[0].scale,
+                                                       output_qparams[0].zero_point, use_contrib_qdq);
+  };
+}
+
+template <typename InputAQType, typename InputBQType>
+static void RunQnnHtpQdqEinsum(const TestInputDef<float>& in0,
+                               const TestInputDef<float>& in1,
+                               const std::string& equation,
+                               QDQTolerance tolerance) {
+  ProviderOptions provider_options;
+  provider_options[kQnnBackendType] = kQnnBackendTypeHtp;
+  provider_options[kOffloadGraphIoQuantization] = kOffloadGraphIoQuantizationDisable;
+  std::vector<ONNX_NAMESPACE::AttributeProto> attrs{MakeAttribute(kEinsumEquation, equation)};
+  auto f32_model_builder = BuildOpTestCase<float, float>(
+      /*op_type=*/kEinsumOp,
+      /*input_defs_1=*/{in0, in1},
+      /*input_defs_2=*/{},
+      /*attrs=*/attrs);
+  auto qdq_model_builder = BuildTestCaseQdq<InputAQType, InputBQType>(
+      /*input_defs=*/{in0, in1}, /*attrs=*/attrs, /*use_contrib_qdq=*/false);
+  TestQDQModelAccuracy<InputAQType>(/*f32_model_fn=*/f32_model_builder,
+                                    /*qdq_model_fn=*/qdq_model_builder,
+                                    /*qnn_options=*/provider_options,
+                                    /*opset_version=*/12,
+                                    /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+                                    /*tolerance=*/tolerance);
+}
+
+}  // namespace
+
+namespace onnxruntime {
+namespace test {
+
+//
+// QNN CPU
+//
+
+TEST_F(QnnCPUBackendTests, EinsumRank2) {
+  const std::vector<int64_t> shape0{2, 3};
+  const std::vector<int64_t> shape1{3, 4};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeCpu,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"ab,bc->ac",
+      /*tolerance=*/1e-4f);
+}
+
+TEST_F(QnnCPUBackendTests, EinsumRank4MatMul) {
+  const std::vector<int64_t> shape0{3, 4, 5, 6};
+  const std::vector<int64_t> shape1{3, 4, 6, 5};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeCpu,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bhij,bhjd->bhid",
+      /*tolerance=*/1e-4f);
+}
+
+TEST_F(QnnCPUBackendTests, EinsumRank4MatMulTransposeY) {
+  const std::vector<int64_t> shape0{2, 3, 4, 6};
+  const std::vector<int64_t> shape1{2, 3, 5, 6};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeCpu,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bhid,bhjd->bhij",
+      /*tolerance=*/1e-4f);
+}
+
+TEST_F(QnnCPUBackendTests, EinsumRank4MatMulTransposeAll1) {
+  const std::vector<int64_t> shape0{1, 9, 1, 7};
+  const std::vector<int64_t> shape1{1, 7, 1, 9};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeCpu,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bchq,bkhc->bkhq",
+      /*tolerance=*/1e-4f);
+}
+
+TEST_F(QnnCPUBackendTests, EinsumRank4MatMulTransposeAll2) {
+  const std::vector<int64_t> shape0{1, 7, 1, 7};
+  const std::vector<int64_t> shape1{1, 9, 1, 7};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeCpu,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bkhq,bchk->bchq",
+      /*tolerance=*/1e-4f);
+}
+
+//
+// QNN HTP F16
+//
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+TEST_F(QnnHTPBackendTests, EinsumF16Rank2MatMul) {
+  const std::vector<int64_t> shape0{2, 3};
+  const std::vector<int64_t> shape1{3, 4};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeHtp,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"ij,jk->ik",
+      /*tolerance=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, EinsumF16Rank4MatMul) {
+  const std::vector<int64_t> shape0{3, 1, 5, 2};
+  const std::vector<int64_t> shape1{3, 1, 2, 5};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeHtp,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bhij,bhjd->bhid",
+      /*tolerance=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, EinsumF16Rank4MatMulTransposeY) {
+  const std::vector<int64_t> shape0{2, 3, 4, 2};
+  const std::vector<int64_t> shape1{2, 3, 5, 2};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeHtp,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bhid,bhjd->bhij",
+      /*tolerance=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, EinsumF16Rank4MatMulTransposeAll1) {
+  const std::vector<int64_t> shape0{1, 3, 1, 7};
+  const std::vector<int64_t> shape1{1, 7, 1, 3};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeHtp,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bchq,bkhc->bkhq",
+      /*tolerance=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, EinsumF16Rank4MatMulTransposeAll2) {
+  const std::vector<int64_t> shape0{1, 4, 1, 4};
+  const std::vector<int64_t> shape1{1, 9, 1, 4};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnEinsum<float>(
+      /*backend=*/kQnnBackendTypeHtp,
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bkhq,bchk->bchq",
+      /*tolerance=*/1e-2f);
+}
+
+//
+// QNN HTP QDQ
+//
+
+TEST_F(QnnHTPBackendTests, EinsumQdqRank2MatMul) {
+  const std::vector<int64_t> shape0{2, 3};
+  const std::vector<int64_t> shape1{3, 4};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnHtpQdqEinsum<uint8_t, uint8_t>(
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"ij,jk->ik",
+      /*tolerance=*/QDQTolerance());
+}
+
+TEST_F(QnnHTPBackendTests, EinsumQdqRank4MatMul) {
+  const std::vector<int64_t> shape0{3, 1, 5, 2};
+  const std::vector<int64_t> shape1{3, 1, 2, 5};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnHtpQdqEinsum<uint8_t, uint8_t>(
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bhij,bhjd->bhid",
+      /*tolerance=*/QDQTolerance());
+}
+
+TEST_F(QnnHTPBackendTests, EinsumQdqRank4MatMulTransposeY) {
+  const std::vector<int64_t> shape0{2, 3, 4, 2};
+  const std::vector<int64_t> shape1{2, 3, 5, 2};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnHtpQdqEinsum<uint8_t, uint8_t>(
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bhid,bhjd->bhij",
+      /*tolerance=*/QDQTolerance());
+}
+
+TEST_F(QnnHTPBackendTests, EinsumQdqRank4MatMulTransposeAll1) {
+  const std::vector<int64_t> shape0{1, 3, 1, 7};
+  const std::vector<int64_t> shape1{1, 7, 1, 3};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnHtpQdqEinsum<uint8_t, uint8_t>(
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bchq,bkhc->bkhq",
+      /*tolerance=*/QDQTolerance());
+}
+
+TEST_F(QnnHTPBackendTests, EinsumQdqRank4MatMulTransposeAll2) {
+  const std::vector<int64_t> shape0{1, 4, 1, 4};
+  const std::vector<int64_t> shape1{1, 9, 1, 4};
+  const std::vector<float> data0 = GetSequentialFloatData(shape0, /*start=*/-0.1f, /*step=*/0.05f);
+  const std::vector<float> data1 = GetSequentialFloatData(shape1, /*start=*/-0.1f, /*step=*/0.05f);
+  RunQnnHtpQdqEinsum<uint8_t, uint8_t>(
+      /*in0=*/TestInputDef<float>(shape0, /*is_initializer=*/false, std::move(data0)),
+      /*in1=*/TestInputDef<float>(shape1, /*is_initializer=*/false, std::move(data1)),
+      /*equation=*/"bkhq,bchk->bchq",
+      /*tolerance=*/QDQTolerance());
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
index 326354dffa8ae..22459bb4f6941 100644
--- a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc
@@ -178,7 +178,9 @@ static void RunOpTest(const std::string& op_type,
 }
 
 // Non-QDQ model, Gather with static input and dynamic int64 indices
-TEST_F(QnnHTPBackendTests, GatherOp_IndicesStaticInt64) {
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_GatherOp_IndicesStaticInt64) {
   RunOpTest<float, int64_t>("Gather",
                             TestInputDef<float>({3, 2}, true, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.7f}),
                             TestInputDef<int64_t>({2, 2}, false, {0, 1, 1, 2}),
diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc
index a7c86806bf426..ddc2a09553df9 100644
--- a/onnxruntime/test/providers/qnn/gemm_op_test.cc
+++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc
@@ -73,8 +73,9 @@ TEST_F(QnnCPUBackendTests, Gemm_2D_Bias_Unsupported) {
                           ExpectedEPNodeAssignment::All);  // Assigned to QNN EP.
 }
 
+// since Qnn v2.34 value pair (120.73912, 121.73912) at index #0 don't match, which is 1 from 120.739
 // Test Gemm with dynamic (i.e., not initializer) inputs (A, B, Bias).
-TEST_F(QnnCPUBackendTests, Gemm_Dynamic_A_B_Bias) {
+TEST_F(QnnCPUBackendTests, DISABLED_Gemm_Dynamic_A_B_Bias) {
   std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
   std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
   std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
@@ -110,8 +111,9 @@ TEST_F(QnnCPUBackendTests, Gemm_TransAB_Static_B_And_Bias) {
                           ExpectedEPNodeAssignment::All);
 }
 
+// Since Qnn 2.34 value pair (29.4347763, 30.4347763) at index #0 don't match, which is 1 from 29.4348
 // Test Gemm with transposed A/B and dynamic (i.e., not initializer) B and Bias inputs.
-TEST_F(QnnCPUBackendTests, Gemm_TransAB_Dynamic_B_And_Bias) {
+TEST_F(QnnCPUBackendTests, DISABLED_Gemm_TransAB_Dynamic_B_And_Bias) {
   std::vector<float> input_a_data = GetFloatDataInRange(-10.0f, 10.0f, 6);
   std::vector<float> input_b_data = GetFloatDataInRange(-5.0f, 5.0f, 24);
   std::vector<float> input_c_data = GetFloatDataInRange(-1.0f, 1.0f, 4);
@@ -123,7 +125,8 @@ TEST_F(QnnCPUBackendTests, Gemm_TransAB_Dynamic_B_And_Bias) {
                           ExpectedEPNodeAssignment::All);
 }
 
-TEST_F(QnnCPUBackendTests, Gemm_Broadcast_Bias_DynamicInputs) {
+// Since Qnn 2.34 value pair (11, 10) at index #0 don't match, which is -1 from 11
+TEST_F(QnnCPUBackendTests, DISABLED_Gemm_Broadcast_Bias_DynamicInputs) {
   std::vector<float> input_a_data = {1.0f, 2.0f, 3.0f, 4.0f, -1.0f, -2.0f, -3.0f, -4.0f};
   std::vector<float> input_b_data(12, 1.0f);
   std::vector<float> input_c_data = {1.0f, 2.0f, 3.0f};
@@ -317,8 +320,7 @@ TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicInputs) {
                                         ExpectedEPNodeAssignment::All,
                                         13,
                                         false,
-                                        // Require tolerance of 0.74% on Windows ARM64.
-                                        QDQTolerance(0.0074f));
+                                        QDQTolerance(0.00410f));
 }
 
 TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_DynamicC) {
@@ -337,8 +339,7 @@ TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_DynamicC) {
                                         ExpectedEPNodeAssignment::All,
                                         13,
                                         false,
-                                        // Require tolerance of 0.74% on Windows ARM64.
-                                        QDQTolerance(0.0074f));
+                                        QDQTolerance(0.00410f));
 }
 
 TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_StaticC) {
@@ -357,8 +358,7 @@ TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_StaticC) {
                                         ExpectedEPNodeAssignment::All,
                                         13,
                                         false,
-                                        // Require tolerance of 0.74% on Windows ARM64.
-                                        QDQTolerance(0.0074f));
+                                        QDQTolerance(0.00410f));
 }
 
 // Test 16-bit QDQ Gemm with dynamic inputs A and Bias. The B input is an initializer.
diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc
index 182877ddf200c..7aa3f030d9f43 100644
--- a/onnxruntime/test/providers/qnn/layer_norm_test.cc
+++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc
@@ -32,13 +32,7 @@ static void RunLayerNormCpuTest(const TestInputDef<float>& input_def,
                   expected_ep_assignment);
 }
 
-#ifdef __linux__
-// This CPU test fails on Linux, QNN SDK 2.17
-// the value pair (-1.75661933, 0) at index #1 don't match, which is 1.75662 from -1.75662
-TEST_F(QnnCPUBackendTests, DISABLED_LayerNorm) {
-#else
 TEST_F(QnnCPUBackendTests, LayerNorm) {
-#endif
   RunLayerNormCpuTest(TestInputDef<float>({2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
                       TestInputDef<float>({2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)),
                       {utils::MakeAttribute("axis", static_cast<int64_t>(0))},
@@ -210,7 +204,7 @@ TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU16_WU8) {
 
 // Test accuracy of 8-bit QDQ LayerNorm with a dynamic scale input.
 //
-// TODO(adrianlizarraga): Fails to finalize with QNN SDK 2.22. Still fails on QNN SDK 2.28.2.
+// TODO(adrianlizarraga): Fails to finalize with QNN SDK 2.22. Still fails on QNN SDK 2.35.0.
 // Verbose logs:
 // Starting stage: Graph Transformations and Optimizations
 // C:\...\QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::flat_to_vtcm
diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc
index bb3a40a47a750..35ec2cb450691 100644
--- a/onnxruntime/test/providers/qnn/lrn_op_test.cc
+++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc
@@ -149,20 +149,13 @@ TEST_F(QnnHTPBackendTests, LRNSize5) {
 }
 
 TEST_F(QnnHTPBackendTests, LRN_size_larger_than_channel) {
-#ifdef __linux__
-  // On Linux QNN SDK 2.17: Need a tolerance of 0.407% of output range to pass.
-  QDQTolerance tolerance = QDQTolerance(0.00407f);
-#else
-  QDQTolerance tolerance = QDQTolerance();
-#endif
   RunQDQLRNOpTest<uint8_t>(TestInputDef<float>({1, 128, 4, 5}, false, -10.0f, 10.0f),
                            255,  // Size
                            ExpectedEPNodeAssignment::All,
                            0.0001f,  // alpha
                            0.75f,    // beta
                            1.0f,     // bias
-                           13,       // opset
-                           tolerance);
+                           13);
 }
 
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
diff --git a/onnxruntime/test/providers/qnn/lstm_test.cc b/onnxruntime/test/providers/qnn/lstm_test.cc
new file mode 100644
index 0000000000000..5d20806d3ea4d
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/lstm_test.cc
@@ -0,0 +1,1217 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include <unordered_map>
+
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "test/providers/tester_types.h"
+
+#include "core/graph/onnx_protobuf.h"
+
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+/*
+  ONNX LSTM inputs:
+  in[0]: X [seq_length, batch_size, input_size]
+  in[1]: W [num_directions, 4*hidden_size, input_size]
+  in[2]: R [num_directions, 4*hidden_size, hidden_size]
+
+  ONNX LSTM optional inputs:
+  in[3]: B [num_directions, 8*hidden_size]
+  in[4]:
+  in[5]: initial_h [num_directions, batch_size, hidden_size].
+  in[6]: initial_c [num_directions, batch_size, hidden_size].
+  in[7]: P [num_directions, 3*hidde_size]
+
+  ONNX LSTM Parameters:
+  - activation_alpha ---> Not supported by QNN.
+  - activation_beta  ---> Not supported by QNN.
+  - activations      ---> Not supported by QNN.
+  - clip             ---> Not supported by QNN since the clip in ONNX applied to iofc while QNN only apply to c. Refer
+                          https://github.com/microsoft/onnxruntime/blob/v1.21.0/onnxruntime/core/providers/cpu/rnn/uni_directional_lstm.cc
+  - direction
+  - hidden_size
+  - input_forget     ---> Not supported by QNN
+  - layout: The shape format of inputs X, initial_h, initial_c and outputs Y, Y_h, Y_c.
+            If 0, the following shapes are expected:
+                X.shape = [seq_length, batch_size, input_size],
+                Y.shape = [seq_length, num_directions, batch_size, hidden_size],
+                initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = [num_directions, batch_size, hidden_size].
+            If 1, the following shapes are expected:
+                X.shape = [batch_size, seq_length, input_size],
+                Y.shape = [batch_size, seq_length, num_directions, hidden_size],
+                initial_h.shape = Y_h.shape = initial_c.shape = Y_c.shape = [batch_size, num_directions, hidden_size].
+
+  ONNX LSTM optional outputs:
+  out[0]: Y [seq_length, num_directions, batch_size, hidden_size]
+  out[1]: Y_h [num_directions, batch_size, hidden_size]
+  out[2]: Y_c [num_directions, batch_size, hidden_size]
+
+*/
+
+template <typename InputType>
+void _BuildLSTMTestCase(ModelTestBuilder& builder,
+                        const TestInputDef<float>& X_def,
+                        const TestInputDef<float>& W_def,
+                        const TestInputDef<float>& R_def,
+                        const std::optional<std::reference_wrapper<TestInputDef<float>>> B_def,
+                        const std::optional<std::reference_wrapper<TestInputDef<float>>> H_def,
+                        const std::optional<std::reference_wrapper<TestInputDef<float>>> C_def,
+                        const std::optional<std::reference_wrapper<TestInputDef<float>>> P_def,
+                        const bool has_Y,
+                        const bool has_Y_h,
+                        const bool has_Y_c,
+                        const std::string direction,
+                        const int64_t hidden_size,
+                        const int64_t layout,
+                        const std::vector<QuantParams<InputType>>& output_qparams) {
+  auto convert_input = [](ModelTestBuilder& builder, const TestInputDef<float>& def) {
+    if (std::is_same<InputType, MLFloat16>::value) {
+      TestInputDef<MLFloat16> Fp16_def = ConvertToFP16InputDef(def);
+      return MakeTestInput(builder, Fp16_def);
+    } else if (std::is_same<InputType, uint8_t>::value) {
+      NodeArg* input = MakeTestInput(builder, def);
+      QuantParams<uint8_t> qparams = GetTestInputQuantParams<uint8_t>(def);
+      return AddQDQNodePair<uint8_t>(builder, input, qparams.scale, qparams.zero_point);
+    } else {
+      return MakeTestInput(builder, def);
+    }
+  };
+
+  NodeArg* inputX = convert_input(builder, X_def);
+  NodeArg* inputW = convert_input(builder, W_def);
+  NodeArg* inputR = convert_input(builder, R_def);
+  std::vector<NodeArg*> input_args = {inputX, inputW, inputR};
+
+  // optional inputs
+  // B
+  if (B_def) {
+    input_args.push_back(convert_input(builder, B_def->get()));
+  } else {
+    input_args.push_back(builder.MakeOptionalTensor());
+  }
+
+  // sequence length
+  input_args.push_back(builder.MakeOptionalTensor());
+
+  // H
+  if (H_def) {
+    input_args.push_back(convert_input(builder, H_def->get()));
+  } else {
+    input_args.push_back(builder.MakeOptionalTensor());
+  }
+
+  // C
+  if (C_def) {
+    input_args.push_back(convert_input(builder, C_def->get()));
+  } else {
+    input_args.push_back(builder.MakeOptionalTensor());
+  }
+
+  // P
+  if (P_def) {
+    input_args.push_back(convert_input(builder, P_def->get()));
+  } else {
+    input_args.push_back(builder.MakeOptionalTensor());
+  }
+
+  NodeArg *lstm_output_Y, *lstm_output_Y_h, *lstm_output_Y_c;
+  if (has_Y) {
+    if (std::is_same<InputType, MLFloat16>::value || std::is_same<InputType, float>::value) {
+      lstm_output_Y = builder.MakeOutput();
+    } else {
+      lstm_output_Y = builder.MakeIntermediate();
+    }
+  } else {
+    lstm_output_Y = builder.MakeOptionalTensor();
+  }
+
+  if (has_Y_h) {
+    if (std::is_same<InputType, MLFloat16>::value || std::is_same<InputType, float>::value) {
+      lstm_output_Y_h = builder.MakeOutput();
+    } else {
+      lstm_output_Y_h = builder.MakeIntermediate();
+    }
+  } else {
+    lstm_output_Y_h = builder.MakeOptionalTensor();
+  }
+  if (has_Y_c) {
+    if (std::is_same<InputType, MLFloat16>::value || std::is_same<InputType, float>::value) {
+      lstm_output_Y_c = builder.MakeOutput();
+    } else {
+      lstm_output_Y_c = builder.MakeIntermediate();
+    }
+  } else {
+    lstm_output_Y_c = builder.MakeOptionalTensor();
+  }
+
+  Node& lstm_node = builder.AddNode("LSTM",
+                                    input_args,
+                                    {lstm_output_Y, lstm_output_Y_h, lstm_output_Y_c});
+  lstm_node.AddAttribute("direction", direction);
+  lstm_node.AddAttribute("hidden_size", hidden_size);
+  lstm_node.AddAttribute("layout", layout);
+  ORT_UNUSED_PARAMETER(output_qparams);
+  if (std::is_same<InputType, uint8_t>::value) {
+    size_t i = 0;
+    if (has_Y) {
+      AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, lstm_output_Y, output_qparams[i].scale,
+                                                     output_qparams[i].zero_point);
+      i++;
+    }
+    if (has_Y_h) {
+      AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, lstm_output_Y_h, output_qparams[i].scale,
+                                                     output_qparams[i].zero_point);
+      i++;
+    }
+    if (has_Y_c) {
+      AddQDQNodePairWithOutputAsGraphOutput<uint8_t>(builder, lstm_output_Y_c, output_qparams[i].scale,
+                                                     output_qparams[i].zero_point);
+      i++;
+    }
+  }
+}
+
+template <typename InputType>
+static GetTestModelFn BuildLSTMTestCase(const TestInputDef<float>& X_def,
+                                        const TestInputDef<float>& W_def,
+                                        const TestInputDef<float>& R_def,
+                                        const std::optional<std::reference_wrapper<TestInputDef<float>>> B_def,
+                                        const std::optional<std::reference_wrapper<TestInputDef<float>>> H_def,
+                                        const std::optional<std::reference_wrapper<TestInputDef<float>>> C_def,
+                                        const std::optional<std::reference_wrapper<TestInputDef<float>>> P_def,
+                                        const bool has_Y,
+                                        const bool has_Y_h,
+                                        const bool has_Y_c,
+                                        const std::string direction,
+                                        const int64_t hidden_size,
+                                        const int64_t layout) {
+  return [X_def, W_def, R_def, B_def,
+          H_def, C_def, P_def,
+          has_Y, has_Y_h, has_Y_c,
+          direction, hidden_size, layout](ModelTestBuilder& builder) {
+    _BuildLSTMTestCase<InputType>(builder, X_def, W_def, R_def, B_def, H_def, C_def, P_def, has_Y, has_Y_h, has_Y_c, direction, hidden_size, layout, {});
+  };
+}
+
+template <typename InputQType>
+static GetTestQDQModelFn<InputQType> BuildQDQLSTMTestCase(const TestInputDef<float>& X_def,
+                                                          const TestInputDef<float>& W_def,
+                                                          const TestInputDef<float>& R_def,
+                                                          const std::optional<std::reference_wrapper<TestInputDef<float>>> B_def,
+                                                          const std::optional<std::reference_wrapper<TestInputDef<float>>> H_def,
+                                                          const std::optional<std::reference_wrapper<TestInputDef<float>>> C_def,
+                                                          const std::optional<std::reference_wrapper<TestInputDef<float>>> P_def,
+                                                          const bool has_Y,
+                                                          const bool has_Y_h,
+                                                          const bool has_Y_c,
+                                                          const std::string direction,
+                                                          const int64_t hidden_size,
+                                                          const int64_t layout) {
+  return [X_def, W_def, R_def, B_def,
+          H_def, C_def, P_def,
+          has_Y, has_Y_h, has_Y_c,
+          direction, hidden_size, layout](ModelTestBuilder& builder,
+                                          std::vector<QuantParams<InputQType>>& output_qparams) {
+    _BuildLSTMTestCase<InputQType>(builder, X_def, W_def, R_def, B_def, H_def, C_def, P_def, has_Y, has_Y_h, has_Y_c, direction, hidden_size, layout, output_qparams);
+  };
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+// Runs an LSTM model on the QNN HTP backend. Checks the graph node assignment, and that inference
+// outputs for QNN EP and CPU EP match.
+// Note: There are accuracy on HTP in fixed point, to avoid the issue, we don't register QDQ selector for LSTM and it
+//       is running on HTP fp16
+template <typename QuantType>
+static void RunHtpQDQLSTMOpTest(const TestInputDef<float>& X_def,
+                                const TestInputDef<float>& W_def,
+                                const TestInputDef<float>& R_def,
+                                const std::optional<std::reference_wrapper<TestInputDef<float>>> B_def,
+                                const std::optional<std::reference_wrapper<TestInputDef<float>>> H_def,
+                                const std::optional<std::reference_wrapper<TestInputDef<float>>> C_def,
+                                const std::optional<std::reference_wrapper<TestInputDef<float>>> P_def,
+                                const bool has_Y,
+                                const bool has_Y_h,
+                                const bool has_Y_c,
+                                const std::string direction,
+                                const int64_t hidden_size,
+                                const int64_t layout,
+                                ExpectedEPNodeAssignment expected_ep_assignment,
+                                int opset = 22,
+                                QDQTolerance tolerance = QDQTolerance()) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  TestQDQModelAccuracy(BuildLSTMTestCase<float>(X_def, W_def, R_def, B_def, H_def, C_def, P_def, has_Y, has_Y_h, has_Y_c, direction, hidden_size, layout),
+                       BuildQDQLSTMTestCase<QuantType>(X_def, W_def, R_def, B_def, H_def, C_def, P_def, has_Y, has_Y_h, has_Y_c, direction, hidden_size, layout),
+                       provider_options,
+                       opset,
+                       expected_ep_assignment,
+                       tolerance);
+}
+
+static void RunHtpFp16LSTMOpTest(const TestInputDef<float>& X_def,
+                                 const TestInputDef<float>& W_def,
+                                 const TestInputDef<float>& R_def,
+                                 const std::optional<std::reference_wrapper<TestInputDef<float>>> B_def,
+                                 const std::optional<std::reference_wrapper<TestInputDef<float>>> H_def,
+                                 const std::optional<std::reference_wrapper<TestInputDef<float>>> C_def,
+                                 const std::optional<std::reference_wrapper<TestInputDef<float>>> P_def,
+                                 const bool has_Y,
+                                 const bool has_Y_h,
+                                 const bool has_Y_c,
+                                 const std::string direction,
+                                 const int64_t hidden_size,
+                                 const int64_t layout,
+                                 ExpectedEPNodeAssignment expected_ep_assignment,
+                                 int opset = 22,
+                                 float tolerance = 0.004f) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+
+  TestFp16ModelAccuracy(BuildLSTMTestCase<float>(X_def, W_def, R_def, B_def, H_def, C_def, P_def, has_Y, has_Y_h, has_Y_c, direction, hidden_size, layout),
+                        BuildLSTMTestCase<MLFloat16>(X_def, W_def, R_def, B_def, H_def, C_def, P_def, has_Y, has_Y_h, has_Y_c, direction, hidden_size, layout),
+                        provider_options,
+                        opset,
+                        expected_ep_assignment,
+                        tolerance);
+}
+
+static void RunCpuFP32LSTMOpTest(const TestInputDef<float>& X_def,
+                                 const TestInputDef<float>& W_def,
+                                 const TestInputDef<float>& R_def,
+                                 const std::optional<std::reference_wrapper<TestInputDef<float>>> B_def,
+                                 const std::optional<std::reference_wrapper<TestInputDef<float>>> H_def,
+                                 const std::optional<std::reference_wrapper<TestInputDef<float>>> C_def,
+                                 const std::optional<std::reference_wrapper<TestInputDef<float>>> P_def,
+                                 const bool has_Y,
+                                 const bool has_Y_h,
+                                 const bool has_Y_c,
+                                 const std::string direction,
+                                 const int64_t hidden_size,
+                                 const int64_t layout,
+                                 ExpectedEPNodeAssignment expected_ep_assignment,
+                                 int opset = 22,
+                                 float tolerance = 0.004f) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "cpu";
+
+  RunQnnModelTest(BuildLSTMTestCase<float>(X_def, W_def, R_def, B_def, H_def, C_def, P_def, has_Y, has_Y_h, has_Y_c, direction, hidden_size, layout),
+                  provider_options,
+                  opset,
+                  expected_ep_assignment,
+                  tolerance);
+}
+
+// QNN failed to finalize when P is provided
+// TODO: Add P to unit test below once finalize issue is resolved
+
+// HTP QDQ
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_forward) {
+  std::string direction = "forward";
+  uint32_t num_direction = 1;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                               std::ref(B_def),                                                                         // B
+                               std::ref(H_def),                                                                         // initial_h
+                               std::ref(C_def),                                                                         // initial_c
+                               std::nullopt,                                                                            // P
+                               true,                                                                                    // has_Y
+                               true,                                                                                    // has_Y_h
+                               true,                                                                                    // has_Y_c
+                               direction,                                                                               // direction
+                               hidden_size,                                                                             // hidden_size
+                               0,                                                                                       // layout
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_reverse) {
+  std::string direction = "reverse";
+  uint32_t num_direction = 1;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                               std::ref(B_def),                                                                         // B
+                               std::ref(H_def),                                                                         // initial_h
+                               std::ref(C_def),                                                                         // initial_c
+                               std::nullopt,                                                                            // P
+                               true,                                                                                    // has_Y
+                               true,                                                                                    // has_Y_h
+                               true,                                                                                    // has_Y_c
+                               direction,                                                                               // direction
+                               hidden_size,                                                                             // hidden_size
+                               0,                                                                                       // layout
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_bidirectional) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                               std::ref(B_def),                                                                         // B
+                               std::ref(H_def),                                                                         // initial_h
+                               std::ref(C_def),                                                                         // initial_c
+                               std::nullopt,                                                                            // P
+                               true,                                                                                    // has_Y
+                               true,                                                                                    // has_Y_h
+                               true,                                                                                    // has_Y_c
+                               direction,                                                                               // direction
+                               hidden_size,                                                                             // hidden_size
+                               0,                                                                                       // layout
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_bidirectional_wo_B) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                               std::nullopt,                                                                            // B
+                               std::ref(H_def),                                                                         // initial_h
+                               std::ref(C_def),                                                                         // initial_c
+                               std::nullopt,                                                                            // P
+                               true,                                                                                    // has_Y
+                               true,                                                                                    // has_Y_h
+                               true,                                                                                    // has_Y_c
+                               direction,                                                                               // direction
+                               hidden_size,                                                                             // hidden_size
+                               0,                                                                                       // layout
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_bidirectional_wo_H) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                               std::ref(B_def),                                                                         // B
+                               std::nullopt,                                                                            // initial_h
+                               std::ref(C_def),                                                                         // initial_c
+                               std::nullopt,                                                                            // P
+                               true,                                                                                    // has_Y
+                               true,                                                                                    // has_Y_h
+                               true,                                                                                    // has_Y_c
+                               direction,                                                                               // direction
+                               hidden_size,                                                                             // hidden_size
+                               0,                                                                                       // layout
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_bidirectional_wo_C) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                               std::ref(B_def),                                                                         // B
+                               std::ref(H_def),                                                                         // initial_h
+                               std::nullopt,                                                                            // initial_c
+                               std::nullopt,                                                                            // P
+                               true,                                                                                    // has_Y
+                               true,                                                                                    // has_Y_h
+                               true,                                                                                    // has_Y_c
+                               direction,                                                                               // direction
+                               hidden_size,                                                                             // hidden_size
+                               0,                                                                                       // layout
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_bidirectional_all_initializer) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, true, -0.5f, 0.5f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, true, -0.5f, 0.5f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, true, -0.5f, 0.5f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -0.5f, 0.5f),             // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, true, -0.5f, 0.5f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, true, -0.5f, 0.5f),  // R
+                               std::ref(B_def),                                                                        // B
+                               std::ref(H_def),                                                                        // initial_h
+                               std::ref(C_def),                                                                        // initial_c
+                               std::nullopt,                                                                           // P
+                               true,                                                                                   // has_Y
+                               true,                                                                                   // has_Y_h
+                               true,                                                                                   // has_Y_c
+                               direction,                                                                              // direction
+                               hidden_size,                                                                            // hidden_size
+                               0,                                                                                      // layout
+                               ExpectedEPNodeAssignment::All,
+                               22,
+                               QDQTolerance(0.008f));
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_bidirectional_Y_only) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                               std::ref(B_def),                                                                         // B
+                               std::ref(H_def),                                                                         // initial_h
+                               std::ref(C_def),                                                                         // initial_c
+                               std::nullopt,                                                                            // P
+                               true,                                                                                    // has_Y
+                               false,                                                                                   // has_Y_h
+                               false,                                                                                   // has_Y_c
+                               direction,                                                                               // direction
+                               hidden_size,                                                                             // hidden_size
+                               0,                                                                                       // layout
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_bidirectional_Y_h_only) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                               std::ref(B_def),                                                                         // B
+                               std::ref(H_def),                                                                         // initial_h
+                               std::ref(C_def),                                                                         // initial_c
+                               std::nullopt,                                                                            // P
+                               false,                                                                                   // has_Y
+                               true,                                                                                    // has_Y_h
+                               false,                                                                                   // has_Y_c
+                               direction,                                                                               // direction
+                               hidden_size,                                                                             // hidden_size
+                               0,                                                                                       // layout
+                               ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_QDQ_sanity_bidirectional_Y_c_only) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpQDQLSTMOpTest<uint8_t>(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                               TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                               TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                               std::ref(B_def),                                                                         // B
+                               std::ref(H_def),                                                                         // initial_h
+                               std::ref(C_def),                                                                         // initial_c
+                               std::nullopt,                                                                            // P
+                               false,                                                                                   // has_Y
+                               false,                                                                                   // has_Y_h
+                               true,                                                                                    // has_Y_c
+                               direction,                                                                               // direction
+                               hidden_size,                                                                             // hidden_size
+                               0,                                                                                       // layout
+                               ExpectedEPNodeAssignment::All);
+}
+
+// HTP Fp16
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_forward) {
+  std::string direction = "forward";
+  uint32_t num_direction = 1;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                       TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                       TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                       std::ref(B_def),                                                                         // B
+                       std::ref(H_def),                                                                         // initial_h
+                       std::ref(C_def),                                                                         // initial_c
+                       std::nullopt,                                                                            // P
+                       true,                                                                                    // has_Y
+                       true,                                                                                    // has_Y_h
+                       true,                                                                                    // has_Y_c
+                       direction,                                                                               // direction
+                       hidden_size,                                                                             // hidden_size
+                       0,                                                                                       // layout
+                       ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_reverse) {
+  std::string direction = "reverse";
+  uint32_t num_direction = 1;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                       TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                       TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                       std::ref(B_def),                                                                         // B
+                       std::ref(H_def),                                                                         // initial_h
+                       std::ref(C_def),                                                                         // initial_c
+                       std::nullopt,                                                                            // P
+                       true,                                                                                    // has_Y
+                       true,                                                                                    // has_Y_h
+                       true,                                                                                    // has_Y_c
+                       direction,                                                                               // direction
+                       hidden_size,                                                                             // hidden_size
+                       0,                                                                                       // layout
+                       ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_bidirectional) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::nullopt,                                                                            // P
+      true,                                                                                    // has_Y
+      true,                                                                                    // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_bidirectional_wo_B) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::nullopt,                                                                            // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::nullopt,                                                                            // P
+      true,                                                                                    // has_Y
+      true,                                                                                    // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_bidirectional_wo_H) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::nullopt,                                                                            // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::nullopt,                                                                            // P
+      true,                                                                                    // has_Y
+      true,                                                                                    // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_bidirectional_wo_C) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::nullopt,                                                                            // initial_c
+      std::nullopt,                                                                            // P
+      true,                                                                                    // has_Y
+      true,                                                                                    // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_bidirectional_all_initializer) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, true, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, true, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, true, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),             // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, true, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, true, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                        // B
+      std::ref(H_def),                                                                        // initial_h
+      std::ref(C_def),                                                                        // initial_c
+      std::nullopt,                                                                           // P
+      true,                                                                                   // has_Y
+      true,                                                                                   // has_Y_h
+      true,                                                                                   // has_Y_c
+      direction,                                                                              // direction
+      hidden_size,                                                                            // hidden_size
+      0,                                                                                      // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_bidirectional_Y_only) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::nullopt,                                                                            // P
+      true,                                                                                    // has_Y
+      false,                                                                                   // has_Y_h
+      false,                                                                                   // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_bidirectional_Y_h_only) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::nullopt,                                                                            // P
+      false,                                                                                   // has_Y
+      true,                                                                                    // has_Y_h
+      false,                                                                                   // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+// Fails with QNN SDK 2.35.0:
+// Failed to finalize QNN graph. Error code: 1002
+TEST_F(QnnHTPBackendTests, DISABLED_LSTM_Fp16_sanity_bidirectional_Y_c_only) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunHtpFp16LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::nullopt,                                                                            // P
+      false,                                                                                   // has_Y
+      false,                                                                                   // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+// CPU FP32
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_forward) {
+  std::string direction = "forward";
+  uint32_t num_direction = 1;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                       TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                       TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                       std::ref(B_def),                                                                         // B
+                       std::ref(H_def),                                                                         // initial_h
+                       std::ref(C_def),                                                                         // initial_c
+                       std::ref(P_def),                                                                         // P
+                       true,                                                                                    // has_Y
+                       true,                                                                                    // has_Y_h
+                       true,                                                                                    // has_Y_c
+                       direction,                                                                               // direction
+                       hidden_size,                                                                             // hidden_size
+                       0,                                                                                       // layout
+                       ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_reverse) {
+  std::string direction = "reverse";
+  uint32_t num_direction = 1;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                       TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                       TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                       std::ref(B_def),                                                                         // B
+                       std::ref(H_def),                                                                         // initial_h
+                       std::ref(C_def),                                                                         // initial_c
+                       std::ref(P_def),                                                                         // P
+                       true,                                                                                    // has_Y
+                       true,                                                                                    // has_Y_h
+                       true,                                                                                    // has_Y_c
+                       direction,                                                                               // direction
+                       hidden_size,                                                                             // hidden_size
+                       0,                                                                                       // layout
+                       ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::ref(P_def),                                                                         // P
+      true,                                                                                    // has_Y
+      true,                                                                                    // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional_wo_B) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::nullopt,                                                                            // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::ref(P_def),                                                                         // P
+      true,                                                                                    // has_Y
+      true,                                                                                    // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional_wo_H) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::nullopt,                                                                            // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::ref(P_def),                                                                         // P
+      true,                                                                                    // has_Y
+      true,                                                                                    // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional_wo_C) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::nullopt,                                                                            // initial_c
+      std::ref(P_def),                                                                         // P
+      true,                                                                                    // has_Y
+      true,                                                                                    // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional_wo_HC) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::nullopt,                                                                            // initial_h
+      std::nullopt,                                                                            // initial_c
+      std::ref(P_def),                                                                         // P
+      true,                                                                                    // has_Y
+      true,                                                                                    // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional_wo_P) {
+  std::string direction = "forward";
+  uint32_t num_direction = 1;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+                       TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+                       TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+                       std::ref(B_def),                                                                         // B
+                       std::ref(H_def),                                                                         // initial_h
+                       std::ref(C_def),                                                                         // initial_c
+                       std::nullopt,                                                                            // P
+                       true,                                                                                    // has_Y
+                       true,                                                                                    // has_Y_h
+                       true,                                                                                    // has_Y_c
+                       direction,                                                                               // direction
+                       hidden_size,                                                                             // hidden_size
+                       0,                                                                                       // layout
+                       ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional_all_initializer) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, true, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, true, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, true, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, true, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),             // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, true, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, true, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                        // B
+      std::ref(H_def),                                                                        // initial_h
+      std::ref(C_def),                                                                        // initial_c
+      std::ref(P_def),                                                                        // P
+      true,                                                                                   // has_Y
+      true,                                                                                   // has_Y_h
+      true,                                                                                   // has_Y_c
+      direction,                                                                              // direction
+      hidden_size,                                                                            // hidden_size
+      0,                                                                                      // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional_Y_only) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::ref(P_def),                                                                         // P
+      true,                                                                                    // has_Y
+      false,                                                                                   // has_Y_h
+      false,                                                                                   // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional_Y_h_only) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::ref(P_def),                                                                         // P
+      false,                                                                                   // has_Y
+      true,                                                                                    // has_Y_h
+      false,                                                                                   // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+TEST_F(QnnCPUBackendTests, LSTM_FP32_sanity_bidirectional_Y_c_only) {
+  std::string direction = "bidirectional";
+  uint32_t num_direction = 2;
+  uint32_t batch_size = 3;
+  uint32_t hidden_size = 4;
+  uint32_t input_size = 5;
+  uint32_t seq_len = 6;
+  auto B_def = TestInputDef<float>({num_direction, 8 * hidden_size}, false, -1.0f, 1.0f);
+  auto H_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto C_def = TestInputDef<float>({num_direction, batch_size, hidden_size}, false, -1.0f, 1.0f);
+  auto P_def = TestInputDef<float>({num_direction, 3 * hidden_size}, false, -1.0f, 1.0f);
+  RunCpuFP32LSTMOpTest(
+      TestInputDef<float>({seq_len, batch_size, input_size}, false, -1.0f, 1.0f),              // X
+      TestInputDef<float>({num_direction, 4 * hidden_size, input_size}, false, -1.0f, 1.0f),   // W
+      TestInputDef<float>({num_direction, 4 * hidden_size, hidden_size}, false, -1.0f, 1.0f),  // R
+      std::ref(B_def),                                                                         // B
+      std::ref(H_def),                                                                         // initial_h
+      std::ref(C_def),                                                                         // initial_c
+      std::ref(P_def),                                                                         // P
+      false,                                                                                   // has_Y
+      false,                                                                                   // has_Y_h
+      true,                                                                                    // has_Y_c
+      direction,                                                                               // direction
+      hidden_size,                                                                             // hidden_size
+      0,                                                                                       // layout
+      ExpectedEPNodeAssignment::All);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp
index 09ead72889bca..e0ea04b7d163b 100644
--- a/onnxruntime/test/providers/qnn/matmul_test.cpp
+++ b/onnxruntime/test/providers/qnn/matmul_test.cpp
@@ -194,13 +194,7 @@ TEST_F(QnnCPUBackendTests, MatMulOp) {
   RunMatMulOpTest(false, {2, 3, 3, 3}, {3, 2}, false, true);
   RunMatMulOpTest(false, {2, 3, 3, 3}, {2, 3, 3, 2}, false, true);
 
-#if defined(__linux__)
-  // TODO: This fails on Linux (HTP emulation). Works on Windows ARM64.
-  // Expected: contains 24 values, where each value and its corresponding value in 16-byte object <18-00 00-00 00-00 00-00 00-29 4E-53 A8-55 00-00> are an almost-equal pair
-  // Actual: 16-byte object <18-00 00-00 00-00 00-00 80-28 3E-53 A8-55 00-00>, where the value pair (0.0285999943, 0) at index #12 don't match, which is -0.0286 from 0.0286
-#else
   RunMatMulOpTest(false, {2, 1, 2, 3}, {3, 3, 2}, false, false);
-#endif
   RunMatMulOpTest(false, {3}, {3}, false, false);
   RunMatMulOpTest(false, {3}, {3}, false, true);
   RunMatMulOpTest(false, {3}, {3}, true, false);
@@ -285,7 +279,7 @@ TEST_F(QnnHTPBackendTests, MatMulOp_QDQ) {
   // UINT16, per-channel INT8 weight
   RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3}, {3, 2}, 1, QDQTolerance(),
                                                            ExpectedEPNodeAssignment::All, 21, false, false);
-  RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3, 3}, {3}, -1, QDQTolerance(0.005f));
+  RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3, 3}, {3}, -1, QDQTolerance(0.0041f));
 }
 
 // Tests MatMul with two uint16 (quantized) inputs that are both dynamic.
diff --git a/onnxruntime/test/providers/qnn/pool_op_test.cpp b/onnxruntime/test/providers/qnn/pool_op_test.cpp
index ae194bd2ef920..c6d25e6addc42 100644
--- a/onnxruntime/test/providers/qnn/pool_op_test.cpp
+++ b/onnxruntime/test/providers/qnn/pool_op_test.cpp
@@ -182,10 +182,8 @@ TEST_F(QnnHTPBackendTests, MaxPool_Large_Input_HTP_u8) {
                              utils::MakeAttribute("storage_order", static_cast<int64_t>(0)),
                              utils::MakeAttribute("auto_pad", "NOTSET")},
                             ExpectedEPNodeAssignment::All,
-                            18,     // opset
-                            false,  // use_contrib_qdq_ops
-                            // Need a tolerance of 0.417% of output range after QNN SDK 2.17
-                            QDQTolerance(0.00417f));
+                            18,      // opset
+                            false);  // use_contrib_qdq_ops
 }
 
 TEST_F(QnnHTPBackendTests, MaxPool_Ceil_HTP_u8) {
diff --git a/onnxruntime/test/providers/qnn/qnn_node_group/scale_softmax_fusion_test.cc b/onnxruntime/test/providers/qnn/qnn_node_group/scale_softmax_fusion_test.cc
new file mode 100644
index 0000000000000..eda04b954f590
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/qnn_node_group/scale_softmax_fusion_test.cc
@@ -0,0 +1,147 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include "core/graph/graph.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "test/optimizer/qdq_test_utils.h"
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+namespace {
+
+GetTestModelFn BuildTestCaseScalar(
+    const TestInputDef<float>& input_def,
+    float scale_value,
+    bool use_constant,
+    bool reverse_input_order,
+    std::optional<int64_t> softmax_axis = std::nullopt) {
+  return [&](ModelTestBuilder& builder) -> void {
+    NodeArg* input = MakeTestInput<float>(builder, input_def);
+    NodeArg* scale{nullptr};
+    if (use_constant) {
+      onnx::TensorProto scale_value_proto;
+      scale_value_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+      utils::SetRawDataInTensorProto(scale_value_proto, reinterpret_cast<const char*>(&scale_value), sizeof(float));
+      scale = builder.MakeIntermediate();
+      builder.AddNode("Constant", {}, {scale}).AddAttribute("value", scale_value_proto);
+    } else {
+      scale = builder.MakeScalarInitializer<float>(scale_value);
+    }
+    NodeArg* intermediate = builder.MakeIntermediate();
+    auto mul_inputs = reverse_input_order ? std::vector<NodeArg*>{scale, input} : std::vector<NodeArg*>{input, scale};
+    builder.AddNode("Mul", mul_inputs, {intermediate});
+    Node& softmax = builder.AddNode("Softmax", {intermediate}, {builder.MakeOutput()});
+    if (softmax_axis.has_value()) {
+      softmax.AddAttribute("axis", softmax_axis.value());
+    }
+  };
+}
+
+GetTestModelFn BuildTestCaseNoScalar(const TestInputDef<float>& input_def1, const TestInputDef<float>& input_def2) {
+  return [&input_def1, input_def2](ModelTestBuilder& builder) -> void {
+    NodeArg* input = MakeTestInput<float>(builder, input_def1);
+    NodeArg* scale = MakeTestInput<float>(builder, input_def2);
+    NodeArg* intermediate = builder.MakeIntermediate();
+    builder.AddNode("Mul", {input, scale}, {intermediate});
+    builder.AddNode("Softmax", {intermediate}, {builder.MakeOutput()});
+  };
+}
+
+ProviderOptions GetProviderOptions() {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+  return provider_options;
+}
+
+}  // namespace
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+TEST_F(QnnHTPBackendTests, DISABLED_ScaleSoftmaxFusionScalarInitializer) {
+  ProviderOptions provider_options = GetProviderOptions();
+
+  auto input_def = TestInputDef<float>({1, 3, 5, 5}, false, -0.5f, 0.5f);
+  RunQnnModelTest(BuildTestCaseScalar(input_def, 0.125f, /*use_constant=*/false, /*reverse_input_order=*/false),
+                  provider_options,
+                  /*opset_version=*/13,
+                  /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+                  /*fp32_abs_err=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, DISABLED_ScaleSoftmaxFusionScalarConstant) {
+  ProviderOptions provider_options = GetProviderOptions();
+
+  auto input_def = TestInputDef<float>({1, 3, 5, 5}, false, -0.5f, 0.5f);
+  RunQnnModelTest(BuildTestCaseScalar(input_def, 0.375f, /*use_constant=*/true, /*reverse_input_order=*/false),
+                  provider_options,
+                  /*opset_version=*/14,
+                  /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+                  /*fp32_abs_err=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, DISABLED_ScaleSoftmaxFusionScalarInitializerReversed) {
+  ProviderOptions provider_options = GetProviderOptions();
+  auto input_def = TestInputDef<float>({1, 3, 5, 5}, false, -0.5f, 0.5f);
+  RunQnnModelTest(BuildTestCaseScalar(input_def, 0.375f, /*use_constant=*/false, /*reverse_input_order=*/true),
+                  provider_options,
+                  /*opset_version=*/15,
+                  /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+                  /*fp32_abs_err=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, DISABLED_ScaleSoftmaxFusionScalarConstantReversed) {
+  ProviderOptions provider_options = GetProviderOptions();
+  auto input_def = TestInputDef<float>({1, 3, 5, 5}, false, -0.5f, 0.5f);
+  RunQnnModelTest(BuildTestCaseScalar(input_def, 0.125f, /*use_constant=*/true, /*reverse_input _order=*/true),
+                  provider_options,
+                  /*opset_version=*/16,
+                  /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+                  /*fp32_abs_err=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, DISABLED_ScaleSoftmaxFusionSoftmaxNegativeAxis) {
+  ProviderOptions provider_options = GetProviderOptions();
+  auto input_def = TestInputDef<float>({1, 3, 5, 5}, false, -0.5f, 0.5f);
+  RunQnnModelTest(BuildTestCaseScalar(input_def, 0.125f,
+                                      /*use_constant=*/true, /*reverse_input_order=*/true, /*softmax_axis=*/-1),
+                  provider_options,
+                  /*opset_version=*/22,
+                  /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+                  /*fp32_abs_err=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, ScaleSoftmaxFusionSkipNoScalar4d) {
+  ProviderOptions provider_options = GetProviderOptions();
+  auto input_def1 = TestInputDef<float>({1, 3, 5, 5}, false, -0.5f, 0.5f);
+  auto input_def2 = TestInputDef<float>({1, 3, 5, 5}, false, -0.5f, 0.5f);
+  RunQnnModelTest(BuildTestCaseNoScalar(input_def1, input_def2),
+                  provider_options,
+                  /*opset_version=*/13,
+                  /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+                  /*fp32_abs_err=*/1e-2f);
+}
+
+TEST_F(QnnHTPBackendTests, ScaleSoftmaxFusionSkipNoScalar1d) {
+  ProviderOptions provider_options = GetProviderOptions();
+  auto input_def1 = TestInputDef<float>({1, 3, 5, 5}, false, -0.5f, 0.5f);
+  auto input_def2 = TestInputDef<float>({1}, false, -0.5f, 0.5f);
+  RunQnnModelTest(BuildTestCaseNoScalar(input_def1, input_def2),
+                  provider_options,
+                  /*opset_version=*/13,
+                  /*expected_ep_assignment=*/ExpectedEPNodeAssignment::All,
+                  /*fp32_abs_err=*/1e-2f);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/providers/qnn/resize_test.cc b/onnxruntime/test/providers/qnn/resize_test.cc
index fbd729fa998d9..4ab9c6fbd8961 100644
--- a/onnxruntime/test/providers/qnn/resize_test.cc
+++ b/onnxruntime/test/providers/qnn/resize_test.cc
@@ -336,9 +336,7 @@ TEST_F(QnnHTPBackendTests, Resize_DownSample_Linear_HalfPixel) {
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 1, 2, 4}, false, input_data),
                               {1, 1, 1, 2}, "linear", "half_pixel", "",
                               ExpectedEPNodeAssignment::All,
-                              19,
-                              // Need tolerance of 0.539% of output range after QNN SDK 2.17
-                              QDQTolerance(0.00539f));
+                              19);
 }
 
 // Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "pytorch_half_pixel"
@@ -348,9 +346,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearPytorchHalfPixel) {
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
                               {1, 3, 8, 8}, "linear", "pytorch_half_pixel", "",
                               ExpectedEPNodeAssignment::All,
-                              19,
-                              // Need tolerance of 0.609% of output range after QNN SDK 2.17
-                              QDQTolerance(0.00609f));
+                              19);
 }
 
 // Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "half_pixel"
@@ -360,9 +356,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearHalfPixel) {
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
                               {1, 3, 8, 8}, "linear", "half_pixel", "",
                               ExpectedEPNodeAssignment::All,
-                              19,
-                              // Need tolerance of 0.609% of output range after QNN SDK 2.17
-                              QDQTolerance(0.00609f));
+                              19);
 }
 
 // Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "align_corners"
@@ -372,9 +366,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAlignCorners) {
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
                               {1, 3, 8, 8}, "linear", "align_corners", "",
                               ExpectedEPNodeAssignment::All,
-                              19,
-                              // Need tolerance of 0.533% of output range after QNN SDK 2.17
-                              QDQTolerance(0.00533f));
+                              19);
 }
 
 // Test 2x QDQ Resize mode: "linear", coordinate_transformation_mode: "asymmetric"
@@ -384,9 +376,7 @@ TEST_F(QnnHTPBackendTests, ResizeU8_2xLinearAsymmetric) {
   RunQDQResizeOpTest<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, input_data),
                               {1, 3, 8, 8}, "linear", "asymmetric", "",
                               ExpectedEPNodeAssignment::All,
-                              19,
-                              // Need tolerance of 0.619% of output range after QNN SDK 2.17
-                              QDQTolerance(0.00619f));
+                              19);
 }
 
 // Test 2x QDQ Resize mode: "nearest", coordinate_transformation_mode: "half_pixel", nearest_mode: "round_prefer_floor"
diff --git a/onnxruntime/test/providers/qnn/transpose_htp_test.cc b/onnxruntime/test/providers/qnn/transpose_htp_test.cc
index f206e517408bf..83ff6440c8399 100644
--- a/onnxruntime/test/providers/qnn/transpose_htp_test.cc
+++ b/onnxruntime/test/providers/qnn/transpose_htp_test.cc
@@ -120,7 +120,9 @@ TEST_F(QnnHTPBackendTests, TransposeInt32OnHTP) {
 }
 
 // Check that QNN supports Transpose with float32 data input on HTP
-TEST_F(QnnHTPBackendTests, TransposeFloatOnHTP) {
+// Fails with QNN SDK 2.35.0:
+// value pair (0.183528364, 0.183471695) at index #0 don't match, which is -5.66691e-05 from 0.183528
+TEST_F(QnnHTPBackendTests, DISABLED_TransposeFloatOnHTP) {
   RunTransposeNonQDQOnHTP<float>(TestInputDef<float>({1, 3, 224, 128}, false, 0, 10.0f),
                                  {utils::MakeAttribute("perm", std::vector<int64_t>{0, 2, 3, 1})},
                                  ExpectedEPNodeAssignment::All, false);
diff --git a/onnxruntime/test/providers/qnn/upsample_op_test.cc b/onnxruntime/test/providers/qnn/upsample_op_test.cc
new file mode 100644
index 0000000000000..3371bbef44e1b
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/upsample_op_test.cc
@@ -0,0 +1,114 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <string>
+#include <vector>
+
+#include "test/providers/qnn/qnn_test_utils.h"
+#include "core/graph/node_attr_utils.h"
+
+#include "core/graph/onnx_protobuf.h"
+#include "gtest/gtest.h"
+
+namespace onnxruntime {
+namespace test {
+
+// Runs a model with a Upsample operator on the QNN CPU backend. Checks the graph node assignment
+// and that inference outputs for QNN EP and CPU EP match.
+template <typename DataType>
+static void RunUpsampleTestOnCPU(const TestInputDef<DataType>& input_def,
+                                 const TestInputDef<float>& scales_def,
+                                 std::vector<ONNX_NAMESPACE::AttributeProto>&& attrs,
+                                 ExpectedEPNodeAssignment expected_ep_assignment,
+                                 int opset = 9) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "cpu";
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  if (opset <= 7) {
+    const std::vector<float>& scales = scales_def.GetRawData();
+    attrs.push_back(utils::MakeAttribute("scales", scales));
+
+    RunQnnModelTest(BuildOpTestCase<DataType>("Upsample", {input_def}, {}, attrs),
+                    provider_options,
+                    opset,
+                    expected_ep_assignment);
+  } else {
+    RunQnnModelTest(BuildOpTestCase<DataType, float>("Upsample", {input_def}, {scales_def}, attrs),
+                    provider_options,
+                    opset,
+                    expected_ep_assignment);
+  }
+}
+
+//
+// CPU tests:
+//
+
+// Test that Upsample with a dynamic scales input is not supported by QNN EP.
+TEST_F(QnnCPUBackendTests, Upsample_DynamicScales_Unsupported) {
+  RunUpsampleTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                       TestInputDef<float>({4}, false /* is_initializer */, {1.0f, 1.0f, 1.5f, 1.5f}),
+                       {utils::MakeAttribute("mode", "nearest")},  // Attributes
+                       ExpectedEPNodeAssignment::None,             // Should not be assigned to QNN EP.
+                       9);                                         // Opset
+}
+
+// Test Upsample with opset-9, mode `nearest`
+TEST_F(QnnCPUBackendTests, Upsample_4D_Nearest_opset9) {
+  RunUpsampleTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                       TestInputDef<float>({4}, true, {1.0f, 1.0f, 1.5f, 1.5f}),
+                       {utils::MakeAttribute("mode", "nearest")},  // Attributes
+                       ExpectedEPNodeAssignment::All,
+                       9);  // Opset
+}
+
+// Test Upsample with opset-9, mode `linear`
+TEST_F(QnnCPUBackendTests, Upsample_4D_Linear_opset9) {
+  RunUpsampleTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                       TestInputDef<float>({4}, true, {1.0f, 1.0f, 1.5f, 1.5f}),
+                       {utils::MakeAttribute("mode", "linear")},  // Attributes
+                       ExpectedEPNodeAssignment::All,
+                       9);  // Opset
+}
+
+// Test Upsample with opset-7, mode `nearest`
+TEST_F(QnnCPUBackendTests, Upsample_4D_Nearest_opset7) {
+  RunUpsampleTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                       TestInputDef<float>({4}, true, {1.0f, 1.0f, 1.5f, 1.5f}),
+                       {utils::MakeAttribute("mode", "nearest")},  // Attributes
+                       ExpectedEPNodeAssignment::All,
+                       7);  // Opset
+}
+
+// Test Upsample with opset-7, mode `linear`
+TEST_F(QnnCPUBackendTests, Upsample_4D_Linear_opset7) {
+  RunUpsampleTestOnCPU(TestInputDef<float>({1, 3, 4, 4}, false, -10.0f, 10.0f),
+                       TestInputDef<float>({4}, true, {1.0f, 1.0f, 1.5f, 1.5f}),
+                       {utils::MakeAttribute("mode", "linear")},  // Attributes
+                       ExpectedEPNodeAssignment::All,
+                       7);  // Opset
+}
+
+// Test Upsample 5D
+TEST_F(QnnCPUBackendTests, Upsample_5D) {
+  RunUpsampleTestOnCPU(TestInputDef<float>({1, 3, 4, 4, 4}, false, -10.0f, 10.0f),
+                       TestInputDef<float>({5}, true, {1.0f, 1.0f, 1.5f, 1.5f, 1.5f}),
+                       {utils::MakeAttribute("mode", "nearest")},  // Attributes
+                       ExpectedEPNodeAssignment::All,
+                       9);  // Opset
+}
+
+/*
+QNN HTP backend tests for the QDQ Upsample model is bypassed and can not be enabled.
+
+ONNX Upsample is deprecated in domain version 10. However, ONNX QuantizeLinear and DequantizeLinear are enabled in
+domain version 10. Their conditions are mutually exclusive, so it is not possible for these ops to coexist in the
+same domain version.
+*/
+
+}  // namespace test
+}  // namespace onnxruntime
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/setup.py b/setup.py
index 1e426ea8e060b..3e0a96db39390 100644
--- a/setup.py
+++ b/setup.py
@@ -371,7 +371,6 @@ def finalize_options(self):
         "libQnnSaver.so",
         "libQnnSystem.so",
         "libHtpPrepare.so",
-        "ep_weight_sharing_ctx_gen",
     ]
     dl_libs.extend(qnn_deps)
     if nightly_build:
@@ -474,7 +473,7 @@ def finalize_options(self):
 examples = [path.join("datasets", x) for x in examples_names]
 
 # Extra files such as EULA and ThirdPartyNotices (and Qualcomm License, only for QNN release packages)
-extra = ["LICENSE", "ThirdPartyNotices.txt", "Privacy.md", "Qualcomm AI Hub Proprietary License.pdf"]
+extra = ["LICENSE", "ThirdPartyNotices.txt", "Privacy.md", "Qualcomm_LICENSE.pdf"]
 
 # Description
 readme_file = "docs/python/ReadMeOV.rst" if is_openvino else "docs/python/README.rst"
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 2a06916a8208a..ef1954efbb9a2 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -461,6 +461,7 @@ def generate_build_tree(
             else "OFF"
         ),
         "-Donnxruntime_REDUCED_OPS_BUILD=" + ("ON" if is_reduced_ops_build(args) else "OFF"),
+        "-Donnxruntime_CLIENT_PACKAGE_BUILD=" + ("ON" if args.client_package_build else "OFF"),
         "-Donnxruntime_BUILD_MS_EXPERIMENTAL_OPS=" + ("ON" if args.ms_experimental else "OFF"),
         "-Donnxruntime_ENABLE_LTO=" + ("ON" if args.enable_lto else "OFF"),
         "-Donnxruntime_USE_ACL=" + ("ON" if args.use_acl else "OFF"),
diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py
index 215ad77335083..edd04ed77cc17 100644
--- a/tools/ci_build/build_args.py
+++ b/tools/ci_build/build_args.py
@@ -527,6 +527,15 @@ def add_size_reduction_args(parser: argparse.ArgumentParser) -> None:
     )
 
 
+def add_client_package_args(parser: argparse.ArgumentParser) -> None:
+    """Adds arguments for client package build package."""
+    parser.add_argument(
+        "--client_package_build",
+        action="store_true",
+        help="Create ORT package with default settings more appropriate for client/on-device workloads.",
+    )
+
+
 def add_python_binding_args(parser: argparse.ArgumentParser) -> None:
     """Adds arguments for Python bindings."""
     parser.add_argument("--enable_pybind", action="store_true", help="Enable Python bindings.")
@@ -835,6 +844,7 @@ def convert_arg_line_to_args(self, arg_line: str) -> list[str]:  # Use list[str]
     add_dependency_args(parser)
     add_extension_args(parser)
     add_size_reduction_args(parser)
+    add_client_package_args(parser)
 
     # Language Bindings
     add_python_binding_args(parser)
diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
index ba6a33b07e765..91f35d2b54033 100644
--- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml
@@ -32,7 +32,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 jobs:
 - job: Build_QNN_EP
@@ -52,7 +52,7 @@ jobs:
   - script: sudo chmod go+rw /dev/kvm
     displayName: Update permissions to KVM
 
-  - template: templates/jobs/download_linux_qnn_sdk.yml
+  - template: templates/jobs/init_linux_qnn_sdk_x64.yml
     parameters:
       QnnSDKVersion: ${{ parameters.QnnSdk }}
 
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 093ce0a49aa9e..69ccd95ee6eb4 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -60,7 +60,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.33.0.250327
+  default: 2.36.1.250708
 
 resources:
   repositories:
@@ -189,8 +189,8 @@ extends:
         DoEsrp: ${{ parameters.DoEsrp }}
         NuPackScript: |
           msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} /p:CurrentData=$(BuildDate) /p:CurrentTime=$(BuildTime)
-          copy $(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg $(Build.ArtifactStagingDirectory)
-          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg $(Build.ArtifactStagingDirectory)
+          copy $(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\Microsoft.ML.OnnxRuntime.DirectML.1.22.2.nupkg $(Build.ArtifactStagingDirectory)
+          copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\Microsoft.ML.OnnxRuntime.DirectML.1.22.2.nupkg $(Build.ArtifactStagingDirectory)
           mkdir $(Build.ArtifactStagingDirectory)\testdata
           copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
 
@@ -210,7 +210,7 @@ extends:
         NuPackScript: |
           msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=x86 /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
           cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
-          ren Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg win-dml-x86.zip
+          ren Microsoft.ML.OnnxRuntime.DirectML.1.22.2.nupkg win-dml-x86.zip
           copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-x86.zip $(Build.ArtifactStagingDirectory)
           mkdir $(Build.ArtifactStagingDirectory)\testdata
           copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
@@ -231,7 +231,7 @@ extends:
         NuPackScript: |
           msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm64 /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
           cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
-          ren Microsoft.ML.OnnxRuntime.DirectML.1.22.1.nupkg win-dml-arm64.zip
+          ren Microsoft.ML.OnnxRuntime.DirectML.1.22.2.nupkg win-dml-arm64.zip
           copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-arm64.zip $(Build.ArtifactStagingDirectory)
           mkdir $(Build.ArtifactStagingDirectory)\testdata
           copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
diff --git a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
index b1a7c92dc3529..5fafd1ee15485 100644
--- a/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/custom-nuget-packaging-pipeline.yml
@@ -6,7 +6,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 - name: IsReleaseBuild
   displayName: Is a release build? Set it to true if you are doing an Onnx Runtime release.
diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
index f08fd70d6d6cf..526ed71df2006 100644
--- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml
@@ -33,7 +33,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 jobs:
   - job: Build_QNN_EP
diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
index d19f9bde7ad75..b99246625cb77 100644
--- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml
@@ -59,7 +59,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 trigger: none
 
diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
index 722a3162cfed8..626a638121858 100644
--- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml
@@ -2,7 +2,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 - name: build_config
   displayName: Build Configuration
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
index 4c18fb73cd779..6a1f0ef464df0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml
@@ -59,7 +59,7 @@ parameters:
 - name: qnn_sdk_version
   type: string
   displayName: 'QNN SDK version. Only for QNN packages.'
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 stages:
 - ${{ if eq(parameters.enable_windows_cpu, true) }}:
@@ -287,7 +287,7 @@ stages:
     - template: ../templates/py-linux.yml
       parameters:
         arch: 'x86_64'
-        machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU-Large'
+        machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
         extra_build_arg: ${{ parameters.build_py_parameters }}
         cmake_build_type: ${{ parameters.cmake_build_type }}
         is1ES: true
diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
index eea9b672eef3d..45fc78a4f6e03 100644
--- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml
@@ -62,7 +62,7 @@ stages:
       - template: py-linux-gpu-stage.yml
         parameters:
           arch: 'x86_64'
-          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU-Large'
+          machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU'
           extra_build_arg: ${{ parameters.build_py_parameters }}
           cmake_build_type: ${{ parameters.cmake_build_type }}
           cuda_version: ${{ parameters.cuda_version }}
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
index d1fa72d7e4413..74f7f782fe1b2 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar-test.yml
@@ -19,7 +19,7 @@ parameters:
 - name: QnnSDKVersion
   displayName: QNN SDK Version
   type: string
-  default: '2.33.0.250327'
+  default: '2.36.1.250708'
 
 - name: enableWebGpu
   displayName: Enable WebGPU test
diff --git a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
index 4474a6b45ef58..bbb84642320fb 100644
--- a/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/android-java-api-aar.yml
@@ -53,7 +53,7 @@ parameters:
 - name: QnnSDKVersion
   displayName: QNN SDK Version
   type: string
-  default: '2.33.0.250327'
+  default: '2.36.1.250708'
 
 - name: is1ES
   displayName: Is 1ES pipeline
@@ -103,7 +103,7 @@ jobs:
   - template: use-android-ndk.yml
 
   - ${{ if contains(parameters.packageName, 'qnn') }}:
-    - template: jobs/download_linux_qnn_sdk.yml
+    - template: jobs/init_linux_qnn_sdk_x64.yml
       parameters:
         QnnSDKVersion: '${{parameters.QnnSDKVersion}}'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 9f65fc8891e94..cac46e26fef1c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -47,7 +47,7 @@ parameters:
 - name: QnnSDKVersion
   displayName: QNN SDK Version
   type: string
-  default: 2.33.0.250327
+  default: 2.36.1.250708
 
 - name: is1ES
   displayName: Is 1ES pipeline
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
index e00e40b80b723..57703239fc594 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.33.2.250410'
+    default: '2.36.1.250708'
 
 steps:
   - script: |
@@ -39,10 +39,6 @@ steps:
       fi
     displayName: "Sanity Check: QnnSDKVersion vs sdk.yaml version"
 
-  - script: |
-      azcopy cp --recursive 'https://lotusscus.blob.core.windows.net/models/qnnsdk/Qualcomm AI Hub Proprietary License.pdf' $(QnnSDKRootDir)
-    displayName: 'Download Qualcomm AI Hub license'
-
   - script: |
       ls -al $(QnnSDKRootDir)
     displayName: 'Print contents of QNN SDK'
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
index 3b27060b3fcec..d2e401f3f6ab4 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml
@@ -1,7 +1,7 @@
 parameters:
   - name: QnnSDKVersion
     type: string
-    default: '2.33.2.250410'
+    default: '2.36.1.250708'
 
 steps:
   - powershell: |
@@ -18,10 +18,6 @@ steps:
         echo $(QnnSDKRootDir)
     displayName: 'Print QnnSDKRootDir after downloading QNN SDK'
 
-  - powershell: |
-      azcopy.exe cp --recursive 'https://lotusscus.blob.core.windows.net/models/qnnsdk/Qualcomm AI Hub Proprietary License.pdf' $(QnnSDKRootDir)
-    displayName: 'Download Qualcomm AI Hub license'
-
   - task: CmdLine@2
     displayName: 'Print contents of QNN SDK'
     inputs:
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/init_linux_qnn_sdk_x64.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/init_linux_qnn_sdk_x64.yml
new file mode 100644
index 0000000000000..b7fb8a51f28be
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/init_linux_qnn_sdk_x64.yml
@@ -0,0 +1,42 @@
+parameters:
+  - name: QnnSDKVersion
+    type: string
+    default: '2.36.1.250708'
+
+steps:
+  - bash: |
+      echo "##vso[task.setvariable variable=QnnSDKRootDir]/data/qnnsdk/qnn-v${{ parameters.QnnSDKVersion }}"
+    displayName: Set QnnSDKRootDir
+
+  - script: |
+      echo $(QnnSDKRootDir)
+    displayName: 'Print QnnSDKRootDir after downloading QNN SDK'
+
+  - script: |
+      set -x
+      sdk_file="$(QnnSDKRootDir)/sdk.yaml"
+      # Parse the sdk.yaml file to get the QNN SDK version downloaded
+      downloaded_qnn_sdk_version=$(grep '^version:' "$sdk_file" | head -n 1 | cut -d':' -f2 | xargs | cut -d'.' -f1-3 | tr -d '\r')
+
+      # Extract major.minor.patch part from QnnSDKVersion passed as parameter
+      expected_qnn_sdk_version=$(echo ${{ parameters.QnnSDKVersion }} | cut -d'.' -f1-3)
+
+      if [[ -z "$downloaded_qnn_sdk_version" ]]; then
+        echo "QNN version not found in sdk.yaml."
+        exit 1
+      fi
+
+      # Compare provided version with version from sdk.yaml
+      if [[ "$downloaded_qnn_sdk_version" == "$expected_qnn_sdk_version" ]]; then
+        echo "Success: QnnSDKVersion matches sdk.yaml version ($downloaded_qnn_sdk_version)."
+      else
+        echo "Error: QnnSDKVersion ($expected_qnn_sdk_version) does not match sdk.yaml version ($downloaded_qnn_sdk_version) in the QNN SDK directory"
+        exit 1
+      fi
+    displayName: "Sanity Check: QnnSDKVersion vs sdk.yaml version"
+
+
+
+  - script: |
+      ls -al $(QnnSDKRootDir)
+    displayName: 'Print contents of QNN SDK'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
index c361fe678699e..a7cbf196c10fd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml
@@ -26,7 +26,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
   
 - name: is1ES
   displayName: 'Whether the pipeline is running in 1ES'
@@ -60,7 +60,7 @@ jobs:
     clean: true
     submodules: none
 
-  - template: jobs/download_linux_qnn_sdk.yml
+  - template: jobs/init_linux_qnn_sdk_x64.yml
     parameters:
       QnnSDKVersion: ${{ parameters.QnnSdk }}
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
index c1f47de63c38c..185f41822a7e5 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
index 6df46bfc8e1b0..9a1e7e5e251c9 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 - name: ENV_SETUP_SCRIPT
   type: string
@@ -91,7 +91,7 @@ jobs:
         --use_qnn
         --qnn_home $(QnnSDKRootDir)
         --enable_pybind
-        --parallel --update --arm64ec
+        --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --update --arm64ec
         $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }}
       workingDirectory: '$(Build.BinariesDirectory)'
 
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
index 72c8323d032ed..5affc152a0a4a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml
@@ -7,7 +7,7 @@ parameters:
 - name: QNN_SDK
   displayName: QNN SDK Version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 - name: ENV_SETUP_SCRIPT
   type: string
diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
index 05b4485e98ebd..29ebb8c4e4e61 100644
--- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml
@@ -1,5 +1,5 @@
 parameters:
-  QnnSdk: '2.33.2.250410'
+  QnnSdk: '2.36.1.250708'
   build_config: 'RelWithDebInfo'
   IsReleaseBuild: false
   DoEsrp: false
@@ -20,7 +20,7 @@ stages:
       name: ${{ parameters.qnn_ep_build_pool_name }}
     variables:
       OrtPackageId: ${{ parameters.OrtNugetPackageId }}
-      commonBuildArgs: '--compile_no_warning_as_error --skip_submodule_sync --build_shared_lib --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_binskim_compliant_compile_flags '
+      commonBuildArgs: '--compile_no_warning_as_error --skip_submodule_sync --build_shared_lib --client_package_build --cmake_generator "Visual Studio 17 2022" --config ${{ parameters.build_config }} --parallel --use_vcpkg --use_vcpkg_ms_internal_asset_cache --use_binskim_compliant_compile_flags '
 
     steps:
     - template: set-version-number-variables-step.yml
@@ -125,4 +125,4 @@ stages:
       displayName: 'Publish Pipeline Qnn NuGet Artifact'
       inputs:
         artifactName: 'drop-signed-nuget-qnn'
-        targetPath: '$(Build.ArtifactStagingDirectory)'
\ No newline at end of file
+        targetPath: '$(Build.ArtifactStagingDirectory)'
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
index 93a9909e529f8..7ebf5394e4530 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml
@@ -33,7 +33,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 jobs:
 - job: 'BUILD_QNN_EP'
@@ -50,7 +50,7 @@ jobs:
     matrix:
       SHARED_LIB:
         QnnLibKind: 'shared_lib'
-        ExtraQnnBuildArgs: ''
+        ExtraQnnBuildArgs: '--client_package_build'
       STATIC_LIB:
         QnnLibKind: 'static_lib'
         ExtraQnnBuildArgs: ''
diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
index b83621d285f9a..ffeb577547f69 100644
--- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml
@@ -33,7 +33,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK version
   type: string
-  default: 2.33.2.250410
+  default: 2.36.1.250708
 
 jobs:
 - job: 'BUILD_QNN_EP'
diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py
index 419fdd47458f7..f5fa612aab9a5 100644
--- a/tools/nuget/generate_nuspec_for_native_nuget.py
+++ b/tools/nuget/generate_nuspec_for_native_nuget.py
@@ -1080,8 +1080,8 @@ def generate_files(line_list, args):
         files_list.append(
             "<file src="
             + '"'
-            + os.path.join(args.native_build_path, "Qualcomm AI Hub Proprietary License.pdf")
-            + '" target="Qualcomm AI Hub Proprietary License.pdf" />'
+            + os.path.join(args.native_build_path, "Qualcomm_LICENSE.pdf")
+            + '" target="Qualcomm_LICENSE.pdf" />'
         )
     files_list.append("</files>")