From 748e7af8dd6e9b4683a6402a0ca6598fe23a9c1e Mon Sep 17 00:00:00 2001
From: Krish Gupta <krishom70@gmail.com>
Date: Tue, 9 Dec 2025 20:40:21 +0530
Subject: [PATCH 01/85] [flang][OpenMP] Fix firstprivate not working with
 lastprivate in DO SIMD (#170163)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a bug where firstprivate was ignored when the same variable
had both firstprivate and lastprivate clauses in a do simd construct.

What was broken:
```
integer :: a
a = 10
!$omp do simd firstprivate(a) lastprivate(a)
do i = 1, 1
   print *, a  ! Should print 10, but printed garbage/0
   a = 20
end do
!$omp end do simd
print *, a  ! Correctly prints 20
```

Inside the loop, [a] wasn't being initialized from the firstprivate
clause—it just had whatever uninitialized value was there.

The fix:

In genCompositeDoSimd(), we were using simdItemDSP to handle
privatization for the whole loop nest. This only looked at SIMD clauses
and missed the firstprivate from the DO part. Changed it to use
wsloopItemDSP instead, which handles both DO clauses (firstprivate,
lastprivate) correctly.

One line change in OpenMP.cpp

Tests added:

Lowering test to check MLIR generation
Runtime test to verify the actual values are correct
<img width="740" height="440" alt="image"
src="https://github.com/user-attachments/assets/fa911ea8-2024-4edf-b710-52c10659742e"
/>


Fixes #168306

---------

Co-authored-by: Krish Gupta <krishgupta@Krishs-MacBook-Air.local>
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 20 ++---
 ...-simd-firstprivate-lastprivate-runtime.f90 | 48 ++++++++++
 .../do-simd-firstprivate-lastprivate.f90      | 89 +++++++++++++++++++
 flang/test/Lower/OpenMP/order-clause.f90      |  8 +-
 flang/test/Lower/OpenMP/wsloop-simd.f90       |  9 +-
 5 files changed, 151 insertions(+), 23 deletions(-)
 create mode 100644 flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
 create mode 100644 flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 582e684442dfc..9c25c1955cb78 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3314,17 +3314,12 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
                  simdReductionSyms);
 
-  DataSharingProcessor wsloopItemDSP(
-      converter, semaCtx, doItem->clauses, eval,
-      /*shouldCollectPreDeterminedSymbols=*/false,
-      /*useDelayedPrivatization=*/true, symTable);
+  DataSharingProcessor wsloopItemDSP(converter, semaCtx, doItem->clauses, eval,
+                                     /*shouldCollectPreDeterminedSymbols=*/true,
+                                     /*useDelayedPrivatization=*/true,
+                                     symTable);
   wsloopItemDSP.processStep1(&wsloopClauseOps);
 
-  DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval,
-                                   /*shouldCollectPreDeterminedSymbols=*/true,
-                                   /*useDelayedPrivatization=*/true, symTable);
-  simdItemDSP.processStep1(&simdClauseOps, simdItem->id);
-
   // Pass the innermost leaf construct's clauses because that's where COLLAPSE
   // is placed by construct decomposition.
   mlir::omp::LoopNestOperands loopNestClauseOps;
@@ -3343,8 +3338,9 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  simdArgs.priv.syms = simdItemDSP.getDelayedPrivSymbols();
-  simdArgs.priv.vars = simdClauseOps.privateVars;
+  // For composite 'do simd', privatization is handled by the wsloop.
+  // The simd does not create separate private storage for variables already
+  // privatized by the worksharing construct.
   simdArgs.reduction.syms = simdReductionSyms;
   simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
@@ -3354,7 +3350,7 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, simdItem,
                 loopNestClauseOps, iv,
                 {{wsloopOp, wsloopArgs}, {simdOp, simdArgs}},
-                llvm::omp::Directive::OMPD_do_simd, simdItemDSP);
+                llvm::omp::Directive::OMPD_do_simd, wsloopItemDSP);
   return wsloopOp;
 }
 
diff --git a/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90 b/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
new file mode 100644
index 0000000000000..4fef69188e0ee
--- /dev/null
+++ b/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
@@ -0,0 +1,48 @@
+! Test runtime behavior of DO SIMD with firstprivate and lastprivate on same variable
+! This is the reproducer from issue #168306
+
+! REQUIRES: openmp-runtime
+
+! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=LLVM
+! RUN: %flang -fopenmp %s -o %t && %t | FileCheck %s
+
+! LLVM-LABEL: define {{.*}} @_QQmain
+program main
+  integer :: a
+  integer :: i
+  
+  a = 10
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+     ! Inside loop: a should be 10 (from firstprivate initialization)
+     ! CHECK: main1 : a = 10
+     print *, "main1 : a = ", a
+     a = 20
+  end do
+  !$omp end do simd
+  ! After loop: a should be 20 (from lastprivate copy-out)
+  ! CHECK: main2 : a = 20
+  print *, "main2 : a = ", a
+  
+  call sub
+  ! CHECK: pass
+  print *, 'pass'
+end program main
+
+subroutine sub
+  integer :: a
+  integer :: i
+  
+  a = 10
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+     ! Inside loop: a should be 10 (from firstprivate initialization)
+     ! CHECK: sub1  : a = 10
+     print *, "sub1  : a = ", a
+     a = 20
+  end do
+  !$omp end do simd
+  ! After loop: a should be 20 (from lastprivate copy-out)
+  ! CHECK: sub2  : a = 20
+  print *, "sub2  : a = ", a
+end subroutine sub
diff --git a/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90 b/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
new file mode 100644
index 0000000000000..429409926d47b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
@@ -0,0 +1,89 @@
+! Test for DO SIMD with the same variable in both firstprivate and lastprivate clauses
+! This tests the fix for issue #168306
+
+! RUN: %flang_fc1 -fopenmp -mmlir --enable-delayed-privatization-staging=true -emit-hlfir %s -o - | FileCheck %s
+
+! Test case 1: Basic test with firstprivate + lastprivate on same variable
+! CHECK-LABEL: func.func @_QPdo_simd_first_last_same_var
+subroutine do_simd_first_last_same_var()
+  integer :: a
+  integer :: i
+  a = 10
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  ! CHECK-NEXT: omp.loop_nest (%[[IV:.*]]) : i32
+  !$omp do simd firstprivate(a) lastprivate(a)
+  do i = 1, 1
+    ! CHECK: %[[FIRSTPRIV_A_DECL:.*]]:2 = hlfir.declare %[[FIRSTPRIV_A]]
+    ! CHECK: %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I]]
+    ! The private copy should be initialized from firstprivate (value 10)
+    ! and then modified to 20
+    a = 20
+  end do
+  !$omp end do simd
+  ! After the loop, 'a' should be 20 due to lastprivate
+end subroutine do_simd_first_last_same_var
+
+! Test case 2: Test with lastprivate and firstprivate in reverse order
+! CHECK-LABEL: func.func @_QPdo_simd_last_first_reverse
+subroutine do_simd_last_first_reverse()
+  integer :: a
+  integer :: i
+  a = 10
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+    a = 20
+  end do
+  !$omp end do simd
+end subroutine do_simd_last_first_reverse
+
+! Test case 3: Multiple variables with mixed privatization
+! CHECK-LABEL: func.func @_QPdo_simd_multiple_vars
+subroutine do_simd_multiple_vars()
+  integer :: a, b, c
+  integer :: i
+  a = 10
+  b = 20
+  c = 30
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %{{.*}}, @{{.*}}firstprivate{{.*}} %{{.*}} -> %{{.*}}, @{{.*}}private{{.*}} %{{.*}} -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  !$omp do simd firstprivate(a, b) lastprivate(a) private(c)
+  do i = 1, 5
+    a = a + 1
+    b = b + 1
+    c = i
+  end do
+  !$omp end do simd
+end subroutine do_simd_multiple_vars
+
+! Test case 4: Reproducer from issue #168306
+! CHECK-LABEL: func.func @_QPissue_168306_reproducer
+subroutine issue_168306_reproducer()
+  integer :: a
+  integer :: i
+  a = 10
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+    ! Inside the loop, 'a' should start at 10 (from firstprivate)
+    ! This is the key behavior that was broken
+    a = 20
+  end do
+  !$omp end do simd
+  ! After the loop, 'a' should be 20 (from lastprivate)
+end subroutine issue_168306_reproducer
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 b/flang/test/Lower/OpenMP/order-clause.f90
index d5799079b3759..9da7d905ceeed 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -36,15 +36,15 @@ end subroutine do_order
 
 !CHECK-LABEL:   func.func @_QPdo_simd_order() {
 subroutine do_simd_order
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent)
    !$omp do simd order(concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent)
    !$omp do simd order(reproducible:concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(unconstrained:concurrent) {
+   !CHECK: omp.wsloop order(unconstrained:concurrent)
    !$omp do simd order(unconstrained:concurrent)
    do i = 1, 10
    end do
@@ -53,7 +53,7 @@ end subroutine do_simd_order
 !CHECK-LABEL:   func.func @_QPdo_simd_order_parallel() {
 subroutine do_simd_order_parallel
    !CHECK: omp.parallel {
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent)
    !$omp parallel do simd order(reproducible:concurrent)
    do i = 1, 10
    end do
diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90
index 03e35de04cace..b18bc29efb230 100644
--- a/flang/test/Lower/OpenMP/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -71,16 +71,13 @@ end subroutine do_simd_reduction
 subroutine do_simd_private()
   integer, allocatable :: tmp
   ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@[[PRIV_IVAR_SYM:.*]] %{{.*}} -> %[[PRIV_IVAR:.*]] : !fir.ref<i32>)
   ! CHECK-NEXT: omp.simd
-  ! CHECK-SAME: private(@[[PRIV_BOX_SYM:.*]] %{{.*}} -> %[[PRIV_BOX:.*]], @[[PRIV_IVAR_SYM:.*]] %{{.*}} -> %[[PRIV_IVAR:.*]] : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<i32>)
   ! CHECK-NEXT: omp.loop_nest (%[[IVAR:.*]]) : i32
   !$omp do simd private(tmp)
   do i=1, 10
-  ! CHECK:      %[[PRIV_BOX_DECL:.*]]:2 = hlfir.declare %[[PRIV_BOX]]
   ! CHECK:      %[[PRIV_IVAR_DECL:.*]]:2 = hlfir.declare %[[PRIV_IVAR]]
   ! CHECK:      hlfir.assign %[[IVAR]] to %[[PRIV_IVAR_DECL]]#0
-  ! CHECK:      %[[PRIV_BOX_LOAD:.*]] = fir.load %[[PRIV_BOX_DECL]]
-  ! CHECK:      hlfir.assign %{{.*}} to %[[PRIV_BOX_DECL]]#0
   ! CHECK:      omp.yield
     tmp = tmp + 1
   end do
@@ -90,13 +87,11 @@ end subroutine do_simd_private
 subroutine do_simd_lastprivate_firstprivate()
   integer :: a
   ! CHECK:      omp.wsloop
-  ! CHECK-SAME: private(@[[FIRSTPRIVATE_A_SYM:.*]] %{{.*}} -> %[[FIRSTPRIVATE_A:.*]] : !fir.ref<i32>)
+  ! CHECK-SAME: private(@[[FIRSTPRIVATE_A_SYM:.*]] %{{.*}} -> %[[FIRSTPRIVATE_A:.*]], @[[PRIVATE_I_SYM:.*]] %{{.*}} -> %[[PRIVATE_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
   ! CHECK-NEXT: omp.simd
-  ! CHECK-SAME: private(@[[PRIVATE_A_SYM:.*]] %{{.*}} -> %[[PRIVATE_A:.*]], @[[PRIVATE_I_SYM:.*]] %{{.*}} -> %[[PRIVATE_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
   !$omp do simd lastprivate(a) firstprivate(a)
   do i = 1, 10
     ! CHECK: %[[FIRSTPRIVATE_A_DECL:.*]]:2 = hlfir.declare %[[FIRSTPRIVATE_A]]
-    ! CHECK: %[[PRIVATE_A_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_A]]
     ! CHECK: %[[PRIVATE_I_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_I]]
     a = a + 1
   end do

From b0bd8bdbd89701173db0d446757aad1ad166f08d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 14:56:02 +0100
Subject: [PATCH 02/85] [AtomicExpand] Use getSigned() for negative value

---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index d9bc042d6807e..d19862ad7c188 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -853,8 +853,8 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
   if (AddrAlign < MinWordSize) {
     PMV.AlignedAddr = Builder.CreateIntrinsic(
         Intrinsic::ptrmask, {PtrTy, IntTy},
-        {Addr, ConstantInt::get(IntTy, ~(uint64_t)(MinWordSize - 1))}, nullptr,
-        "AlignedAddr");
+        {Addr, ConstantInt::getSigned(IntTy, ~(uint64_t)(MinWordSize - 1))},
+        nullptr, "AlignedAddr");
 
     Value *AddrInt = Builder.CreatePtrToInt(Addr, IntTy);
     PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB");

From 80fc9bc17715cb4e68ec233ae1db668ac321e777 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 15:20:08 +0100
Subject: [PATCH 03/85] [Hexagon] Use getSigned() for signed value

---
 llvm/lib/Target/Hexagon/HexagonISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index bae9d705f5a7a..025e5b087abed 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2527,7 +2527,7 @@ HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values,
     // Make sure to always cast to IntTy.
     if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) {
       const ConstantInt *CI = CN->getConstantIntValue();
-      Consts[i] = ConstantInt::get(IntTy, CI->getValue().getSExtValue());
+      Consts[i] = ConstantInt::getSigned(IntTy, CI->getValue().getSExtValue());
     } else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) {
       const ConstantFP *CF = CN->getConstantFPValue();
       APInt A = CF->getValueAPF().bitcastToAPInt();

From cf9ba401e3fb2a9c3c728f1a0f49e75db372e704 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 14:38:28 +0100
Subject: [PATCH 04/85] [BypassSlowDivision] Explicitly create bit mask

Explicitly create the high bit mask using getBitsSetFrom() instead
of inverting an integer. This avoids relying on implicit
truncation.
---
 llvm/lib/Transforms/Utils/BypassSlowDivision.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 9f6d89e97180f..66d8fea251cbd 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -335,10 +335,10 @@ Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
   else
     OrV = Op1 ? Op1 : Op2;
 
-  // BitMask is inverted to check if the operands are
-  // larger than the bypass type
-  uint64_t BitMask = ~BypassType->getBitMask();
-  Value *AndV = Builder.CreateAnd(OrV, BitMask);
+  // Check whether the operands are larger than the bypass type.
+  Value *AndV = Builder.CreateAnd(
+      OrV, APInt::getBitsSetFrom(OrV->getType()->getIntegerBitWidth(),
+                                 BypassType->getBitWidth()));
 
   // Compare operand values
   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);

From 005ef5cda00d4a5a03c9b7ebd2113b5f2314da89 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 9 Dec 2025 10:14:32 -0500
Subject: [PATCH 05/85] [libc][CI] update macOS version in workflow
 configuration (#171228)

upgrade macOS version to latest stable version in github action. We run
into a problem that timed `os_sync` API only becomes available in 14.4+.
---
 .github/workflows/libc-overlay-tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml
index 807377564fa13..6bb01d502050e 100644
--- a/.github/workflows/libc-overlay-tests.yml
+++ b/.github/workflows/libc-overlay-tests.yml
@@ -16,7 +16,7 @@ jobs:
       # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations.
       fail-fast: false
       matrix:
-        os: [ubuntu-24.04, ubuntu-24.04-arm, windows-2022, windows-2025, macos-14]
+        os: [ubuntu-24.04, ubuntu-24.04-arm, windows-2022, windows-2025, macos-15]
         include:
           # TODO: add linux gcc when it is fixed
           - os: ubuntu-24.04
@@ -35,7 +35,7 @@ jobs:
             compiler:
               c_compiler: clang-cl
               cpp_compiler: clang-cl
-          - os: macos-14
+          - os: macos-15
             compiler:
               c_compiler: clang
               cpp_compiler: clang++

From 6960b633ee7633d0ed7e9853baea296dbe201ab2 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 12:37:42 +0100
Subject: [PATCH 06/85] [LSR] Use getSigned() for negated immediate

---
 llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 63b228efe3b11..68cffd4c18688 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -5805,7 +5805,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
       // negated immediate.
       if (!ICmpScaledV)
         ICmpScaledV =
-            ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
+            ConstantInt::getSigned(IntTy, -(uint64_t)Offset.getFixedValue());
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
         ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());

From 4afc92e43a1462df613e26997f64cb368d552648 Mon Sep 17 00:00:00 2001
From: Jessica Clarke <jrtc27@jrtc27.com>
Date: Tue, 9 Dec 2025 15:26:09 +0000
Subject: [PATCH 07/85] [NFC][ELF] Remove pointless NEEDS_TLSGD_TO_IE (#171046)

NEEDS_TLSGD_TO_IE is only ever set when the symbol is preeptible, in
which case addTpOffsetGotEntry will just add the symbol to the GOT and
emit a symbolic tlsGotRel anyway, so there is no need to give it its own
special case.

As well as simplifying the code upstream, this is useful downstream for
Morello, which doesn't really have a proper GD/IE-to-LE relaxation, and
so for GD-to-IE can benefit from being able to use the optimisations
addTpOffsetGotEntry has for non-preemptible symbols, rather than having
to reimplement them here.
---
 lld/ELF/Relocations.cpp | 9 ++-------
 lld/ELF/Symbols.h       | 4 ++--
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index ef19a2af0c4d2..59aa43036ce01 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1295,7 +1295,7 @@ unsigned RelocScan::handleTlsRelocation(RelExpr expr, RelType type,
     // label, so TLSDESC=>IE will be categorized as R_RELAX_TLS_GD_TO_LE. We fix
     // the categorization in RISCV::relocateAllosec->
     if (sym.isPreemptible) {
-      sym.setFlags(NEEDS_TLSGD_TO_IE);
+      sym.setFlags(NEEDS_TLSIE);
       sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE),
                      type, offset, addend, &sym});
     } else {
@@ -1635,18 +1635,13 @@ void elf::postScanRelocations(Ctx &ctx) {
       else
         got->addConstant({R_ABS, ctx.target->tlsOffsetRel, offsetOff, 0, &sym});
     }
-    if (flags & NEEDS_TLSGD_TO_IE) {
-      got->addEntry(sym);
-      ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->tlsGotRel, *got,
-                                            sym.getGotOffset(ctx), sym);
-    }
     if (flags & NEEDS_GOT_DTPREL) {
       got->addEntry(sym);
       got->addConstant(
           {R_ABS, ctx.target->tlsOffsetRel, sym.getGotOffset(ctx), 0, &sym});
     }
 
-    if ((flags & NEEDS_TLSIE) && !(flags & NEEDS_TLSGD_TO_IE))
+    if (flags & NEEDS_TLSIE)
       addTpOffsetGotEntry(ctx, sym);
   };
 
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index a7d61f48ed3d5..034c8734addb8 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -48,7 +48,7 @@ enum {
   NEEDS_COPY = 1 << 3,
   NEEDS_TLSDESC = 1 << 4,
   NEEDS_TLSGD = 1 << 5,
-  NEEDS_TLSGD_TO_IE = 1 << 6,
+  // 1 << 6 unused
   NEEDS_GOT_DTPREL = 1 << 7,
   NEEDS_TLSIE = 1 << 8,
   NEEDS_GOT_AUTH = 1 << 9,
@@ -352,7 +352,7 @@ class Symbol {
   bool needsDynReloc() const {
     return flags.load(std::memory_order_relaxed) &
            (NEEDS_COPY | NEEDS_GOT | NEEDS_PLT | NEEDS_TLSDESC | NEEDS_TLSGD |
-            NEEDS_TLSGD_TO_IE | NEEDS_GOT_DTPREL | NEEDS_TLSIE);
+            NEEDS_GOT_DTPREL | NEEDS_TLSIE);
   }
   void allocateAux(Ctx &ctx) {
     assert(auxIdx == 0);

From 51d928f0cff426c70d368afd09afd14cf9a67dfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gergely=20B=C3=A1lint?= <gergely.balint@arm.com>
Date: Tue, 9 Dec 2025 16:36:13 +0100
Subject: [PATCH 08/85] [BOLT] Fix pacret-synchronous-unwind.cpp test (#171395)

The test case build a binary from C++, and checks for the number of
functions the PointerAuthCFIFixup pass runs on.
This can change based on the platform. To account for this, the patch
changes the number to a regex.

The test failed when running on RHEL 9.
---
 .../runtime/AArch64/pacret-synchronous-unwind.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
index 1bfeeaed3715a..0f5e9a38da2ba 100644
--- a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -11,12 +11,15 @@
 // RUN: -fno-asynchronous-unwind-tables \
 // RUN: %s -o %t.exe -Wl,-q
 // RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
-//
-// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
-// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
-// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
-// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports asynchronous
-// CHECK-SAME: unwind tables. For C compilers, see -fasynchronous-unwind-tables.
+
+// Number of functions with .cfi-negate-ra-state in the binary is
+// platform-dependent.
+// CHECK: BOLT-INFO: PointerAuthCFIAnalyzer ran on {{[0-9]+}} functions.
+// CHECK-SAME: Ignored {{[0-9]}} functions ({{[0-9.]+}}%) because of CFI
+// CHECK-SAME: inconsistencies
+// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports
+// CHECK-SAME: asynchronous unwind tables. For C compilers, see
+// CHECK-SAME: -fasynchronous-unwind-tables.
 
 #include <cstdio>
 #include <stdexcept>

From b2ddb909cfd410ed22fee79511bd09e1ba3d9829 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 9 Dec 2025 16:37:32 +0100
Subject: [PATCH 09/85] [libc++] Don't try to be compatible with libstdc++ in
 __libcpp_refstring on iOS (#170816)

iOS doesn't provide a libstdc++ dylib anymore, so we can remove the
compatiblity check whether we can load the dylib.
---
 libcxx/src/include/refstring.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/src/include/refstring.h b/libcxx/src/include/refstring.h
index 3e0ec7a97c7be..1c73c60f9ced1 100644
--- a/libcxx/src/include/refstring.h
+++ b/libcxx/src/include/refstring.h
@@ -15,7 +15,7 @@
 #include <cstring>
 #include <stdexcept>
 
-// MacOS and iOS used to ship with libstdc++, and still support old applications
+// MacOS used to ship with libstdc++, and still support old applications
 // linking against libstdc++. The libc++ and libstdc++ exceptions are supposed
 // to be ABI compatible, such that they can be thrown from one library and caught
 // in the other.
@@ -25,7 +25,7 @@
 // string singleton before manipulating the reference count. This is done so that
 // if an exception is created with a zero-length string in libstdc++, libc++abi
 // won't try to delete the memory.
-#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) || defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)
+#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
 #  define _LIBCPP_CHECK_FOR_GCC_EMPTY_STRING_STORAGE
 #  include <dlfcn.h>
 #  include <mach-o/dyld.h>

From 6b58449b2c813588f9c251d4f0963762d6746881 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 9 Dec 2025 10:43:47 -0500
Subject: [PATCH 10/85] Update the NATVIS file

ElaboratedType is no longer a thing.
---
 clang/utils/ClangVisualizers/clang.natvis | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/clang/utils/ClangVisualizers/clang.natvis b/clang/utils/ClangVisualizers/clang.natvis
index 3ecd93902d1bb..0755f0ffcbf56 100644
--- a/clang/utils/ClangVisualizers/clang.natvis
+++ b/clang/utils/ClangVisualizers/clang.natvis
@@ -44,9 +44,6 @@ For later versions of Visual Studio, no setup is required-->
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="poly">{(clang::DecayedType *)this,na}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="left">{(clang::DecayedType *)this,view(left)na}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="right">{(clang::DecayedType *)this,view(right)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="poly">{(clang::ElaboratedType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="left">{(clang::ElaboratedType *)this,view(left)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="right">{(clang::ElaboratedType *)this,view(right)na}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="poly">{*(clang::TemplateTypeParmType *)this}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="cpp">{*(clang::TemplateTypeParmType *)this,view(cpp)}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm" IncludeView="poly">{*(clang::SubstTemplateTypeParmType *)this}</DisplayString>
@@ -94,7 +91,6 @@ For later versions of Visual Studio, no setup is required-->
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray">(clang::IncompleteArrayType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Attributed">*(clang::AttributedType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Decayed">(clang::DecayedType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated">(clang::ElaboratedType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm">(clang::TemplateTypeParmType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm">(clang::SubstTemplateTypeParmType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Record">(clang::RecordType *)this</ExpandedItem>
@@ -428,16 +424,6 @@ For later versions of Visual Studio, no setup is required-->
       <ExpandedItem>(clang::AdjustedType *)this</ExpandedItem>
     </Expand>
   </Type>
-  <Type Name="clang::ElaboratedType">
-    <DisplayString IncludeView="left">{NamedType,view(left)}</DisplayString>
-    <DisplayString IncludeView="right">{NamedType,view(right)}</DisplayString>
-    <DisplayString>{NamedType}</DisplayString>
-    <Expand>
-      <Item Name="[Keyword]">(clang::ElaboratedTypeKeyword)TypeWithKeywordBits.Keyword</Item>
-      <Item Name="[Nested Name Specifier]">NNS</Item>
-      <Item Name="[Underlying Type]">NamedType,view(cmn)</Item>
-    </Expand>
-  </Type>
   <Type Name="clang::TemplateTypeParmType">
     <DisplayString IncludeView="cpp" Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">{TTPDecl->Name,view(cpp)}</DisplayString>
     <DisplayString Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">Non-canonical: {*TTPDecl}</DisplayString>

From b3a5870c64c94c361b4144333066382aac0b6dc9 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 9 Dec 2025 15:58:06 +0000
Subject: [PATCH 11/85] [llvm][docs] Add a release note for LLDB "version -v"

Added by #170772.
---
 llvm/docs/ReleaseNotes.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 22d5b4183fac0..8ec46c661974b 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -253,6 +253,10 @@ Changes to LLDB
   LLVM's PDB and CodeView support. You can switch back to the DIA reader with
   `settings set plugin.symbol-file.pdb.reader dia`. Note that support for the
   DIA reader will be removed in a future version of LLDB.
+* A `--verbose` option was added to the `version` command. When `--verbose` is used,
+  LLDB's build configuration is included in the command's output. This includes
+  all the supported targets, along with the presence of (or lack of) optional
+  features like XML parsing.
 
 Changes to BOLT
 ---------------------------------

From c66eb25459279ce7663bcd51eabaeeffafae0366 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 16:59:24 +0100
Subject: [PATCH 12/85] [OCaml] Fix build

Fix a mistake introduced in https://github.com/llvm/llvm-project/pull/163979:

We should stick with the deprecated LLVMGetGlobalContext() API
in this file, as getGlobalContextForCAPI() is a C++ API that is
not available here.
---
 llvm/bindings/ocaml/llvm/llvm_ocaml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
index 53158c88be19c..a2d948033724c 100644
--- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
@@ -240,7 +240,7 @@ value llvm_dispose_context(value C) {
 
 /* unit -> llcontext */
 value llvm_global_context(value Unit) {
-  return to_val(getGlobalContextForCAPI());
+  return to_val(LLVMGetGlobalContext());
 }
 
 /* llcontext -> string -> int */

From 7f2bbba60dce67c1eb8cc8b0633d04ca0adfff62 Mon Sep 17 00:00:00 2001
From: valadaptive <79560998+valadaptive@users.noreply.github.com>
Date: Tue, 9 Dec 2025 11:11:26 -0500
Subject: [PATCH 13/85] [AArch64][ARM] Optimize more `tbl`/`tbx` calls into
 `shufflevector` (#169748)

Resolves #169701.

This PR extends the existing InstCombine operation which folds `tbl1`
intrinsics to `shufflevector` if the mask operand is constant. Before
this change, it only handled 64-bit `tbl1` intrinsics with no
out-of-bounds indices. I've extended it to support both 64-bit and
128-bit vectors, and it now handles the full range of `tbl1`-`tbl4` and
`tbx1`-`tbx4`, as long as at most two of the input operands are actually
indexed into.

For the purposes of `tbl`, we need a dummy vector of zeroes if there are
any out-of-bounds indices, and for the purposes of `tbx`, we use the
"fallback" operand. Both of those take up an operand for the purposes of
`shufflevector`.

This works a lot like https://github.com/llvm/llvm-project/pull/169110,
with some added complexity because we need to handle multiple operands.
I raised a couple questions in that PR that still need to be answered:
- Is it correct to check `IsA<UndefValue>` for each mask index, and set
the output mask index to -1 if so? This is later folded to a poison
value, and I'm not sure about the subtle differences between poison and
undef and when you can substitute one for the other. As I mentioned in
#169110, the existing x86 pass (`simplifyX86vpermilvar`) already behaves
this way when it comes to undef.
- How can I write an Alive2 proof for this? It's very hard to find good
documentation or tutorials about Alive2.

As with #169110, most of the regression test cases were generated using
Claude. Everything else was written by me.
---
 .../InstCombine/InstCombineCalls.cpp          | 136 +++++++--
 .../Transforms/InstCombine/AArch64/tbl.ll     | 261 ++++++++++++++++++
 .../Transforms/InstCombine/AArch64/tbl1.ll    |  65 -----
 llvm/test/Transforms/InstCombine/ARM/tbl.ll   | 215 +++++++++++++++
 llvm/test/Transforms/InstCombine/ARM/tbl1.ll  |  35 ---
 5 files changed, 589 insertions(+), 123 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/tbl.ll
 delete mode 100644 llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
 create mode 100644 llvm/test/Transforms/InstCombine/ARM/tbl.ll
 delete mode 100644 llvm/test/Transforms/InstCombine/ARM/tbl1.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8e7282a4ffefe..85602a5a7575a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -737,42 +737,119 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   return nullptr;
 }
 
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-static Value *simplifyNeonTbl1(const IntrinsicInst &II,
-                               InstCombiner::BuilderTy &Builder) {
+/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
+/// at most two source operands are actually referenced.
+static Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
+                                    bool IsExtension) {
   // Bail out if the mask is not a constant.
-  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+  auto *C = dyn_cast<Constant>(II.getArgOperand(II.arg_size() - 1));
   if (!C)
     return nullptr;
 
-  auto *VecTy = cast<FixedVectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
+  auto *RetTy = cast<FixedVectorType>(II.getType());
+  unsigned NumIndexes = RetTy->getNumElements();
 
-  // Only perform this transformation for <8 x i8> vector types.
-  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+  // Only perform this transformation for <8 x i8> and <16 x i8> vector types.
+  if (!RetTy->getElementType()->isIntegerTy(8) ||
+      (NumIndexes != 8 && NumIndexes != 16))
     return nullptr;
 
-  int Indexes[8];
+  // For tbx instructions, the first argument is the "fallback" vector, which
+  // has the same length as the mask and return type.
+  unsigned int StartIndex = (unsigned)IsExtension;
+  auto *SourceTy =
+      cast<FixedVectorType>(II.getArgOperand(StartIndex)->getType());
+  // Note that the element count of each source vector does *not* need to be the
+  // same as the element count of the return type and mask! All source vectors
+  // must have the same element count as each other, though.
+  unsigned NumElementsPerSource = SourceTy->getNumElements();
+
+  // There are no tbl/tbx intrinsics for which the destination size exceeds the
+  // source size. However, our definitions of the intrinsics, at least in
+  // IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it
+  // *could* technically happen.
+  if (NumIndexes > NumElementsPerSource)
+    return nullptr;
+
+  // The tbl/tbx intrinsics take several source operands followed by a mask
+  // operand.
+  unsigned int NumSourceOperands = II.arg_size() - 1 - (unsigned)IsExtension;
 
-  for (unsigned I = 0; I < NumElts; ++I) {
+  // Map input operands to shuffle indices. This also helpfully deduplicates the
+  // input arguments, in case the same value is passed as an argument multiple
+  // times.
+  SmallDenseMap<Value *, unsigned, 2> ValueToShuffleSlot;
+  Value *ShuffleOperands[2] = {PoisonValue::get(SourceTy),
+                               PoisonValue::get(SourceTy)};
+
+  int Indexes[16];
+  for (unsigned I = 0; I < NumIndexes; ++I) {
     Constant *COp = C->getAggregateElement(I);
 
-    if (!COp || !isa<ConstantInt>(COp))
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return nullptr;
 
-    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
+    // The index of the input argument that this index references (0 = first
+    // source argument, etc).
+    unsigned SourceOperandIndex = Index / NumElementsPerSource;
+    // The index of the element at that source operand.
+    unsigned SourceOperandElementIndex = Index % NumElementsPerSource;
+
+    Value *SourceOperand;
+    if (SourceOperandIndex >= NumSourceOperands) {
+      // This index is out of bounds. Map it to index into either the fallback
+      // vector (tbx) or vector of zeroes (tbl).
+      SourceOperandIndex = NumSourceOperands;
+      if (IsExtension) {
+        // For out-of-bounds indices in tbx, choose the `I`th element of the
+        // fallback.
+        SourceOperand = II.getArgOperand(0);
+        SourceOperandElementIndex = I;
+      } else {
+        // Otherwise, choose some element from the dummy vector of zeroes (we'll
+        // always choose the first).
+        SourceOperand = Constant::getNullValue(SourceTy);
+        SourceOperandElementIndex = 0;
+      }
+    } else {
+      SourceOperand = II.getArgOperand(SourceOperandIndex + StartIndex);
+    }
+
+    // The source operand may be the fallback vector, which may not have the
+    // same number of elements as the source vector. In that case, we *could*
+    // choose to extend its length with another shufflevector, but it's simpler
+    // to just bail instead.
+    if (cast<FixedVectorType>(SourceOperand->getType())->getNumElements() !=
+        NumElementsPerSource)
+      return nullptr;
 
-    // Make sure the mask indices are in range.
-    if ((unsigned)Indexes[I] >= NumElts)
+    // We now know the source operand referenced by this index. Make it a
+    // shufflevector operand, if it isn't already.
+    unsigned NumSlots = ValueToShuffleSlot.size();
+    // This shuffle references more than two sources, and hence cannot be
+    // represented as a shufflevector.
+    if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand))
       return nullptr;
+
+    auto [It, Inserted] =
+        ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots);
+    if (Inserted)
+      ShuffleOperands[It->getSecond()] = SourceOperand;
+
+    unsigned RemappedIndex =
+        (It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex;
+    Indexes[I] = RemappedIndex;
   }
 
-  auto *V1 = II.getArgOperand(0);
-  auto *V2 = Constant::getNullValue(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
+  Value *Shuf = IC.Builder.CreateShuffleVector(
+      ShuffleOperands[0], ShuffleOperands[1], ArrayRef(Indexes, NumIndexes));
+  return IC.replaceInstUsesWith(II, Shuf);
 }
 
 // Returns true iff the 2 intrinsics have the same operands, limiting the
@@ -3167,10 +3244,23 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     return CallInst::Create(NewFn, CallArgs);
   }
   case Intrinsic::arm_neon_vtbl1:
+  case Intrinsic::arm_neon_vtbl2:
+  case Intrinsic::arm_neon_vtbl3:
+  case Intrinsic::arm_neon_vtbl4:
   case Intrinsic::aarch64_neon_tbl1:
-    if (Value *V = simplifyNeonTbl1(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
+  case Intrinsic::aarch64_neon_tbl2:
+  case Intrinsic::aarch64_neon_tbl3:
+  case Intrinsic::aarch64_neon_tbl4:
+    return simplifyNeonTbl(*II, *this, /*IsExtension=*/false);
+  case Intrinsic::arm_neon_vtbx1:
+  case Intrinsic::arm_neon_vtbx2:
+  case Intrinsic::arm_neon_vtbx3:
+  case Intrinsic::arm_neon_vtbx4:
+  case Intrinsic::aarch64_neon_tbx1:
+  case Intrinsic::aarch64_neon_tbx2:
+  case Intrinsic::aarch64_neon_tbx3:
+  case Intrinsic::aarch64_neon_tbx4:
+    return simplifyNeonTbl(*II, *this, /*IsExtension=*/true);
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
new file mode 100644
index 0000000000000..8a9ca6ce635a3
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
@@ -0,0 +1,261 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; We can turn a tbl/tbx intrinsic into a shufflevector instruction if the mask
+; is constant and references 2 or fewer operands.
+
+; Basic tbl1 with all in-bounds indices should optimize to shufflevector.
+define <16 x i8> @tbl1_basic(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_basic(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with both operands the same should optimize (1 unique source).
+define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) {
+; CHECK-LABEL: @tbl2_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl3 referencing 2 unique operands should optimize.
+define <16 x i8> @tbl3_two_sources(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl3_two_sources(
+; CHECK-NEXT:    [[TBL:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 0, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 with alternating duplicate operands should optimize (2 unique sources).
+define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl4_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 where mask only references first two operands should optimize.
+define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_unused_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 where mask only references one operand should optimize.
+define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_single_operand_used(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
+define <16 x i8> @tbl1_with_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
+define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl2_duplicate_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources).
+define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl2_with_oob_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl1 with all OOB indices should optimize to zero vector.
+define <16 x i8> @tbl1_all_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_all_oob(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl3 referencing all 3 operands should NOT optimize.
+define <16 x i8> @tbl3_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: @tbl3_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 referencing 3 unique operands should NOT optimize.
+define <16 x i8> @tbl4_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: @tbl4_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 referencing all 4 unique operands should NOT optimize.
+define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_four_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbx1 with no OOB should optimize.
+define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbx
+}
+
+; tbx2 where fallback == second source operand should optimize (deduplicated).
+define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbx2_fallback_equals_second_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with OOB where fallback == source should optimize (deduplicated).
+define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) {
+; CHECK-LABEL: @tbx1_oob_fallback_same_as_source(
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <16 x i8> [[A1:%.*]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[A]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources).
+define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbx2_with_oob_bail(
+; CHECK-NEXT:    [[TBX:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TBX]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with all OOB indices should optimize to fallback.
+define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_all_oob(
+; CHECK-NEXT:    ret <16 x i8> [[FALLBACK:%.*]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with OOB and mismatched fallback/source sizes should NOT optimize.
+define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_fallback_size_mismatch(
+; CHECK-NEXT:    [[TBX:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBX]]
+;
+  %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; tbx1 with no OOB and mismatched fallback/source sizes should optimize.
+define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbx
+}
+
+; tbl1 with non-i8 element type should NOT optimize.
+define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_8x16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; CHECK-NEXT:    ret <8 x i16> [[TBL1]]
+;
+entry:
+  ; `tbl1.v8i16` is not really a thing, but it's good to check.
+  %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %tbl1
+}
+
+; tbl1 with non-8/16 element count should NOT optimize.
+define <12 x i8> @tbl1_16x8(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_16x8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> [[VEC:%.*]], <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <12 x i8> [[TBL1]]
+;
+entry:
+  ; `tbl1.v12i8` is not really a thing, but it's good to check.
+  %tbl1 = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> %vec, <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <12 x i8> %tbl1
+}
+
+; Non-constant mask should NOT optimize.
+define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) {
+; CHECK-LABEL: @tbl1_non_constant_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[MASK:%.*]])
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %mask)
+  ret <16 x i8> %tbl
+}
+
+; Mask with some poison elements should optimize, with poison propagating to output.
+define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_poison_mask_elements(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %tbl
+}
+
+; Mask with all poison elements should optimize to poison.
+define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_all_poison_mask(
+; CHECK-NEXT:    ret <16 x i8> poison
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison)
+  ret <16 x i8> %tbl
+}
diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
deleted file mode 100644
index 362cc0f6c4493..0000000000000
--- a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64"
-
-; Turning a table lookup intrinsic into a shuffle vector instruction
-; can be beneficial. If the mask used for the lookup is the constant
-; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
-; instructions instead.
-
-define <8 x i8> @tbl1_8x8(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC:%.*]], <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
-;
-entry:
-  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %tbl1
-}
-
-; Bail the optimization if a mask index is out of range.
-define <8 x i8> @tbl1_8x8_out_of_range(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8_out_of_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <8 x i8> [[TBL1]]
-;
-entry:
-  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %tbl1
-}
-
-; Bail the optimization if the size of the return vector is not 8 elements.
-define <16 x i8> @tbl1_16x8(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_16x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[VEC:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <16 x i8> [[TBL1]]
-;
-entry:
-  %tbl1 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %vec, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <16 x i8> %tbl1
-}
-
-; Bail the optimization if the elements of the return vector are not of type i8.
-define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x16(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; CHECK-NEXT:    ret <8 x i16> [[TBL1]]
-;
-entry:
-  %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  ret <8 x i16> %tbl1
-}
-
-; The type <8 x i16> is not a valid return type for this intrinsic,
-; but we want to test that the optimization won't trigger for vector
-; elements of type different than i8.
-declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>)
-
-declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
new file mode 100644
index 0000000000000..d4d5ec284d0b7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-arm-none-eabi"
+
+; We can turn a vtbl/vtbx intrinsic into a shufflevector instruction if the mask
+; is constant and references 2 or fewer operands.
+
+; Basic vtbl1 with all in-bounds indices should optimize to shufflevector.
+define <8 x i8> @vtbl1_basic(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_basic(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with both operands the same should be optimized (1 unique source).
+define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl2_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl3 referencing 2 unique operands should optimize.
+define <8 x i8> @vtbl3_two_sources(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl3_two_sources(
+; CHECK-NEXT:    [[TBL:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 with alternating duplicate operands should optimize (2 unique sources).
+define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl4_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 0, i32 1, i32 8, i32 9>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 where mask only references first two operands should optimize.
+define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_unused_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 where mask only references one operand should optimize.
+define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_single_operand_used(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
+define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
+define <8 x i8> @vtbl2_duplicate_with_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl2_duplicate_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources).
+define <8 x i8> @vtbl2_with_oob_bail(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl2_with_oob_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl1 with all OOB indices should optimize to zero vector.
+define <8 x i8> @vtbl1_all_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_all_oob(
+; CHECK-NEXT:    ret <8 x i8> zeroinitializer
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl3 referencing all 3 operands should NOT optimize.
+define <8 x i8> @vtbl3_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: @vtbl3_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 referencing 3 unique operands should NOT optimize.
+define <8 x i8> @vtbl4_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: @vtbl4_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[A]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 referencing all 4 unique operands should NOT optimize.
+define <8 x i8> @vtbl4_four_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_four_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbx1 with no OOB should optimize.
+define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx2 where fallback == second source operand should optimize (deduplicated).
+define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbx2_fallback_equals_second_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx1 with OOB where fallback == source should optimize (deduplicated).
+define <8 x i8> @vtbx1_oob_fallback_same_as_source(<8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_oob_fallback_same_as_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources).
+define <8 x i8> @vtbx2_with_oob_bail(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbx2_with_oob_bail(
+; CHECK-NEXT:    [[TBX:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBX]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx1 with all OOB indices should optimize to fallback.
+define <8 x i8> @vtbx1_all_oob(<8 x i8> %fallback, <8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_all_oob(
+; CHECK-NEXT:    ret <8 x i8> [[FALLBACK:%.*]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; Non-constant mask should NOT optimize.
+define <8 x i8> @vtbl1_non_constant_mask(<8 x i8> %a, <8 x i8> %mask) {
+; CHECK-LABEL: @vtbl1_non_constant_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> [[MASK:%.*]])
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %mask)
+  ret <8 x i8> %tbl
+}
+
+; Mask with some poison elements should optimize, with poison propagating to output.
+define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_poison_mask_elements(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7>)
+  ret <8 x i8> %tbl
+}
+
+; Mask with all poison elements should optimize to poison.
+define <8 x i8> @vtbl1_all_poison_mask(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_all_poison_mask(
+; CHECK-NEXT:    ret <8 x i8> poison
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> poison)
+  ret <8 x i8> %tbl
+}
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
deleted file mode 100644
index fbec1a2bb7a07..0000000000000
--- a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv8-arm-none-eabi"
-
-; Turning a table lookup intrinsic into a shuffle vector instruction
-; can be beneficial. If the mask used for the lookup is the constant
-; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
-; instructions instead.
-
-define <8 x i8> @tbl1_8x8(<8 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
-;
-entry:
-  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %vtbl1
-}
-
-; Bail the optimization if a mask index is out of range.
-define <8 x i8> @tbl1_8x8_out_of_range(<8 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8_out_of_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VTBL1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <8 x i8> [[VTBL1]]
-;
-entry:
-  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %vtbl1
-}
-
-declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>)

From 1a66474ca0f995868367fd4c22aa1259dcc6cf96 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Tue, 9 Dec 2025 16:15:07 +0000
Subject: [PATCH 14/85] [clang][FMV][AArch64] Remove O3 from failing test
 (#171457)

This fixes the buildbot failures from
https://github.com/llvm/llvm-project/pull/150267.

I could not reproduce them locally but my intuition suggests that the
-O3 option on the RUN line behaves incosistently on different hosts
judging from the error logs.

My intention was to run an integration test which will use llvm's
globalopt pass, but there's no need actually. We have unittests in place
for it.
---
 .../CodeGen/AArch64/fmv-explicit-priority.c   | 208 +++++++-----------
 1 file changed, 75 insertions(+), 133 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/fmv-explicit-priority.c b/clang/test/CodeGen/AArch64/fmv-explicit-priority.c
index 1abf330ffee49..b3c544124afa0 100644
--- a/clang/test/CodeGen/AArch64/fmv-explicit-priority.c
+++ b/clang/test/CodeGen/AArch64/fmv-explicit-priority.c
@@ -1,204 +1,146 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -O3 -fno-inline -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
 
 __attribute__((target_version("lse;priority=30"))) int foo(void) { return 1; }
-__attribute__((target_version("sve2;priority=20"))) int foo(void) { return 2; }
+__attribute__((target_version("aes;priority=20"))) int foo(void) { return 2; }
 __attribute__((target_version("sve;priority=10"))) int foo(void) { return 3; }
 __attribute__((target_version("default"))) int foo(void) { return 0; }
 
-__attribute__((target_clones("lse+sve2;priority=3", "lse;priority=2", "sve;priority=1", "default")))
-int fmv_caller(void) { return foo(); }
-
-
-__attribute__((target_version("aes"))) int bar(void) { return 1; }
-__attribute__((target_version("sm4;priority=5"))) int bar(void) { return 2; }
-__attribute__((target_version("default"))) int bar(void) { return 0; }
-
-__attribute__((target("aes"))) int regular_caller_aes() { return bar(); }
-__attribute__((target("sm4"))) int regular_caller_sm4() { return bar(); }
+__attribute__((target_clones("sme;priority=3", "bti;priority=2", "mops;priority=1", "default"))) int bar(void) { return 0; }
 //.
-// CHECK: @__aarch64_cpu_features = external dso_local local_unnamed_addr global { i64 }
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
 // CHECK: @foo = weak_odr ifunc i32 (), ptr @foo.resolver
-// CHECK: @fmv_caller = weak_odr ifunc i32 (), ptr @fmv_caller.resolver
 // CHECK: @bar = weak_odr ifunc i32 (), ptr @bar.resolver
 //.
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@foo._Mlse
 // CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@foo._Msve2
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@foo._Maes
 // CHECK-SAME: () #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
+// CHECK: Function Attrs: noinline nounwind optnone vscale_range(1,16)
 // CHECK-LABEL: define {{[^@]+}}@foo._Msve
 // CHECK-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@foo.default
 // CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller._MlseMsve2
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@bar._Msme
 // CHECK-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo._Mlse()
-// CHECK-NEXT:    ret i32 [[CALL]]
+// CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller._Mlse
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@bar._Mbti
 // CHECK-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo._Mlse()
-// CHECK-NEXT:    ret i32 [[CALL]]
+// CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: noinline nounwind vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller._Msve
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@bar._Mmops
 // CHECK-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo() #[[ATTR13:[0-9]+]]
-// CHECK-NEXT:    ret i32 [[CALL]]
-//
-//
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller.default
-// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo.default()
-// CHECK-NEXT:    ret i32 [[CALL]]
-//
-//
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
-// CHECK-LABEL: define {{[^@]+}}@bar._Maes
-// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
-// CHECK-LABEL: define {{[^@]+}}@bar._Msm4
-// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 2
+// CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@bar.default
 // CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: noinline nounwind
-// CHECK-LABEL: define {{[^@]+}}@regular_caller_aes
-// CHECK-SAME: () local_unnamed_addr #[[ATTR10:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @bar() #[[ATTR13]]
-// CHECK-NEXT:    ret i32 [[CALL]]
-//
-//
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
-// CHECK-LABEL: define {{[^@]+}}@regular_caller_sm4
-// CHECK-SAME: () local_unnamed_addr #[[ATTR11:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @bar._Msm4()
-// CHECK-NEXT:    ret i32 [[CALL]]
-//
-//
 // CHECK: Function Attrs: disable_sanitizer_instrumentation
 // CHECK-LABEL: define {{[^@]+}}@foo.resolver
-// CHECK-SAME: () #[[ATTR12:[0-9]+]] comdat {
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    tail call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 128
-// CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], 0
-// CHECK-NEXT:    br i1 [[DOTNOT]], label [[RESOLVER_ELSE:%.*]], label [[COMMON_RET:%.*]]
-// CHECK:       common.ret:
-// CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi ptr [ @foo._Mlse, [[RESOLVER_ENTRY:%.*]] ], [ @foo._Msve2, [[RESOLVER_ELSE]] ], [ [[FOO__MSVE_FOO_DEFAULT:%.*]], [[RESOLVER_ELSE2:%.*]] ]
-// CHECK-NEXT:    ret ptr [[COMMON_RET_OP]]
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 128
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @foo._Mlse
 // CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP0]], 69793284352
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 69793284352
-// CHECK-NEXT:    br i1 [[TMP3]], label [[COMMON_RET]], label [[RESOLVER_ELSE2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 33536
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 33536
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @foo._Maes
 // CHECK:       resolver_else2:
-// CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP0]], 1073807616
-// CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 1073807616
-// CHECK-NEXT:    [[FOO__MSVE_FOO_DEFAULT]] = select i1 [[TMP5]], ptr @foo._Msve, ptr @foo.default
-// CHECK-NEXT:    br label [[COMMON_RET]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 1073807616
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 1073807616
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
+// CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
+// CHECK:       resolver_return3:
+// CHECK-NEXT:    ret ptr @foo._Msve
+// CHECK:       resolver_else4:
+// CHECK-NEXT:    ret ptr @foo.default
 //
 //
 // CHECK: Function Attrs: disable_sanitizer_instrumentation
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller.resolver
-// CHECK-SAME: () #[[ATTR12]] comdat {
+// CHECK-LABEL: define {{[^@]+}}@bar.resolver
+// CHECK-SAME: () #[[ATTR7]] comdat {
 // CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    tail call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 69793284480
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 69793284480
-// CHECK-NEXT:    br i1 [[TMP2]], label [[COMMON_RET:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       common.ret:
-// CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi ptr [ @fmv_caller._MlseMsve2, [[RESOLVER_ENTRY:%.*]] ], [ @fmv_caller._Mlse, [[RESOLVER_ELSE]] ], [ [[FMV_CALLER__MSVE_FMV_CALLER_DEFAULT:%.*]], [[RESOLVER_ELSE2:%.*]] ]
-// CHECK-NEXT:    ret ptr [[COMMON_RET_OP]]
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4398180795136
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4398180795136
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @bar._Msme
 // CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP0]], 128
-// CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP3]], 0
-// CHECK-NEXT:    br i1 [[DOTNOT]], label [[RESOLVER_ELSE2]], label [[COMMON_RET]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @bar._Mbti
 // CHECK:       resolver_else2:
-// CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP0]], 1073807616
-// CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 1073807616
-// CHECK-NEXT:    [[FMV_CALLER__MSVE_FMV_CALLER_DEFAULT]] = select i1 [[TMP5]], ptr @fmv_caller._Msve, ptr @fmv_caller.default
-// CHECK-NEXT:    br label [[COMMON_RET]]
-//
-//
-// CHECK: Function Attrs: disable_sanitizer_instrumentation
-// CHECK-LABEL: define {{[^@]+}}@bar.resolver
-// CHECK-SAME: () #[[ATTR12]] comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    tail call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 800
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 800
-// CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP0]], 33536
-// CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 33536
-// CHECK-NEXT:    [[BAR__MAES_BAR_DEFAULT:%.*]] = select i1 [[TMP4]], ptr @bar._Maes, ptr @bar.default
-// CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[TMP2]], ptr @bar._Msm4, ptr [[BAR__MAES_BAR_DEFAULT]]
-// CHECK-NEXT:    ret ptr [[COMMON_RET_OP]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 576460752303423488
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 576460752303423488
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
+// CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
+// CHECK:       resolver_return3:
+// CHECK-NEXT:    ret ptr @bar._Mmops
+// CHECK:       resolver_else4:
+// CHECK-NEXT:    ret ptr @bar.default
 //
 //.
-// CHECK: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "fmv-features"="P1,P2,P3,P4,lse" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
-// CHECK: attributes #[[ATTR1]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features"="P2,P4,sve2" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve,+sve2" }
-// CHECK: attributes #[[ATTR2]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features"="P1,P3,sve" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
-// CHECK: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "fmv-features" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR4]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features"="P0,P1,lse,sve2" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+lse,+sve,+sve2" }
-// CHECK: attributes #[[ATTR5]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features"="P1,lse" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
-// CHECK: attributes #[[ATTR6]] = { noinline nounwind vscale_range(1,16) "fmv-features"="P0,sve" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
-// CHECK: attributes #[[ATTR7]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR8]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "fmv-features"="aes" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR9]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "fmv-features"="P0,P2,sm4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4" }
-// CHECK: attributes #[[ATTR10]] = { noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR11]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4" }
-// CHECK: attributes #[[ATTR12]] = { disable_sanitizer_instrumentation }
-// CHECK: attributes #[[ATTR13]] = { nounwind }
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "fmv-features"="P1,P2,P3,P4,lse" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
+// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "fmv-features"="P2,P4,aes" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone vscale_range(1,16) "fmv-features"="P1,P3,sve" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
+// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "fmv-features" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "fmv-features"="P0,P1,sme" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme" }
+// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "fmv-features"="P1,bti" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti" }
+// CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "fmv-features"="P0,mops" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" }
+// CHECK: attributes #[[ATTR7]] = { disable_sanitizer_instrumentation }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-// CHECK: [[META2:![0-9]+]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
-// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.

From c61a481a231025a90ef12e4f9a04bedb04eeeb99 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 9 Dec 2025 16:19:13 +0000
Subject: [PATCH 15/85] [VPlan] Use SCEV to prove non-aliasing for stores at
 different offsets. (#170347)

Extend the logic add in https://github.com/llvm/llvm-project/pull/168771
to also allow sinking stores past stores in the same noalias set by
checking if we can prove no-alias via the distance between accesses,
checked via SCEV.

PR: https://github.com/llvm/llvm-project/pull/170347
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 96 ++++++++++++++++---
 ...predicated-loads-with-predicated-stores.ll | 48 ++++------
 2 files changed, 99 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index eb078c783d5f7..852196e589c59 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -44,6 +44,7 @@
 
 using namespace llvm;
 using namespace VPlanPatternMatch;
+using namespace SCEVPatternMatch;
 
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
     VPlan &Plan,
@@ -139,14 +140,77 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
   return true;
 }
 
-// Check if a memory operation doesn't alias with memory operations in blocks
-// between FirstBB and LastBB using scoped noalias metadata.
-// For load hoisting, we only check writes in one direction.
-// For store sinking, we check both reads and writes bidirectionally.
-static bool canHoistOrSinkWithNoAliasCheck(
-    const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
-    bool CheckReads,
-    const SmallPtrSetImpl<VPRecipeBase *> *ExcludeRecipes = nullptr) {
+/// Helper for extra no-alias checks via known-safe recipe and SCEV.
+class SinkStoreInfo {
+  const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
+  VPReplicateRecipe &GroupLeader;
+  ScalarEvolution &SE;
+  const Loop &L;
+  VPTypeAnalysis &TypeInfo;
+
+  // Return true if \p A and \p B are known to not alias for all VFs in the
+  // plan, checked via the distance between the accesses
+  bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
+    if (A->getOpcode() != Instruction::Store ||
+        B->getOpcode() != Instruction::Store)
+      return false;
+
+    VPValue *AddrA = A->getOperand(1);
+    const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, SE, &L);
+    VPValue *AddrB = B->getOperand(1);
+    const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, SE, &L);
+    if (isa<SCEVCouldNotCompute>(SCEVA) || isa<SCEVCouldNotCompute>(SCEVB))
+      return false;
+
+    const APInt *Distance;
+    if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
+      return false;
+
+    const DataLayout &DL = SE.getDataLayout();
+    Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
+    uint64_t SizeA = DL.getTypeStoreSize(TyA);
+    Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
+    uint64_t SizeB = DL.getTypeStoreSize(TyB);
+
+    // Use the maximum store size to ensure no overlap from either direction.
+    // Currently only handles fixed sizes, as it is only used for
+    // replicating VPReplicateRecipes.
+    uint64_t MaxStoreSize = std::max(SizeA, SizeB);
+
+    auto VFs = B->getParent()->getPlan()->vectorFactors();
+    ElementCount MaxVF = *max_element(VFs, ElementCount::isKnownLT);
+    return Distance->abs().uge(
+        MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
+  }
+
+public:
+  SinkStoreInfo(const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes,
+                VPReplicateRecipe &GroupLeader, ScalarEvolution &SE,
+                const Loop &L, VPTypeAnalysis &TypeInfo)
+      : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), SE(SE), L(L),
+        TypeInfo(TypeInfo) {}
+
+  /// Return true if \p R should be skipped during alias checking, either
+  /// because it's in the exclude set or because no-alias can be proven via
+  /// SCEV.
+  bool shouldSkip(VPRecipeBase &R) const {
+    auto *Store = dyn_cast<VPReplicateRecipe>(&R);
+    return ExcludeRecipes.contains(&R) ||
+           (Store && isNoAliasViaDistance(Store, &GroupLeader));
+  }
+};
+
+/// Check if a memory operation doesn't alias with memory operations in blocks
+/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
+/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
+/// checked (for load hoisting). Otherwise recipes that both read and write
+/// memory are checked, and SCEV is used to prove no-alias between the group
+/// leader and other replicate recipes (for store sinking).
+static bool
+canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,
+                               VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
+                               std::optional<SinkStoreInfo> SinkInfo = {}) {
+  bool CheckReads = SinkInfo.has_value();
   if (!MemLoc.AATags.Scope)
     return false;
 
@@ -158,7 +222,7 @@ static bool canHoistOrSinkWithNoAliasCheck(
            "Expected at most one successor in block chain");
     auto *VPBB = cast<VPBasicBlock>(Block);
     for (VPRecipeBase &R : *VPBB) {
-      if (ExcludeRecipes && ExcludeRecipes->contains(&R))
+      if (SinkInfo && SinkInfo->shouldSkip(R))
         continue;
 
       // Skip recipes that don't need checking.
@@ -4273,8 +4337,7 @@ void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
 
     // Check that the load doesn't alias with stores between first and last.
     auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
-    if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB,
-                                                    /*CheckReads=*/false))
+    if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
       continue;
 
     // Collect common metadata from all loads in the group.
@@ -4301,7 +4364,9 @@ void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
 }
 
 static bool
-canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink) {
+canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink,
+                             ScalarEvolution &SE, const Loop &L,
+                             VPTypeAnalysis &TypeInfo) {
   auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
   if (!StoreLoc || !StoreLoc->AATags.Scope)
     return false;
@@ -4313,8 +4378,8 @@ canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink) {
 
   VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
   VPBasicBlock *LastBB = StoresToSink.back()->getParent();
-  return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB,
-                                        /*CheckReads=*/true, &StoresToSinkSet);
+  SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], SE, L, TypeInfo);
+  return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
 }
 
 void VPlanTransforms::sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE,
@@ -4325,13 +4390,14 @@ void VPlanTransforms::sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE,
     return;
 
   VPDominatorTree VPDT(Plan);
+  VPTypeAnalysis TypeInfo(Plan);
 
   for (auto &Group : Groups) {
     sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
       return VPDT.properlyDominates(A, B);
     });
 
-    if (!canSinkStoreWithNoAliasCheck(Group))
+    if (!canSinkStoreWithNoAliasCheck(Group, SE, *L, TypeInfo))
       continue;
 
     // Use the last (most dominated) store's location for the unconditional
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
index cdbe9bb555834..7450fcccbb484 100644
--- a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
@@ -764,7 +764,7 @@ define void @sink_multiple_store_groups_noalias_via_scev(ptr %dst, ptr %src) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = mul i64 [[INDEX1]], 16
 ; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 16
@@ -781,42 +781,30 @@ define void @sink_multiple_store_groups_noalias_via_scev(ptr %dst, ptr %src) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP22]], align 8, !alias.scope [[META78]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TMP13]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = insertelement <2 x double> [[TMP15]], double [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[TMP10]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP34:%.*]] = fadd <2 x double> [[WIDE_LOAD]], splat (double 8.000000e+00)
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0
-; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP34]], i32 0
-; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP18]], align 8, !alias.scope [[META81:![0-9]+]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x ptr> [[TMP31]], ptr [[TMP21]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP10]], <2 x double> [[WIDE_LOAD]], <2 x double> [[TMP34]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP20]], i32 0
+; CHECK-NEXT:    store double [[TMP32]], ptr [[TMP18]], align 8, !alias.scope [[META81:![0-9]+]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP20]], i32 1
+; CHECK-NEXT:    store double [[TMP33]], ptr [[TMP21]], align 8, !alias.scope [[META81]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP18]], i64 16
+; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP24]], align 8, !alias.scope [[META81]], !noalias [[META78]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP16]], i32 1
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
 ; CHECK:       [[PRED_STORE_IF2]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP34]], i32 1
-; CHECK-NEXT:    store double [[TMP33]], ptr [[TMP21]], align 8, !alias.scope [[META81]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP21]], i64 16
+; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP35]], align 8, !alias.scope [[META81]], !noalias [[META78]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
 ; CHECK:       [[PRED_STORE_CONTINUE3]]:
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
-; CHECK-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
-; CHECK:       [[PRED_STORE_IF4]]:
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store double [[TMP13]], ptr [[TMP31]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[TMP31]], i64 16
-; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP37]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
-; CHECK:       [[PRED_STORE_CONTINUE5]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
-; CHECK:       [[PRED_STORE_IF6]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]]
-; CHECK-NEXT:    store double [[TMP14]], ptr [[TMP32]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[TMP32]], i64 16
-; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP47]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
-; CHECK:       [[PRED_STORE_CONTINUE7]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP52]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP83:![0-9]+]]

From 76cffd310ac45c43c95c879d1bfdc1580a3d591e Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 9 Dec 2025 16:33:29 +0000
Subject: [PATCH 16/85] [CI] Tweak wording for builds with passing tests and
 build errors (#171436)

"All tests passed" is too easily interpreted as every possible test was
run and was fine. A lot of the time it means all the tests that didn't
fail to build ran and were fine.

Maybe the wording is still too subtle but at least it hints to the idea
that the tests run might be fewer than if the build had no compilation
errors.
---
 .ci/generate_test_report_lib.py      | 4 ++--
 .ci/generate_test_report_lib_test.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 9a4fc6030d5ec..5edde254eb73d 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -267,7 +267,7 @@ def plural(num_tests):
             report.extend(
                 [
                     "",
-                    "All tests passed but another part of the build **failed**. "
+                    "All executed tests passed, but another part of the build **failed**. "
                     "Information about the build failure could not be automatically "
                     "obtained.",
                     "",
@@ -278,7 +278,7 @@ def plural(num_tests):
             report.extend(
                 [
                     "",
-                    "All tests passed but another part of the build **failed**. Click on "
+                    "All executed tests passed, but another part of the build **failed**. Click on "
                     "a failure below to see the details.",
                     "",
                 ]
diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py
index b9e992e0f798b..06279d672f3c3 100644
--- a/.ci/generate_test_report_lib_test.py
+++ b/.ci/generate_test_report_lib_test.py
@@ -343,7 +343,7 @@ def test_no_failures_build_failed(self):
 
               * 1 test passed
 
-              All tests passed but another part of the build **failed**. Information about the build failure could not be automatically obtained.
+              All executed tests passed, but another part of the build **failed**. Information about the build failure could not be automatically obtained.
 
               Download the build's log file to see the details.
               
@@ -390,7 +390,7 @@ def test_no_failures_build_failed_ninja_log(self):
 
                     * 1 test passed
 
-                    All tests passed but another part of the build **failed**. Click on a failure below to see the details.
+                    All executed tests passed, but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
                     <summary>test/4.stamp</summary>
@@ -476,7 +476,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
 
                     * 1 test passed
 
-                    All tests passed but another part of the build **failed**. Click on a failure below to see the details.
+                    All executed tests passed, but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
                     <summary>touch test/2.stamp</summary>
@@ -978,7 +978,7 @@ def test_generate_report_end_to_end(self):
 
                     * 1 test passed
 
-                    All tests passed but another part of the build **failed**. Click on a failure below to see the details.
+                    All executed tests passed, but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
                     <summary>test/4.stamp</summary>

From a03318360649c8b6f30b28e1ee5a76af1f559c51 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 08:37:48 -0800
Subject: [PATCH 17/85] [compiler-rt] Try bumping soft_rss_limit again
 (#171469)

This is still failing on some of the bots. Try bumping the limit again
to see if this fixes things.
---
 .../TestCases/Linux/soft_rss_limit_mb_test.cpp              | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
index 958fe40d5db64..0c7257849dc31 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
@@ -2,12 +2,12 @@
 // RUN: %clangxx -O2 %s -o %t
 //
 // Run with limit should fail:
-// RUN: %env_tool_opts=soft_rss_limit_mb=250:quarantine_size=1:allocator_may_return_null=1     %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_1
-// RUN: %env_tool_opts=soft_rss_limit_mb=250:quarantine_size=1:allocator_may_return_null=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
+// RUN: %env_tool_opts=soft_rss_limit_mb=384:quarantine_size=1:allocator_may_return_null=1     %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_1
+// RUN: %env_tool_opts=soft_rss_limit_mb=384:quarantine_size=1:allocator_may_return_null=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
 
 // This run uses getrusage. We can only test getrusage when allocator_may_return_null=0
 // because getrusage gives us max-rss, not current-rss.
-// RUN: %env_tool_opts=soft_rss_limit_mb=250:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
+// RUN: %env_tool_opts=soft_rss_limit_mb=384:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
 // REQUIRES: stable-runtime
 
 // Ubsan does not intercept pthread_create.

From 94ebcfd16dac67486bae624f74e1c5c789448bae Mon Sep 17 00:00:00 2001
From: Men-cotton <mencotton0410@gmail.com>
Date: Wed, 10 Dec 2025 01:38:02 +0900
Subject: [PATCH 18/85] [mlir][vector] Fix crash in ReorderCastOpsOnBroadcast
 with non-vector result (#170985)

Fixes a crash in `ReorderCastOpsOnBroadcast` by ensuring the cast result
is a `VectorType` before applying the pattern.
A regression test has been added to
mlir/test/Dialect/Vector/vector-sink.mlir.

Fixes: #126371
---
 .../Vector/Transforms/VectorTransforms.cpp        |  2 ++
 mlir/test/Dialect/Vector/vector-sink.mlir         | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 726da1e9a3d14..ad16b80a732b3 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -453,6 +453,8 @@ struct ReorderCastOpsOnBroadcast
                                 PatternRewriter &rewriter) const override {
     if (op->getNumOperands() != 1)
       return failure();
+    if (!isa<VectorType>(op->getResult(0).getType()))
+      return failure();
     auto bcastOp = op->getOperand(0).getDefiningOp<vector::BroadcastOp>();
     if (!bcastOp)
       return failure();
diff --git a/mlir/test/Dialect/Vector/vector-sink.mlir b/mlir/test/Dialect/Vector/vector-sink.mlir
index beaba52af1841..69fba88a14048 100644
--- a/mlir/test/Dialect/Vector/vector-sink.mlir
+++ b/mlir/test/Dialect/Vector/vector-sink.mlir
@@ -382,6 +382,21 @@ func.func @broadcast_scalar_extsi_scalable(%a : i8) -> vector<2x[4]xi32> {
   return %r : vector<2x[4]xi32>
 }
 
+// -----
+
+// CHECK-LABEL: func.func @negative_broadcast_cast_non_vector_result
+// CHECK-SAME: (%[[ARG:.*]]: i64)
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG]] : i64 to vector<26x7xi64>
+// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[BCAST]] : vector<26x7xi64> to !llvm.array<26 x vector<7xi64>>
+// CHECK: return %[[CAST]] : !llvm.array<26 x vector<7xi64>>
+/// This test ensures that the `ReorderCastOpsOnBroadcast` pattern does not
+/// attempt to reorder a cast operation that produces a non-vector result type.
+func.func @negative_broadcast_cast_non_vector_result(%arg0: i64) -> !llvm.array<26 x vector<7xi64>> {
+  %0 = vector.broadcast %arg0 : i64 to vector<26x7xi64>
+  %1 = builtin.unrealized_conversion_cast %0 : vector<26x7xi64> to !llvm.array<26 x vector<7xi64>>
+  return %1 : !llvm.array<26 x vector<7xi64>>
+}
+
 //===----------------------------------------------------------------------===//
 // [Pattern: ReorderElementwiseOpsOnTranspose]
 //===----------------------------------------------------------------------===//

From 5236af88e5ed0a3449b2292ef02be28b8722b172 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li@intel.com>
Date: Tue, 9 Dec 2025 08:48:27 -0800
Subject: [PATCH 19/85] [MLIR][XeGPU] Extend propagation and sg_to_lane
 distribution pass support broadcast with low rank and scalar source input
 (#170409)

This PR extends XeGPU layout propagation and distribution for
vector.broadcast operation.
It relaxes the restriction of layout propagation to allow low-rank and
scalar source input, and adds a pattern in sg-to-wi distribution to
support the lowering.
---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  34 +++-
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |   2 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 143 +++++++++++++++
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |  42 +++--
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 163 +++++++++++++++++-
 mlir/test/Dialect/XeGPU/propagate-layout.mlir |  58 +++++++
 .../XeGPU/subgroup-distribute-unit.mlir       |  65 +++++++
 .../Dialect/XeGPU/subgroup-distribute.mlir    |  61 +++++++
 8 files changed, 550 insertions(+), 18 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 93c5187b00756..eae0bd4e68a84 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -223,6 +223,14 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
     InterfaceMethod<"Derive a new layout by dropping InstData",
                     "xegpu::DistributeLayoutAttr",
                     "dropInstData">,
+    InterfaceMethod<"Derive a new layout with sg_data, inst_data and lane_data set to 1 for the specified unit dims",
+                    "xegpu::DistributeLayoutAttr",
+                    "setUnitDimData",
+                    /*args=*/(ins "const llvm::SetVector<int64_t>": $unitDims)>,
+    InterfaceMethod<"Derive a new layout with sg_lane and lane_layout set to 1 for the specified unit dims",
+                    "xegpu::DistributeLayoutAttr",
+                    "setUnitDimLayout",
+                    /*args=*/(ins "const llvm::SetVector<int64_t>": $unitDims)>,
     InterfaceMethod<[{Delinearizes a linear ID into its multidimensional
                       indices based on the effective layout level.}],
                     "FailureOr<SmallVector<Value>>",
@@ -283,9 +291,14 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                       }
                       return true;
                     }]>,
-    InterfaceMethod</*desc=*/[{Check if this layout is a slice of some other layout.}],
+    InterfaceMethod</*desc=*/[{Check if this layout is a slice of another layout.}],
                     /*retTy=*/"bool",
                     /*methodName=*/"isSliceOf",
+                    /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other)>,
+
+    InterfaceMethod</*desc=*/[{Check if this layout is identical to another layout.}],
+                    /*retTy=*/"bool",
+                    /*methodName=*/"isEqualTo",
                     /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other)>
   ];
 }
@@ -487,6 +500,12 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
       return {};
     }
 
+    //set the layout for the sepcified unit dims: sg_data, inst_data and lane_data to 1
+    DistributeLayoutAttr setUnitDimData(SetVector<int64_t> unitDims);
+
+    //set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+    DistributeLayoutAttr setUnitDimLayout(SetVector<int64_t> unitDims);
+
     /// Delinearizes a linear ID into its multidimensional indices
     /// based on the effective level of the layout.
     FailureOr<SmallVector<Value>>
@@ -501,6 +520,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
+    
+    /// Check if this is identical to some other layout.
+    bool isEqualTo(const xegpu::DistributeLayoutAttr &other); 
 
   }];
 
@@ -649,6 +671,12 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
       return SliceAttr::get(getContext(), parent, attr.getDims());
     }
 
+    //set the layout for the sepcified unit dims: sg_data, inst_data and lane_data to 1
+    DistributeLayoutAttr setUnitDimData(SetVector<int64_t> unitDims);
+
+    //set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+    DistributeLayoutAttr setUnitDimLayout(SetVector<int64_t> unitDims);
+
     /// flatten a nested SliceAttr, e.g., for 2-level nested SliceAttr
     /// #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0]>, dims = [0]>
     /// it will coalese two slice operations and return a simplified SliceAttr
@@ -670,7 +698,9 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
-
+    
+    /// Check if this is identical to some other layout.
+    bool isEqualTo(const xegpu::DistributeLayoutAttr &other); 
   }];
 
   let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`";
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 4fe1087d18879..b54d620c3c0c3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -405,7 +405,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint, 
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
                        OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let results = (outs XeGPU_ValueType: $value);
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7ab2e612ed890..1a19ab5fd970b 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -390,6 +390,86 @@ LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
   return genCoordinates(builder, loc, ids, layout, subShape, shape);
 }
 
+bool LayoutAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
+  if (dyn_cast<xegpu::SliceAttr>(other))
+    return false;
+
+  return *this == dyn_cast<xegpu::LayoutAttr>(other);
+}
+
+// set the layout for unit dims: sg_data, inst_data and lane_data to 1
+DistributeLayoutAttr LayoutAttr::setUnitDimData(SetVector<int64_t> unitDims) {
+  auto sgDataOpt = getSgData();
+  auto instDataOpt = getInstData();
+  auto laneDataOpt = getLaneData();
+
+  SmallVector<int32_t> sgData;
+  SmallVector<int32_t> instData;
+  SmallVector<int32_t> laneData;
+
+  if (sgDataOpt) {
+    sgData = llvm::to_vector(sgDataOpt.asArrayRef());
+  }
+  if (instDataOpt) {
+    instData = llvm::to_vector(instDataOpt.asArrayRef());
+  }
+  if (laneDataOpt) {
+    laneData = llvm::to_vector(laneDataOpt.asArrayRef());
+  }
+
+  for (auto dim : unitDims) {
+    if (dim < static_cast<int64_t>(sgData.size()))
+      sgData[dim] = 1;
+    if (dim < static_cast<int64_t>(instData.size()))
+      instData[dim] = 1;
+    if (dim < static_cast<int64_t>(laneData.size()))
+      laneData[dim] = 1;
+  }
+
+  return LayoutAttr::get(
+      getContext(), getSgLayout(),
+      sgData.empty() ? DenseI32ArrayAttr()
+                     : DenseI32ArrayAttr::get(getContext(), sgData),
+      instData.empty() ? DenseI32ArrayAttr()
+                       : DenseI32ArrayAttr::get(getContext(), instData),
+      getLaneLayout(),
+      laneData.empty() ? DenseI32ArrayAttr()
+                       : DenseI32ArrayAttr::get(getContext(), laneData),
+      getOrder());
+}
+
+// set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+DistributeLayoutAttr LayoutAttr::setUnitDimLayout(SetVector<int64_t> unitDims) {
+  auto sgLayoutOpt = getSgLayout();
+  auto laneLayoutOpt = getLaneLayout();
+
+  SmallVector<int32_t> sgLayout;
+  SmallVector<int32_t> laneLayout;
+
+  if (sgLayoutOpt) {
+    sgLayout = llvm::to_vector(sgLayoutOpt.asArrayRef());
+  }
+  if (laneLayoutOpt) {
+    laneLayout = llvm::to_vector(laneLayoutOpt.asArrayRef());
+  }
+
+  for (auto dim : unitDims) {
+    if (dim < static_cast<int64_t>(sgLayout.size()))
+      sgLayout[dim] = 1;
+    if (dim < static_cast<int64_t>(laneLayout.size()))
+      laneLayout[dim] = 1;
+  }
+
+  return LayoutAttr::get(
+      getContext(),
+      sgLayout.empty() ? DenseI32ArrayAttr()
+                       : DenseI32ArrayAttr::get(getContext(), sgLayout),
+      getSgData(), getInstData(),
+      laneLayout.empty() ? DenseI32ArrayAttr()
+                         : DenseI32ArrayAttr::get(getContext(), laneLayout),
+      getLaneData(), getOrder());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
@@ -510,6 +590,69 @@ bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
                       [&](int64_t dim) { return thisDims.contains(dim); });
 }
 
+bool SliceAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
+  if (dyn_cast<xegpu::LayoutAttr>(other))
+    return false;
+
+  auto flattenedThis = flatten();
+  auto flattenedOther = dyn_cast<xegpu::SliceAttr>(other).flatten();
+
+  return ((flattenedThis.getParent() == flattenedOther.getParent()) &&
+          (flattenedThis.getDims() == flattenedOther.getDims()));
+}
+
+// Helper function to adjust unit dimensions from sliced space to parent space
+static SetVector<int64_t>
+adjustUnitDimsWithSliceDims(const SetVector<int64_t> &unitDims,
+                            ArrayRef<int64_t> sliceDims) {
+  // Reconstruct parent's non-sliced dimensions
+
+  int64_t parentRank = sliceDims.size() + unitDims.size();
+  llvm::SmallDenseSet<int64_t> slicedDimsSet(sliceDims.begin(),
+                                             sliceDims.end());
+  SmallVector<int64_t> nonSlicedDims;
+  for (int64_t i = 0; i < parentRank; ++i) {
+    if (!slicedDimsSet.contains(i))
+      nonSlicedDims.push_back(i);
+  }
+
+  // Map unit dims from sliced space to parent space
+  SetVector<int64_t> adjustUnitDims;
+  for (auto dim : unitDims) {
+    if (dim < static_cast<int64_t>(nonSlicedDims.size())) {
+      adjustUnitDims.insert(nonSlicedDims[dim]);
+    }
+  }
+
+  return adjustUnitDims;
+}
+
+// set the layout for unit dims: sg_data, inst_data and lane_data to 1
+DistributeLayoutAttr SliceAttr::setUnitDimData(SetVector<int64_t> unitDims) {
+  SliceAttr attr = flatten();
+  ArrayRef<int64_t> sliceDims = attr.getDims().asArrayRef();
+  auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+
+  SetVector<int64_t> adjustUnitDims =
+      adjustUnitDimsWithSliceDims(unitDims, sliceDims);
+
+  return SliceAttr::get(getContext(), parent.setUnitDimData(adjustUnitDims),
+                        attr.getDims());
+}
+
+// set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+DistributeLayoutAttr SliceAttr::setUnitDimLayout(SetVector<int64_t> unitDims) {
+  SliceAttr attr = flatten();
+  ArrayRef<int64_t> sliceDims = attr.getDims().asArrayRef();
+  auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+
+  SetVector<int64_t> adjustUnitDims =
+      adjustUnitDimsWithSliceDims(unitDims, sliceDims);
+
+  return SliceAttr::get(getContext(), parent.setUnitDimLayout(adjustUnitDims),
+                        attr.getDims());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_RangeAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 59a1ad9dbe189..dc9eb96c169b4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -580,23 +580,39 @@ void LayoutInfoPropagation::visitVectorBroadCastOp(
   // Only consider vector to vector broadcasts for now.
   VectorType resultTy = broadcast.getResultVectorType();
   VectorType sourceTy = dyn_cast<VectorType>(broadcast.getSourceType());
-  if (!sourceTy) {
-    broadcast.emitWarning("Expecting source type to be a vector type.");
+  // skip layout propagation for non-vector source operand.
+  if (!sourceTy)
     return;
-  }
 
-  // Only consider nD -> nD broadcast.
+  // Hanlding broadcast from low-rank to high-rank (e.g., 1D to 2D) case.
   if (sourceTy.getRank() != resultTy.getRank()) {
-    broadcast.emitWarning("Expecting source and result to have same rank.");
+    auto sourceDims = sourceTy.getShape();
+    auto resultDims = resultTy.getShape();
+    SmallVector<int64_t> bcastDims;
+    auto dimDiff = resultTy.getRank() - sourceTy.getRank();
+    // adding the missing leading dims
+    for (int i = 0; i < dimDiff; i++)
+      bcastDims.push_back(i);
+
+    // for the rest dims in the resultTy, if sourceTy dim is 1, then it's
+    // broadcasted dim
+    for (size_t i = 0; i < sourceDims.size(); i++)
+      if ((sourceDims[i] == 1) && (resultDims[i + dimDiff] != 1))
+        bcastDims.push_back(i + dimDiff);
+
+    // create a slice layout for the source
+    xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get(
+        broadcast->getContext(),
+        cast<xegpu::DistributeLayoutAttr>(resultLayout.get()),
+        DenseI64ArrayAttr::get(broadcast->getContext(), bcastDims));
+
+    propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout)));
     return;
   }
+
   SetVector<int64_t> broadcastUnitDims = broadcast.computeBroadcastedUnitDims();
-  if (broadcastUnitDims.size() != 1) {
-    broadcast.emitWarning("Expecting source type to be nD vector only with "
-                          "one broadcasted dimension.");
-    return;
-  }
-  // Propagate the result layout to the source operand.
+  resultLayout = cast<xegpu::DistributeLayoutAttr>(resultLayout.get())
+                     .setUnitDimData(broadcastUnitDims);
   propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
 }
 
@@ -917,7 +933,7 @@ void LayoutInfoPropagation::visitLoadGatherOp(
   } else {
 
     // The layout is strictly determined by the payload type.
-    auto payloadTy = dyn_cast<VectorType>(load.getValueType());
+    VectorType payloadTy = load.getValueType();
     if (!payloadTy) {
       load.emitWarning("Not propagating, non-vector payload supplied.");
       return;
@@ -987,7 +1003,7 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     // Currently, for 2D StoreScatterOp we expect that the height dimension of
     // the tensor descriptor is equal to the subgroup size. This is ensured by
     // the op verifier.
-    auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType());
+    VectorType payloadTy = storeScatter.getValueType();
     if (!payloadTy) {
       storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
       return;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 0d1c5eeeff711..ca81c3cd7be42 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -99,7 +99,6 @@ getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
     if (i < distributionStart)
       continue;
-
     // Check if the dimension can be distributed evenly.
     if (dim % effectiveLaneLayout[i - distributionStart] != 0)
       return failure();
@@ -1424,6 +1423,166 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
   }
 };
 
+/// This pattern distributes the `vector.broadcast` operation across lanes in a
+/// warp. The pattern supports three use cases:
+///
+/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
+/// vector
+///    must have a slice layout of the result. If the distributed source and
+///    target vector types are identical, this lowers to a no-op; otherwise, it
+///    remains a broadcast but operates on distributed vectors.
+///
+/// 2) Broadcast a same-rank vector with identical layouts for source and
+/// target:
+///    The source vector must have unit dimensions, and lane_data must be unit
+///    size for those unit dims. This always lowers to a no-op.
+///
+/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
+///    scalar to distributed result type.
+///
+/// Example 1 (lowering to a broadcast with distributed types):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [0]> } : () -> (vector<32xf32>)
+///   %2 = vector.broadcast %0 {layout_result_0 =
+///     #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>}
+///     : vector<32xf32> to vector<8x32xf32>
+///     gpu.yield %1 : vector<8x32xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [0]> } : () -> (vector<32xf32>)
+///   gpu.yield %0 : vector<32xf32>
+/// }
+/// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
+///
+/// Example 2 (no-op):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [1]> } : () -> (vector<8xf32>)
+///   %1 = vector.shape_cast %0
+///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+///      1]>}: vector<8xf32> to vector<8x1xf32>
+///   %2 = vector.broadcast %1
+///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+///     1]>}: vector<8x1xf32> to vector<8x32xf32>
+///   gpu.yield %1 : vector<8x32xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [1]> } : () -> (vector<8xf32>)
+///   %1 = vector.shape_cast %0
+///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+///     1]>}: vector<8xf32> to vector<8x1xf32>
+///   gpu.yield %1 : vector<8x1xf32>
+/// }
+/// // The broadcast is implicit through layout transformation (no-op)
+///  "some_use"(%r#0)
+/// ```
+struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *yieldOperand =
+        getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
+    if (!yieldOperand)
+      return failure();
+    auto broadcastOp =
+        cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
+    unsigned operandIdx = yieldOperand->getOperandNumber();
+
+    VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
+    VectorType destType =
+        dyn_cast<VectorType>(broadcastOp.getResult().getType());
+
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getDistributeLayoutAttr(broadcastOp->getOpOperand(0));
+    xegpu::DistributeLayoutAttr resultLayout =
+        xegpu::getDistributeLayoutAttr(broadcastOp.getResult());
+
+    FailureOr<VectorType> sourceDistType;
+    Type sourceElemOrDistType;
+    if (sourceType) {
+
+      // Case 1 and 2: source is a vector type.
+      int64_t rankDiff = destType.getRank() - sourceType.getRank();
+      if (rankDiff > 0) {
+        // Case 1: source is lower-rank than result.
+        bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
+        if (!isSliceOf)
+          return rewriter.notifyMatchFailure(
+              warpOp,
+              "Broadcast input layout must be a slice of result layout.");
+      }
+      // case 2: source and result have same rank
+      if (rankDiff == 0) {
+        SetVector<int64_t> broadcastUnitDims =
+            broadcastOp.computeBroadcastedUnitDims();
+        resultLayout = resultLayout.setUnitDimData(broadcastUnitDims);
+        bool isEqualTo = sourceLayout.isEqualTo(resultLayout);
+        if (!isEqualTo)
+          return rewriter.notifyMatchFailure(
+              warpOp, "For same-rank broadcast, source must be identical to "
+                      "adjusted result layouts with unit dims.");
+        sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);
+      }
+
+      sourceDistType =
+          getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
+      if (failed(sourceDistType)) {
+        return rewriter.notifyMatchFailure(
+            warpOp, "Failed to distribute the source vector type.");
+      }
+      sourceElemOrDistType = sourceDistType.value();
+
+    } else {
+      // Case 3: source is a scalar type.
+      if (sourceLayout) {
+        return rewriter.notifyMatchFailure(
+            warpOp, "Broadcast from scalar must not have a layout attribute.");
+      }
+      sourceElemOrDistType = broadcastOp.getSourceType();
+    }
+    FailureOr<VectorType> destDistType =
+        getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
+    if (failed(destDistType)) {
+      return rewriter.notifyMatchFailure(
+          warpOp, "Failed to distribute the dest vector type.");
+    }
+
+    SmallVector<size_t> newRetIndices;
+    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType,
+        newRetIndices);
+
+    Value distributedSource = newWarpOp.getResult(newRetIndices[0]);
+
+    Value newBroadcast = distributedSource;
+
+    if (sourceElemOrDistType != destDistType.value()) {
+      rewriter.setInsertionPointAfter(newWarpOp);
+      newBroadcast =
+          vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(),
+                                      destDistType.value(), distributedSource);
+    }
+
+    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
+    return success();
+  }
+};
+
 /// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
 /// `gpu.warp_execute_on_lane_0` region.
 struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
@@ -1865,7 +2024,7 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
   // patterns. Therefore, assign higher benefit.
   patterns
       .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
-           VectorInsertStridedSliceDistribution>(
+           VectorInsertStridedSliceDistribution, VectorBroadcastDistribution>(
           patterns.getContext(),
           /*pattern benefit=*/highPatternBenefit);
 }
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index f8b59b87a122b..48e77d867508b 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -640,3 +640,61 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
   return
 }
 }
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row(
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK-SAME:      !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT:    %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
+// CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
+// CHECK-NEXT:    %[[BROADCAST:.*]] = vector.broadcast %[[REDUCE]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
+func.func @vector_broadcast_1d_to_2d_broadcast_along_row(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.0000> : vector<16xf16>
+  %3 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = vector.multi_reduction <add>, %3, %cst [0] : vector<16x16xf16> to vector<16xf16>
+  %5 = vector.broadcast %4 : vector<16xf16> to vector<16x16xf16>
+  xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_2d_to_2d_along_column(
+// CHECK:            %[[REDUCE:.*]] = vector.multi_reduction <add>
+// CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<16x16xf16> to vector<16xf16>
+// CHECK-NEXT:    %[[SHAPECAST:.*]] = vector.shape_cast %[[REDUCE]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
+// CHECK-NEXT:    vector.broadcast %[[SHAPECAST]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+
+func.func @vector_broadcast_2d_to_2d_along_column(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.0000> : vector<16xf16>
+  %3 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = vector.multi_reduction <add>, %3, %cst [1] : vector<16x16xf16> to vector<16xf16>
+  %5 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16>
+  %6 = vector.broadcast %5 : vector<16x1xf16> to vector<16x16xf16>
+  xegpu.store_nd %6, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_scalar_to_vector(
+// CHECK:         %[[CST:.*]] = arith.constant 0.{{.*}} : f16
+// CHECK-NEXT:    %[[BROADCAST:.*]] = vector.broadcast %[[CST]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+
+func.func @vector_broadcast_scalar_to_vector(%arg0: !xegpu.tensor_desc<16x16xf16>) {
+  %cst = arith.constant 0.0000 : f16
+  %6 = vector.broadcast %cst : f16 to vector<16x16xf16>
+  xegpu.store_nd %6, %arg0  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 44ec21359593f..216f3d19cff94 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -920,4 +920,69 @@ gpu.func @vector_insert_strided_slice_unsupported_offset(%laneid: index) {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane
+// CHECK-SAME: (%[[ARG0:.*]]: index) {
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<1xf16>)
+// CHECK: %[[DEF:.*]] = "some_def"()
+// CHECK: %[[BCAST_INNER:.*]] = vector.broadcast %[[DEF]]
+// CHECK: gpu.yield %[[BCAST_INNER]], %[[DEF]]
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[R]]#1 : vector<1xf16> to vector<16x1xf16>
+// CHECK: "some_use"(%[[BCAST]])
+gpu.func  @vector_broadcast_1d_to_2d_broadcast_within_lane(%laneid: index) {
+
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
+
+    %1 = "some_def"() : () -> vector<16xf16>
+
+    %2 = vector.broadcast %1 {
+      layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    } : vector<16xf16> to vector<16x16xf16>
+
+    gpu.yield %2 : vector<16x16xf16>
+  }
+  "some_use"(%r) : (vector<16x1xf16>) -> ()
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case
+// CHECK-SAME: (%[[ARG0:.*]]: index)
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<16x1xf16>)
+// CHECK:   %[[DEF:.*]] = "some_def"() : () -> vector<16x1xf16>
+// CHECK:   %[[BCAST:.*]] = vector.broadcast %[[DEF]]
+// CHECK-SAME: : vector<16x1xf16> to vector<16x16xf16>
+// CHECK:   gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, vector<16x1xf16>
+// CHECK: "some_use"(%[[R]]#1) : (vector<16x1xf16>) -> ()
+gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: index) {
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
+    %1 = "some_def"() : () -> vector<16x1xf16>
+    %2 = vector.broadcast %1 {
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    } : vector<16x1xf16> to vector<16x16xf16>
+    gpu.yield %2: vector<16x16xf16>
+  }
+  "some_use"(%0) : (vector<16x1xf16>) -> ()
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector
+// CHECK-SAME: (%[[ARG0:.*]]: index)
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, f16)
+// CHECK: %[[DEF:.*]] = "some_def"()
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+// CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16
+// CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x1xf16>
+// CHECK: "some_use"(%[[RESULT]])
+gpu.func
+@vector_shape_cast_scalar_to_vector(%arg0: index) {
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
+    %1 = "some_def"() : () -> f16
+    %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+    gpu.yield %2 : vector<16x16xf16>
+  }
+  "some_use"(%0) : (vector<16x1xf16>) -> ()
+  gpu.return
+}
+
 }
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 22177f8f6a15f..e5e3d2a1c1ad5 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -330,3 +330,64 @@ gpu.module @xevm_module{
     gpu.return
   }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane({{.*}}) {
+gpu.module @xevm_module{
+   gpu.func  @vector_broadcast_1d_to_2d_broadcast_within_lane(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
+    %tdesc0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
+    // CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16>
+    %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case({{.*}}) {
+gpu.module @xevm_module{
+   gpu.func  @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
+    %1 = xegpu.load %arg0[%c0], %mask {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
+    
+    %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
+    %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+    // CHECK-NOT: vector.broadcast
+    // CHECK-NOT: vector.shape_cast
+ 
+    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK: xegpu.store_nd {{.*}}, {{.*}}[{{.*}}, {{.*}}]
+    // CHECK-SAME: : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector({{.*}}) {
+gpu.module @xevm_module{
+   gpu.func  @vector_shape_cast_scalar_to_vector(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %9 = gpu.block_id  x
+    %10 = arith.index_cast %9 : index to i16
+    %11 = arith.bitcast %10 : i16 to f16
+    // CHECK: vector.broadcast {{.*}} : f16 to vector<16xf16>
+    %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+

From 0c0ed398ecef2d3840b4ebc11b41d638339d299b Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 9 Dec 2025 08:50:37 -0800
Subject: [PATCH 20/85] [lldb] Don't read firstSubclass and nextSiblingClass
 from class_rw_t (#171213)

We're considering modifying the ObjC runtime's class_rw_t structure to
remove the firstSubclass and nextSiblingClass fields in some cases. LLDB
is currently reading those but not actually using them. Stop doing that
to avoid issues if they are removed by the runtime.

rdar://166084122
---
 .../ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp     | 5 +----
 .../ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h       | 3 ---
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
index 954f269f8860b..ebde8892d8f62 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
@@ -111,7 +111,6 @@ bool ClassDescriptorV2::class_rw_t::Read(Process *process, lldb::addr_t addr) {
                           process->GetAddressByteSize());
 
   lldb::offset_t cursor = 0;
-
   m_flags = extractor.GetU32_unchecked(&cursor);
   m_version = extractor.GetU32_unchecked(&cursor);
   m_ro_ptr = extractor.GetAddress_unchecked(&cursor);
@@ -119,18 +118,16 @@ bool ClassDescriptorV2::class_rw_t::Read(Process *process, lldb::addr_t addr) {
     m_ro_ptr = abi_sp->FixCodeAddress(m_ro_ptr);
   m_method_list_ptr = extractor.GetAddress_unchecked(&cursor);
   m_properties_ptr = extractor.GetAddress_unchecked(&cursor);
-  m_firstSubclass = extractor.GetAddress_unchecked(&cursor);
-  m_nextSiblingClass = extractor.GetAddress_unchecked(&cursor);
 
   if (m_ro_ptr & 1) {
     DataBufferHeap buffer(ptr_size, '\0');
     process->ReadMemory(m_ro_ptr ^ 1, buffer.GetBytes(), ptr_size, error);
     if (error.Fail())
       return false;
-    cursor = 0;
     DataExtractor extractor(buffer.GetBytes(), ptr_size,
                             process->GetByteOrder(),
                             process->GetAddressByteSize());
+    lldb::offset_t cursor = 0;
     m_ro_ptr = extractor.GetAddress_unchecked(&cursor);
     if (ABISP abi_sp = process->GetABI())
       m_ro_ptr = abi_sp->FixCodeAddress(m_ro_ptr);
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
index 0fff9af438367..8d19b00f1551f 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
@@ -133,9 +133,6 @@ class ClassDescriptorV2 : public ObjCLanguageRuntime::ClassDescriptor {
     lldb::addr_t m_properties_ptr;
     lldb::addr_t m_protocols_ptr;
 
-    ObjCLanguageRuntime::ObjCISA m_firstSubclass;
-    ObjCLanguageRuntime::ObjCISA m_nextSiblingClass;
-
     bool Read(Process *process, lldb::addr_t addr);
   };
 

From 2e16f24957eea220d09bdc869d369c80904eecaf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 9 Dec 2025 08:51:43 -0800
Subject: [PATCH 21/85] [RISCV] Add VMNoV0 register class with only the
 VMaskVTs. (#171231)

I plan to use this for inline assembly "vd" contraints with mask types
in a follow up patch. Due to the test changes I wanted to post this
separately.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td    |  1 +
 .../instruction-select/rvv/select.mir         | 20 +++++++++----------
 .../RISCV/rvv/pass-fast-math-flags-sdnode.ll  |  2 +-
 .../RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir  |  4 ++--
 .../rvv/strided-vpload-vpstore-output.ll      |  2 +-
 .../RISCV/rvv/vleff-vlseg2ff-output.ll        |  4 ++--
 .../CodeGen/RISCV/rvv/vmerge-peephole.mir     |  4 ++--
 .../CodeGen/RISCV/rvv/vmv.v.v-peephole.mir    |  6 +++---
 .../RISCV/rvv/vsetvli-insert-crossbb.mir      |  4 ++--
 9 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 11b7a0a3c691a..f354793eb0eac 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -813,6 +813,7 @@ def VMV0 : VReg<VMaskVTs, (add V0), 1>;
 
 // The register class is added for inline assembly for vector mask types.
 def VM : VReg<VMaskVTs, (add VR), 1>;
+def VMNoV0 : VReg<VMaskVTs, (sub VR, V0), 1>;
 
 defvar VTupM1N2VTs = [riscv_nxv8i8x2, riscv_nxv4i8x2, riscv_nxv2i8x2, riscv_nxv1i8x2];
 defvar VTupM1N3VTs = [riscv_nxv8i8x3, riscv_nxv4i8x3, riscv_nxv2i8x3, riscv_nxv1i8x3];
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
index ada76a43639d7..b7cb295648b4e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
@@ -11,7 +11,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv1i8
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -19,7 +19,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv1i8
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -40,7 +40,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv4i8
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -48,7 +48,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv4i8
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -98,7 +98,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv64i8
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -106,7 +106,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv64i8
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -127,7 +127,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv2i16
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -135,7 +135,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv2i16
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -185,7 +185,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv32i16
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
@@ -193,7 +193,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv32i16
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
diff --git a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
index 0654fe8bd8d66..3225d649f066e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
@@ -13,7 +13,7 @@ define <vscale x 1 x double> @foo(<vscale x 1 x double> %x, <vscale x 1 x double
   ; CHECK-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
   ; CHECK-NEXT:   [[SRLI:%[0-9]+]]:gprnox0 = SRLI killed [[SLLI]], 32
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVFMUL_VV_M1_E64_MASK:%[0-9]+]]:vrnov0 = nnan ninf nsz arcp contract afn reassoc nofpexcept PseudoVFMUL_VV_M1_E64_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], 7, killed [[SRLI]], 6 /* e64 */, 1 /* ta, mu */, implicit $frm
+  ; CHECK-NEXT:   [[PseudoVFMUL_VV_M1_E64_MASK:%[0-9]+]]:vmnov0 = nnan ninf nsz arcp contract afn reassoc nofpexcept PseudoVFMUL_VV_M1_E64_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], 7, killed [[SRLI]], 6 /* e64 */, 1 /* ta, mu */, implicit $frm
   ; CHECK-NEXT:   $v8 = COPY [[PseudoVFMUL_VV_M1_E64_MASK]]
   ; CHECK-NEXT:   PseudoRET implicit $v8
   %1 = call fast <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> %x, <vscale x 1 x double> %y, <vscale x 1 x i1> %m, i32 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index c73c2004834db..ece457a09dbdf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -11,7 +11,7 @@ body: |
     ; CHECK: liveins: $x1, $v8, $v9
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %false:vrnov0 = COPY $v8
-    ; CHECK-NEXT: %true:vrnov0 = COPY $v9
+    ; CHECK-NEXT: %true:vmnov0 = COPY $v9
     ; CHECK-NEXT: %avl:gprnox0 = COPY $x1
     ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */
     ; CHECK-NEXT: $v0 = COPY %mask
@@ -135,7 +135,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %false:vrnov0 = COPY $v8
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
-    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: %true:vmnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 1 /* ta, mu */
     %false:vrnov0 = COPY $v8
     %mask:vmv0 = COPY $v0
     %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
index f087efcc5f57b..d3649ef4b6664 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
@@ -15,7 +15,7 @@ define <vscale x 1 x i8> @strided_vpload_nxv1i8_i8(ptr %ptr, i8 signext %stride,
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vmnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
   ; CHECK-NEXT:   $v8 = COPY [[PseudoVLSE8_V_MF8_MASK]]
   ; CHECK-NEXT:   PseudoRET implicit $v8
   %load = call <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 %stride, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index 6b6276b838fba..7cbaceae858b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -42,9 +42,9 @@ define i64 @test_vleff_nxv8i8_mask(<vscale x 8 x i8> %maskedoff, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vmnov0 = COPY $v8
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vrnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vmnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_MASK1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
index 81a271bd975e3..bc78a7732c15a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
@@ -148,8 +148,8 @@ body: |
     ; CHECK-NEXT: %y:vr = COPY $v9
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
     ; CHECK-NEXT: %add0:vr = PseudoVADD_VV_M1 $noreg, %x, %y, -1, 5 /* e32 */, 3 /* ta, ma */
-    ; CHECK-NEXT: %add1:vrnov0 = COPY %add:vrnov0
-    ; CHECK-NEXT: %merge:vrnov0 = PseudoVOR_VV_M1_MASK %add:vrnov0, %add1, %y, %mask, -1, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: %add1:vmnov0 = COPY %add:vmnov0
+    ; CHECK-NEXT: %merge:vrnov0 = PseudoVOR_VV_M1_MASK %add:vmnov0, %add1, %y, %mask, -1, 5 /* e32 */, 1 /* ta, mu */
     %x:vr = COPY $v8
     %y:vr = COPY $v9
     %mask:vmv0 = COPY $v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index 68e74ff6ba05b..3f551ba91b3a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -112,7 +112,7 @@ body: |
     ; CHECK-LABEL: name: diff_regclass
     ; CHECK: liveins: $v8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vmnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vmv0 = COPY $v8
     ; CHECK-NEXT: [[PseudoVADD_VV_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVADD_VV_M1_MASK [[PseudoVMV_V_I_MF2_]], $noreg, $noreg, [[COPY]], 0, 5 /* e32 */, 0 /* tu, mu */
     %0:vr = PseudoVMV_V_I_MF2 $noreg, 0, -1, 5 /* e32 */, 0 /* tu, mu */
@@ -128,7 +128,7 @@ body: |
     ; CHECK-LABEL: name: diff_regclass_passthru
     ; CHECK: liveins: $v8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vmnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vmv0 = COPY $v8
     ; CHECK-NEXT: [[PseudoVLSE32_V_MF2_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE32_V_MF2_MASK [[PseudoVMV_V_I_MF2_]], $noreg, $noreg, [[COPY]], 0, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 4)
     %2:vr = PseudoVMV_V_I_MF2 $noreg, 0, -1, 5 /* e32 */, 0 /* tu, mu */
@@ -162,7 +162,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %passthru:vrnov0 = COPY $v8
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
-    ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, $noreg, %mask, 4, 5 /* e32 */
+    ; CHECK-NEXT: %x:vmnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, $noreg, %mask, 4, 5 /* e32 */
     %passthru:vrnov0 = COPY $v8
     %mask:vmv0 = COPY $v0
     %x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %passthru, $noreg, %mask, 4, 5 /* e32 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
index a35100654432c..0b242abeb035c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
@@ -793,7 +793,7 @@ body:             |
   ; CHECK-NEXT:   %idxs:vr = COPY $v0
   ; CHECK-NEXT:   %t1:vr = COPY $v1
   ; CHECK-NEXT:   %t3:vr = COPY $v2
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vrnov0 = COPY $v3
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vmnov0 = COPY $v3
   ; CHECK-NEXT:   %t5:vrnov0 = COPY $v1
   ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gprnox0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
   ; CHECK-NEXT:   %t6:vr = PseudoVMSEQ_VI_M1 %t1, 0, -1, 6 /* e64 */, implicit $vl, implicit $vtype
@@ -811,7 +811,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $v0 = COPY %mask
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0X0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl
-  ; CHECK-NEXT:   early-clobber [[COPY]]:vrnov0 = PseudoVLUXEI64_V_M1_MF8_MASK %t5, %inaddr, %idxs, $v0, -1, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   early-clobber [[COPY]]:vmnov0 = PseudoVLUXEI64_V_M1_MF8_MASK %t5, %inaddr, %idxs, $v0, -1, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:

From 632cbee244f31854c3d7320033471772a7a2f453 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 9 Dec 2025 08:52:55 -0800
Subject: [PATCH 22/85] [RISCV] Use VM and VMNoV0 for "vr" and "vd" inline asm
 constraints with mask type. (#171235)

The inline assembly handling in SelectionDAG uses the first type
for the register class as the type at the input/output of the
inlineassembly. If this isn't the type for the surrounding DAG,
it needs to be converted.

nxv8i8 is the first type for the VR and VRNoV0 register classes.
So we currently generate insert/extract_subvector and bitcasts to
convert to/from nxv8i8.

I believe some of the special casing we have for this in
splitValueIntoRegisterParts and joinRegisterPartsIntoValue is causing
us to also generate incorrect code for arguments with nxv16i4 types
that should be any extended to nxv16i8. Instead we widen them to nxv32i4
and bitcast to nxv16i8.

This patch uses VM and VMNoV0 for masks which has nxv64i1 as their
first type. This means we will only emit an insert/extract_subvector
without any bitcasts. This will allow me to fix
splitValueIntoRegisterParts and joinRegisterPartsIntoValue to fix the
nxv16i4 argument issue without breaking inline assembly.

I may need to add more register classes to cover fractional LMULs,
but I'm not sure yet.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 32 +++++++++++----------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b74dba12e8959..7cbb9c0da4874 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24323,14 +24323,15 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       break;
     }
   } else if (Constraint == "vr") {
+    // Check VM first so that mask types will use that instead of VR.
     for (const auto *RC :
-         {&RISCV::VRRegClass, &RISCV::VRM2RegClass, &RISCV::VRM4RegClass,
-          &RISCV::VRM8RegClass, &RISCV::VRN2M1RegClass, &RISCV::VRN3M1RegClass,
-          &RISCV::VRN4M1RegClass, &RISCV::VRN5M1RegClass,
-          &RISCV::VRN6M1RegClass, &RISCV::VRN7M1RegClass,
-          &RISCV::VRN8M1RegClass, &RISCV::VRN2M2RegClass,
-          &RISCV::VRN3M2RegClass, &RISCV::VRN4M2RegClass,
-          &RISCV::VRN2M4RegClass}) {
+         {&RISCV::VMRegClass, &RISCV::VRRegClass, &RISCV::VRM2RegClass,
+          &RISCV::VRM4RegClass, &RISCV::VRM8RegClass, &RISCV::VRN2M1RegClass,
+          &RISCV::VRN3M1RegClass, &RISCV::VRN4M1RegClass,
+          &RISCV::VRN5M1RegClass, &RISCV::VRN6M1RegClass,
+          &RISCV::VRN7M1RegClass, &RISCV::VRN8M1RegClass,
+          &RISCV::VRN2M2RegClass, &RISCV::VRN3M2RegClass,
+          &RISCV::VRN4M2RegClass, &RISCV::VRN2M4RegClass}) {
       if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
         return std::make_pair(0U, RC);
 
@@ -24341,15 +24342,16 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       }
     }
   } else if (Constraint == "vd") {
+    // Check VMNoV0 first so that mask types will use that instead of VRNoV0.
     for (const auto *RC :
-         {&RISCV::VRNoV0RegClass, &RISCV::VRM2NoV0RegClass,
-          &RISCV::VRM4NoV0RegClass, &RISCV::VRM8NoV0RegClass,
-          &RISCV::VRN2M1NoV0RegClass, &RISCV::VRN3M1NoV0RegClass,
-          &RISCV::VRN4M1NoV0RegClass, &RISCV::VRN5M1NoV0RegClass,
-          &RISCV::VRN6M1NoV0RegClass, &RISCV::VRN7M1NoV0RegClass,
-          &RISCV::VRN8M1NoV0RegClass, &RISCV::VRN2M2NoV0RegClass,
-          &RISCV::VRN3M2NoV0RegClass, &RISCV::VRN4M2NoV0RegClass,
-          &RISCV::VRN2M4NoV0RegClass}) {
+         {&RISCV::VMNoV0RegClass, &RISCV::VRNoV0RegClass,
+          &RISCV::VRM2NoV0RegClass, &RISCV::VRM4NoV0RegClass,
+          &RISCV::VRM8NoV0RegClass, &RISCV::VRN2M1NoV0RegClass,
+          &RISCV::VRN3M1NoV0RegClass, &RISCV::VRN4M1NoV0RegClass,
+          &RISCV::VRN5M1NoV0RegClass, &RISCV::VRN6M1NoV0RegClass,
+          &RISCV::VRN7M1NoV0RegClass, &RISCV::VRN8M1NoV0RegClass,
+          &RISCV::VRN2M2NoV0RegClass, &RISCV::VRN3M2NoV0RegClass,
+          &RISCV::VRN4M2NoV0RegClass, &RISCV::VRN2M4NoV0RegClass}) {
       if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
         return std::make_pair(0U, RC);
 

From b3b033bf7350e6a7cd581caa21f18471aad65c0f Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 9 Dec 2025 09:00:57 -0800
Subject: [PATCH 23/85] [CIR][NFC] Fix bad switch fallthroughs in emitStmt
 (#171224)

This moves a couple of statement emitters that were incorrectly
implemented in the middle of a switch statement where all cases in the
final group are intended to fall through to a handler that emits an NYI
error message. The placement of these implementations was causing some
statement types that should have emitted the NYI error to instead go to
a handler for a different statement type.
---
 clang/lib/CIR/CodeGen/CIRGenStmt.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index da7ab0691cb63..f13e7cb32c71e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -159,6 +159,10 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
     return emitCXXTryStmt(cast<CXXTryStmt>(*s));
   case Stmt::CXXForRangeStmtClass:
     return emitCXXForRangeStmt(cast<CXXForRangeStmt>(*s), attr);
+  case Stmt::CoroutineBodyStmtClass:
+    return emitCoroutineBody(cast<CoroutineBodyStmt>(*s));
+  case Stmt::IndirectGotoStmtClass:
+    return emitIndirectGotoStmt(cast<IndirectGotoStmt>(*s));
   case Stmt::OpenACCComputeConstructClass:
     return emitOpenACCComputeConstruct(cast<OpenACCComputeConstruct>(*s));
   case Stmt::OpenACCLoopConstructClass:
@@ -199,11 +203,7 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
   case Stmt::CaseStmtClass:
   case Stmt::SEHLeaveStmtClass:
   case Stmt::SYCLKernelCallStmtClass:
-  case Stmt::CoroutineBodyStmtClass:
-    return emitCoroutineBody(cast<CoroutineBodyStmt>(*s));
   case Stmt::CoreturnStmtClass:
-  case Stmt::IndirectGotoStmtClass:
-    return emitIndirectGotoStmt(cast<IndirectGotoStmt>(*s));
   case Stmt::OMPParallelDirectiveClass:
   case Stmt::OMPTaskwaitDirectiveClass:
   case Stmt::OMPTaskyieldDirectiveClass:

From e6145e870977babfc599d8339675a2b8c56da730 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 9 Dec 2025 09:01:16 -0800
Subject: [PATCH 24/85] [CIR][NFC] Add stubs for missing visitors in
 ScalarExprEmitter (#171222)

This adds stubs that issue NYI errors for any visitor that is present in
the ClangIR incubator but missing in the upstream implementation. This
will make it easier to find to correct locations to implement missing
functionality.
---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 170 ++++++++++++++++++++-
 1 file changed, 169 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 9043ecab42f15..6820e2a403288 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -139,6 +139,11 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return {};
   }
 
+  mlir::Value VisitConstantExpr(ConstantExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: constant expr");
+    return {};
+  }
+
   mlir::Value VisitPackIndexingExpr(PackIndexingExpr *e) {
     return Visit(e->getSelectedExpr());
   }
@@ -159,6 +164,14 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   mlir::Value VisitCoawaitExpr(CoawaitExpr *s) {
     return cgf.emitCoawaitExpr(*s).getValue();
   }
+  mlir::Value VisitCoyieldExpr(CoyieldExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: coyield");
+    return {};
+  }
+  mlir::Value VisitUnaryCoawait(const UnaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: unary coawait");
+    return {};
+  }
 
   mlir::Value emitLoadOfLValue(LValue lv, SourceLocation loc) {
     return cgf.emitLoadOfLValue(lv, loc).getValue();
@@ -198,6 +211,12 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
                                    cir::IntAttr::get(type, e->getValue()));
   }
 
+  mlir::Value VisitFixedPointLiteral(const FixedPointLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: fixed point literal");
+    return {};
+  }
+
   mlir::Value VisitFloatingLiteral(const FloatingLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
     assert(mlir::isa<cir::FPTypeInterface>(type) &&
@@ -229,6 +248,23 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitOffsetOfExpr(OffsetOfExpr *e);
 
+  mlir::Value VisitSizeOfPackExpr(SizeOfPackExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: size of pack");
+    return {};
+  }
+  mlir::Value VisitPseudoObjectExpr(PseudoObjectExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: pseudo object");
+    return {};
+  }
+  mlir::Value VisitSYCLUniqueStableNameExpr(SYCLUniqueStableNameExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: sycl unique stable name");
+    return {};
+  }
+  mlir::Value VisitEmbedExpr(EmbedExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: embed");
+    return {};
+  }
   mlir::Value VisitOpaqueValueExpr(OpaqueValueExpr *e) {
     if (e->isGLValue())
       return emitLoadOfLValue(cgf.getOrCreateOpaqueLValueMapping(e),
@@ -238,6 +274,38 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return cgf.getOrCreateOpaqueRValueMapping(e).getValue();
   }
 
+  mlir::Value VisitObjCSelectorExpr(ObjCSelectorExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc selector");
+    return {};
+  }
+  mlir::Value VisitObjCProtocolExpr(ObjCProtocolExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc protocol");
+    return {};
+  }
+  mlir::Value VisitObjCIVarRefExpr(ObjCIvarRefExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc ivar ref");
+    return {};
+  }
+  mlir::Value VisitObjCMessageExpr(ObjCMessageExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc message");
+    return {};
+  }
+  mlir::Value VisitObjCIsaExpr(ObjCIsaExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc isa");
+    return {};
+  }
+  mlir::Value VisitObjCAvailabilityCheckExpr(ObjCAvailabilityCheckExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc availability check");
+    return {};
+  }
+
+  mlir::Value VisitMatrixSubscriptExpr(MatrixSubscriptExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: matrix subscript");
+    return {};
+  }
+
   mlir::Value VisitCastExpr(CastExpr *e);
   mlir::Value VisitCallExpr(const CallExpr *e);
 
@@ -319,6 +387,18 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitInitListExpr(InitListExpr *e);
 
+  mlir::Value VisitArrayInitIndexExpr(ArrayInitIndexExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: array init index");
+    return {};
+  }
+
+  mlir::Value VisitImplicitValueInitExpr(const ImplicitValueInitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: implicit value init");
+    return {};
+  }
+
   mlir::Value VisitExplicitCastExpr(ExplicitCastExpr *e) {
     return VisitCastExpr(e);
   }
@@ -726,6 +806,16 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return Visit(e->getSubExpr());
   }
 
+  // C++
+  mlir::Value VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: materialize temporary");
+    return {};
+  }
+  mlir::Value VisitSourceLocExpr(SourceLocExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: source loc");
+    return {};
+  }
   mlir::Value VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
     CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
     return Visit(dae->getExpr());
@@ -745,11 +835,43 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     cgf.emitCXXDeleteExpr(e);
     return {};
   }
-
+  mlir::Value VisitTypeTraitExpr(const TypeTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: type trait");
+    return {};
+  }
+  mlir::Value
+  VisitConceptSpecializationExpr(const ConceptSpecializationExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: concept specialization");
+    return {};
+  }
+  mlir::Value VisitRequiresExpr(const RequiresExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: requires");
+    return {};
+  }
+  mlir::Value VisitArrayTypeTraitExpr(const ArrayTypeTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: array type trait");
+    return {};
+  }
+  mlir::Value VisitExpressionTraitExpr(const ExpressionTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: expression trait");
+    return {};
+  }
+  mlir::Value VisitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: cxx pseudo destructor");
+    return {};
+  }
   mlir::Value VisitCXXThrowExpr(const CXXThrowExpr *e) {
     cgf.emitCXXThrowExpr(e);
     return {};
   }
+  mlir::Value VisitCXXNoexceptExpr(CXXNoexceptExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: cxx noexcept");
+    return {};
+  }
 
   /// Emit a conversion from the specified type to the specified destination
   /// type, both of which are CIR scalar types.
@@ -1213,6 +1335,52 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return maybePromoteBoolResult(resOp.getResult(), resTy);
   }
 
+  mlir::Value VisitBinPtrMemD(const BinaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: ptr mem d");
+    return {};
+  }
+
+  mlir::Value VisitBinPtrMemI(const BinaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: ptr mem i");
+    return {};
+  }
+
+  // Other Operators.
+  mlir::Value VisitBlockExpr(const BlockExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: block");
+    return {};
+  }
+
+  mlir::Value VisitChooseExpr(ChooseExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: choose");
+    return {};
+  }
+
+  mlir::Value VisitObjCStringLiteral(const ObjCStringLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc string literal");
+    return {};
+  }
+  mlir::Value VisitObjCBoxedExpr(ObjCBoxedExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc boxed");
+    return {};
+  }
+  mlir::Value VisitObjCArrayLiteral(ObjCArrayLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc array literal");
+    return {};
+  }
+  mlir::Value VisitObjCDictionaryLiteral(ObjCDictionaryLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc dictionary literal");
+    return {};
+  }
+
+  mlir::Value VisitAsTypeExpr(AsTypeExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: as type");
+    return {};
+  }
+
   mlir::Value VisitAtomicExpr(AtomicExpr *e) {
     return cgf.emitAtomicExpr(e).getValue();
   }

From 9b12f8fcaeb4e9f8a03de6e982e94e525a9a4dc6 Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Tue, 9 Dec 2025 18:06:26 +0100
Subject: [PATCH 25/85] [LLDB] Run MSVC STL smart pointer tests with PDB
 (#166946)

Runs the `std::shared/unique_ptr` tests with PDB with two changes:

- PDB uses the "full" name, so `std::string` is `std::basic_string<char,
std::char_traits<char>, std::allocator<char>>`
- The type of the pointer inside the shared/unique_ptr isn't the
`element_type` typedef
---
 .../shared_ptr/TestDataFormatterStdSharedPtr.py    | 14 ++++++++++++--
 .../unique_ptr/TestDataFormatterStdUniquePtr.py    | 12 +++++++++++-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
index d71fbf8d5f81a..fa03fc14dfb83 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
@@ -9,6 +9,8 @@
 
 
 class TestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test(self):
         """Test `frame variable` output for `std::shared_ptr` types."""
         (_, process, _, bkpt) = lldbutil.run_to_source_breakpoint(
@@ -62,7 +64,7 @@ def do_test(self):
         valobj = self.expect_var_path("sp_user", type="std::shared_ptr<User>")
         self.assertRegex(
             valobj.summary,
-            "element_type @ 0x0*[1-9a-f][0-9a-f]+( strong=1)? weak=0",
+            f"{'User' if self.getDebugInfo() == 'pdb' else 'element_type'} @ 0x0*[1-9a-f][0-9a-f]+( strong=1)? weak=0",
         )
         self.assertNotEqual(valobj.child[0].unsigned, 0)
 
@@ -77,7 +79,15 @@ def do_test(self):
         self.assertEqual(str(valobj), '(User) *pointer = (id = 30, name = "steph")')
 
         self.expect_var_path("sp_user->id", type="int", value="30")
-        self.expect_var_path("sp_user->name", type="std::string", summary='"steph"')
+        self.expect_var_path(
+            "sp_user->name",
+            type=(
+                "std::basic_string<char, std::char_traits<char>, std::allocator<char>>"
+                if self.getDebugInfo() == "pdb"
+                else "std::string"
+            ),
+            summary='"steph"',
+        )
 
         valobj = self.expect_var_path(
             "si", type="std::shared_ptr<int>", summary="47 strong=2 weak=0"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py
index 0b68b1b532bb0..1516db698798d 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py
@@ -9,6 +9,8 @@
 
 
 class TestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test(self):
         """Test `frame variable` output for `std::unique_ptr` types."""
 
@@ -84,7 +86,15 @@ def do_test(self):
         self.assertNotEqual(valobj.child[0].unsigned, 0)
 
         self.expect_var_path("up_user->id", type="int", value="30")
-        self.expect_var_path("up_user->name", type="std::string", summary='"steph"')
+        self.expect_var_path(
+            "up_user->name",
+            type=(
+                "std::basic_string<char, std::char_traits<char>, std::allocator<char>>"
+                if self.getDebugInfo() == "pdb"
+                else "std::string"
+            ),
+            summary='"steph"',
+        )
 
         self.runCmd("settings set target.experimental.use-DIL true")
         self.expect_var_path("ptr_node->value", value="1")

From 1bada0af22d878b2ec0cfefd655c09b801b44918 Mon Sep 17 00:00:00 2001
From: Rajat Bajpai <rbajpai@nvidia.com>
Date: Tue, 9 Dec 2025 22:39:11 +0530
Subject: [PATCH 26/85] [NVPTX] Add IR pass for FMA transformation in the llc
 pipeline (#154735)

This change introduces a new IR pass in the llc pipeline for NVPTX that
transforms sequences of FMUL followed by FADD or FSUB into a single FMA
instruction.

Currently, all FMA folding for NVPTX occurs at the DAGCombine stage,
which is too late for any IR-level passes that might want to optimize or
analyze FMAs. By moving this transformation earlier into the IR phase,
we enable more opportunities for FMA folding, including across basic
blocks.

Additionally, this new pass relies on the contract instruction level
fast-math flag to perform these transformations, rather than depending
on the -fp-contract=fast or -enable-unsafe-fp-math options passed to
llc.
---
 llvm/lib/Target/NVPTX/CMakeLists.txt         |   1 +
 llvm/lib/Target/NVPTX/NVPTX.h                |   6 +
 llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp    | 167 +++++++++++++
 llvm/lib/Target/NVPTX/NVPTXPassRegistry.def  |   1 +
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp |  10 +
 llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll    | 247 +++++++++++++++++++
 6 files changed, 432 insertions(+)
 create mode 100644 llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp
 create mode 100644 llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll

diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index f9c24750c4836..6fe58c25c757d 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -18,6 +18,7 @@ set(NVPTXCodeGen_sources
   NVPTXAssignValidGlobalNames.cpp
   NVPTXAtomicLower.cpp
   NVPTXCtorDtorLowering.cpp
+  NVPTXIRPeephole.cpp
   NVPTXForwardParams.cpp
   NVPTXFrameLowering.cpp
   NVPTXGenericToNVVM.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 95fd05f2a926f..210624fbb235c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -52,6 +52,7 @@ FunctionPass *createNVPTXLowerAllocaPass();
 FunctionPass *createNVPTXLowerUnreachablePass(bool TrapUnreachable,
                                               bool NoTrapAfterNoreturn);
 FunctionPass *createNVPTXTagInvariantLoadsPass();
+FunctionPass *createNVPTXIRPeepholePass();
 MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 MachineFunctionPass *createNVPTXForwardParamsPass();
@@ -75,12 +76,17 @@ void initializeNVPTXAAWrapperPassPass(PassRegistry &);
 void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
 void initializeNVPTXPeepholePass(PassRegistry &);
 void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);
+void initializeNVPTXIRPeepholePass(PassRegistry &);
 void initializeNVPTXPrologEpilogPassPass(PassRegistry &);
 
 struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+struct NVPTXIRPeepholePass : PassInfoMixin<NVPTXIRPeepholePass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
   NVVMReflectPass() : SmVersion(0) {}
   NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp b/llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp
new file mode 100644
index 0000000000000..bd16c7213b1e7
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp
@@ -0,0 +1,167 @@
+//===------ NVPTXIRPeephole.cpp - NVPTX IR Peephole --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements IR-level peephole optimizations. These transformations
+// run late in the NVPTX IR pass pipeline just before the instruction selection.
+//
+// Currently, it implements the following transformation(s):
+// 1. FMA folding (float/double types):
+//    Transforms FMUL+FADD/FSUB sequences into FMA intrinsics when the
+//    'contract' fast-math flag is present. Supported patterns:
+//    - fadd(fmul(a, b), c) => fma(a, b, c)
+//    - fadd(c, fmul(a, b)) => fma(a, b, c)
+//    - fadd(fmul(a, b), fmul(c, d)) => fma(a, b, fmul(c, d))
+//    - fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+//    - fsub(a, fmul(b, c)) => fma(fneg(b), c, a)
+//    - fsub(fmul(a, b), fmul(c, d)) => fma(a, b, fneg(fmul(c, d)))
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+
+#define DEBUG_TYPE "nvptx-ir-peephole"
+
+using namespace llvm;
+
+static bool tryFoldBinaryFMul(BinaryOperator *BI) {
+  Value *Op0 = BI->getOperand(0);
+  Value *Op1 = BI->getOperand(1);
+
+  auto *FMul0 = dyn_cast<BinaryOperator>(Op0);
+  auto *FMul1 = dyn_cast<BinaryOperator>(Op1);
+
+  BinaryOperator *FMul = nullptr;
+  Value *OtherOperand = nullptr;
+  bool IsFirstOperand = false;
+
+  // Either Op0 or Op1 should be a valid FMul
+  if (FMul0 && FMul0->getOpcode() == Instruction::FMul && FMul0->hasOneUse() &&
+      FMul0->hasAllowContract()) {
+    FMul = FMul0;
+    OtherOperand = Op1;
+    IsFirstOperand = true;
+  } else if (FMul1 && FMul1->getOpcode() == Instruction::FMul &&
+             FMul1->hasOneUse() && FMul1->hasAllowContract()) {
+    FMul = FMul1;
+    OtherOperand = Op0;
+    IsFirstOperand = false;
+  } else {
+    return false;
+  }
+
+  bool IsFSub = BI->getOpcode() == Instruction::FSub;
+  LLVM_DEBUG({
+    const char *OpName = IsFSub ? "FSub" : "FAdd";
+    dbgs() << "Found " << OpName << " with FMul (single use) as "
+           << (IsFirstOperand ? "first" : "second") << " operand: " << *BI
+           << "\n";
+  });
+
+  Value *MulOp0 = FMul->getOperand(0);
+  Value *MulOp1 = FMul->getOperand(1);
+  IRBuilder<> Builder(BI);
+  Value *FMA = nullptr;
+
+  if (!IsFSub) {
+    // fadd(fmul(a, b), c) => fma(a, b, c)
+    // fadd(c, fmul(a, b)) => fma(a, b, c)
+    FMA = Builder.CreateIntrinsic(Intrinsic::fma, {BI->getType()},
+                                  {MulOp0, MulOp1, OtherOperand});
+  } else {
+    if (IsFirstOperand) {
+      // fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+      Value *NegOtherOp =
+          Builder.CreateFNegFMF(OtherOperand, BI->getFastMathFlags());
+      FMA = Builder.CreateIntrinsic(Intrinsic::fma, {BI->getType()},
+                                    {MulOp0, MulOp1, NegOtherOp});
+    } else {
+      // fsub(a, fmul(b, c)) => fma(fneg(b), c, a)
+      Value *NegMulOp0 =
+          Builder.CreateFNegFMF(MulOp0, FMul->getFastMathFlags());
+      FMA = Builder.CreateIntrinsic(Intrinsic::fma, {BI->getType()},
+                                    {NegMulOp0, MulOp1, OtherOperand});
+    }
+  }
+
+  // Combine fast-math flags from the original instructions
+  auto *FMAInst = cast<Instruction>(FMA);
+  FastMathFlags BinaryFMF = BI->getFastMathFlags();
+  FastMathFlags FMulFMF = FMul->getFastMathFlags();
+  FastMathFlags NewFMF = FastMathFlags::intersectRewrite(BinaryFMF, FMulFMF) |
+                         FastMathFlags::unionValue(BinaryFMF, FMulFMF);
+  FMAInst->setFastMathFlags(NewFMF);
+
+  LLVM_DEBUG({
+    const char *OpName = IsFSub ? "FSub" : "FAdd";
+    dbgs() << "Replacing " << OpName << " with FMA: " << *FMA << "\n";
+  });
+  BI->replaceAllUsesWith(FMA);
+  BI->eraseFromParent();
+  FMul->eraseFromParent();
+  return true;
+}
+
+static bool foldFMA(Function &F) {
+  bool Changed = false;
+
+  // Iterate and process float/double FAdd/FSub instructions with allow-contract
+  for (auto &I : llvm::make_early_inc_range(instructions(F))) {
+    if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
+      // Only FAdd and FSub are supported.
+      if (BI->getOpcode() != Instruction::FAdd &&
+          BI->getOpcode() != Instruction::FSub)
+        continue;
+
+      // At minimum, the instruction should have allow-contract.
+      if (!BI->hasAllowContract())
+        continue;
+
+      // Only float and double are supported.
+      if (!BI->getType()->isFloatTy() && !BI->getType()->isDoubleTy())
+        continue;
+
+      if (tryFoldBinaryFMul(BI))
+        Changed = true;
+    }
+  }
+  return Changed;
+}
+
+namespace {
+
+struct NVPTXIRPeephole : public FunctionPass {
+  static char ID;
+  NVPTXIRPeephole() : FunctionPass(ID) {}
+  bool runOnFunction(Function &F) override;
+};
+
+} // namespace
+
+char NVPTXIRPeephole::ID = 0;
+INITIALIZE_PASS(NVPTXIRPeephole, "nvptx-ir-peephole", "NVPTX IR Peephole",
+                false, false)
+
+bool NVPTXIRPeephole::runOnFunction(Function &F) { return foldFMA(F); }
+
+FunctionPass *llvm::createNVPTXIRPeepholePass() {
+  return new NVPTXIRPeephole();
+}
+
+PreservedAnalyses NVPTXIRPeepholePass::run(Function &F,
+                                           FunctionAnalysisManager &) {
+  if (!foldFMA(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
index ee37c9826012c..7d645bff7110f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
+++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -40,4 +40,5 @@ FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
 FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
 FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
 FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
+FUNCTION_PASS("nvptx-ir-peephole", NVPTXIRPeepholePass())
 #undef FUNCTION_PASS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index a6837a482608c..74bae28044e66 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -51,6 +51,13 @@ static cl::opt<bool>
                                cl::desc("Disable load/store vectorizer"),
                                cl::init(false), cl::Hidden);
 
+// NVPTX IR Peephole is a new pass; this option will lets us turn it off in case
+// we encounter some issues.
+static cl::opt<bool>
+    DisableNVPTXIRPeephole("disable-nvptx-ir-peephole",
+                           cl::desc("Disable NVPTX IR Peephole"),
+                           cl::init(false), cl::Hidden);
+
 // TODO: Remove this flag when we are confident with no regressions.
 static cl::opt<bool> DisableRequireStructuredCFG(
     "disable-nvptx-require-structured-cfg",
@@ -115,6 +122,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   initializeNVPTXExternalAAWrapperPass(PR);
   initializeNVPTXPeepholePass(PR);
   initializeNVPTXTagInvariantLoadLegacyPassPass(PR);
+  initializeNVPTXIRPeepholePass(PR);
   initializeNVPTXPrologEpilogPassPass(PR);
 }
 
@@ -379,6 +387,8 @@ void NVPTXPassConfig::addIRPasses() {
       addPass(createLoadStoreVectorizerPass());
     addPass(createSROAPass());
     addPass(createNVPTXTagInvariantLoadsPass());
+    if (!DisableNVPTXIRPeephole)
+      addPass(createNVPTXIRPeepholePass());
   }
 
   if (ST.hasPTXASUnreachableBug()) {
diff --git a/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll
new file mode 100644
index 0000000000000..6d9ad8d3ad436
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=nvptx-ir-peephole -S | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+define float @test_fsub_fmul_c(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fsub_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %mul = fmul contract float %a, %b
+  %sub = fsub contract float %mul, %c
+  ret float %sub
+}
+
+
+; fsub(c, fmul(a, b)) => fma(-a, b, c)
+define float @test_fsub_c_fmul(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fsub_c_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[B]], float [[C]])
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %mul = fmul contract float %a, %b
+  %sub = fsub contract float %c, %mul
+  ret float %sub
+}
+
+
+; fsub(fmul(a, b), fmul(c, d)) => fma(a, b, fneg(fmul(c, d)))
+define float @test_fsub_fmul_fmul(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fsub_fmul_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul contract float [[C]], [[D]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[MUL2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %mul1 = fmul contract float %a, %b
+  %mul2 = fmul contract float %c, %d
+  %sub = fsub contract float %mul1, %mul2
+  ret float %sub
+}
+
+
+; fsub(fmul(a, b), fmul(c, d)) => fma(fneg(c), d, fmul(a, b)))
+; fmul(a, b) has multiple uses.
+define float @test_fsub_fmul_fmul_multiple_use(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fsub_fmul_fmul_multiple_use(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[D]], float [[MUL1]])
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP2]], [[MUL1]]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %mul1 = fmul contract float %a, %b
+  %mul2 = fmul contract float %c, %d
+  %sub = fsub contract float %mul1, %mul2
+  %add = fadd float %sub, %mul1
+  ret float %add
+}
+
+
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c)) where fsub and fmul are in different BBs
+define float @test_fsub_fmul_different_BB(float %a, float %b, float %c, i32 %n) {
+; CHECK-LABEL: define float @test_fsub_fmul_different_BB(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[INIT:.*]]:
+; CHECK-NEXT:    [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10
+; CHECK-NEXT:    br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]]
+; CHECK:       [[ITERATION]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00
+; CHECK-NEXT:    [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = fneg contract float [[C_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP0]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+init:
+  %mul = fmul contract float %a, %b
+  %cmp_iter = icmp sgt i32 %n, 10
+  br i1 %cmp_iter, label %iteration, label %exit
+
+iteration:
+  %i = phi i32 [ 0, %init ], [ %i_next, %iteration ]
+  %acc = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %i_next = add i32 %i, 1
+  %acc_next = fadd contract float %acc, 1.0
+  %cmp_loop = icmp slt i32 %i_next, %n
+  br i1 %cmp_loop, label %iteration, label %exit
+
+exit:
+  %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %sub = fsub contract float %mul, %c_phi
+  ret float %sub
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c)
+define float @test_fadd_fmul_c(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fadd_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %mul = fmul contract float %a, %b
+  %add = fadd contract float %mul, %c
+  ret float %add
+}
+
+
+; fadd(c, fmul(a, b)) => fma(a, b, c)
+define float @test_fadd_c_fmul(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fadd_c_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %mul = fmul contract float %a, %b
+  %add = fadd contract float %c, %mul
+  ret float %add
+}
+
+
+; fadd(fmul(a, b), fmul(c, d)) => fma(a, b, fmul(c, d))
+define float @test_fadd_fmul_fmul(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fadd_fmul_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul contract float [[C]], [[D]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[MUL2]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %mul1 = fmul contract float %a, %b
+  %mul2 = fmul contract float %c, %d
+  %add = fadd contract float %mul1, %mul2
+  ret float %add
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c) where fadd and fmul are in different BBs
+define float @test_fadd_fmul_different_BB(float %a, float %b, float %c, i32 %n) {
+; CHECK-LABEL: define float @test_fadd_fmul_different_BB(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[INIT:.*]]:
+; CHECK-NEXT:    [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10
+; CHECK-NEXT:    br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]]
+; CHECK:       [[ITERATION]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00
+; CHECK-NEXT:    [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C_PHI]])
+; CHECK-NEXT:    ret float [[TMP0]]
+;
+init:
+  %mul = fmul contract float %a, %b
+  %cmp_iter = icmp sgt i32 %n, 10
+  br i1 %cmp_iter, label %iteration, label %exit
+
+iteration:
+  %i = phi i32 [ 0, %init ], [ %i_next, %iteration ]
+  %acc = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %i_next = add i32 %i, 1
+  %acc_next = fadd contract float %acc, 1.0
+  %cmp_loop = icmp slt i32 %i_next, %n
+  br i1 %cmp_loop, label %iteration, label %exit
+
+exit:
+  %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %add = fadd contract float %mul, %c_phi
+  ret float %add
+}
+
+
+; These scenarios shouldn't work.
+; fadd(fpext(fmul(a, b)), c) => fma(fpext(a), fpext(b), c)
+define double @test_fadd_fpext_fmul_c(float %a, float %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_fpext_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[MUL]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd contract double [[EXT]], [[C]]
+; CHECK-NEXT:    ret double [[ADD]]
+;
+  %mul = fmul contract float %a, %b
+  %ext = fpext float %mul to double
+  %add = fadd contract double %ext, %c
+  ret double %add
+}
+
+
+; fadd(c, fpext(fmul(a, b))) => fma(fpext(a), fpext(b), c)
+define double @test_fadd_c_fpext_fmul(float %a, float %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_c_fpext_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[MUL]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd contract double [[C]], [[EXT]]
+; CHECK-NEXT:    ret double [[ADD]]
+;
+  %mul = fmul contract float %a, %b
+  %ext = fpext float %mul to double
+  %add = fadd contract double %c, %ext
+  ret double %add
+}
+
+
+; Double precision tests
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+define double @test_fsub_fmul_c_double(double %a, double %b, double %c) {
+; CHECK-LABEL: define double @test_fsub_fmul_c_double(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract double [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %mul = fmul contract double %a, %b
+  %sub = fsub contract double %mul, %c
+  ret double %sub
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c)
+define double @test_fadd_fmul_c_double(double %a, double %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_fmul_c_double(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %mul = fmul contract double %a, %b
+  %add = fadd contract double %mul, %c
+  ret double %add
+}

From 4f79552d25c25606f5487076c109b2fe2a76a7e2 Mon Sep 17 00:00:00 2001
From: BaiXilin <x53bai@uwaterloo.ca>
Date: Tue, 9 Dec 2025 12:10:20 -0500
Subject: [PATCH 27/85] [x86][AVX-VNNI] Fix VPDPWXXD Argument Types (#169456)

Fixed the argument types of the following intrinsics to match with the
ISA:
 - vpdpwssd_128, vpdpwssd_256, vpdpwssd_512,
 - vpdpwssds_128, vpdpwssds_256, vpdpwssds_512
 - vpdpwsud_128, vpdpwsud_256, vpdowsud_512
 - vpdpwsuds_128, vpdpwsuds_256, vpdpwsuds_512
 - vpdpwusd_128, vpdpwusd_256, vpdpwusd_512
 - vpdpwusds_128, vpdpwusds_256, vpdpwusds_512
 - vpdpwuud_128, vpdpwuud_256, vpdpwuud_512
 - vpdpwuuds_128, vpdpwuuds_256, vpdpwuuds_512

Fixes #97271. Note that this is the last PR for the issue.
---
 clang/include/clang/Basic/BuiltinsX86.td      |  48 +--
 clang/lib/Headers/avx10_2_512niintrin.h       |  24 +-
 clang/lib/Headers/avx512vlvnniintrin.h        |  17 +-
 clang/lib/Headers/avx512vnniintrin.h          |   8 +-
 clang/lib/Headers/avxvnniint16intrin.h        | 217 +++++-----
 clang/lib/Headers/avxvnniintrin.h             |  12 +-
 .../test/CodeGen/X86/avx10_2_512ni-builtins.c |  36 +-
 clang/test/CodeGen/X86/avx10_2ni-builtins.c   |  48 +--
 .../test/CodeGen/X86/avx512vlvnni-builtins.c  |  24 +-
 clang/test/CodeGen/X86/avx512vnni-builtins.c  |  12 +-
 clang/test/CodeGen/X86/avxvnni-builtins.c     |  16 +-
 .../test/CodeGen/X86/avxvnniint16-builtins.c  |  24 +-
 llvm/include/llvm/IR/IntrinsicsX86.td         |  60 +--
 llvm/lib/IR/AutoUpgrade.cpp                   | 183 +++++++--
 .../Instrumentation/MemorySanitizer.cpp       | 102 ++++-
 .../CodeGen/X86/avx10.2-intrinsic-upgrade.ll  |  96 +++++
 .../CodeGen/X86/avx10_2_512ni-intrinsics.ll   |  18 +-
 llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll |  24 +-
 .../CodeGen/X86/avx512vl_vnni-intrinsics.ll   |  58 +--
 .../X86/avx512vnni-intrinsics-upgrade.ll      |  60 ++-
 .../test/CodeGen/X86/avx512vnni-intrinsics.ll |  28 +-
 .../X86/avx_vnni-intrinsics-upgrade.ll        |  44 ++
 llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll  |  24 +-
 .../X86/avxvnniint16-intrinsics-upgrade.ll    | 185 +++++++++
 .../CodeGen/X86/avxvnniint16-intrinsics.ll    |  72 ++--
 .../CodeGen/X86/stack-folding-int-avxvnni.ll  |  40 +-
 .../X86/stack-folding-int-avxvnniint16.ll     |  24 +-
 .../X86/avx10_2_512ni-intrinsics.ll           | 290 +++++++++-----
 .../X86/avx10_2ni-intrinsics.ll               | 360 ++++++++++++-----
 .../X86/avx512vl_vnni-intrinsics-upgrade.ll   |  72 ++--
 .../X86/avx512vl_vnni-intrinsics.ll           |  72 ++--
 .../X86/avx512vnni-intrinsics-upgrade.ll      |  36 +-
 .../X86/avx512vnni-intrinsics.ll              |  36 +-
 .../X86/avx_vnni-intrinsics.ll                |  24 +-
 .../X86/avxvnniint16-intrinsics.ll            | 376 ++++++++++++------
 35 files changed, 1870 insertions(+), 900 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index feb3e9a4afe3b..71aee5038d518 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1088,27 +1088,27 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -4222,12 +4222,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
-  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
+  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
@@ -4235,51 +4235,51 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index fdb57c7c9e27b..b2215b72c57bc 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -185,8 +185,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v32hi)__B,
+                                             (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -206,8 +206,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v32hi)__B,
+                                              (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
@@ -227,8 +227,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v32hu)__B,
+                                             (__v32hi)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -248,8 +248,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v32hu)__B,
+                                              (__v32hi)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
@@ -269,8 +269,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v32hu)__B,
+                                             (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -290,8 +290,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v32hu)__B,
+                                              (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index a1a0338a69e0d..4b8a199af32e5 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -80,8 +80,8 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-#define _mm256_dpwssd_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssd_epi32(S, A, B)                                           \
+  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -98,8 +98,9 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-#define _mm256_dpwssds_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssds_epi32(S, A, B)                                          \
+  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A),             \
+                                        (__v16hi)(B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -157,8 +158,8 @@
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-#define _mm_dpwssd_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssd_epi32(S, A, B)                                              \
+  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -175,8 +176,8 @@
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-#define _mm_dpwssds_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssds_epi32(S, A, B)                                             \
+  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index c386923360de6..2ce88efe4a04f 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -68,8 +68,8 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A,
+                                             (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -91,8 +91,8 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A,
+                                              (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h
index 805d249911c17..98d94ee3fcf3a 100644
--- a/clang/lib/Headers/avxvnniint16intrin.h
+++ b/clang/lib/Headers/avxvnniint16intrin.h
@@ -16,9 +16,10 @@
 #define __AVXVNNIINT16INTRIN_H
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -40,19 +41,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwsud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v8hi)(__A),           \
+                                       (__v8hu)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -74,20 +77,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+///		dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwsud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v16hi)(__A),          \
+                                       (__v16hu)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -109,20 +113,22 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 #define _mm_dpwsuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v8hi)(__A),          \
+                                        (__v8hu)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -144,19 +150,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwsuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v16hi)(__A),         \
+                                        (__v16hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -178,19 +186,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwusd_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v8hu)(__A),           \
+                                       (__v8hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -212,20 +222,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwusd_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v16hu)(__A),          \
+                                       (__v16hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and
+///    store the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -233,7 +244,7 @@
 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUSDS instruction.
 ///
 /// \param __W
 ///    A 128-bit vector of [4 x int].
@@ -247,20 +258,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwusds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v8hu)(__A),          \
+                                        (__v8hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and
+///    store the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -268,7 +280,7 @@
 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUSDS instruction.
 ///
 /// \param __W
 ///    A 256-bit vector of [8 x int].
@@ -282,19 +294,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwusds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v16hu)(__A),         \
+                                        (__v16hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -305,30 +319,32 @@
 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 ///
 /// \param __W
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 /// \param __A
 ///    A 128-bit vector of [8 x unsigned short].
 /// \param __B
 ///    A 128-bit vector of [8 x unsigned short].
 /// \returns
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwuud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v8hu)(__A),           \
+                                       (__v8hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -339,31 +355,32 @@
 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 ///
 /// \param __W
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 /// \param __A
 ///    A 256-bit vector of [16 x unsigned short].
 /// \param __B
 ///    A 256-bit vector of [16 x unsigned short].
 /// \returns
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwuud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v16hu)(__A),          \
+                                       (__v16hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -371,34 +388,35 @@
 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUUDS instruction.
 ///
 /// \param __W
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 /// \param __A
 ///    A 128-bit vector of [8 x unsigned short].
 /// \param __B
 ///    A 128-bit vector of [8 x unsigned short].
 /// \returns
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwuuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v8hu)(__A),          \
+                                        (__v8hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -406,27 +424,28 @@
 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUUDS instruction.
 ///
 /// \param __W
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 /// \param __A
 ///    A 256-bit vector of [16 x unsigned short].
 /// \param __B
 ///    A 256-bit vector of [16 x unsigned short].
 /// \returns
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwuuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v16hu)(__A),         \
+                                        (__v16hu)(__B)))
 
 #endif // __AVXVNNIINT16INTRIN_H
diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h
index 3c4c44a930fe2..1d2e8c906effc 100644
--- a/clang/lib/Headers/avxvnniintrin.h
+++ b/clang/lib/Headers/avxvnniintrin.h
@@ -109,7 +109,8 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v16hi)__A,
+                                             (__v16hi)__B);
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
@@ -130,7 +131,8 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v16hi)__A,
+                                              (__v16hi)__B);
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
@@ -199,7 +201,8 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v8hi)__A,
+                                             (__v8hi)__B);
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
@@ -220,7 +223,8 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v8hi)__A,
+                                              (__v8hi)__B);
 }
 
 #undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
index 728c9f5652dd9..1ba6e87653a74 100644
--- a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -176,20 +176,20 @@ __m512i test_mm512_maskz_dpbuuds_epi32(__mmask16 __U, __m512i __W, __m512i __A,
 /* VNNI INT16 */
 __m512i test_mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwsud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwsud_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwsud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwsud_epi32(__U, __A, __B, __C);
@@ -197,20 +197,20 @@ __m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, _
 
 __m512i test_mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwsuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwsuds_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwsuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwsuds_epi32(__U, __A, __B, __C);
@@ -218,20 +218,20 @@ __m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B,
 
 __m512i test_mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwusd_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwusd_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwusd_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusd_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwusd_epi32(__U, __A, __B, __C);
@@ -239,20 +239,20 @@ __m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, _
 
 __m512i test_mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwusds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwusds_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwusds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwusds_epi32(__U, __A, __B, __C);
@@ -260,20 +260,20 @@ __m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B,
 
 __m512i test_mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwuud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwuud_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwuud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwuud_epi32(__U, __A, __B, __C);
@@ -281,20 +281,20 @@ __m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, _
 
 __m512i test_mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwuuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwuuds_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwuuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwuuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwuuds_epi32(__U, __A, __B, __C);
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
index a250d91ae5989..be2719c33c52f 100644
--- a/clang/test/CodeGen/X86/avx10_2ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -259,168 +259,168 @@ __m256i test_mm256_maskz_dpbuuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, _
 // VNNI INT16
 __m128i test_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwsud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwsud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwsud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwsud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwsuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwsuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwsuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwsuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwusd_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwusd_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusd_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwusd_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwusd_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusd_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwusds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwusds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwusds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwusds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwuud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwuud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwuud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwuud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwuuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwuuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwuuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwuuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
index f63b5c6e73917..11dbd717a9f77 100644
--- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
@@ -47,41 +47,41 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
 
 __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
 
@@ -127,41 +127,41 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
 
 __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
 
 __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
 
diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c
index afe80458e37cc..6b8465206eedb 100644
--- a/clang/test/CodeGen/X86/avx512vnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c
@@ -47,41 +47,41 @@ __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssd_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssd_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssd_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssd_epi32(__S, __A, __B);
 }
 
 __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssds_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssds_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssds_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssds_epi32(__S, __A, __B);
 }
 
diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c
index 7948e0d57d9bf..6557a26807eb2 100644
--- a/clang/test/CodeGen/X86/avxvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnni-builtins.c
@@ -19,13 +19,13 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
 
@@ -43,13 +43,13 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
 
@@ -67,13 +67,13 @@ __m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
 
 __m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_avx_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_avx_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_avx_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_avx_epi32(__S, __A, __B);
 }
 
@@ -91,12 +91,12 @@ __m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
 
 __m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_avx_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_avx_epi32(__S, __A, __B);
 }
 
 __m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_avx_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_avx_epi32(__S, __A, __B);
 }
diff --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
index 941da9aa223b5..f28fff2c0cfec 100644
--- a/clang/test/CodeGen/X86/avxvnniint16-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
@@ -11,72 +11,72 @@
 
 __m128i test_mm_dpwsud_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwsud_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwsud_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwsud_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwsud_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwsud_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwsuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwsuds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwsuds_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwsuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwsuds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwsuds_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwusd_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwusd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwusd_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwusd_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwusd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwusd_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwusds_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwusds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwusds_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwusds_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwusds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwusds_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwuud_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwuud_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwuud_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwuud_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwuud_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwuud_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwuuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwuuds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwuuds_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwuuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwuuds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwuuds_epi32(__A, __B, __C);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 1dd23f60c7e1e..ec80ba3e1ee81 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1893,29 +1893,29 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
   def int_x86_avx512_vpdpwssd_128 :
       ClangBuiltin<"__builtin_ia32_vpdpwssd128">,
-      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                             llvm_v4i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty,
+                             llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssd_256 :
       ClangBuiltin<"__builtin_ia32_vpdpwssd256">,
-      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                             llvm_v8i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty,
+                             llvm_v16i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssd_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwssd512">,
-      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                             llvm_v16i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty,
+                             llvm_v32i16_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpdpwssds_128 :
       ClangBuiltin<"__builtin_ia32_vpdpwssds128">,
-      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                             llvm_v4i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty,
+                             llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssds_256 :
       ClangBuiltin<"__builtin_ia32_vpdpwssds256">,
-      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                             llvm_v8i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty,
+                             llvm_v16i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwssds512">,
-      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                             llvm_v16i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty,
+                             llvm_v32i16_ty], [IntrNoMem]>;
   def int_x86_avx2_vpdpbssd_128
       : ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
@@ -1980,62 +1980,62 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_vpdpwsud_128
       : ClangBuiltin<"__builtin_ia32_vpdpwsud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwsud_256
       : ClangBuiltin<"__builtin_ia32_vpdpwsud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwsuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpwsuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwsuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpwsuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusd_128
       : ClangBuiltin<"__builtin_ia32_vpdpwusd128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusd_256
       : ClangBuiltin<"__builtin_ia32_vpdpwusd256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusds_128
       : ClangBuiltin<"__builtin_ia32_vpdpwusds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusds_256
       : ClangBuiltin<"__builtin_ia32_vpdpwusds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuud_128
       : ClangBuiltin<"__builtin_ia32_vpdpwuud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuud_256
       : ClangBuiltin<"__builtin_ia32_vpdpwuud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpwuuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpwuuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
 }
 
@@ -5031,32 +5031,32 @@ let TargetPrefix = "x86" in {
   def int_x86_avx10_vpdpwsud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwsud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwsuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwsuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwusd_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwusd512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwusds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwusds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwuud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwuud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwuuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwuuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
 
   // VMPSADBW
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 487db134b0df3..e67f1ecd96bb1 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -125,6 +125,24 @@ static bool upgradeX86MultiplyAddBytes(Function *F, Intrinsic::ID IID,
   return true;
 }
 
+// Upgrade the declaration of multipy and add words intrinsics whose input
+// arguments' types have changed to vectors of i32 to vectors of i16
+static bool upgradeX86MultiplyAddWords(Function *F, Intrinsic::ID IID,
+                                       Function *&NewFn) {
+  // check if input argument type is a vector of i16
+  Type *Arg1Type = F->getFunctionType()->getParamType(1);
+  Type *Arg2Type = F->getFunctionType()->getParamType(2);
+  if (Arg1Type->isVectorTy() &&
+      cast<VectorType>(Arg1Type)->getElementType()->isIntegerTy(16) &&
+      Arg2Type->isVectorTy() &&
+      cast<VectorType>(Arg2Type)->getElementType()->isIntegerTy(16))
+    return false;
+
+  rename(F);
+  NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
+  return true;
+}
+
 static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID,
                                     Function *&NewFn) {
   if (F->getReturnType()->getScalarType()->isBFloatTy())
@@ -590,43 +608,89 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
                .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic)
         return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    } else if (Name.starts_with("vpdpwssd.") ||
+               Name.starts_with("vpdpwssds.")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("vpdpwssd.128", Intrinsic::x86_avx512_vpdpwssd_128)
+               .Case("vpdpwssd.256", Intrinsic::x86_avx512_vpdpwssd_256)
+               .Case("vpdpwssd.512", Intrinsic::x86_avx512_vpdpwssd_512)
+               .Case("vpdpwssds.128", Intrinsic::x86_avx512_vpdpwssds_128)
+               .Case("vpdpwssds.256", Intrinsic::x86_avx512_vpdpwssds_256)
+               .Case("vpdpwssds.512", Intrinsic::x86_avx512_vpdpwssds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddWords(F, ID, NewFn);
     }
     return false; // No other 'x86.avx512.*'.
   }
 
-  if (Name.consume_front("avx2.vpdpb")) {
-    // Added in 21.1
-    ID = StringSwitch<Intrinsic::ID>(Name)
-             .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128)
-             .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256)
-             .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128)
-             .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256)
-             .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128)
-             .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256)
-             .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128)
-             .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256)
-             .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128)
-             .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256)
-             .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128)
-             .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256)
-             .Default(Intrinsic::not_intrinsic);
-    if (ID != Intrinsic::not_intrinsic)
-      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+  if (Name.consume_front("avx2.")) {
+    if (Name.consume_front("vpdpb")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128)
+               .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256)
+               .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128)
+               .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256)
+               .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128)
+               .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256)
+               .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128)
+               .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256)
+               .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128)
+               .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256)
+               .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128)
+               .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    } else if (Name.consume_front("vpdpw")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("sud.128", Intrinsic::x86_avx2_vpdpwsud_128)
+               .Case("sud.256", Intrinsic::x86_avx2_vpdpwsud_256)
+               .Case("suds.128", Intrinsic::x86_avx2_vpdpwsuds_128)
+               .Case("suds.256", Intrinsic::x86_avx2_vpdpwsuds_256)
+               .Case("usd.128", Intrinsic::x86_avx2_vpdpwusd_128)
+               .Case("usd.256", Intrinsic::x86_avx2_vpdpwusd_256)
+               .Case("usds.128", Intrinsic::x86_avx2_vpdpwusds_128)
+               .Case("usds.256", Intrinsic::x86_avx2_vpdpwusds_256)
+               .Case("uud.128", Intrinsic::x86_avx2_vpdpwuud_128)
+               .Case("uud.256", Intrinsic::x86_avx2_vpdpwuud_256)
+               .Case("uuds.128", Intrinsic::x86_avx2_vpdpwuuds_128)
+               .Case("uuds.256", Intrinsic::x86_avx2_vpdpwuuds_256)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddWords(F, ID, NewFn);
+    }
     return false; // No other 'x86.avx2.*'
   }
 
-  if (Name.consume_front("avx10.vpdpb")) {
-    // Added in 21.1
-    ID = StringSwitch<Intrinsic::ID>(Name)
-             .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512)
-             .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512)
-             .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512)
-             .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512)
-             .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512)
-             .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512)
-             .Default(Intrinsic::not_intrinsic);
-    if (ID != Intrinsic::not_intrinsic)
-      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+  if (Name.consume_front("avx10.")) {
+    if (Name.consume_front("vpdpb")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512)
+               .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512)
+               .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512)
+               .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512)
+               .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512)
+               .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    } else if (Name.consume_front("vpdpw")) {
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("sud.512", Intrinsic::x86_avx10_vpdpwsud_512)
+               .Case("suds.512", Intrinsic::x86_avx10_vpdpwsuds_512)
+               .Case("usd.512", Intrinsic::x86_avx10_vpdpwusd_512)
+               .Case("usds.512", Intrinsic::x86_avx10_vpdpwusds_512)
+               .Case("uud.512", Intrinsic::x86_avx10_vpdpwuud_512)
+               .Case("uuds.512", Intrinsic::x86_avx10_vpdpwuuds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddWords(F, ID, NewFn);
+    }
     return false; // No other 'x86.avx10.*'
   }
 
@@ -4315,6 +4379,32 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
+
+    // Input arguments types were incorrectly set to vectors of i32 before but
+    // they should be vectors of i16. Insert bit cast when encountering the old
+    // types
+    if (Args[1]->getType()->isVectorTy() &&
+        cast<VectorType>(Args[1]->getType())
+            ->getElementType()
+            ->isIntegerTy(32) &&
+        Args[2]->getType()->isVectorTy() &&
+        cast<VectorType>(Args[2]->getType())
+            ->getElementType()
+            ->isIntegerTy(32)) {
+      Type *NewArgType = nullptr;
+      if (VecWidth == 128)
+        NewArgType = VectorType::get(Builder.getInt16Ty(), 8, false);
+      else if (VecWidth == 256)
+        NewArgType = VectorType::get(Builder.getInt16Ty(), 16, false);
+      else if (VecWidth == 512)
+        NewArgType = VectorType::get(Builder.getInt16Ty(), 32, false);
+      else
+        llvm_unreachable("Unexpected vector bit width");
+
+      Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+      Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+    }
+
     Rep = Builder.CreateIntrinsic(IID, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
@@ -5390,6 +5480,39 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     NewCall = Builder.CreateCall(NewFn, Args);
     break;
   }
+  case Intrinsic::x86_avx512_vpdpwssd_128:
+  case Intrinsic::x86_avx512_vpdpwssd_256:
+  case Intrinsic::x86_avx512_vpdpwssd_512:
+  case Intrinsic::x86_avx512_vpdpwssds_128:
+  case Intrinsic::x86_avx512_vpdpwssds_256:
+  case Intrinsic::x86_avx512_vpdpwssds_512:
+  case Intrinsic::x86_avx2_vpdpwsud_128:
+  case Intrinsic::x86_avx2_vpdpwsud_256:
+  case Intrinsic::x86_avx10_vpdpwsud_512:
+  case Intrinsic::x86_avx2_vpdpwsuds_128:
+  case Intrinsic::x86_avx2_vpdpwsuds_256:
+  case Intrinsic::x86_avx10_vpdpwsuds_512:
+  case Intrinsic::x86_avx2_vpdpwusd_128:
+  case Intrinsic::x86_avx2_vpdpwusd_256:
+  case Intrinsic::x86_avx10_vpdpwusd_512:
+  case Intrinsic::x86_avx2_vpdpwusds_128:
+  case Intrinsic::x86_avx2_vpdpwusds_256:
+  case Intrinsic::x86_avx10_vpdpwusds_512:
+  case Intrinsic::x86_avx2_vpdpwuud_128:
+  case Intrinsic::x86_avx2_vpdpwuud_256:
+  case Intrinsic::x86_avx10_vpdpwuud_512:
+  case Intrinsic::x86_avx2_vpdpwuuds_128:
+  case Intrinsic::x86_avx2_vpdpwuuds_256:
+  case Intrinsic::x86_avx10_vpdpwuuds_512:
+    unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / 16;
+    Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2)};
+    Type *NewArgType = VectorType::get(Builder.getInt16Ty(), NumElts, false);
+    Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+    Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
   assert(NewCall && "Should have either set this variable or returned through "
                     "the default case");
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b38e305df0ce4..32ee16c89b4fe 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5896,52 +5896,118 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //
     // Multiply and Add Signed Word Integers
     //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
-    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
     //   < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
-    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
     //
     // Multiply and Add Signed Word Integers With Saturation
     //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
-    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
     //   < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
-    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Signed and Unsigned Word Integers
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwsud.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwsud.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwsud.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Signed and Unsigned Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwsuds.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwsuds.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwsuds.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Signed Word Integers
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwusd.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwusd.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwusd.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Signed Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwusds.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwusds.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwusds.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Unsigned Word Integers
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwuud.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwuud.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwuud.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Unsigned Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwuuds.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwuuds.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwuuds.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
     //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     //
     //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     case Intrinsic::x86_avx512_vpdpwssd_128:
     case Intrinsic::x86_avx512_vpdpwssd_256:
     case Intrinsic::x86_avx512_vpdpwssd_512:
     case Intrinsic::x86_avx512_vpdpwssds_128:
     case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssds_512:
+    case Intrinsic::x86_avx2_vpdpwsud_128:
+    case Intrinsic::x86_avx2_vpdpwsud_256:
+    case Intrinsic::x86_avx10_vpdpwsud_512:
+    case Intrinsic::x86_avx2_vpdpwsuds_128:
+    case Intrinsic::x86_avx2_vpdpwsuds_256:
+    case Intrinsic::x86_avx10_vpdpwsuds_512:
+    case Intrinsic::x86_avx2_vpdpwusd_128:
+    case Intrinsic::x86_avx2_vpdpwusd_256:
+    case Intrinsic::x86_avx10_vpdpwusd_512:
+    case Intrinsic::x86_avx2_vpdpwusds_128:
+    case Intrinsic::x86_avx2_vpdpwusds_256:
+    case Intrinsic::x86_avx10_vpdpwusds_512:
+    case Intrinsic::x86_avx2_vpdpwuud_128:
+    case Intrinsic::x86_avx2_vpdpwuud_256:
+    case Intrinsic::x86_avx10_vpdpwuud_512:
+    case Intrinsic::x86_avx2_vpdpwuuds_128:
+    case Intrinsic::x86_avx2_vpdpwuuds_256:
+    case Intrinsic::x86_avx10_vpdpwuuds_512:
       handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
                                  /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
       break;
diff --git a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
index 76d84c1159ee4..860d60ff0d4e1 100644
--- a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
@@ -97,3 +97,99 @@ define <16 x i32>@test_int_x86_avx10_vpdpbuuds_512(<16 x i32> %x0, <16 x i32> %x
   %res = call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   ret <16 x i32> %res
 }
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwsud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwsud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwsud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwsuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwsuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwsuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwusd_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwusd_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwusds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwusds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwuud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwuud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwuud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwuuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwuuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwuuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index a2aad604f19bc..e9c6cb6a19ba4 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -220,7 +220,7 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8
 
 ; VNNI INT16
 
-define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -231,12 +231,12 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpwsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) {
 ; X86-LABEL: test_mm512_mask_dpwsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -248,13 +248,13 @@ define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) {
 ; X86-LABEL: test_mm512_maskz_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -266,14 +266,14 @@ define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
 define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpwusd_epi32:
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 1f270d539cdb4..bf7f9375570f9 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -334,7 +334,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 ; VNNI INT16
 
-define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) {
 ; X86-LABEL: test_mm_mask_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -346,13 +346,13 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) {
 ; X86-LABEL: test_mm_maskz_dpwsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -364,13 +364,13 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) {
 ; X86-LABEL: test_mm256_maskz_dpwsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -382,13 +382,13 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) {
 ; X86-LABEL: test_mm256_mask_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -400,16 +400,16 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
 define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
 ; X86-LABEL: test_mm_mask_dpwusd_epi32:
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
index b8ebe2a4890a1..ddf0050dbd74a 100644
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -178,18 +178,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
   ret { <4 x i32>, <4 x i32> } %res2
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %1
 }
 
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -209,11 +209,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; X64-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
 ; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %x2 = load <16 x i16>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
-  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
   %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -221,18 +221,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
   ret { <8 x i32>, <8 x i32> } %res2
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %1
 }
 
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -252,12 +252,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; X64-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
 ; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <8 x i16>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
-  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
@@ -266,18 +266,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
   ret { <4 x i32>, <4 x i32> } %res2
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x53,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %1
 }
 
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -297,11 +297,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; X64-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
 ; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %x2 = load <16 x i16>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
-  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
   %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -309,9 +309,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
   ret { <8 x i32>, <8 x i32> } %res2
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p) {
 ; X86-LABEL: test_int_x86_avx512_vpdpwssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -322,12 +322,12 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpwssds (%rdi), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x53,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <8 x i16>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %1
 }
 
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -347,12 +347,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; X64-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
 ; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <8 x i16>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
-  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
index 63ff88a7fa4ae..2aabfab1c8666 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
@@ -102,21 +102,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpbusds_512(<16 x
   ret { <16 x i32>, <16 x i32> } %res3
 }
 
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
-; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
+define <16 x i32>@test_int_x86_avx512_vpdpwssd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_vpdpwssd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   ret <16 x i32> %res
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
+; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
@@ -125,7 +143,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
@@ -141,21 +159,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
   ret { <16 x i32>, <16 x i32> } %res3
 }
 
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   ret <16 x i32> %res
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
+; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
@@ -164,7 +200,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
index 60d0298e057f3..e97b8a5c5503f 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
@@ -86,18 +86,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
   ret { <16 x i32>, <16 x i32> } %res2
 }
 
-declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   ret <16 x i32> %1
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -116,11 +116,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda]
 ; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %x2 = load <32 x i16>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
-  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
@@ -128,18 +128,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
   ret { <16 x i32>, <16 x i32> } %res2
 }
 
-declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ask_vpdpwssds_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   ret <16 x i32> %1
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -158,11 +158,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda]
 ; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %x2 = load <32 x i16>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
-  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
index 0f4a4f27b9715..f359ecef8ceb3 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
@@ -45,3 +45,47 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8
     %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
     ret <8 x i32> %res
 }
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+    ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+    ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+    ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+    ret <8 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
index de8b2a41bf8c8..5748a426c76c3 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
@@ -68,9 +68,9 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <1
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
@@ -80,13 +80,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
@@ -96,13 +96,13 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
@@ -112,13 +112,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
@@ -128,6 +128,6 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %res
 }
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll
new file mode 100644
index 0000000000000..abdc296ae1e1c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
index abdc296ae1e1c..7576b12645bd0 100644
--- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
 ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
@@ -14,12 +14,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
@@ -29,12 +29,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
@@ -44,12 +44,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
@@ -59,12 +59,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
@@ -74,12 +74,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
@@ -89,12 +89,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
@@ -104,12 +104,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
@@ -119,12 +119,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
@@ -134,12 +134,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
@@ -149,12 +149,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
@@ -164,12 +164,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
@@ -179,7 +179,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
index cd576b19f8766..345fa0efa42df 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
@@ -4,16 +4,16 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -23,11 +23,11 @@ define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -37,11 +37,11 @@ define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -51,11 +51,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -65,11 +65,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -79,11 +79,11 @@ define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -93,11 +93,11 @@ define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -107,11 +107,11 @@ define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -121,7 +121,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
   ret <8 x i32> %2
 }
 
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
index 534352f322001..47537c8c4fb6e 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
 declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
 declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
@@ -14,7 +14,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i
 declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
 declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -26,11 +26,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -42,11 +42,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -58,11 +58,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -74,7 +74,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
index 8900085af030d..9d5cabca5fef8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
@@ -497,12 +497,12 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <64 x i8>, <64 x i8>
 declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
 
-define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwsud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -510,87 +510,123 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
+; CHECK-NEXT:    store <16 x i32> [[TMP22]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwsuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = and <32 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP19]], [[TMP22]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP23]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwsud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = and <32 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP24]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwusd_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -598,33 +634,57 @@ define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP21]], [[TMP2]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwusds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -636,23 +696,35 @@ define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwusd_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -664,21 +736,21 @@ define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwuud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -686,33 +758,57 @@ define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP21]], [[TMP2]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwuuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -724,23 +820,35 @@ define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwuud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -752,14 +860,14 @@ define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
 
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
index def7ba3f10770..6a53a1595271e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
@@ -739,17 +739,29 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 
-define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwsud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -761,23 +773,35 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwsuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -789,23 +813,35 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -817,23 +853,35 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwsud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -845,28 +893,40 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwusd_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -878,23 +938,35 @@ define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwusds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -906,23 +978,35 @@ define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwusds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -934,23 +1018,35 @@ define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwusd_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -962,28 +1058,40 @@ define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwuud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -995,23 +1103,35 @@ define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwuuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -1023,23 +1143,35 @@ define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -1051,23 +1183,35 @@ define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwuud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -1079,16 +1223,16 @@ define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
index 5e937485ff282..d0daea5e68fea 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
@@ -528,10 +528,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -546,7 +546,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -574,10 +574,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -592,7 +592,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -601,10 +601,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -619,7 +619,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -653,10 +653,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -671,7 +671,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
@@ -699,10 +699,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -717,7 +717,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -728,10 +728,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -746,7 +746,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -783,10 +783,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -801,7 +801,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -829,10 +829,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -847,7 +847,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -856,10 +856,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -874,7 +874,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -908,10 +908,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -926,7 +926,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
@@ -954,10 +954,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -972,7 +972,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -983,10 +983,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -1001,7 +1001,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
index 1d3046804b74f..f2b1e16e3d5c2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
@@ -495,10 +495,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -513,7 +513,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -541,10 +541,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -559,7 +559,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -568,10 +568,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -586,7 +586,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -623,10 +623,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -641,7 +641,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
@@ -669,10 +669,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -687,7 +687,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -698,10 +698,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -716,7 +716,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -757,10 +757,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -775,7 +775,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -803,10 +803,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -821,7 +821,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -830,10 +830,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -848,7 +848,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -896,10 +896,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i16> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i16> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i16> [[TMP26]], zeroinitializer
@@ -914,7 +914,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <4 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP25]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP26]], <8 x i16> [[TMP10]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
@@ -943,10 +943,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -961,7 +961,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -972,10 +972,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -990,7 +990,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
index 5c99f8a3a1fb6..4e7598b92abcf 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
@@ -270,10 +270,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -288,7 +288,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -316,10 +316,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -334,7 +334,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -343,10 +343,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -361,7 +361,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -395,10 +395,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -413,7 +413,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -441,10 +441,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -459,7 +459,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -468,10 +468,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -486,7 +486,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
index 236ff45c6cd08..0fd60149cc15e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
@@ -251,10 +251,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -269,7 +269,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -297,10 +297,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -315,7 +315,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -324,10 +324,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -342,7 +342,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -379,10 +379,10 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -397,7 +397,7 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -425,10 +425,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -443,7 +443,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -452,10 +452,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -470,7 +470,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
index 0344fbd5ee2a9..0a6c5f7b089ba 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
@@ -143,10 +143,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
@@ -161,7 +161,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -178,10 +178,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
@@ -196,7 +196,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -213,10 +213,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
@@ -231,7 +231,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -248,10 +248,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
@@ -266,7 +266,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
index 707b46bb8686e..fd9e0b953d002 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
@@ -22,218 +22,362 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
+; CHECK-NEXT:    store <8 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
+; CHECK-NEXT:    store <8 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)

From ef47462ce9f1ade501da011a18869ea2a653d2cb Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Tue, 9 Dec 2025 17:11:28 +0000
Subject: [PATCH 28/85] [SPIRV] Start adding support for `int128` (#170798)

LLVM has pretty thorough support for `int128`, and it has started seeing
some use. Even thouth we already have support for the
`SPV_ALTERA_arbitrary_precision_integers` extension, the BE was oddly
capping integer width to 64-bits. This patch adds partial support for
lowering 128-bit integers to `OpTypeInt 128`. Some work remains to be
done around legalisation support and validating constant uses (e.g.
cases that get lowered to `OpSpecConstantOp`).
---
 .../SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp   |  8 +++
 llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp    | 19 +++++-
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 20 +++---
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |  3 +-
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 15 +++++
 llvm/lib/Target/SPIRV/SPIRVUtils.cpp          |  7 ++
 .../i128-addsub.ll                            | 67 +++++++++++++++++++
 .../i128-arith.ll                             | 27 ++++++++
 .../i128-switch-lower.ll                      | 27 ++++++++
 .../enable-all-extensions-but-one.ll          |  1 +
 10 files changed, 181 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll

diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 62f5e47c5ea3b..42de884840dc9 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -50,6 +50,14 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI,
   unsigned IsBitwidth16 = MI->getFlags() & SPIRV::INST_PRINTER_WIDTH16;
   const unsigned NumVarOps = MI->getNumOperands() - StartIndex;
 
+  if (MI->getOpcode() == SPIRV::OpConstantI && NumVarOps > 2) {
+    // SPV_ALTERA_arbitrary_precision_integers allows for integer widths greater
+    // than 64, which will be encoded via multiple operands.
+    for (unsigned I = StartIndex; I != MI->getNumOperands(); ++I)
+      O << ' ' << MI->getOperand(I).getImm();
+    return;
+  }
+
   assert((NumVarOps == 1 || NumVarOps == 2) &&
          "Unsupported number of bits for literal variable");
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index d2a8fddc5d8e4..42edad255ce82 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -13,9 +13,14 @@
 
 #include "SPIRVCommandLine.h"
 #include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/TargetParser/Triple.h"
-#include <algorithm>
+
+#include <functional>
 #include <map>
+#include <string>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "spirv-commandline"
 
@@ -176,7 +181,17 @@ bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   std::set<SPIRV::Extension::Extension> &Vals) {
   SmallVector<StringRef, 10> Tokens;
   ArgValue.split(Tokens, ",", -1, false);
-  std::sort(Tokens.begin(), Tokens.end());
+  llvm::sort(Tokens, [](auto &&LHS, auto &&RHS) {
+    // We want to ensure that we handle "all" first, to ensure that any
+    // subsequent disablement actually behaves as expected i.e. given
+    // --spv-ext=all,-foo, we first enable all and then disable foo; this should
+    // be revisited and simplified.
+    if (LHS == "all")
+      return true;
+    if (RHS == "all")
+      return false;
+    return !(RHS < LHS);
+  });
 
   std::set<SPIRV::Extension::Extension> EnabledExtensions;
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 0fb44052527f0..5d96a67500dff 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -151,22 +151,22 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeBool(MachineIRBuilder &MIRBuilder) {
 }
 
 unsigned SPIRVGlobalRegistry::adjustOpTypeIntWidth(unsigned Width) const {
-  if (Width > 64)
-    report_fatal_error("Unsupported integer width!");
   const SPIRVSubtarget &ST = cast<SPIRVSubtarget>(CurMF->getSubtarget());
   if (ST.canUseExtension(
           SPIRV::Extension::SPV_ALTERA_arbitrary_precision_integers) ||
-      ST.canUseExtension(SPIRV::Extension::SPV_INTEL_int4))
+      (Width == 4 && ST.canUseExtension(SPIRV::Extension::SPV_INTEL_int4)))
     return Width;
   if (Width <= 8)
-    Width = 8;
+    return 8;
   else if (Width <= 16)
-    Width = 16;
+    return 16;
   else if (Width <= 32)
-    Width = 32;
-  else
-    Width = 64;
-  return Width;
+    return 32;
+  else if (Width <= 64)
+    return 64;
+  else if (Width <= 128)
+    return 128;
+  reportFatalUsageError("Unsupported Integer width!");
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeInt(unsigned Width,
@@ -413,7 +413,7 @@ Register SPIRVGlobalRegistry::createConstInt(const ConstantInt *CI,
           MIB = MIRBuilder.buildInstr(SPIRV::OpConstantI)
                     .addDef(Res)
                     .addUse(getSPIRVTypeID(SpvType));
-          addNumImm(APInt(BitWidth, CI->getZExtValue()), MIB);
+          addNumImm(CI->getValue(), MIB);
         } else {
           MIB = MIRBuilder.buildInstr(SPIRV::OpConstantNull)
                     .addDef(Res)
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 71df4cced434e..2078bfee2e767 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -48,6 +48,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   const LLT s16 = LLT::scalar(16);
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
+  const LLT s128 = LLT::scalar(128);
 
   const LLT v16s64 = LLT::fixed_vector(16, 64);
   const LLT v16s32 = LLT::fixed_vector(16, 32);
@@ -307,7 +308,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
           typeInSet(1, allPtrsScalarsAndVectors)));
 
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
-      .legalFor({s1})
+      .legalFor({s1, s128})
       .legalFor(allFloatAndIntScalarsAndPtrs)
       .legalFor(allowedVectorTypes)
       .moreElementsToNextPow2(0)
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 73432279c3306..7717f18d14a34 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1436,6 +1436,21 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::Int16);
     else if (BitWidth == 8)
       Reqs.addCapability(SPIRV::Capability::Int8);
+    else if (BitWidth == 4 &&
+             ST.canUseExtension(SPIRV::Extension::SPV_INTEL_int4)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_int4);
+      Reqs.addCapability(SPIRV::Capability::Int4TypeINTEL);
+    } else if (BitWidth != 32) {
+      if (!ST.canUseExtension(
+              SPIRV::Extension::SPV_ALTERA_arbitrary_precision_integers))
+        reportFatalUsageError(
+            "OpTypeInt type with a width other than 8, 16, 32 or 64 bits "
+            "requires the following SPIR-V extension: "
+            "SPV_ALTERA_arbitrary_precision_integers");
+      Reqs.addExtension(
+          SPIRV::Extension::SPV_ALTERA_arbitrary_precision_integers);
+      Reqs.addCapability(SPIRV::Capability::ArbitraryPrecisionIntegersALTERA);
+    }
     break;
   }
   case SPIRV::OpDot: {
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index d4dd897647cfc..c32ecfb3ef7ac 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -171,6 +171,13 @@ void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB) {
     // Asm Printer needs this info to print 64-bit operands correctly
     MIB.getInstr()->setAsmPrinterFlag(SPIRV::ASM_PRINTER_WIDTH64);
     return;
+  } else if (Bitwidth <= 128) {
+    uint32_t LowBits = Imm.getRawData()[0] & 0xffffffff;
+    uint32_t MidBits0 = (Imm.getRawData()[0] >> 32) & 0xffffffff;
+    uint32_t MidBits1 = Imm.getRawData()[1] & 0xffffffff;
+    uint32_t HighBits = (Imm.getRawData()[1] >> 32) & 0xffffffff;
+    MIB.addImm(LowBits).addImm(MidBits0).addImm(MidBits1).addImm(HighBits);
+    return;
   }
   report_fatal_error("Unsupported constant bitwidth");
 }
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
new file mode 100644
index 0000000000000..c90ffdd17996c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
@@ -0,0 +1,67 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeInt type with a width other than 8, 16, 32 or 64 bits requires the following SPIR-V extension: SPV_ALTERA_arbitrary_precision_integers
+
+; CHECK: OpCapability ArbitraryPrecisionIntegersALTERA
+; CHECK: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
+; CHECK: OpName %[[#TestAdd:]] "test_add"
+; CHECK: OpName %[[#TestSub:]] "test_sub"
+; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
+; CHECK: %[[#Const64Int128:]] = OpConstant %[[#Int128Ty]] 64 0 0 0
+
+; CHECK: %[[#TestAdd]] = OpFunction
+define spir_func void @test_add(i64 %AL, i64 %AH, i64 %BL, i64 %BH, ptr %RL, ptr %RH) {
+entry:
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpShiftLeftLogical %[[#Int128Ty]] {{%[0-9]+}} %[[#Const64Int128]]
+; CHECK: {{.*}} = OpBitwiseOr %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpIAdd %[[#Int128Ty]]
+	%tmp1 = zext i64 %AL to i128
+	%tmp23 = zext i64 %AH to i128
+	%tmp4 = shl i128 %tmp23, 64
+	%tmp5 = or i128 %tmp4, %tmp1
+	%tmp67 = zext i64 %BL to i128
+	%tmp89 = zext i64 %BH to i128
+	%tmp11 = shl i128 %tmp89, 64
+	%tmp12 = or i128 %tmp11, %tmp67
+	%tmp15 = add i128 %tmp12, %tmp5
+	%tmp1617 = trunc i128 %tmp15 to i64
+	store i64 %tmp1617, ptr %RL
+	%tmp21 = lshr i128 %tmp15, 64
+	%tmp2122 = trunc i128 %tmp21 to i64
+	store i64 %tmp2122, ptr %RH
+	ret void
+; CHECK: OpFunctionEnd
+}
+
+; CHECK: %[[#TestSub]] = OpFunction
+define spir_func void @test_sub(i64 %AL, i64 %AH, i64 %BL, i64 %BH, ptr %RL, ptr %RH) {
+entry:
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpShiftLeftLogical %[[#Int128Ty]] {{%[0-9]+}} %[[#Const64Int128]]
+; CHECK: {{.*}} = OpBitwiseOr %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpISub %[[#Int128Ty]]
+	%tmp1 = zext i64 %AL to i128
+	%tmp23 = zext i64 %AH to i128
+	%tmp4 = shl i128 %tmp23, 64
+	%tmp5 = or i128 %tmp4, %tmp1
+	%tmp67 = zext i64 %BL to i128
+	%tmp89 = zext i64 %BH to i128
+	%tmp11 = shl i128 %tmp89, 64
+	%tmp12 = or i128 %tmp11, %tmp67
+	%tmp15 = sub i128 %tmp5, %tmp12
+	%tmp1617 = trunc i128 %tmp15 to i64
+	store i64 %tmp1617, ptr %RL
+	%tmp21 = lshr i128 %tmp15, 64
+	%tmp2122 = trunc i128 %tmp21 to i64
+	store i64 %tmp2122, ptr %RH
+	ret void
+; CHECK: OpFunctionEnd
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll
new file mode 100644
index 0000000000000..d1de5df499566
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll
@@ -0,0 +1,27 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeInt type with a width other than 8, 16, 32 or 64 bits requires the following SPIR-V extension: SPV_ALTERA_arbitrary_precision_integers
+
+; CHECK: OpCapability ArbitraryPrecisionIntegersALTERA
+; CHECK: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
+; CHECK: OpName %[[#Foo:]] "foo"
+; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
+
+; CHECK: %[[#Foo]] = OpFunction
+define i64 @foo(i64 %x, i64 %y, i32 %amt) {
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpSConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpBitwiseOr %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpShiftRightLogical %[[#Int128Ty]]
+  %tmp0 = zext i64 %x to i128
+  %tmp1 = sext i64 %y to i128
+  %tmp2 = or i128 %tmp0, %tmp1
+  %tmp7 = zext i32 13 to i128
+  %tmp3 = lshr i128 %tmp2, %tmp7
+  %tmp4 = trunc i128 %tmp3 to i64
+  ret i64 %tmp4
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll
new file mode 100644
index 0000000000000..669e9362605a5
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll
@@ -0,0 +1,27 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeInt type with a width other than 8, 16, 32 or 64 bits requires the following SPIR-V extension: SPV_ALTERA_arbitrary_precision_integers
+
+; CHECK: OpCapability ArbitraryPrecisionIntegersALTERA
+; CHECK: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
+; CHECK: OpName %[[#Test:]] "test"
+; CHECK: OpName %[[#Exit:]] "exit"
+; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
+; CHECK: %[[#UndefInt128:]] = OpUndef %[[#Int128Ty]]
+
+; CHECK: %[[#Test]] = OpFunction
+define void @test() {
+entry:
+; CHECK: OpSwitch %[[#UndefInt128]] %[[#Exit]] 0 0 3 0 %[[#Exit]] 0 0 5 0 %[[#Exit]] 0 0 4 0 %[[#Exit]] 0 0 8 0 %[[#Exit]]
+  switch i128 poison, label %exit [
+    i128 55340232221128654848, label %exit
+    i128 92233720368547758080, label %exit
+    i128 73786976294838206464, label %exit
+    i128 147573952589676412928, label %exit
+  ]
+exit:
+  unreachable
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll
index 5ddfc85702540..ecf4807a1d5fc 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all,-SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=-SPV_ALTERA_arbitrary_precision_integers,all %s -o - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=KHR %s -o - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=khr %s -o - | FileCheck %s
 

From 04a5ee6065fe6f2955fece5e8db3ae95cfbcce1c Mon Sep 17 00:00:00 2001
From: Rana Pratap Reddy <109514914+ranapratap55@users.noreply.github.com>
Date: Tue, 9 Dec 2025 22:46:33 +0530
Subject: [PATCH 29/85] [AMDGPU] Modifies builtin def to take _Float16('x') for
 both HIP/C++ and for OpenCL (#167652)

For extended imges insts amdgcn_image_sample_*_/gather4_* builtins,
using 'x' in the builtin def so that it will take _Float16 for both
HIP/C++ and OpenCL.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def | 34 ++++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0b78b460c0d6a..5b3074a493d4b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -972,45 +972,45 @@ TARGET_BUILTIN(__builtin_amdgcn_image_sample_3d_v4f16_f32, "V4xifffQtV4ibii", "n
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f16_f32, "V4xifffQtV4ibii", "nc", "image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f32_f32, "V4fifQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f16_f32, "V4eifQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f16_f32, "V4xifQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32, "V4xiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_f32_f32, "fiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f16_f32, "V4xiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f16_f32, "V4xiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_f32_f32, "fiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_f32_f32, "fiffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f32_f32, "V4fiffffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f16_f32, "V4eiffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f16_f32, "V4xiffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_f32_f32, "fifffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f32_f32, "V4fifffffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f16_f32, "V4eifffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f16_f32, "V4xifffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f32_f32, "V4fifffffffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f16_f32, "V4eifffffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f16_f32, "V4xifffffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
 
 #undef BUILTIN

From fa607658a2bdbb5b47e6243b3871d4d6aab09335 Mon Sep 17 00:00:00 2001
From: cs25resch11005-bhuvan <cs25resch11005@iith.ac.in>
Date: Tue, 9 Dec 2025 22:47:17 +0530
Subject: [PATCH 30/85] [CIR][CIRGen][Builtin][X86] Masked compress Intrinsics
 (#169582)

Added masked compress builtin in CIR.
Note: This is my first PR to llvm. Looking forward to corrections

---------

Co-authored-by: bhuvan1527 <balabhuvanvarma@gmail.com>
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  31 +++-
 .../CodeGenBuiltins/X86/avx512vl-builtins.c   |  33 ++++
 .../X86/avx512vlvbmi2-builtins.c              | 171 ++++++++++++++++++
 3 files changed, 229 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index fb17e31bf36d6..855134ba2b249 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -151,6 +151,17 @@ computeFullLaneShuffleMask(CIRGenFunction &cgf, const mlir::Value vec,
 
   outIndices.resize(numElts);
 }
+static mlir::Value emitX86CompressExpand(CIRGenBuilderTy &builder,
+                                         mlir::Location loc, mlir::Value source,
+                                         mlir::Value mask,
+                                         mlir::Value inputVector,
+                                         const std::string &id) {
+  auto resultTy = cast<cir::VectorType>(mask.getType());
+  mlir::Value maskValue = getMaskVecValue(
+      builder, loc, inputVector, cast<cir::VectorType>(resultTy).getSize());
+  return emitIntrinsicCallOp(builder, loc, id, resultTy,
+                             mlir::ValueRange{source, mask, maskValue});
+}
 
 static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder,
                                        mlir::Location loc,
@@ -712,6 +723,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_compressstoreqi128_mask:
   case X86::BI__builtin_ia32_compressstoreqi256_mask:
   case X86::BI__builtin_ia32_compressstoreqi512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_expanddf128_mask:
   case X86::BI__builtin_ia32_expanddf256_mask:
   case X86::BI__builtin_ia32_expanddf512_mask:
@@ -729,7 +744,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_expandhi512_mask:
   case X86::BI__builtin_ia32_expandqi128_mask:
   case X86::BI__builtin_ia32_expandqi256_mask:
-  case X86::BI__builtin_ia32_expandqi512_mask:
+  case X86::BI__builtin_ia32_expandqi512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+                                 "x86.avx512.mask.expand");
+  }
   case X86::BI__builtin_ia32_compressdf128_mask:
   case X86::BI__builtin_ia32_compressdf256_mask:
   case X86::BI__builtin_ia32_compressdf512_mask:
@@ -747,11 +766,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_compresshi512_mask:
   case X86::BI__builtin_ia32_compressqi128_mask:
   case X86::BI__builtin_ia32_compressqi256_mask:
-  case X86::BI__builtin_ia32_compressqi512_mask:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented X86 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return {};
+  case X86::BI__builtin_ia32_compressqi512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+                                 "x86.avx512.mask.compress");
+  }
   case X86::BI__builtin_ia32_gather3div2df:
   case X86::BI__builtin_ia32_gather3div2di:
   case X86::BI__builtin_ia32_gather3div4df:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
index accf1f60d7c32..9ba3e19d41566 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
@@ -199,3 +199,36 @@ __m256i test_mm256_mask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, __m25
   // OGCG: @llvm.x86.avx512.mask.gather3siv8.si
   return _mm256_mmask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
 }
+
+__m128d test_mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CIR-LABEL: _mm_mask_expand_pd
+  // CIR: %[[MASK:.*]] = cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[MASK]], %[[MASK]] : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_mm_mask_expand_pd
+  // LLVM: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  // OGCG-LABEL: test_mm_mask_expand_pd
+  // OGCG: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  return _mm_mask_expand_pd(__W,__U,__A);
+}
+
+__m128d test_mm_maskz_expand_pd(__mmask8 __U, __m128d __A) {
+  // CIR-LABEL: _mm_maskz_expand_pd
+  // CIR: %[[MASK:.*]] = cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[MASK]], %[[MASK]] : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_mm_maskz_expand_pd
+  // LLVM: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  // OGCG-LABEL: test_mm_maskz_expand_pd
+  // OGCG: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  return _mm_maskz_expand_pd(__U,__A);
+}
+
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c
new file mode 100644
index 0000000000000..964971d71eb6c
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c
@@ -0,0 +1,171 @@
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+
+__m128i test_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_compress_epi16
+  // %[[MASK8:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK8]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_mask_compress_epi16(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_compress_epi16
+  // %[[MASK8:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK8]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_compress_epi16(__U, __D);
+}
+
+__m128i test_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_compress_epi8
+  // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK16]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_compress_epi8
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_compress_epi8
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_mask_compress_epi8(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_compress_epi8
+  // %[[ZERO:.+]] = cir.call @_mm_setzero_si128() : () -> !cir.vector<2 x !s64i>
+  // %[[CAST1:.+]] = cir.cast bitcast %[[ZERO]] : !cir.vector<2 x !s64i> -> !cir.vector<16 x !s8i>
+  // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %[[CAST1]], %[[MASK16]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST2:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_compress_epi8
+  // store <2 x i64> zeroinitializer, ptr %{{.+}}, align 16
+  // %[[CAST1:.+]] = bitcast <2 x i64> %{{.+}} to <16 x i8>
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %[[CAST1]], <16 x i1> %[[MASK16]])
+  // %[[CAST2:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_compress_epi8
+  // store <2 x i64> zeroinitializer, ptr %{{.+}}, align 16
+  // %[[CAST1:.+]] = bitcast <2 x i64> %{{.+}} to <16 x i8>
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %[[CAST1]], <16 x i1> %[[MASK16]])
+  // %[[CAST2:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_compress_epi8(__U, __D);
+}
+
+__m128i test_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_expand_epi16
+  // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK16]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_expand_epi16
+  // %[[MASK16:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_expand_epi16
+  // %[[MASK16:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_mask_expand_epi16(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_expand_epi16
+  // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_expand_epi16
+  // %[[MASK:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_expand_epi16
+  // %[[MASK:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_expand_epi16(__U, __D);
+}
+
+__m128i test_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_expand_epi8
+  // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_mask_expand_epi8(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_expand_epi8
+  // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_expand_epi8(__U, __D);
+}

From ab8208f4f13a256e923342e54439d17022befa01 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 9 Dec 2025 17:25:55 +0000
Subject: [PATCH 31/85] [lldb][docs] Fix Visual Studio link in build doc

Fixes warning:
build.rst:107: WARNING: 'any' reference target not found: https://visualstudio.microsoft.com
---
 lldb/docs/resources/build.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 2eb167709dbda..9f76b3a6719c6 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -104,7 +104,7 @@ Build Requirements
 
 Please follow the steps below if you only want to **build** lldb.
 
-1. Install `Visual Studio <https://visualstudio.microsoft.com>` with the
+1. Install `Visual Studio <https://visualstudio.microsoft.com>`_ with the
    "Desktop Development with C++" workload. Make sure that the latest Windows
    SDK and the Active Template Library (ATL) are installed.
 2. Install `Git Bash <https://git-scm.com/install/windows>`_ and add

From 0aa8b8292314be29b02c8d93335c7b04d0fc2221 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 9 Dec 2025 17:27:29 +0000
Subject: [PATCH 32/85] [lldb][docs] Fix plaintext markers in command map

Single backticks RST tries to resolve to a reference.
Double means plaintext.

Fixes these warnings:
map.rst:803: WARNING: 'any' reference target not found: target.prefer-dynamic-value
map.rst:814: WARNING: 'any' reference target not found: expr
---
 lldb/docs/use/map.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/docs/use/map.rst b/lldb/docs/use/map.rst
index da566e7afe058..1221546db45da 100644
--- a/lldb/docs/use/map.rst
+++ b/lldb/docs/use/map.rst
@@ -802,7 +802,7 @@ Print the dynamic type of the result of an expression
 
 LLDB does this automatically if determining the dynamic type does not require
 running the target (in C++, running the target is never needed). This default is
-controlled by the `target.prefer-dynamic-value` setting. If that is disabled, it
+controlled by the ``target.prefer-dynamic-value`` setting. If that is disabled, it
 can be re-enabled on a per-command basis:
 
 .. code-block:: shell
@@ -812,7 +812,7 @@ can be re-enabled on a per-command basis:
   (lldb) expr -d no-run-target -- someCPPObjectPtr
 
 Note that printing of the dynamic type of references is not possible with the
-`expr` command. The workaround is to take the address of the reference and
+``expr`` command. The workaround is to take the address of the reference and
 instruct lldb to print the children of the resulting pointer.
 
 .. code-block:: shell

From 1dacdbe6669a5df92d7e2a4dcf11110eaaaa41ea Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Wed, 10 Dec 2025 02:29:23 +0900
Subject: [PATCH 33/85] [Clang] Export inline move constructors in dllexport-ed
 template instantiations on non-MSVC targets (#168170)

Previously, even when MSVC compatibility was not requested, inline move
constructors in dllexport-ed templates were not exported, which was
seemingly unintended.
On non-MSVC targets (MinGW, Cygwin, and PS), such move constructors
should be exported consistently with copy constructors and with the
behavior of modern MSVC.
---
 clang/include/clang/Basic/LangOptions.h         |  2 ++
 clang/lib/Sema/SemaDeclCXX.cpp                  |  1 +
 clang/test/CodeGenCXX/dllexport.cpp             |  1 +
 clang/test/CodeGenCXX/dllimport.cpp             | 10 ++++++----
 .../CodeGenCXX/mingw-template-dllexport.cpp     | 17 +++++++++++++++++
 5 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 3f042f8ddb5a1..61ee0275283fc 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -623,6 +623,8 @@ class LangOptions : public LangOptionsBase {
            !ObjCSubscriptingLegacyRuntime;
   }
 
+  bool isCompatibleWithMSVC() const { return MSCompatibilityVersion > 0; }
+
   bool isCompatibleWithMSVC(MSVCMajorVersion MajorVersion) const {
     return MSCompatibilityVersion >= MajorVersion * 100000U;
   }
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 3bc748969065a..1cadaf54b7bdd 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -6627,6 +6627,7 @@ void Sema::checkClassLevelDLLAttribute(CXXRecordDecl *Class) {
         auto *Ctor = dyn_cast<CXXConstructorDecl>(MD);
         if ((MD->isMoveAssignmentOperator() ||
              (Ctor && Ctor->isMoveConstructor())) &&
+            getLangOpts().isCompatibleWithMSVC() &&
             !getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015))
           continue;
 
diff --git a/clang/test/CodeGenCXX/dllexport.cpp b/clang/test/CodeGenCXX/dllexport.cpp
index dfbb2762ac85c..2c9e7d36d2cbe 100644
--- a/clang/test/CodeGenCXX/dllexport.cpp
+++ b/clang/test/CodeGenCXX/dllexport.cpp
@@ -1130,5 +1130,6 @@ class __declspec(dllexport) ACE_Shared_Object {
 class __declspec(dllexport) ACE_Service_Object : public ACE_Shared_Object {};
 // Implicit move constructor declaration.
 // MSVC2015-DAG: define weak_odr dso_local dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
+// PS-DAG: define weak_odr dllexport void @_ZN18ACE_Service_ObjectC1EOS_
 // The declarations should not be exported.
 // MSVC2013-NOT: define weak_odr dso_local dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
diff --git a/clang/test/CodeGenCXX/dllimport.cpp b/clang/test/CodeGenCXX/dllimport.cpp
index 363f97a8d58ee..ed1c72c5185d3 100644
--- a/clang/test/CodeGenCXX/dllimport.cpp
+++ b/clang/test/CodeGenCXX/dllimport.cpp
@@ -35,7 +35,7 @@ struct ExplicitSpec_NotImported {};
 #define USEMEMFUNC(class, func) void (class::*UNIQ(use)())() { return &class::func; }
 #define USESTATICMEMFUNC(class, func) void (*UNIQ(use)())() { return &class::func; }
 #define USECLASS(class) void UNIQ(USE)() { class x; }
-#define USECOPYASSIGN(class) class& (class::*UNIQ(use)())(class&) { return &class::operator=; }
+#define USECOPYASSIGN(class) class& (class::*UNIQ(use)())(const class&) { return &class::operator=; }
 #define USEMOVEASSIGN(class) class& (class::*UNIQ(use)())(class&&) { return &class::operator=; }
 
 //===----------------------------------------------------------------------===//
@@ -649,13 +649,15 @@ struct __declspec(dllimport) T {
   static int b;
   // MO1-DAG: @"?b@T@@2HA" = external dllimport global i32
 
-  T& operator=(T&) = default;
-  // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@AAU0@@Z"
+  T& operator=(const T&) = default;
+  // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@ABU0@@Z"
+  // PS-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN1TaSERKS_
 
   T& operator=(T&&) = default;
-  // Note: Don't mark inline move operators dllimport because current MSVC versions don't export them.
+  // Note: Don't mark inline move operators dllimport because MSVC versions before 2015 don't export them.
   // M18-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@$$QAU0@@Z"
   // M19-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@$$QAU0@@Z"
+  // PS-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN1TaSEOS_
 };
 USEMEMFUNC(T, a)
 USESTATICMEMFUNC(T, StaticMethod)
diff --git a/clang/test/CodeGenCXX/mingw-template-dllexport.cpp b/clang/test/CodeGenCXX/mingw-template-dllexport.cpp
index de112d6da53db..9f116c46853b6 100644
--- a/clang/test/CodeGenCXX/mingw-template-dllexport.cpp
+++ b/clang/test/CodeGenCXX/mingw-template-dllexport.cpp
@@ -10,11 +10,16 @@
 
 template <class T>
 class c {
+public:
+  c(const c &) {}
+  c(c &&) noexcept {}
   void f() {}
 };
 
 template class __declspec(dllexport) c<int>;
 
+// CHECK: define {{.*}} dllexport {{.*}} @_ZN1cIiEC1ERKS0_
+// CHECK: define {{.*}} dllexport {{.*}} @_ZN1cIiEC1EOS0_
 // CHECK: define {{.*}} dllexport {{.*}} @_ZN1cIiE1fEv
 
 extern template class __declspec(dllexport) c<char>;
@@ -27,6 +32,18 @@ template class __declspec(dllexport) c<double>;
 
 // CHECK-NOT: define {{.*}} dllexport {{.*}} @_ZN1cIdE1fEv
 
+extern template class __declspec(dllimport) c<short>;
+
+// CHECK: declare dllimport {{.*}} @_ZN1cIsEC1ERKS0_
+// CHECK: declare dllimport {{.*}} @_ZN1cIsEC1EOS0_
+// CHECK: declare dllimport {{.*}} @_ZN1cIsE1fEv
+
+void use_ctors(c<short> &&x) {
+  c<short> y{x};
+  c<short> z{static_cast<c<short> &&>(x)};
+  z.f();
+}
+
 template <class T>
 struct outer {
   void f();

From 0a39d1ff9e4f8e8fb5138bdd21d88ed163aede3b Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 9 Dec 2025 17:31:58 +0000
Subject: [PATCH 34/85] [gn build] Port 1bada0af22d8

---
 llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn
index 7b0db8863c6f0..a6590e86e13b5 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn
@@ -44,6 +44,7 @@ static_library("LLVMNVPTXCodeGen") {
     "NVPTXForwardParams.cpp",
     "NVPTXFrameLowering.cpp",
     "NVPTXGenericToNVVM.cpp",
+    "NVPTXIRPeephole.cpp",
     "NVPTXISelDAGToDAG.cpp",
     "NVPTXISelLowering.cpp",
     "NVPTXImageOptimizer.cpp",

From c0eac77f3cc94a1671297b0ef09e1875f7f74dda Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Tue, 9 Dec 2025 21:00:27 +0300
Subject: [PATCH 35/85] [ADT] BitVector: give `subsetOf(RHS)` name to
 `!test(RHS)` (NFC) (#170875)

Define `LHS.subsetOf(RHS)` as a more descriptive name for `!LHS.test(RHS)`
and update the existing callers to use that name.

Co-authored-by: Jakub Kuderski <jakub@nod-labs.com>
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp        |  2 +-
 llvm/include/llvm/ADT/BitVector.h             |  5 +-
 llvm/include/llvm/ADT/SmallBitVector.h        |  6 +-
 llvm/lib/Analysis/StackLifetime.cpp           |  4 +-
 .../CodeGen/AsmPrinter/DwarfExpression.cpp    |  2 +-
 llvm/lib/CodeGen/StackColoring.cpp            |  4 +-
 .../lib/Target/Hexagon/HexagonBitSimplify.cpp |  3 +-
 llvm/lib/Target/Hexagon/HexagonGenInsert.cpp  |  3 +-
 llvm/unittests/ADT/BitVectorTest.cpp          | 90 +++++++++++++++++++
 9 files changed, 107 insertions(+), 12 deletions(-)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 01b350b2f11fe..d38a7fadb0767 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -547,7 +547,7 @@ class SrcSafetyAnalysis {
 
     // Being trusted is a strictly stronger property than being
     // safe-to-dereference.
-    assert(!Next.TrustedRegs.test(Next.SafeToDerefRegs) &&
+    assert(Next.TrustedRegs.subsetOf(Next.SafeToDerefRegs) &&
            "SafeToDerefRegs should contain all TrustedRegs");
 
     return Next;
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index cc3f3a9226395..f4645c18a93f0 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -550,7 +550,7 @@ class BitVector {
     return *this;
   }
 
-  /// test - Check if (This - RHS) is zero.
+  /// test - Check if (This - RHS) is non-zero.
   /// This is the same as reset(RHS) and any().
   bool test(const BitVector &RHS) const {
     unsigned ThisWords = Bits.size();
@@ -567,6 +567,9 @@ class BitVector {
     return false;
   }
 
+  /// subsetOf - Check if This is a subset of RHS.
+  bool subsetOf(const BitVector &RHS) const { return !test(RHS); }
+
   template <class F, class... ArgTys>
   static BitVector &apply(F &&f, BitVector &Out, BitVector const &Arg,
                           ArgTys const &...Args) {
diff --git a/llvm/include/llvm/ADT/SmallBitVector.h b/llvm/include/llvm/ADT/SmallBitVector.h
index 5b2a5221b791f..978dc3f073031 100644
--- a/llvm/include/llvm/ADT/SmallBitVector.h
+++ b/llvm/include/llvm/ADT/SmallBitVector.h
@@ -552,7 +552,8 @@ class SmallBitVector {
     return *this;
   }
 
-  /// Check if (This - RHS) is zero. This is the same as reset(RHS) and any().
+  /// Check if (This - RHS) is non-zero.
+  /// This is the same as reset(RHS) and any().
   bool test(const SmallBitVector &RHS) const {
     if (isSmall() && RHS.isSmall())
       return (getSmallBits() & ~RHS.getSmallBits()) != 0;
@@ -571,6 +572,9 @@ class SmallBitVector {
     return false;
   }
 
+  /// Check if This is a subset of RHS.
+  bool subsetOf(const SmallBitVector &RHS) const { return !test(RHS); }
+
   SmallBitVector &operator|=(const SmallBitVector &RHS) {
     resize(std::max(size(), RHS.size()));
     if (isSmall() && RHS.isSmall())
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index 1e20fca965ace..30e0316b882cc 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -173,7 +173,7 @@ void StackLifetime::calculateLocalLiveness() {
         BitsIn.resize(NumAllocas, true);
 
       // Update block LiveIn set, noting whether it has changed.
-      if (BitsIn.test(BlockInfo.LiveIn)) {
+      if (!BitsIn.subsetOf(BlockInfo.LiveIn)) {
         BlockInfo.LiveIn |= BitsIn;
       }
 
@@ -198,7 +198,7 @@ void StackLifetime::calculateLocalLiveness() {
       }
 
       // Update block LiveOut set, noting whether it has changed.
-      if (BitsIn.test(BlockInfo.LiveOut)) {
+      if (!BitsIn.subsetOf(BlockInfo.LiveOut)) {
         Changed = true;
         BlockInfo.LiveOut |= BitsIn;
       }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index c7d45897c403b..d6b06b83207c7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -164,7 +164,7 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
 
     // If this sub-register has a DWARF number and we haven't covered
     // its range, and its range covers the value, emit a DWARF piece for it.
-    if (Offset < MaxSize && CurSubReg.test(Coverage)) {
+    if (Offset < MaxSize && !CurSubReg.subsetOf(Coverage)) {
       // Emit a piece for any gap in the coverage.
       if (Offset > CurPos)
         DwarfRegs.push_back(Register::createSubRegister(
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index f7862641d94b9..4ec8b8b3646c1 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -815,13 +815,13 @@ void StackColoring::calculateLocalLiveness() {
       LocalLiveOut |= BlockInfo.Begin;
 
       // Update block LiveIn set, noting whether it has changed.
-      if (LocalLiveIn.test(BlockInfo.LiveIn)) {
+      if (!LocalLiveIn.subsetOf(BlockInfo.LiveIn)) {
         changed = true;
         BlockInfo.LiveIn |= LocalLiveIn;
       }
 
       // Update block LiveOut set, noting whether it has changed.
-      if (LocalLiveOut.test(BlockInfo.LiveOut)) {
+      if (!LocalLiveOut.subsetOf(BlockInfo.LiveOut)) {
         changed = true;
         BlockInfo.LiveOut |= LocalLiveOut;
       }
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 557a0a3f27819..848337457c997 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -137,8 +137,7 @@ namespace {
       return !Bits.any();
     }
     bool includes(const RegisterSet &Rs) const {
-      // A.test(B)  <=>  A-B != {}
-      return !Rs.Bits.test(Bits);
+      return Rs.Bits.subsetOf(Bits);
     }
     bool intersects(const RegisterSet &Rs) const {
       return Bits.anyCommon(Rs.Bits);
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index ff876f6595350..18fcd6a4873fb 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -153,8 +153,7 @@ namespace {
       return !BitVector::any();
     }
     bool includes(const RegisterSet &Rs) const {
-      // A.BitVector::test(B)  <=>  A-B != {}
-      return !Rs.BitVector::test(*this);
+      return Rs.BitVector::subsetOf(*this);
     }
     bool intersects(const RegisterSet &Rs) const {
       return BitVector::anyCommon(Rs);
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index e13523b8e10c3..d3200b7722ee3 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "gtest/gtest.h"
+#include <initializer_list>
 
 using namespace llvm;
 
@@ -835,19 +836,27 @@ TYPED_TEST(BitVectorTest, BinOps) {
   A.resize(65);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(B));
+  EXPECT_TRUE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
 
   B.resize(64);
   A.set(64);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
 
   B.set(63);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_FALSE(B.subsetOf(A));
 
   A.set(63);
   EXPECT_TRUE(A.anyCommon(B));
   EXPECT_TRUE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
 
   B.resize(70);
   B.set(64);
@@ -855,6 +864,87 @@ TYPED_TEST(BitVectorTest, BinOps) {
   A.resize(64);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_FALSE(B.subsetOf(A));
+
+  B.set(63);
+  B.reset(64);
+  EXPECT_TRUE(A.anyCommon(B));
+  EXPECT_TRUE(B.anyCommon(A));
+  EXPECT_TRUE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
+}
+
+template <typename VecType>
+static inline VecType
+createBitVectorFromBits(uint32_t Size, std::initializer_list<int> SetBits) {
+  VecType V;
+  V.resize(Size);
+  for (int BitIndex : SetBits)
+    V.set(BitIndex);
+  return V;
+}
+
+TYPED_TEST(BitVectorTest, BinOpsLiteral) {
+  // More tests of binary operations with more focus on the semantics and
+  // less focus on mutability.
+
+  auto AnyCommon = [](uint32_t SizeLHS, std::initializer_list<int> SetBitsLHS,
+                      uint32_t SizeRHS, std::initializer_list<int> SetBitsRHS) {
+    auto LHS = createBitVectorFromBits<TypeParam>(SizeLHS, SetBitsLHS);
+    auto RHS = createBitVectorFromBits<TypeParam>(SizeRHS, SetBitsRHS);
+    return LHS.anyCommon(RHS);
+  };
+  auto SubsetOf = [](uint32_t SizeLHS, std::initializer_list<int> SetBitsLHS,
+                     uint32_t SizeRHS, std::initializer_list<int> SetBitsRHS) {
+    auto LHS = createBitVectorFromBits<TypeParam>(SizeLHS, SetBitsLHS);
+    auto RHS = createBitVectorFromBits<TypeParam>(SizeRHS, SetBitsRHS);
+    return LHS.subsetOf(RHS);
+  };
+
+  // clang-format off
+
+  // Test small-sized vectors.
+  EXPECT_TRUE (AnyCommon(10, {1, 2, 3}, 10, {3, 4, 5}));
+  EXPECT_FALSE(AnyCommon(10, {1, 2, 3}, 10, {4, 5}));
+
+  EXPECT_FALSE(SubsetOf(10, {1, 2, 3}, 10, {2, 3, 4}));
+  EXPECT_TRUE (SubsetOf(10, {2, 3},    10, {2, 3, 4}));
+  EXPECT_FALSE(SubsetOf(10, {1, 2, 3}, 10, {2, 3}));
+  EXPECT_TRUE (SubsetOf(10, {1, 2, 3}, 10, {1, 2, 3}));
+
+  // Test representations of empty sets of various sizes.
+  EXPECT_FALSE(AnyCommon(10,  {}, 10,  {}));
+  EXPECT_FALSE(AnyCommon(10,  {}, 123, {}));
+  EXPECT_FALSE(AnyCommon(123, {}, 10,  {}));
+  EXPECT_FALSE(AnyCommon(123, {}, 123, {}));
+  EXPECT_TRUE(SubsetOf(10,  {}, 10,  {}));
+  EXPECT_TRUE(SubsetOf(10,  {}, 123, {}));
+  EXPECT_TRUE(SubsetOf(123, {}, 10,  {}));
+  EXPECT_TRUE(SubsetOf(123, {}, 123, {}));
+
+  // Test handling of the remainder words.
+  EXPECT_FALSE(AnyCommon(10,  {1, 2},  123, {5, 70}));
+  EXPECT_TRUE (AnyCommon(10,  {1, 2},  123, {1, 70}));
+  EXPECT_FALSE(AnyCommon(123, {5, 70}, 10,  {1, 2}));
+  EXPECT_TRUE (AnyCommon(123, {1, 70}, 10,  {1, 2}));
+
+  EXPECT_FALSE(AnyCommon(10,  {1, 2}, 123, {5}));
+  EXPECT_TRUE (AnyCommon(10,  {1, 2}, 123, {1}));
+  EXPECT_FALSE(AnyCommon(123, {5},    10,  {1, 2}));
+  EXPECT_TRUE (AnyCommon(123, {1},    10,  {1, 2}));
+
+  EXPECT_FALSE(SubsetOf(10,  {1, 2},     123, {2, 70}));
+  EXPECT_TRUE (SubsetOf(10,  {1, 2},     123, {1, 2, 70}));
+  EXPECT_FALSE(SubsetOf(123, {2, 70},    10,  {1, 2}));
+  EXPECT_FALSE(SubsetOf(123, {1, 2, 70}, 10,  {1, 2}));
+
+  EXPECT_FALSE(SubsetOf(10,  {1, 2}, 123, {2}));
+  EXPECT_TRUE (SubsetOf(10,  {1, 2}, 123, {1, 2}));
+  EXPECT_TRUE (SubsetOf(123, {2},    10,  {1, 2}));
+  EXPECT_TRUE (SubsetOf(123, {1, 2}, 10,  {1, 2}));
+
+  // clang-format on
 }
 
 using RangeList = std::vector<std::pair<int, int>>;

From cd805a73737a951049a106de0f61b50e194d7241 Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Tue, 9 Dec 2025 19:09:33 +0100
Subject: [PATCH 36/85] [LLDB] Run MSVC STL (forward-)list test with PDB
 (#166953)

Since PDB doesn't have template information, we need to get the element
type from somewhere else. I'm using the type of `_Myval` in a list node,
which holds the element type.
---
 .../Plugins/Language/CPlusPlus/GenericList.cpp | 18 ++++++++++++++++++
 .../TestDataFormatterGenericForwardList.py     |  2 ++
 .../list/TestDataFormatterGenericList.py       |  2 ++
 .../loop/TestDataFormatterGenericListLoop.py   |  1 +
 4 files changed, 23 insertions(+)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
index 5289027fbd8af..8c5ac31aef3f3 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
@@ -203,6 +203,16 @@ class MsvcStlListFrontEnd : public AbstractListFrontEnd<StlType::MsvcStl> {
   ValueObject *m_tail = nullptr;
 };
 
+/// Gets the (forward-)list element type from the head node instead of the
+/// template arguments. This is needed with PDB as it doesn't have info about
+/// the template arguments.
+CompilerType GetMsvcStlElementTypeFromHead(ValueObject &head) {
+  auto val_sp = head.GetChildMemberWithName("_Myval");
+  if (val_sp)
+    return val_sp->GetCompilerType();
+  return CompilerType();
+}
+
 } // end anonymous namespace
 
 template <StlType Stl>
@@ -530,6 +540,10 @@ lldb::ChildCacheState MsvcStlForwardListFrontEnd::Update() {
           m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Myhead"}))
     m_head = head_sp.get();
 
+  // With PDB, we can't get the element type from the template arguments
+  if (!m_element_type && m_head)
+    m_element_type = GetMsvcStlElementTypeFromHead(*m_head);
+
   return ChildCacheState::eRefetch;
 }
 
@@ -606,6 +620,10 @@ lldb::ChildCacheState MsvcStlListFrontEnd::Update() {
   m_head = first.get();
   m_tail = last.get();
 
+  // With PDB, we can't get the element type from the template arguments
+  if (!m_element_type && m_head)
+    m_element_type = GetMsvcStlElementTypeFromHead(*m_head);
+
   return lldb::ChildCacheState::eRefetch;
 }
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
index 45695c43b42a9..1db0c489bc7f9 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
@@ -9,6 +9,8 @@
 
 
 class TestDataFormatterGenericForwardList(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         TestBase.setUp(self)
         self.line = line_number("main.cpp", "// break here")
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
index c0207e6ab5911..fbd021190214b 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
@@ -10,6 +10,8 @@
 
 
 class GenericListDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
index f6174dd786380..9c5daf760b31f 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
@@ -11,6 +11,7 @@
 
 
 class GenericListDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
     NO_DEBUG_INFO_TESTCASE = True
 
     def do_test_with_run_command(self):

From d796d73631a9c9cc8a0c6386dba20938b22d34c5 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 11:35:12 -0700
Subject: [PATCH 37/85] [MLIR] Apply clang-tidy fixes for
 readability-identifier-naming in Inliner.cpp (NFC)

---
 mlir/lib/Transforms/Utils/Inliner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp
index 26c965cfc0237..40950312d566f 100644
--- a/mlir/lib/Transforms/Utils/Inliner.cpp
+++ b/mlir/lib/Transforms/Utils/Inliner.cpp
@@ -613,8 +613,8 @@ Inliner::Impl::inlineCallsInSCC(InlinerInterfaceImpl &inlinerIface,
 
   LLVM_DEBUG({
     LDBG() << "* Inliner: Initial calls in SCC are: {";
-    for (unsigned i = 0, e = calls.size(); i < e; ++i)
-      LDBG() << "  " << i << ". " << calls[i].call << ",";
+    for (unsigned I = 0, E = calls.size(); I < E; ++I)
+      LDBG() << "  " << I << ". " << calls[I].call << ",";
     LDBG() << "}";
   });
 

From 00bccfca7cc4aacbbef6127a411c22c0e08bc466 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 9 Dec 2025 18:16:03 +0000
Subject: [PATCH 38/85] [X86] bitcnt-big-integer.ll - add additional test
 coverage where the source values are bitcast from vectors (#171481)

---
 llvm/test/CodeGen/X86/bitcnt-big-integer.ll | 1656 ++++++++++++++++++-
 1 file changed, 1634 insertions(+), 22 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 0fd555991ae29..749b3ddc96d0d 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -52,6 +52,63 @@ define i32 @load_ctpop_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctpop_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    popcntq %rcx, %rcx
+; SSE-NEXT:    popcntq %rax, %rax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctpop_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    popcntq %rax, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rcx, %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctpop_i128:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vmovq %xmm0, %rcx
+; AVX512F-NEXT:    popcntq %rax, %rdx
+; AVX512F-NEXT:    popcntq %rcx, %rax
+; AVX512F-NEXT:    addl %edx, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctpop_i128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    popcntq %rcx, %rcx
+; AVX512VL-NEXT:    popcntq %rax, %rax
+; AVX512VL-NEXT:    addl %ecx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i128:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT:    popcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl %ecx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.ctpop.i128(i128 %a0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctpop_i256(i256 %a0) nounwind {
 ; CHECK-LABEL: test_ctpop_i256:
 ; CHECK:       # %bb.0:
@@ -183,6 +240,107 @@ define i32 @load_ctpop_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctpop_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    popcntq %rsi, %rsi
+; SSE-NEXT:    popcntq %rdx, %rdx
+; SSE-NEXT:    addl %esi, %edx
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    popcntq %rax, %rsi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq %rcx, %rax
+; SSE-NEXT:    addl %esi, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctpop_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    popcntq %rdx, %rdx
+; AVX2-NEXT:    popcntq %rsi, %rsi
+; AVX2-NEXT:    addl %edx, %esi
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    popcntq %rax, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rcx, %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctpop_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vmovq %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    popcntq %rdx, %rdx
+; AVX512F-NEXT:    popcntq %rsi, %rsi
+; AVX512F-NEXT:    addl %edx, %esi
+; AVX512F-NEXT:    popcntq %rax, %rdx
+; AVX512F-NEXT:    popcntq %rcx, %rax
+; AVX512F-NEXT:    addl %edx, %eax
+; AVX512F-NEXT:    addl %esi, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctpop_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %xmm0, %rcx
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    popcntq %rsi, %rsi
+; AVX512VL-NEXT:    popcntq %rdx, %rdx
+; AVX512VL-NEXT:    addl %esi, %edx
+; AVX512VL-NEXT:    xorl %esi, %esi
+; AVX512VL-NEXT:    popcntq %rax, %rsi
+; AVX512VL-NEXT:    xorl %eax, %eax
+; AVX512VL-NEXT:    popcntq %rcx, %rax
+; AVX512VL-NEXT:    addl %esi, %eax
+; AVX512VL-NEXT:    addl %edx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT:    popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT:    popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT:    addl %esi, %edx
+; AVX512POPCNT-NEXT:    xorl %esi, %esi
+; AVX512POPCNT-NEXT:    popcntq %rax, %rsi
+; AVX512POPCNT-NEXT:    xorl %eax, %eax
+; AVX512POPCNT-NEXT:    popcntq %rcx, %rax
+; AVX512POPCNT-NEXT:    addl %esi, %eax
+; AVX512POPCNT-NEXT:    addl %edx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.ctpop.i256(i256 %a0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctpop_i512(i512 %a0) nounwind {
 ; CHECK-LABEL: test_ctpop_i512:
 ; CHECK:       # %bb.0:
@@ -404,6 +562,166 @@ define i32 @load_ctpop_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctpop_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    pextrq $1, %xmm2, %rdi
+; SSE-NEXT:    movq %xmm2, %r8
+; SSE-NEXT:    movq %xmm3, %r9
+; SSE-NEXT:    pextrq $1, %xmm3, %r10
+; SSE-NEXT:    popcntq %r10, %r10
+; SSE-NEXT:    popcntq %r9, %r9
+; SSE-NEXT:    addl %r10d, %r9d
+; SSE-NEXT:    popcntq %rdi, %rdi
+; SSE-NEXT:    popcntq %r8, %r8
+; SSE-NEXT:    addl %edi, %r8d
+; SSE-NEXT:    addl %r9d, %r8d
+; SSE-NEXT:    popcntq %rsi, %rsi
+; SSE-NEXT:    popcntq %rdx, %rdx
+; SSE-NEXT:    addl %esi, %edx
+; SSE-NEXT:    popcntq %rcx, %rcx
+; SSE-NEXT:    popcntq %rax, %rax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    addl %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctpop_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    vmovq %xmm0, %r10
+; AVX2-NEXT:    popcntq %r9, %r9
+; AVX2-NEXT:    popcntq %r10, %r10
+; AVX2-NEXT:    addl %r9d, %r10d
+; AVX2-NEXT:    popcntq %rdi, %rdi
+; AVX2-NEXT:    popcntq %r8, %r8
+; AVX2-NEXT:    addl %edi, %r8d
+; AVX2-NEXT:    addl %r10d, %r8d
+; AVX2-NEXT:    popcntq %rsi, %rsi
+; AVX2-NEXT:    popcntq %rdx, %rdx
+; AVX2-NEXT:    addl %esi, %edx
+; AVX2-NEXT:    popcntq %rcx, %rcx
+; AVX2-NEXT:    popcntq %rax, %rax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctpop_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rdi
+; AVX512F-NEXT:    vmovq %xmm1, %r8
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512F-NEXT:    vmovq %xmm0, %r10
+; AVX512F-NEXT:    popcntq %r9, %r9
+; AVX512F-NEXT:    popcntq %r10, %r10
+; AVX512F-NEXT:    addl %r9d, %r10d
+; AVX512F-NEXT:    popcntq %rdi, %rdi
+; AVX512F-NEXT:    popcntq %r8, %r8
+; AVX512F-NEXT:    addl %edi, %r8d
+; AVX512F-NEXT:    addl %r10d, %r8d
+; AVX512F-NEXT:    popcntq %rdx, %rdx
+; AVX512F-NEXT:    popcntq %rsi, %rsi
+; AVX512F-NEXT:    addl %edx, %esi
+; AVX512F-NEXT:    popcntq %rcx, %rcx
+; AVX512F-NEXT:    popcntq %rax, %rax
+; AVX512F-NEXT:    addl %ecx, %eax
+; AVX512F-NEXT:    addl %esi, %eax
+; AVX512F-NEXT:    addl %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctpop_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rsi
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm1, %rdi
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %r9
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512VL-NEXT:    popcntq %r10, %r10
+; AVX512VL-NEXT:    popcntq %r9, %r9
+; AVX512VL-NEXT:    addl %r10d, %r9d
+; AVX512VL-NEXT:    popcntq %r8, %r8
+; AVX512VL-NEXT:    popcntq %rdi, %rdi
+; AVX512VL-NEXT:    addl %r8d, %edi
+; AVX512VL-NEXT:    addl %r9d, %edi
+; AVX512VL-NEXT:    popcntq %rdx, %rdx
+; AVX512VL-NEXT:    popcntq %rsi, %rsi
+; AVX512VL-NEXT:    addl %edx, %esi
+; AVX512VL-NEXT:    popcntq %rcx, %rcx
+; AVX512VL-NEXT:    popcntq %rax, %rax
+; AVX512VL-NEXT:    addl %ecx, %eax
+; AVX512VL-NEXT:    addl %esi, %eax
+; AVX512VL-NEXT:    addl %edi, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rax
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rdi
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512POPCNT-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %r9
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512POPCNT-NEXT:    popcntq %r10, %r10
+; AVX512POPCNT-NEXT:    popcntq %r9, %r9
+; AVX512POPCNT-NEXT:    addl %r10d, %r9d
+; AVX512POPCNT-NEXT:    popcntq %r8, %r8
+; AVX512POPCNT-NEXT:    popcntq %rdi, %rdi
+; AVX512POPCNT-NEXT:    addl %r8d, %edi
+; AVX512POPCNT-NEXT:    addl %r9d, %edi
+; AVX512POPCNT-NEXT:    popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT:    popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT:    addl %edx, %esi
+; AVX512POPCNT-NEXT:    popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT:    popcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl %ecx, %eax
+; AVX512POPCNT-NEXT:    addl %esi, %eax
+; AVX512POPCNT-NEXT:    addl %edi, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.ctpop.i512(i512 %a0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_ctpop_i1024:
 ; SSE:       # %bb.0:
@@ -969,6 +1287,75 @@ define i32 @load_ctlz_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE-NEXT:    bsrq %rdx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_i128:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    lzcntq %rcx, %rdx
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edx, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_i128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    lzcntq %rcx, %rdx
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i128:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_i256:
 ; SSE:       # %bb.0:
@@ -1125,6 +1512,135 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    bsrq %rsi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rax, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %r8d
+; SSE-NEXT:    bsrq %rdx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    lzcntq %rsi, %rdi
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %rcx, %rdi
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    lzcntq %rsi, %rdi
+; AVX512F-NEXT:    lzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    lzcntq %rcx, %rdi
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rsi, %rdx
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    lzcntq %rsi, %rdi
+; AVX512VL-NEXT:    lzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    lzcntq %rcx, %rdi
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rsi, %rdx
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT:    lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rsi, %rdx
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_i512(i512 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_i512:
 ; SSE:       # %bb.0:
@@ -1423,10 +1939,155 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: load_ctlz_i512:
+; AVX512F-LABEL: load_ctlz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: load_ctlz_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rdx
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    pextrq $1, %xmm2, %rdi
+; SSE-NEXT:    movq %xmm2, %rsi
+; SSE-NEXT:    movq %xmm3, %r8
+; SSE-NEXT:    pextrq $1, %xmm3, %r9
+; SSE-NEXT:    bsrq %r9, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq %r8, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r10d, %r8d
+; SSE-NEXT:    bsrq %rdi, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq %rsi, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r9d, %esi
+; SSE-NEXT:    movq %xmm1, %rdi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    ptest %xmm3, %xmm3
+; SSE-NEXT:    cmovnel %r8d, %esi
+; SSE-NEXT:    bsrq %rax, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    bsrq %rdi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    orl $64, %edi
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %r8d, %edi
+; SSE-NEXT:    bsrq %rcx, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rdx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT:    vmovq %xmm2, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vmovq %xmm2, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    lzcntq %rax, %r10
+; AVX2-NEXT:    lzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %r9, %r10
+; AVX2-NEXT:    lzcntq %rdi, %rdi
+; AVX2-NEXT:    addl $64, %edi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %edi
+; AVX2-NEXT:    subl $-128, %edi
+; AVX2-NEXT:    orq %rax, %r8
+; AVX2-NEXT:    cmovnel %r11d, %edi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    lzcntq %rsi, %r9
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm1, %ymm1
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
@@ -1435,10 +2096,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX512F-NEXT:    vmovd %xmm1, %eax
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VL-LABEL: load_ctlz_i512:
+; AVX512VL-LABEL: vector_ctlz_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
 ; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
@@ -1448,10 +2109,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
-; AVX512POPCNT-LABEL: load_ctlz_i512:
+; AVX512POPCNT-LABEL: vector_ctlz_i512:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
 ; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
@@ -1460,7 +2121,7 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
-  %a0 = load i512, ptr %p0
+  %a0 = bitcast <16 x i32> %v0 to i512
   %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
   %res = trunc i512 %cnt to i32
   ret i32 %res
@@ -2312,6 +2973,74 @@ define i32 @load_ctlz_undef_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    bsrq %rcx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    bsrq %rax, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i128:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    lzcntq %rcx, %rdx
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edx, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    lzcntq %rcx, %rdx
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i128:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_undef_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i256:
 ; SSE:       # %bb.0:
@@ -2463,6 +3192,134 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    bsrq %rsi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rdx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    bsrq %rcx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %rax, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    lzcntq %rsi, %rdi
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %rcx, %rdi
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    lzcntq %rsi, %rdi
+; AVX512F-NEXT:    lzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    lzcntq %rcx, %rdi
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rsi, %rdx
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    lzcntq %rsi, %rdi
+; AVX512VL-NEXT:    lzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    lzcntq %rcx, %rdi
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rsi, %rdx
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT:    lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rsi, %rdx
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i512:
 ; SSE:       # %bb.0:
@@ -2796,6 +3653,147 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    pextrq $1, %xmm2, %rsi
+; SSE-NEXT:    movq %xmm2, %rdx
+; SSE-NEXT:    movq %xmm3, %rdi
+; SSE-NEXT:    pextrq $1, %xmm3, %r8
+; SSE-NEXT:    bsrq %r8, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq %rdi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    orl $64, %edi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r9d, %edi
+; SSE-NEXT:    bsrq %rsi, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    bsrq %rdx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r8d, %edx
+; SSE-NEXT:    movq %xmm0, %rsi
+; SSE-NEXT:    subl $-128, %edx
+; SSE-NEXT:    ptest %xmm3, %xmm3
+; SSE-NEXT:    movq %xmm1, %r8
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    bsrq %rax, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %r8, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %edi, %r8d
+; SSE-NEXT:    bsrq %rcx, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rsi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT:    vmovq %xmm2, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vmovq %xmm2, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    lzcntq %rax, %r10
+; AVX2-NEXT:    lzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %r9, %r10
+; AVX2-NEXT:    lzcntq %rdi, %rdi
+; AVX2-NEXT:    addl $64, %edi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %edi
+; AVX2-NEXT:    subl $-128, %edi
+; AVX2-NEXT:    orq %rax, %r8
+; AVX2-NEXT:    cmovnel %r11d, %edi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    lzcntq %rsi, %r9
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm1, %ymm1
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i1024:
 ; SSE:       # %bb.0:
@@ -3636,6 +4634,49 @@ define i32 @load_cttz_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    movq %xmm0, %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rsi
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vector_cttz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq %rax, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_cttz_i256:
 ; SSE:       # %bb.0:
@@ -3775,21 +4816,146 @@ define i32 @load_cttz_i256(ptr %p0) nounwind {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
-; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    movq %xmm0, %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rsi
+; SSE-NEXT:    rep bsfq %rax, %rdi
+; SSE-NEXT:    addl $64, %edi
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %edi
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rsi
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    tzcntq %rsi, %rdi
+; AVX2-NEXT:    tzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rcx, %rdi
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vmovq %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    tzcntq %rsi, %rdi
+; AVX512F-NEXT:    tzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    tzcntq %rcx, %rdi
+; AVX512F-NEXT:    tzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rdx, %rsi
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %xmm1, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rsi
+; AVX512VL-NEXT:    tzcntq %rsi, %rdi
+; AVX512VL-NEXT:    tzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    tzcntq %rcx, %rdi
+; AVX512VL-NEXT:    tzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rdx, %rsi
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i256:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
-; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
-; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
-; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
-; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT:    tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rdx, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
-  %a0 = load i256, ptr %p0
+  %a0 = bitcast <8 x i32> %v0 to i256
   %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
   %res = trunc i256 %cnt to i32
   ret i32 %res
@@ -4128,6 +5294,148 @@ define i32 @load_cttz_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm3, %rdx
+; SSE-NEXT:    movq %xmm3, %rcx
+; SSE-NEXT:    pextrq $1, %xmm2, %rax
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    movq %xmm1, %rdi
+; SSE-NEXT:    pextrq $1, %xmm0, %r8
+; SSE-NEXT:    movq %xmm0, %r9
+; SSE-NEXT:    rep bsfq %r9, %r10
+; SSE-NEXT:    rep bsfq %r8, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r10d, %r8d
+; SSE-NEXT:    rep bsfq %rdi, %r9
+; SSE-NEXT:    rep bsfq %rsi, %rsi
+; SSE-NEXT:    addl $64, %esi
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r9d, %esi
+; SSE-NEXT:    movq %xmm2, %rdi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %r8d, %esi
+; SSE-NEXT:    rep bsfq %rdi, %r8
+; SSE-NEXT:    rep bsfq %rax, %r9
+; SSE-NEXT:    addl $64, %r9d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r8d, %r9d
+; SSE-NEXT:    rep bsfq %rcx, %rdi
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vmovq %xmm0, %r9
+; AVX2-NEXT:    tzcntq %r9, %r10
+; AVX2-NEXT:    tzcntq %rdi, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    tzcntq %r8, %r10
+; AVX2-NEXT:    tzcntq %rsi, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %rdi, %r9
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rdx, %rdi
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    vmovq %xmm2, %rdi
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    tzcntq %rdi, %r9
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_cttz_i1024:
 ; SSE:       # %bb.0:
@@ -4930,6 +6238,48 @@ define i32 @load_cttz_undef_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    rep bsfq %rcx, %rdx
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_undef_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vector_cttz_undef_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq %rax, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_undef_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_cttz_undef_i256:
 ; SSE:       # %bb.0:
@@ -5084,6 +6434,130 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    movq %xmm1, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE-NEXT:    movq %xmm0, %rsi
+; SSE-NEXT:    rep bsfq %rsi, %rdi
+; SSE-NEXT:    rep bsfq %rdx, %rdx
+; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    rep bsfq %rcx, %rsi
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_undef_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    tzcntq %rsi, %rdi
+; AVX2-NEXT:    tzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rcx, %rdi
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vmovq %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    tzcntq %rsi, %rdi
+; AVX512F-NEXT:    tzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    tzcntq %rcx, %rdi
+; AVX512F-NEXT:    tzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rdx, %rsi
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %xmm1, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rsi
+; AVX512VL-NEXT:    tzcntq %rsi, %rdi
+; AVX512VL-NEXT:    tzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    tzcntq %rcx, %rdi
+; AVX512VL-NEXT:    tzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rdx, %rsi
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT:    tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rdx, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_undef_i512(i512 %a0) nounwind {
 ; SSE-LABEL: test_cttz_undef_i512:
 ; SSE:       # %bb.0:
@@ -5409,6 +6883,144 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm3, %rax
+; SSE-NEXT:    pextrq $1, %xmm2, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE-NEXT:    movq %xmm1, %rsi
+; SSE-NEXT:    pextrq $1, %xmm0, %rdi
+; SSE-NEXT:    movq %xmm0, %r8
+; SSE-NEXT:    rep bsfq %r8, %r9
+; SSE-NEXT:    rep bsfq %rdi, %rdi
+; SSE-NEXT:    addl $64, %edi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r9d, %edi
+; SSE-NEXT:    rep bsfq %rsi, %r8
+; SSE-NEXT:    rep bsfq %rcx, %rcx
+; SSE-NEXT:    addl $64, %ecx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r8d, %ecx
+; SSE-NEXT:    movq %xmm2, %rsi
+; SSE-NEXT:    subl $-128, %ecx
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %edi, %ecx
+; SSE-NEXT:    rep bsfq %rsi, %rdi
+; SSE-NEXT:    rep bsfq %rdx, %rdx
+; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    movq %xmm3, %rsi
+; SSE-NEXT:    rep bsfq %rsi, %rdi
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_undef_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vmovq %xmm0, %r9
+; AVX2-NEXT:    tzcntq %r9, %r10
+; AVX2-NEXT:    tzcntq %rdi, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    tzcntq %r8, %r10
+; AVX2-NEXT:    tzcntq %rsi, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %rdi, %r9
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rdx, %rdi
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    vmovq %xmm2, %rdi
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    tzcntq %rdi, %r9
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_cttz_undef_i1024:
 ; SSE:       # %bb.0:

From 93d2ef105703254769a8f182300b329dad5ed976 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 9 Dec 2025 20:20:54 +0200
Subject: [PATCH 39/85] [mlir][bytecode] Add support for deferred
 attribute/type parsing. (#170993)

Add ability to defer parsing and re-enqueueing oneself. This enables
changing CallSiteLoc parsing to not recurse as deeply: previously this
could fail (especially on large inputs in debug mode the recursion could
overflow). Add a default depth cutoff, this could be a parameter later
if needed.
---
 mlir/lib/Bytecode/Reader/BytecodeReader.cpp | 257 ++++++++++++++++----
 mlir/unittests/Bytecode/BytecodeTest.cpp    |  37 +++
 2 files changed, 241 insertions(+), 53 deletions(-)

diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
index 1659437e1eb24..dd367b5922558 100644
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -27,6 +27,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <deque>
 #include <list>
 #include <memory>
 #include <numeric>
@@ -830,6 +831,23 @@ namespace {
 /// This class provides support for reading attribute and type entries from the
 /// bytecode. Attribute and Type entries are read lazily on demand, so we use
 /// this reader to manage when to actually parse them from the bytecode.
+///
+/// The parsing of attributes & types are generally recursive, this can lead to
+/// stack overflows for deeply nested structures, so we track a few extra pieces
+/// of information to avoid this:
+///
+/// - `depth`: The current depth while parsing nested attributes. We defer on
+///   parsing deeply nested attributes to avoid potential stack overflows. The
+///   deferred parsing is achieved by reporting a failure when parsing a nested
+///   attribute/type and registering the index of the encountered attribute/type
+///   in the deferred parsing worklist. Hence, a failure with deffered entry
+///   does not constitute a failure, it also requires that folks return on
+///   first failure rather than attempting additional parses.
+/// - `deferredWorklist`: A list of attribute/type indices that we could not
+///   parse due to hitting the depth limit. The worklist is used to capture the
+///   indices of attributes/types that need to be parsed/reparsed when we hit
+///   the depth limit. This enables moving the tracking of what needs to be
+///   parsed to the heap.
 class AttrTypeReader {
   /// This class represents a single attribute or type entry.
   template <typename T>
@@ -863,12 +881,34 @@ class AttrTypeReader {
              ArrayRef<uint8_t> sectionData,
              ArrayRef<uint8_t> offsetSectionData);
 
+  LogicalResult readAttribute(uint64_t index, Attribute &result,
+                              uint64_t depth = 0) {
+    return readEntry(attributes, index, result, "attribute", depth);
+  }
+
+  LogicalResult readType(uint64_t index, Type &result, uint64_t depth = 0) {
+    return readEntry(types, index, result, "type", depth);
+  }
+
   /// Resolve the attribute or type at the given index. Returns nullptr on
   /// failure.
-  Attribute resolveAttribute(size_t index) {
-    return resolveEntry(attributes, index, "Attribute");
+  Attribute resolveAttribute(size_t index, uint64_t depth = 0) {
+    return resolveEntry(attributes, index, "Attribute", depth);
+  }
+  Type resolveType(size_t index, uint64_t depth = 0) {
+    return resolveEntry(types, index, "Type", depth);
+  }
+
+  Attribute getAttributeOrSentinel(size_t index) {
+    if (index >= attributes.size())
+      return nullptr;
+    return attributes[index].entry;
+  }
+  Type getTypeOrSentinel(size_t index) {
+    if (index >= types.size())
+      return nullptr;
+    return types[index].entry;
   }
-  Type resolveType(size_t index) { return resolveEntry(types, index, "Type"); }
 
   /// Parse a reference to an attribute or type using the given reader.
   LogicalResult parseAttribute(EncodingReader &reader, Attribute &result) {
@@ -909,23 +949,33 @@ class AttrTypeReader {
                             llvm::getTypeName<T>(), ", but got: ", baseResult);
   }
 
+  /// Add an index to the deferred worklist for re-parsing.
+  void addDeferredParsing(uint64_t index) { deferredWorklist.push_back(index); }
+
 private:
   /// Resolve the given entry at `index`.
   template <typename T>
-  T resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
-                 StringRef entryType);
+  T resolveEntry(SmallVectorImpl<Entry<T>> &entries, uint64_t index,
+                 StringRef entryType, uint64_t depth = 0);
 
-  /// Parse an entry using the given reader that was encoded using the textual
-  /// assembly format.
+  /// Read the entry at the given index, returning failure if the entry is not
+  /// yet resolved.
   template <typename T>
-  LogicalResult parseAsmEntry(T &result, EncodingReader &reader,
-                              StringRef entryType);
+  LogicalResult readEntry(SmallVectorImpl<Entry<T>> &entries, uint64_t index,
+                          T &result, StringRef entryType, uint64_t depth);
 
   /// Parse an entry using the given reader that was encoded using a custom
   /// bytecode format.
   template <typename T>
   LogicalResult parseCustomEntry(Entry<T> &entry, EncodingReader &reader,
-                                 StringRef entryType);
+                                 StringRef entryType, uint64_t index,
+                                 uint64_t depth);
+
+  /// Parse an entry using the given reader that was encoded using the textual
+  /// assembly format.
+  template <typename T>
+  LogicalResult parseAsmEntry(T &result, EncodingReader &reader,
+                              StringRef entryType);
 
   /// The string section reader used to resolve string references when parsing
   /// custom encoded attribute/type entries.
@@ -951,6 +1001,10 @@ class AttrTypeReader {
 
   /// Reference to the parser configuration.
   const ParserConfig &parserConfig;
+
+  /// Worklist for deferred attribute/type parsing. This is used to handle
+  /// deeply nested structures like CallSiteLoc iteratively.
+  std::vector<uint64_t> deferredWorklist;
 };
 
 class DialectReader : public DialectBytecodeReader {
@@ -959,10 +1013,11 @@ class DialectReader : public DialectBytecodeReader {
                 const StringSectionReader &stringReader,
                 const ResourceSectionReader &resourceReader,
                 const llvm::StringMap<BytecodeDialect *> &dialectsMap,
-                EncodingReader &reader, uint64_t &bytecodeVersion)
+                EncodingReader &reader, uint64_t &bytecodeVersion,
+                uint64_t depth = 0)
       : attrTypeReader(attrTypeReader), stringReader(stringReader),
         resourceReader(resourceReader), dialectsMap(dialectsMap),
-        reader(reader), bytecodeVersion(bytecodeVersion) {}
+        reader(reader), bytecodeVersion(bytecodeVersion), depth(depth) {}
 
   InFlightDiagnostic emitError(const Twine &msg) const override {
     return reader.emitError(msg);
@@ -998,14 +1053,40 @@ class DialectReader : public DialectBytecodeReader {
   // IR
   //===--------------------------------------------------------------------===//
 
+  /// The maximum depth to eagerly parse nested attributes/types before
+  /// deferring.
+  static constexpr uint64_t maxAttrTypeDepth = 5;
+
   LogicalResult readAttribute(Attribute &result) override {
-    return attrTypeReader.parseAttribute(reader, result);
+    uint64_t index;
+    if (failed(reader.parseVarInt(index)))
+      return failure();
+    if (depth > maxAttrTypeDepth) {
+      if (Attribute attr = attrTypeReader.getAttributeOrSentinel(index)) {
+        result = attr;
+        return success();
+      }
+      attrTypeReader.addDeferredParsing(index);
+      return failure();
+    }
+    return attrTypeReader.readAttribute(index, result, depth + 1);
   }
   LogicalResult readOptionalAttribute(Attribute &result) override {
     return attrTypeReader.parseOptionalAttribute(reader, result);
   }
   LogicalResult readType(Type &result) override {
-    return attrTypeReader.parseType(reader, result);
+    uint64_t index;
+    if (failed(reader.parseVarInt(index)))
+      return failure();
+    if (depth > maxAttrTypeDepth) {
+      if (Type type = attrTypeReader.getTypeOrSentinel(index)) {
+        result = type;
+        return success();
+      }
+      attrTypeReader.addDeferredParsing(index);
+      return failure();
+    }
+    return attrTypeReader.readType(index, result, depth + 1);
   }
 
   FailureOr<AsmDialectResourceHandle> readResourceHandle() override {
@@ -1095,6 +1176,7 @@ class DialectReader : public DialectBytecodeReader {
   const llvm::StringMap<BytecodeDialect *> &dialectsMap;
   EncodingReader &reader;
   uint64_t &bytecodeVersion;
+  uint64_t depth;
 };
 
 /// Wraps the properties section and handles reading properties out of it.
@@ -1239,68 +1321,110 @@ LogicalResult AttrTypeReader::initialize(
 
 template <typename T>
 T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
-                               StringRef entryType) {
+                               StringRef entryType, uint64_t depth) {
   if (index >= entries.size()) {
     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
     return {};
   }
 
-  // If the entry has already been resolved, there is nothing left to do.
-  Entry<T> &entry = entries[index];
-  if (entry.entry)
-    return entry.entry;
+  // Fast path: Try direct parsing without worklist overhead. This handles the
+  // common case where there are no deferred dependencies.
+  assert(deferredWorklist.empty());
+  T result;
+  if (succeeded(readEntry(entries, index, result, entryType, depth))) {
+    assert(deferredWorklist.empty());
+    return result;
+  }
+  if (deferredWorklist.empty()) {
+    // Failed with no deferred entries is error.
+    return T();
+  }
 
-  // Parse the entry.
-  EncodingReader reader(entry.data, fileLoc);
+  // Slow path: Use worklist to handle deferred dependencies. Use a deque to
+  // iteratively resolve entries with dependencies.
+  // - Pop from front to process
+  // - Push new dependencies to front (depth-first)
+  // - Move failed entries to back (retry after dependencies)
+  std::deque<size_t> worklist;
+  llvm::DenseSet<size_t> inWorklist;
 
-  // Parse based on how the entry was encoded.
-  if (entry.hasCustomEncoding) {
-    if (failed(parseCustomEntry(entry, reader, entryType)))
-      return T();
-  } else if (failed(parseAsmEntry(entry.entry, reader, entryType))) {
-    return T();
+  // Add the original index and any dependencies from the fast path attempt.
+  worklist.push_back(index);
+  inWorklist.insert(index);
+  for (uint64_t idx : llvm::reverse(deferredWorklist)) {
+    if (inWorklist.insert(idx).second)
+      worklist.push_front(idx);
   }
 
-  if (!reader.empty()) {
-    reader.emitError("unexpected trailing bytes after " + entryType + " entry");
-    return T();
+  while (!worklist.empty()) {
+    size_t currentIndex = worklist.front();
+    worklist.pop_front();
+
+    // Clear the deferred worklist before parsing to capture any new entries.
+    deferredWorklist.clear();
+
+    T result;
+    if (succeeded(readEntry(entries, currentIndex, result, entryType, depth))) {
+      inWorklist.erase(currentIndex);
+      continue;
+    }
+
+    if (deferredWorklist.empty()) {
+      // Parsing failed with no deferred entries which implies an error.
+      return T();
+    }
+
+    // Move this entry to the back to retry after dependencies.
+    worklist.push_back(currentIndex);
+
+    // Add dependencies to the front (in reverse so they maintain order).
+    for (uint64_t idx : llvm::reverse(deferredWorklist)) {
+      if (inWorklist.insert(idx).second)
+        worklist.push_front(idx);
+    }
+    deferredWorklist.clear();
   }
-  return entry.entry;
+  return entries[index].entry;
 }
 
 template <typename T>
-LogicalResult AttrTypeReader::parseAsmEntry(T &result, EncodingReader &reader,
-                                            StringRef entryType) {
-  StringRef asmStr;
-  if (failed(reader.parseNullTerminatedString(asmStr)))
-    return failure();
+LogicalResult AttrTypeReader::readEntry(SmallVectorImpl<Entry<T>> &entries,
+                                        uint64_t index, T &result,
+                                        StringRef entryType, uint64_t depth) {
+  if (index >= entries.size())
+    return emitError(fileLoc) << "invalid " << entryType << " index: " << index;
 
-  // Invoke the MLIR assembly parser to parse the entry text.
-  size_t numRead = 0;
-  MLIRContext *context = fileLoc->getContext();
-  if constexpr (std::is_same_v<T, Type>)
-    result =
-        ::parseType(asmStr, context, &numRead, /*isKnownNullTerminated=*/true);
-  else
-    result = ::parseAttribute(asmStr, context, Type(), &numRead,
-                              /*isKnownNullTerminated=*/true);
-  if (!result)
+  // If the entry has already been resolved, return it.
+  Entry<T> &entry = entries[index];
+  if (entry.entry) {
+    result = entry.entry;
+    return success();
+  }
+
+  // If the entry hasn't been resolved, try to parse it.
+  EncodingReader reader(entry.data, fileLoc);
+  LogicalResult parseResult =
+      entry.hasCustomEncoding
+          ? parseCustomEntry(entry, reader, entryType, index, depth)
+          : parseAsmEntry(entry.entry, reader, entryType);
+  if (failed(parseResult))
     return failure();
 
-  // Ensure there weren't dangling characters after the entry.
-  if (numRead != asmStr.size()) {
-    return reader.emitError("trailing characters found after ", entryType,
-                            " assembly format: ", asmStr.drop_front(numRead));
-  }
+  if (!reader.empty())
+    return reader.emitError("unexpected trailing bytes after " + entryType +
+                            " entry");
+
+  result = entry.entry;
   return success();
 }
 
 template <typename T>
 LogicalResult AttrTypeReader::parseCustomEntry(Entry<T> &entry,
                                                EncodingReader &reader,
-                                               StringRef entryType) {
+                                               StringRef entryType,
+                                               uint64_t index, uint64_t depth) {
   DialectReader dialectReader(*this, stringReader, resourceReader, dialectsMap,
-                              reader, bytecodeVersion);
+                              reader, bytecodeVersion, depth);
   if (failed(entry.dialect->load(dialectReader, fileLoc.getContext())))
     return failure();
 
@@ -1350,6 +1474,33 @@ LogicalResult AttrTypeReader::parseCustomEntry(Entry<T> &entry,
   return success(!!entry.entry);
 }
 
+template <typename T>
+LogicalResult AttrTypeReader::parseAsmEntry(T &result, EncodingReader &reader,
+                                            StringRef entryType) {
+  StringRef asmStr;
+  if (failed(reader.parseNullTerminatedString(asmStr)))
+    return failure();
+
+  // Invoke the MLIR assembly parser to parse the entry text.
+  size_t numRead = 0;
+  MLIRContext *context = fileLoc->getContext();
+  if constexpr (std::is_same_v<T, Type>)
+    result =
+        ::parseType(asmStr, context, &numRead, /*isKnownNullTerminated=*/true);
+  else
+    result = ::parseAttribute(asmStr, context, Type(), &numRead,
+                              /*isKnownNullTerminated=*/true);
+  if (!result)
+    return failure();
+
+  // Ensure there weren't dangling characters after the entry.
+  if (numRead != asmStr.size()) {
+    return reader.emitError("trailing characters found after ", entryType,
+                            " assembly format: ", asmStr.drop_front(numRead));
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Bytecode Reader
 //===----------------------------------------------------------------------===//
diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp
index d7b442f6832d0..30e7ed9b6cb7e 100644
--- a/mlir/unittests/Bytecode/BytecodeTest.cpp
+++ b/mlir/unittests/Bytecode/BytecodeTest.cpp
@@ -15,6 +15,7 @@
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Parser/Parser.h"
 
+#include "mlir/IR/BuiltinOps.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Endian.h"
@@ -228,3 +229,39 @@ TEST(Bytecode, OpWithoutProperties) {
   EXPECT_TRUE(OperationEquivalence::computeHash(op.get()) ==
               OperationEquivalence::computeHash(roundtripped));
 }
+
+TEST(Bytecode, DeepCallSiteLoc) {
+  MLIRContext context;
+  ParserConfig config(&context);
+
+  // Create a deep CallSiteLoc chain to test iterative parsing.
+  Location baseLoc = FileLineColLoc::get(&context, "test.mlir", 1, 1);
+  Location loc = baseLoc;
+  constexpr int kDepth = 1000;
+  for (int i = 0; i < kDepth; ++i) {
+    loc = CallSiteLoc::get(loc, baseLoc);
+  }
+
+  // Create a simple module with the deep location.
+  Builder builder(&context);
+  OwningOpRef<ModuleOp> module =
+      ModuleOp::create(loc, /*attributes=*/std::nullopt);
+  ASSERT_TRUE(module);
+
+  // Write to bytecode.
+  std::string bytecode;
+  llvm::raw_string_ostream os(bytecode);
+  ASSERT_TRUE(succeeded(writeBytecodeToFile(module.get(), os)));
+
+  // Parse it back using the bytecode reader.
+  std::unique_ptr<Block> block = std::make_unique<Block>();
+  ASSERT_TRUE(succeeded(readBytecodeFile(
+      llvm::MemoryBufferRef(bytecode, "string-buffer"), block.get(), config)));
+
+  // Verify we got the roundtripped module.
+  ASSERT_FALSE(block->empty());
+  Operation *roundTripped = &block->front();
+
+  // Verify the location matches.
+  EXPECT_EQ(module.get()->getLoc(), roundTripped->getLoc());
+}

From cc25ac424a856554edb152c09f8217e95b59a7b6 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Tue, 9 Dec 2025 12:45:45 -0600
Subject: [PATCH 40/85] [flang][OpenMP] Use DirId() instead of DirName().v, NFC
 (#171484)

---
 flang/lib/Parser/openmp-parsers.cpp         | 2 +-
 flang/lib/Semantics/check-omp-loop.cpp      | 4 ++--
 flang/lib/Semantics/check-omp-structure.cpp | 2 +-
 flang/lib/Semantics/resolve-directives.cpp  | 2 +-
 flang/lib/Semantics/rewrite-parse-tree.cpp  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 6b3bcd00c5bec..a0e106d70a5fd 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1920,7 +1920,7 @@ struct OmpLoopConstructParser {
     auto loopItem{LoopNestParser{} || ompLoopConstruct};
 
     if (auto &&begin{OmpBeginDirectiveParser(dirs_).Parse(state)}) {
-      auto loopDir{begin->DirName().v};
+      auto loopDir{begin->DirId()};
       auto assoc{llvm::omp::getDirectiveAssociation(loopDir)};
       if (assoc == llvm::omp::Association::LoopNest) {
         if (auto &&item{attempt(loopItem).Parse(state)}) {
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index f25cf7eb33817..726dbe865834d 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -271,7 +271,7 @@ void OmpStructureChecker::CheckNestedBlock(const parser::OpenMPLoopConstruct &x,
     } else if (parser::Unwrap<parser::DoConstruct>(stmt)) {
       ++nestedCount;
     } else if (auto *omp{parser::Unwrap<parser::OpenMPLoopConstruct>(stmt)}) {
-      if (!IsLoopTransforming(omp->BeginDir().DirName().v)) {
+      if (!IsLoopTransforming(omp->BeginDir().DirId())) {
         context_.Say(omp->source,
             "Only loop-transforming OpenMP constructs are allowed inside OpenMP loop constructs"_err_en_US);
       }
@@ -324,7 +324,7 @@ void OmpStructureChecker::CheckFullUnroll(
   // since it won't contain a loop.
   if (const parser::OpenMPLoopConstruct *nested{x.GetNestedConstruct()}) {
     auto &nestedSpec{nested->BeginDir()};
-    if (nestedSpec.DirName().v == llvm::omp::Directive::OMPD_unroll) {
+    if (nestedSpec.DirId() == llvm::omp::Directive::OMPD_unroll) {
       bool isPartial{
           llvm::any_of(nestedSpec.Clauses().v, [](const parser::OmpClause &c) {
             return c.Id() == llvm::omp::Clause::OMPC_partial;
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index b6b0f86eb4cce..7776f0d1f21f9 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2800,7 +2800,7 @@ void OmpStructureChecker::Leave(const parser::OpenMPCancelConstruct &) {
 void OmpStructureChecker::Enter(const parser::OpenMPCriticalConstruct &x) {
   const parser::OmpBeginDirective &beginSpec{x.BeginDir()};
   const std::optional<parser::OmpEndDirective> &endSpec{x.EndDir()};
-  PushContextAndClauseSets(beginSpec.DirName().source, beginSpec.DirName().v);
+  PushContextAndClauseSets(beginSpec.DirName().source, beginSpec.DirId());
 
   const auto &block{std::get<parser::Block>(x.t)};
   CheckNoBranching(
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 44be5ffd684a2..6211643b08970 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2477,7 +2477,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionConstruct &x) {
 
 bool OmpAttributeVisitor::Pre(const parser::OpenMPCriticalConstruct &x) {
   const parser::OmpBeginDirective &beginSpec{x.BeginDir()};
-  PushContext(beginSpec.DirName().source, beginSpec.DirName().v);
+  PushContext(beginSpec.DirName().source, beginSpec.DirId());
   GetContext().withinConstruct = true;
   return true;
 }
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index 285eaac1e2c8f..60e3e6ab3f5f1 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -118,7 +118,7 @@ static bool ReturnsDataPointer(const Symbol &symbol) {
 }
 
 static bool LoopConstructIsSIMD(parser::OpenMPLoopConstruct *ompLoop) {
-  return llvm::omp::allSimdSet.test(ompLoop->BeginDir().DirName().v);
+  return llvm::omp::allSimdSet.test(ompLoop->BeginDir().DirId());
 }
 
 // Remove non-SIMD OpenMPConstructs once they are parsed.

From 719826d33df18a2d386c008b11521275e9ec1d74 Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Tue, 9 Dec 2025 19:52:02 +0100
Subject: [PATCH 41/85] [LLDB] Run MSVC STL optional test with PDB (#171486)

Similar to the other PRs, this runs the `std::optional` test with PDB.
Since we don't know that variables use typedefs, we check for the full
name when testing PDB.
---
 .../optional/TestDataFormatterGenericOptional.py   | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
index 7bb4f75de4e59..c88e83bb5b1f4 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
@@ -5,6 +5,8 @@
 
 
 class GenericOptionalDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test_with_run_command(self):
         """Test that that file and class static variables display correctly."""
 
@@ -55,7 +57,11 @@ def cleanup():
         self.expect(
             "frame var numbers",
             substrs=[
-                "(optional_int_vect) numbers =  Has Value=true  {",
+                (
+                    "(std::optional<std::vector<int, std::allocator<int>>>) numbers =  Has Value=true  {"
+                    if self.getDebugInfo() == "pdb"
+                    else "(optional_int_vect) numbers =  Has Value=true  {"
+                ),
                 "Value = size=4 {",
                 "[0] = 1",
                 "[1] = 2",
@@ -69,7 +75,11 @@ def cleanup():
         self.expect(
             "frame var ostring",
             substrs=[
-                "(optional_string) ostring =  Has Value=true  {",
+                (
+                    "(std::optional<std::basic_string<char, std::char_traits<char>, std::allocator<char>>>) ostring =  Has Value=true  {"
+                    if self.getDebugInfo() == "pdb"
+                    else "(optional_string) ostring =  Has Value=true  {"
+                ),
                 'Value = "hello"',
                 "}",
             ],

From 03160c186e90463402786b611826a1702420e1c7 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Tue, 9 Dec 2025 19:52:17 +0100
Subject: [PATCH 42/85] [X86] fix typo: `MCVTTP2SIS` -> `MCVTTP2UIS` (#171229)

This LLVM IR

https://godbolt.org/z/5bM1vrMY1

```llvm
define <4 x i32> @masked(<2 x double> %a, <4 x i32> %src, i8 noundef zeroext %mask) unnamed_addr #0 {
  %r = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128(<2 x double> %a, <4 x i32> %src, i8 noundef %mask)
  ret <4 x i32> %r
}

define <4 x i32> @unmasked(<2 x double> %a) unnamed_addr #0 {
  %r = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128(<2 x double> %a, <4 x i32> zeroinitializer, i8 noundef -1)
  ret <4 x i32> %r
}

declare <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128(<2 x double>, <4 x i32>, i8) unnamed_addr

attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" "target-features"="+avx10.2-512" }
```

produces

```asm
masked:                                 # @masked
        kmovd   k1, edi
        vcvttpd2dqs     xmm1 {k1}, xmm0
        vmovaps xmm0, xmm1
        ret
unmasked:                               # @unmasked
        vcvttpd2udqs    xmm0, xmm0
        ret
```

So, when a mask is used, somehow the signed version of this instruction
is selected. I suspect this is a typo.
---
 llvm/lib/Target/X86/X86IntrinsicsInfo.h             |  2 +-
 llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 99665b5872fe2..88ade87e1dca8 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -534,7 +534,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2qqs_round_512, INTR_TYPE_1OP_MASK,
                        X86ISD::CVTTP2SIS, X86ISD::CVTTP2SIS_SAE),
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_128, CVTPD2DQ_MASK,
-                       X86ISD::CVTTP2UIS, X86ISD::MCVTTP2SIS),
+                       X86ISD::CVTTP2UIS, X86ISD::MCVTTP2UIS),
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_256, INTR_TYPE_1OP_MASK,
                        X86ISD::CVTTP2UIS, 0),
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_round_512, INTR_TYPE_1OP_MASK,
diff --git a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
index 38d54cff6dc23..00db1fb07c78d 100644
--- a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
@@ -652,14 +652,14 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128(<2 x double> %x0, <4 x i32
 ; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8]
+; X64-NEXT:    vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8]
 ; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8]
+; X86-NEXT:    vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> %src, i8 %mask)
@@ -670,13 +670,13 @@ define <4 x i32> @test_int_x86_maskz_vcvtt_pd2udqs_128_z(<2 x double> %x0, i8 %m
 ; X64-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0]
+; X64-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0]
+; X86-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> zeroinitializer, i8 %mask)
   ret <4 x i32> %res
@@ -686,13 +686,13 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128_undef(<2 x double> %x0, i8
 ; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0]
+; X64-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0]
+; X86-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> undef, i8 %mask)
   ret <4 x i32> %res

From 5052b6ce1d3a1d1b36333f036525ec8cff6c9a10 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <95053726+gandhi56@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:59:36 -0500
Subject: [PATCH 43/85] [AMDGPU] Scavenge a VGPR to eliminate a frame index
 (#166979)

If the subtarget supports flat scratch SVS mode and there is no SGPR
available to replace a frame index, convert a scratch instruction in SS
form into SV form and replace the frame index with a scavenged VGPR.
Resolves #155902

Co-authored-by: Matt Arsenault <matthew.arsenault@amd.com>
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  32 +-
 .../flat-scratch-alloca-issue-155902.ll       | 469 ++++++++++++++++++
 2 files changed, 498 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ad3828fba2187..66586e8bc234a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2983,10 +2983,36 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                   : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
                                                   MI, false, 0, !UseSGPR);
 
-      // TODO: for flat scratch another attempt can be made with a VGPR index
-      //       if no SGPRs can be scavenged.
-      if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+      if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
+        int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
+        if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
+          Register TmpVGPR = RS->scavengeRegisterBackwards(
+              AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
+
+          // Materialize the frame register.
+          auto MIB =
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
+          if (FrameReg)
+            MIB.addReg(FrameReg);
+          else
+            MIB.addImm(Offset);
+
+          // Add the offset to the frame register.
+          if (FrameReg && Offset)
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
+                .addReg(FrameReg, RegState::Kill)
+                .addImm(Offset);
+
+          BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
+              .add(MI->getOperand(0)) // $vdata
+              .addReg(TmpVGPR)        // $vaddr
+              .addImm(0)              // Offset
+              .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
+          MI->eraseFromParent();
+          return true;
+        }
         report_fatal_error("Cannot scavenge register in FI elimination!");
+      }
 
       if (!TmpSReg) {
         // Use frame register and restore it after.
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll
new file mode 100644
index 0000000000000..26acb4604cbcb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll
@@ -0,0 +1,469 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s --check-prefix=GFX950
+
+; Ensure we don't crash with: "Cannot scavenge register in FI elimination!"
+define amdgpu_kernel void @issue155902(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, i64 %arg48, i64 %arg49) {
+; GFX950-LABEL: issue155902:
+; GFX950:       ; %bb.0: ; %bb
+; GFX950-NEXT:    s_mov_b32 s33, 0x4008
+; GFX950-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; GFX950-NEXT:    v_writelane_b32 v2, s33, 0
+; GFX950-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX950-NEXT:    s_load_dwordx2 vcc, s[2:3], 0x8
+; GFX950-NEXT:    s_load_dwordx2 s[98:99], s[2:3], 0x10
+; GFX950-NEXT:    s_load_dwordx2 s[96:97], s[2:3], 0x18
+; GFX950-NEXT:    s_load_dwordx2 s[94:95], s[2:3], 0x20
+; GFX950-NEXT:    s_load_dwordx2 s[92:93], s[2:3], 0x28
+; GFX950-NEXT:    s_load_dwordx2 s[90:91], s[2:3], 0x30
+; GFX950-NEXT:    s_load_dwordx2 s[88:89], s[2:3], 0x38
+; GFX950-NEXT:    s_load_dwordx2 s[86:87], s[2:3], 0x40
+; GFX950-NEXT:    s_load_dwordx2 s[84:85], s[2:3], 0x48
+; GFX950-NEXT:    s_load_dwordx2 s[82:83], s[2:3], 0x50
+; GFX950-NEXT:    s_load_dwordx2 s[80:81], s[2:3], 0x58
+; GFX950-NEXT:    s_load_dwordx2 s[78:79], s[2:3], 0x60
+; GFX950-NEXT:    s_load_dwordx2 s[76:77], s[2:3], 0x68
+; GFX950-NEXT:    s_load_dwordx2 s[74:75], s[2:3], 0x70
+; GFX950-NEXT:    s_load_dwordx2 s[72:73], s[2:3], 0x78
+; GFX950-NEXT:    s_load_dwordx2 s[70:71], s[2:3], 0x80
+; GFX950-NEXT:    s_load_dwordx2 s[68:69], s[2:3], 0x88
+; GFX950-NEXT:    s_load_dwordx2 s[66:67], s[2:3], 0x90
+; GFX950-NEXT:    s_load_dwordx2 s[64:65], s[2:3], 0x98
+; GFX950-NEXT:    s_load_dwordx2 s[62:63], s[2:3], 0xa0
+; GFX950-NEXT:    s_load_dwordx2 s[60:61], s[2:3], 0xa8
+; GFX950-NEXT:    s_load_dwordx2 s[58:59], s[2:3], 0xb0
+; GFX950-NEXT:    s_load_dwordx2 s[56:57], s[2:3], 0xb8
+; GFX950-NEXT:    s_load_dwordx2 s[54:55], s[2:3], 0xc0
+; GFX950-NEXT:    s_load_dwordx2 s[52:53], s[2:3], 0xc8
+; GFX950-NEXT:    s_load_dwordx2 s[50:51], s[2:3], 0xd0
+; GFX950-NEXT:    s_load_dwordx2 s[48:49], s[2:3], 0xd8
+; GFX950-NEXT:    s_load_dwordx2 s[46:47], s[2:3], 0xe0
+; GFX950-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0xe8
+; GFX950-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0xf0
+; GFX950-NEXT:    s_load_dwordx2 s[40:41], s[2:3], 0xf8
+; GFX950-NEXT:    s_load_dwordx2 s[38:39], s[2:3], 0x100
+; GFX950-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x108
+; GFX950-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x110
+; GFX950-NEXT:    s_load_dwordx2 s[30:31], s[2:3], 0x118
+; GFX950-NEXT:    s_load_dwordx2 s[28:29], s[2:3], 0x120
+; GFX950-NEXT:    s_load_dwordx2 s[26:27], s[2:3], 0x128
+; GFX950-NEXT:    s_load_dwordx2 s[24:25], s[2:3], 0x130
+; GFX950-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x138
+; GFX950-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0x140
+; GFX950-NEXT:    s_load_dwordx2 s[18:19], s[2:3], 0x148
+; GFX950-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x150
+; GFX950-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x158
+; GFX950-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x160
+; GFX950-NEXT:    s_load_dwordx2 s[10:11], s[2:3], 0x168
+; GFX950-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x170
+; GFX950-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x178
+; GFX950-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x180
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x188
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x4008
+; GFX950-NEXT:    buffer_wbl2 sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_store_dwordx2 v3, v[0:1], off sc0 sc1
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s33
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0x384
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s0, v2, 0
+; GFX950-NEXT:    s_nop 4
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], vcc
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[98:99]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[96:97]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[94:95]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[92:93]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[90:91]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[88:89]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[86:87]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[84:85]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[82:83]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[80:81]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[78:79]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[76:77]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[74:75]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[72:73]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[70:71]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[68:69]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[66:67]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[64:65]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[62:63]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[60:61]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[58:59]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[56:57]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[54:55]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[52:53]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[50:51]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[48:49]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[46:47]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[44:45]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[42:43]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[40:41]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[38:39]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[34:35]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[30:31]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[26:27]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[22:23]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[14:15]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[10:11]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    s_endpgm
+bb:
+  %alloca.big = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca304 = alloca [2 x i64], align 8, addrspace(5)
+  %alloca307 = alloca i64, align 8, addrspace(5)
+  store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca304, align 8
+  store i64 900, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg1, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg2, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg3, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg4, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg5, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg6, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg7, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg8, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg9, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg10, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg11, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg12, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg13, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg14, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg15, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg16, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg17, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg18, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg19, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg20, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg21, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg22, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg23, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg24, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg25, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg26, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg27, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg28, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg29, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg30, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg31, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg32, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg33, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg34, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg35, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg36, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg37, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg38, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg39, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg40, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg41, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg42, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg43, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg44, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg45, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg46, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg47, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg48, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg49, ptr addrspace(5) %alloca307, align 8
+  ret void
+}
+
+define amdgpu_kernel void @issue155902_fp(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, i64 %arg48, i64 %arg49) #0 {
+; GFX950-LABEL: issue155902_fp:
+; GFX950:       ; %bb.0: ; %bb
+; GFX950-NEXT:    s_mov_b32 s33, 0
+; GFX950-NEXT:    s_add_i32 s1, s33, 0x4008
+; GFX950-NEXT:    s_mov_b32 s0, s1
+; GFX950-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; GFX950-NEXT:    v_writelane_b32 v2, s0, 0
+; GFX950-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX950-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_writelane_b32 v2, s4, 1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_writelane_b32 v2, s5, 2
+; GFX950-NEXT:    s_load_dwordx2 vcc, s[2:3], 0x8
+; GFX950-NEXT:    s_load_dwordx2 s[98:99], s[2:3], 0x10
+; GFX950-NEXT:    s_load_dwordx2 s[96:97], s[2:3], 0x18
+; GFX950-NEXT:    s_load_dwordx2 s[94:95], s[2:3], 0x20
+; GFX950-NEXT:    s_load_dwordx2 s[92:93], s[2:3], 0x28
+; GFX950-NEXT:    s_load_dwordx2 s[90:91], s[2:3], 0x30
+; GFX950-NEXT:    s_load_dwordx2 s[88:89], s[2:3], 0x38
+; GFX950-NEXT:    s_load_dwordx2 s[86:87], s[2:3], 0x40
+; GFX950-NEXT:    s_load_dwordx2 s[84:85], s[2:3], 0x48
+; GFX950-NEXT:    s_load_dwordx2 s[82:83], s[2:3], 0x50
+; GFX950-NEXT:    s_load_dwordx2 s[80:81], s[2:3], 0x58
+; GFX950-NEXT:    s_load_dwordx2 s[78:79], s[2:3], 0x60
+; GFX950-NEXT:    s_load_dwordx2 s[76:77], s[2:3], 0x68
+; GFX950-NEXT:    s_load_dwordx2 s[74:75], s[2:3], 0x70
+; GFX950-NEXT:    s_load_dwordx2 s[72:73], s[2:3], 0x78
+; GFX950-NEXT:    s_load_dwordx2 s[70:71], s[2:3], 0x80
+; GFX950-NEXT:    s_load_dwordx2 s[68:69], s[2:3], 0x88
+; GFX950-NEXT:    s_load_dwordx2 s[66:67], s[2:3], 0x90
+; GFX950-NEXT:    s_load_dwordx2 s[64:65], s[2:3], 0x98
+; GFX950-NEXT:    s_load_dwordx2 s[62:63], s[2:3], 0xa0
+; GFX950-NEXT:    s_load_dwordx2 s[60:61], s[2:3], 0xa8
+; GFX950-NEXT:    s_load_dwordx2 s[58:59], s[2:3], 0xb0
+; GFX950-NEXT:    s_load_dwordx2 s[56:57], s[2:3], 0xb8
+; GFX950-NEXT:    s_load_dwordx2 s[54:55], s[2:3], 0xc0
+; GFX950-NEXT:    s_load_dwordx2 s[52:53], s[2:3], 0xc8
+; GFX950-NEXT:    s_load_dwordx2 s[50:51], s[2:3], 0xd0
+; GFX950-NEXT:    s_load_dwordx2 s[48:49], s[2:3], 0xd8
+; GFX950-NEXT:    s_load_dwordx2 s[46:47], s[2:3], 0xe0
+; GFX950-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0xe8
+; GFX950-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0xf0
+; GFX950-NEXT:    s_load_dwordx2 s[40:41], s[2:3], 0xf8
+; GFX950-NEXT:    s_load_dwordx2 s[38:39], s[2:3], 0x100
+; GFX950-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x108
+; GFX950-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x110
+; GFX950-NEXT:    s_load_dwordx2 s[30:31], s[2:3], 0x118
+; GFX950-NEXT:    s_load_dwordx2 s[28:29], s[2:3], 0x120
+; GFX950-NEXT:    s_load_dwordx2 s[26:27], s[2:3], 0x128
+; GFX950-NEXT:    s_load_dwordx2 s[24:25], s[2:3], 0x130
+; GFX950-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x138
+; GFX950-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0x140
+; GFX950-NEXT:    s_load_dwordx2 s[18:19], s[2:3], 0x148
+; GFX950-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x150
+; GFX950-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x158
+; GFX950-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x160
+; GFX950-NEXT:    s_load_dwordx2 s[10:11], s[2:3], 0x168
+; GFX950-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x170
+; GFX950-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x178
+; GFX950-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x180
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x188
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX950-NEXT:    s_add_i32 s1, s33, 0x4008
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s1 offset:8
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0x384
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_readlane_b32 s0, v2, 1
+; GFX950-NEXT:    v_readlane_b32 s1, v2, 2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s0, v2, 0
+; GFX950-NEXT:    s_nop 4
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], vcc
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[98:99]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[96:97]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[94:95]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[92:93]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[90:91]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[88:89]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[86:87]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[84:85]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[82:83]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[80:81]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[78:79]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[76:77]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[74:75]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[72:73]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[70:71]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[68:69]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[66:67]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[64:65]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[62:63]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[60:61]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[58:59]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[56:57]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[54:55]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[52:53]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[50:51]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[48:49]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[46:47]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[44:45]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[42:43]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[40:41]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[38:39]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[34:35]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[30:31]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[26:27]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[22:23]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[14:15]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[10:11]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    s_endpgm
+bb:
+  %alloca.big = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca304 = alloca [2 x i64], align 8, addrspace(5)
+  %alloca307 = alloca i64, align 8, addrspace(5)
+  store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca304, align 8
+  store i64 900, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg1, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg2, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg3, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg4, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg5, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg6, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg7, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg8, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg9, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg10, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg11, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg12, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg13, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg14, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg15, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg16, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg17, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg18, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg19, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg20, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg21, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg22, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg23, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg24, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg25, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg26, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg27, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg28, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg29, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg30, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg31, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg32, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg33, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg34, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg35, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg36, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg37, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg38, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg39, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg40, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg41, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg42, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg43, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg44, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg45, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg46, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg47, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg48, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg49, ptr addrspace(5) %alloca307, align 8
+  ret void
+}
+
+attributes #0 = { "frame-pointer"="all" }

From 24117f75ad9d7bbb439e8e1bd596fdcf0aa8d6e2 Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7@gmail.com>
Date: Tue, 9 Dec 2025 11:50:20 -0800
Subject: [PATCH 44/85] [clang-doc] Replace HTML generation with Mustache
 backend (#170199)

Removes the legacy HTML backend and replaces it with the Mustache
backend.
---
 clang-tools-extra/clang-doc/CMakeLists.txt    |    1 -
 clang-tools-extra/clang-doc/Generators.cpp    |    2 -
 clang-tools-extra/clang-doc/Generators.h      |    1 -
 clang-tools-extra/clang-doc/HTMLGenerator.cpp | 1212 ++---------------
 .../clang-doc/HTMLMustacheGenerator.cpp       |  179 ---
 clang-tools-extra/clang-doc/support/Utils.cpp |    3 +-
 clang-tools-extra/clang-doc/support/Utils.h   |    4 +-
 .../clang-doc/tool/ClangDocMain.cpp           |   99 +-
 .../test/clang-doc/DR-131697.cpp              |    1 -
 clang-tools-extra/test/clang-doc/assets.cpp   |   16 -
 .../clang-doc/basic-project.mustache.test     |    2 +-
 .../test/clang-doc/basic-project.test         |  267 ----
 .../test/clang-doc/comments-in-macros.cpp     |   17 +-
 .../test/clang-doc/conversion_function.cpp    |    7 +-
 clang-tools-extra/test/clang-doc/enum.cpp     |  306 ++---
 .../test/clang-doc/long-name.cpp              |    2 +-
 .../test/clang-doc/mustache-index.cpp         |    2 +-
 .../clang-doc/mustache-separate-namespace.cpp |    2 +-
 .../test/clang-doc/namespace.cpp              |  327 ++---
 .../test/clang-doc/test-path-abs.cpp          |    7 -
 .../unittests/clang-doc/CMakeLists.txt        |    1 -
 .../unittests/clang-doc/HTMLGeneratorTest.cpp |  467 +------
 .../clang-doc/HTMLMustacheGeneratorTest.cpp   |   66 -
 23 files changed, 378 insertions(+), 2613 deletions(-)
 delete mode 100644 clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
 delete mode 100644 clang-tools-extra/test/clang-doc/test-path-abs.cpp
 delete mode 100644 clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp

diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt
index 5989e5fe60cf3..7a375d7cd0524 100644
--- a/clang-tools-extra/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/CMakeLists.txt
@@ -16,7 +16,6 @@ add_clang_library(clangDoc STATIC
   Representation.cpp
   Serialize.cpp
   YAMLGenerator.cpp
-  HTMLMustacheGenerator.cpp
   JSONGenerator.cpp
 
   DEPENDS
diff --git a/clang-tools-extra/clang-doc/Generators.cpp b/clang-tools-extra/clang-doc/Generators.cpp
index ba46609d0e7fc..d6c1cc948ce30 100644
--- a/clang-tools-extra/clang-doc/Generators.cpp
+++ b/clang-tools-extra/clang-doc/Generators.cpp
@@ -243,8 +243,6 @@ void Generator::addInfoToIndex(Index &Idx, const doc::Info *Info) {
 [[maybe_unused]] static int YAMLGeneratorAnchorDest = YAMLGeneratorAnchorSource;
 [[maybe_unused]] static int MDGeneratorAnchorDest = MDGeneratorAnchorSource;
 [[maybe_unused]] static int HTMLGeneratorAnchorDest = HTMLGeneratorAnchorSource;
-[[maybe_unused]] static int MHTMLGeneratorAnchorDest =
-    MHTMLGeneratorAnchorSource;
 [[maybe_unused]] static int JSONGeneratorAnchorDest = JSONGeneratorAnchorSource;
 } // namespace doc
 } // namespace clang
diff --git a/clang-tools-extra/clang-doc/Generators.h b/clang-tools-extra/clang-doc/Generators.h
index 847722646b029..a50f1ac25eda9 100644
--- a/clang-tools-extra/clang-doc/Generators.h
+++ b/clang-tools-extra/clang-doc/Generators.h
@@ -137,7 +137,6 @@ struct MustacheGenerator : public Generator {
 extern volatile int YAMLGeneratorAnchorSource;
 extern volatile int MDGeneratorAnchorSource;
 extern volatile int HTMLGeneratorAnchorSource;
-extern volatile int MHTMLGeneratorAnchorSource;
 extern volatile int JSONGeneratorAnchorSource;
 
 } // namespace doc
diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
index 7c8c16b8e8aca..6f58c3d00fa28 100644
--- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
@@ -5,1145 +5,169 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the implementation of the HTMLGenerator class,
+/// which is a Clang-Doc generator for HTML using Mustache templates.
+///
+//===----------------------------------------------------------------------===//
 
 #include "Generators.h"
 #include "Representation.h"
 #include "support/File.h"
-#include "clang/Basic/Version.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/JSON.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <optional>
-#include <string>
 
 using namespace llvm;
+using namespace llvm::json;
+using namespace llvm::mustache;
 
 namespace clang {
 namespace doc {
 
-namespace {
-
-class HTMLTag {
-public:
-  // Any other tag can be added if required
-  enum TagType {
-    TAG_A,
-    TAG_DIV,
-    TAG_FOOTER,
-    TAG_H1,
-    TAG_H2,
-    TAG_H3,
-    TAG_HEADER,
-    TAG_LI,
-    TAG_LINK,
-    TAG_MAIN,
-    TAG_META,
-    TAG_OL,
-    TAG_P,
-    TAG_SCRIPT,
-    TAG_SPAN,
-    TAG_TITLE,
-    TAG_UL,
-    TAG_TABLE,
-    TAG_THEAD,
-    TAG_TBODY,
-    TAG_TR,
-    TAG_TD,
-    TAG_TH
-  };
-
-  HTMLTag() = default;
-  constexpr HTMLTag(TagType Value) : Value(Value) {}
-
-  operator TagType() const { return Value; }
-  operator bool() = delete;
-
-  bool isSelfClosing() const;
-  StringRef toString() const;
-
-private:
-  TagType Value;
-};
-
-enum NodeType {
-  NODE_TEXT,
-  NODE_TAG,
-};
-
-struct HTMLNode {
-  HTMLNode(NodeType Type) : Type(Type) {}
-  virtual ~HTMLNode() = default;
-
-  virtual void render(llvm::raw_ostream &OS, int IndentationLevel) = 0;
-  NodeType Type; // Type of node
-};
-
-struct TextNode : public HTMLNode {
-  TextNode(const Twine &Text)
-      : HTMLNode(NodeType::NODE_TEXT), Text(Text.str()) {}
-
-  std::string Text; // Content of node
-  void render(llvm::raw_ostream &OS, int IndentationLevel) override;
-};
-
-struct TagNode : public HTMLNode {
-  TagNode(HTMLTag Tag) : HTMLNode(NodeType::NODE_TAG), Tag(Tag) {}
-  TagNode(HTMLTag Tag, const Twine &Text) : TagNode(Tag) {
-    Children.emplace_back(std::make_unique<TextNode>(Text.str()));
-  }
-
-  HTMLTag Tag; // Name of HTML Tag (p, div, h1)
-  std::vector<std::unique_ptr<HTMLNode>> Children; // List of child nodes
-  std::vector<std::pair<std::string, std::string>>
-      Attributes; // List of key-value attributes for tag
-
-  void render(llvm::raw_ostream &OS, int IndentationLevel) override;
-};
-
-struct HTMLFile {
-  std::vector<std::unique_ptr<HTMLNode>> Children; // List of child nodes
-  void render(llvm::raw_ostream &OS) {
-    OS << "<!DOCTYPE html>\n";
-    for (const auto &C : Children) {
-      C->render(OS, 0);
-      OS << "\n";
-    }
-  }
-};
-
-} // namespace
-
-bool HTMLTag::isSelfClosing() const {
-  switch (Value) {
-  case HTMLTag::TAG_META:
-  case HTMLTag::TAG_LINK:
-    return true;
-  case HTMLTag::TAG_A:
-  case HTMLTag::TAG_DIV:
-  case HTMLTag::TAG_FOOTER:
-  case HTMLTag::TAG_H1:
-  case HTMLTag::TAG_H2:
-  case HTMLTag::TAG_H3:
-  case HTMLTag::TAG_HEADER:
-  case HTMLTag::TAG_LI:
-  case HTMLTag::TAG_MAIN:
-  case HTMLTag::TAG_OL:
-  case HTMLTag::TAG_P:
-  case HTMLTag::TAG_SCRIPT:
-  case HTMLTag::TAG_SPAN:
-  case HTMLTag::TAG_TITLE:
-  case HTMLTag::TAG_UL:
-  case HTMLTag::TAG_TABLE:
-  case HTMLTag::TAG_THEAD:
-  case HTMLTag::TAG_TBODY:
-  case HTMLTag::TAG_TR:
-  case HTMLTag::TAG_TD:
-  case HTMLTag::TAG_TH:
-    return false;
-  }
-  llvm_unreachable("Unhandled HTMLTag::TagType");
-}
-
-StringRef HTMLTag::toString() const {
-  switch (Value) {
-  case HTMLTag::TAG_A:
-    return "a";
-  case HTMLTag::TAG_DIV:
-    return "div";
-  case HTMLTag::TAG_FOOTER:
-    return "footer";
-  case HTMLTag::TAG_H1:
-    return "h1";
-  case HTMLTag::TAG_H2:
-    return "h2";
-  case HTMLTag::TAG_H3:
-    return "h3";
-  case HTMLTag::TAG_HEADER:
-    return "header";
-  case HTMLTag::TAG_LI:
-    return "li";
-  case HTMLTag::TAG_LINK:
-    return "link";
-  case HTMLTag::TAG_MAIN:
-    return "main";
-  case HTMLTag::TAG_META:
-    return "meta";
-  case HTMLTag::TAG_OL:
-    return "ol";
-  case HTMLTag::TAG_P:
-    return "p";
-  case HTMLTag::TAG_SCRIPT:
-    return "script";
-  case HTMLTag::TAG_SPAN:
-    return "span";
-  case HTMLTag::TAG_TITLE:
-    return "title";
-  case HTMLTag::TAG_UL:
-    return "ul";
-  case HTMLTag::TAG_TABLE:
-    return "table";
-  case HTMLTag::TAG_THEAD:
-    return "thead";
-  case HTMLTag::TAG_TBODY:
-    return "tbody";
-  case HTMLTag::TAG_TR:
-    return "tr";
-  case HTMLTag::TAG_TD:
-    return "td";
-  case HTMLTag::TAG_TH:
-    return "th";
-  }
-  llvm_unreachable("Unhandled HTMLTag::TagType");
-}
-
-void TextNode::render(llvm::raw_ostream &OS, int IndentationLevel) {
-  OS.indent(IndentationLevel * 2);
-  printHTMLEscaped(Text, OS);
-}
-
-void TagNode::render(llvm::raw_ostream &OS, int IndentationLevel) {
-  // Children nodes are rendered in the same line if all of them are text nodes
-  bool InlineChildren = true;
-  for (const auto &C : Children)
-    if (C->Type == NodeType::NODE_TAG) {
-      InlineChildren = false;
-      break;
-    }
-  OS.indent(IndentationLevel * 2);
-  OS << "<" << Tag.toString();
-  for (const auto &A : Attributes)
-    OS << " " << A.first << "=\"" << A.second << "\"";
-  if (Tag.isSelfClosing()) {
-    OS << "/>";
-    return;
-  }
-  OS << ">";
-  if (!InlineChildren)
-    OS << "\n";
-  bool NewLineRendered = true;
-  for (const auto &C : Children) {
-    int ChildrenIndentation =
-        InlineChildren || !NewLineRendered ? 0 : IndentationLevel + 1;
-    C->render(OS, ChildrenIndentation);
-    if (!InlineChildren && (C == Children.back() ||
-                            (C->Type != NodeType::NODE_TEXT ||
-                             (&C + 1)->get()->Type != NodeType::NODE_TEXT))) {
-      OS << "\n";
-      NewLineRendered = true;
-    } else
-      NewLineRendered = false;
-  }
-  if (!InlineChildren)
-    OS.indent(IndentationLevel * 2);
-  OS << "</" << Tag.toString() << ">";
-}
-
-template <typename Derived, typename Base,
-          typename = std::enable_if<std::is_base_of<Derived, Base>::value>>
-static void appendVector(std::vector<Derived> &&New,
-                         std::vector<Base> &Original) {
-  std::move(New.begin(), New.end(), std::back_inserter(Original));
-}
-
-// HTML generation
-
-static std::vector<std::unique_ptr<TagNode>>
-genStylesheetsHTML(StringRef InfoPath, const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  for (const auto &FilePath : CDCtx.UserStylesheets) {
-    auto LinkNode = std::make_unique<TagNode>(HTMLTag::TAG_LINK);
-    LinkNode->Attributes.emplace_back("rel", "stylesheet");
-    SmallString<128> StylesheetPath = computeRelativePath("", InfoPath);
-    llvm::sys::path::append(StylesheetPath,
-                            llvm::sys::path::filename(FilePath));
-    // Paths in HTML must be in posix-style
-    llvm::sys::path::native(StylesheetPath, llvm::sys::path::Style::posix);
-    LinkNode->Attributes.emplace_back("href", std::string(StylesheetPath));
-    Out.emplace_back(std::move(LinkNode));
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genJsScriptsHTML(StringRef InfoPath, const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-
-  // index_json.js is part of every generated HTML file
-  SmallString<128> IndexJSONPath = computeRelativePath("", InfoPath);
-  auto IndexJSONNode = std::make_unique<TagNode>(HTMLTag::TAG_SCRIPT);
-  llvm::sys::path::append(IndexJSONPath, "index_json.js");
-  llvm::sys::path::native(IndexJSONPath, llvm::sys::path::Style::posix);
-  IndexJSONNode->Attributes.emplace_back("src", std::string(IndexJSONPath));
-  Out.emplace_back(std::move(IndexJSONNode));
-
-  for (const auto &FilePath : CDCtx.JsScripts) {
-    SmallString<128> ScriptPath = computeRelativePath("", InfoPath);
-    auto ScriptNode = std::make_unique<TagNode>(HTMLTag::TAG_SCRIPT);
-    llvm::sys::path::append(ScriptPath, llvm::sys::path::filename(FilePath));
-    // Paths in HTML must be in posix-style
-    llvm::sys::path::native(ScriptPath, llvm::sys::path::Style::posix);
-    ScriptNode->Attributes.emplace_back("src", std::string(ScriptPath));
-    Out.emplace_back(std::move(ScriptNode));
-  }
-  return Out;
-}
-
-static std::unique_ptr<TagNode> genLink(const Twine &Text, const Twine &Link) {
-  auto LinkNode = std::make_unique<TagNode>(HTMLTag::TAG_A, Text);
-  LinkNode->Attributes.emplace_back("href", Link.str());
-  return LinkNode;
-}
-
-static std::unique_ptr<HTMLNode>
-genReference(const Reference &Type, StringRef CurrentDirectory,
-             std::optional<StringRef> JumpToSection = std::nullopt) {
-  if (Type.Path.empty()) {
-    if (!JumpToSection)
-      return std::make_unique<TextNode>(Type.Name);
-    return genLink(Type.Name, "#" + *JumpToSection);
-  }
-  llvm::SmallString<64> Path = Type.getRelativeFilePath(CurrentDirectory);
-  llvm::sys::path::append(Path, Type.getFileBaseName() + ".html");
-
-  // Paths in HTML must be in posix-style
-  llvm::sys::path::native(Path, llvm::sys::path::Style::posix);
-  if (JumpToSection)
-    Path += ("#" + *JumpToSection).str();
-  return genLink(Type.Name, Path);
-}
-
-static std::vector<std::unique_ptr<HTMLNode>>
-genReferenceList(const llvm::SmallVectorImpl<Reference> &Refs,
-                 const StringRef &CurrentDirectory) {
-  std::vector<std::unique_ptr<HTMLNode>> Out;
-  for (const auto &R : Refs) {
-    if (&R != Refs.begin())
-      Out.emplace_back(std::make_unique<TextNode>(", "));
-    Out.emplace_back(genReference(R, CurrentDirectory));
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const EnumInfo &I, const ClangDocContext &CDCtx);
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const FunctionInfo &I, const ClangDocContext &CDCtx,
-        StringRef ParentInfoDir);
-static std::unique_ptr<TagNode> genHTML(const std::vector<CommentInfo> &C);
-
-static std::vector<std::unique_ptr<TagNode>>
-genEnumsBlock(const std::vector<EnumInfo> &Enums,
-              const ClangDocContext &CDCtx) {
-  if (Enums.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, "Enums"));
-  Out.back()->Attributes.emplace_back("id", "Enums");
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_DIV));
-  auto &DivBody = Out.back();
-  for (const auto &E : Enums) {
-    std::vector<std::unique_ptr<TagNode>> Nodes = genHTML(E, CDCtx);
-    appendVector(std::move(Nodes), DivBody->Children);
-  }
-  return Out;
-}
-
-static std::unique_ptr<TagNode>
-genEnumMembersBlock(const llvm::SmallVector<EnumValueInfo, 4> &Members) {
-  if (Members.empty())
-    return nullptr;
-
-  auto List = std::make_unique<TagNode>(HTMLTag::TAG_TBODY);
-
-  for (const auto &M : Members) {
-    auto TRNode = std::make_unique<TagNode>(HTMLTag::TAG_TR);
-    TRNode->Children.emplace_back(
-        std::make_unique<TagNode>(HTMLTag::TAG_TD, M.Name));
-    // Use user supplied value if it exists, otherwise use the value
-    if (!M.ValueExpr.empty()) {
-      TRNode->Children.emplace_back(
-          std::make_unique<TagNode>(HTMLTag::TAG_TD, M.ValueExpr));
-    } else {
-      TRNode->Children.emplace_back(
-          std::make_unique<TagNode>(HTMLTag::TAG_TD, M.Value));
-    }
-    if (!M.Description.empty()) {
-      auto TD = std::make_unique<TagNode>(HTMLTag::TAG_TD);
-      TD->Children.emplace_back(genHTML(M.Description));
-      TRNode->Children.emplace_back(std::move(TD));
-    }
-    List->Children.emplace_back(std::move(TRNode));
-  }
-  return List;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genFunctionsBlock(const std::vector<FunctionInfo> &Functions,
-                  const ClangDocContext &CDCtx, StringRef ParentInfoDir) {
-  if (Functions.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, "Functions"));
-  Out.back()->Attributes.emplace_back("id", "Functions");
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_DIV));
-  auto &DivBody = Out.back();
-  for (const auto &F : Functions) {
-    std::vector<std::unique_ptr<TagNode>> Nodes =
-        genHTML(F, CDCtx, ParentInfoDir);
-    appendVector(std::move(Nodes), DivBody->Children);
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genRecordMembersBlock(const llvm::SmallVector<MemberTypeInfo, 4> &Members,
-                      StringRef ParentInfoDir) {
-  if (Members.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, "Members"));
-  Out.back()->Attributes.emplace_back("id", "Members");
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_UL));
-  auto &ULBody = Out.back();
-  for (const auto &M : Members) {
-    StringRef Access = getAccessSpelling(M.Access);
-    auto LIBody = std::make_unique<TagNode>(HTMLTag::TAG_LI);
-    auto MemberDecl = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-    if (!Access.empty())
-      MemberDecl->Children.emplace_back(
-          std::make_unique<TextNode>(Access + " "));
-    if (M.IsStatic)
-      MemberDecl->Children.emplace_back(std::make_unique<TextNode>("static "));
-    MemberDecl->Children.emplace_back(genReference(M.Type, ParentInfoDir));
-    MemberDecl->Children.emplace_back(std::make_unique<TextNode>(" " + M.Name));
-    if (!M.Description.empty())
-      LIBody->Children.emplace_back(genHTML(M.Description));
-    LIBody->Children.emplace_back(std::move(MemberDecl));
-    ULBody->Children.emplace_back(std::move(LIBody));
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genReferencesBlock(const std::vector<Reference> &References,
-                   llvm::StringRef Title, StringRef ParentPath) {
-  if (References.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, Title));
-  Out.back()->Attributes.emplace_back("id", std::string(Title));
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_UL));
-  auto &ULBody = Out.back();
-  for (const auto &R : References) {
-    auto LiNode = std::make_unique<TagNode>(HTMLTag::TAG_LI);
-    LiNode->Children.emplace_back(genReference(R, ParentPath));
-    ULBody->Children.emplace_back(std::move(LiNode));
-  }
-  return Out;
-}
-static std::unique_ptr<TagNode> writeSourceFileRef(const ClangDocContext &CDCtx,
-                                                   const Location &L) {
-
-  if (!L.IsFileInRootDir && !CDCtx.RepositoryUrl)
-    return std::make_unique<TagNode>(
-        HTMLTag::TAG_P, "Defined at line " + std::to_string(L.StartLineNumber) +
-                            " of file " + L.Filename);
-
-  SmallString<128> FileURL(CDCtx.RepositoryUrl.value_or(""));
-  llvm::sys::path::append(
-      FileURL, llvm::sys::path::Style::posix,
-      // If we're on Windows, the file name will be in the wrong format, and
-      // append won't convert the full path being appended to the correct
-      // format, so we need to do that here.
-      llvm::sys::path::convert_to_slash(
-          L.Filename,
-          // The style here is the current style of the path, not the one we're
-          // targeting. If the string is already in the posix style, it will do
-          // nothing.
-          llvm::sys::path::Style::windows));
-  auto Node = std::make_unique<TagNode>(HTMLTag::TAG_P);
-  Node->Children.emplace_back(std::make_unique<TextNode>("Defined at line "));
-  auto LocNumberNode = std::make_unique<TagNode>(
-      HTMLTag::TAG_A, std::to_string(L.StartLineNumber));
-  // The links to a specific line in the source code use the github /
-  // googlesource notation so it won't work for all hosting pages.
-  LocNumberNode->Attributes.emplace_back(
-      "href",
-      formatv("{0}#{1}{2}", FileURL, CDCtx.RepositoryLinePrefix.value_or(""),
-              L.StartLineNumber));
-  Node->Children.emplace_back(std::move(LocNumberNode));
-  Node->Children.emplace_back(std::make_unique<TextNode>(" of file "));
-  auto LocFileNode = std::make_unique<TagNode>(
-      HTMLTag::TAG_A, llvm::sys::path::filename(FileURL));
-  LocFileNode->Attributes.emplace_back("href", std::string(FileURL));
-  Node->Children.emplace_back(std::move(LocFileNode));
-  return Node;
-}
-
-static void maybeWriteSourceFileRef(std::vector<std::unique_ptr<TagNode>> &Out,
-                                    const ClangDocContext &CDCtx,
-                                    const std::optional<Location> &DefLoc) {
-  if (DefLoc)
-    Out.emplace_back(writeSourceFileRef(CDCtx, *DefLoc));
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const Index &Index, StringRef InfoPath, bool IsOutermostList);
-
-// Generates a list of child nodes for the HTML head tag
-// It contains a meta node, link nodes to import CSS files, and script nodes to
-// import JS files
-static std::vector<std::unique_ptr<TagNode>>
-genFileHeadNodes(StringRef Title, StringRef InfoPath,
-                 const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  auto MetaNode = std::make_unique<TagNode>(HTMLTag::TAG_META);
-  MetaNode->Attributes.emplace_back("charset", "utf-8");
-  Out.emplace_back(std::move(MetaNode));
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_TITLE, Title));
-  std::vector<std::unique_ptr<TagNode>> StylesheetsNodes =
-      genStylesheetsHTML(InfoPath, CDCtx);
-  appendVector(std::move(StylesheetsNodes), Out);
-  std::vector<std::unique_ptr<TagNode>> JsNodes =
-      genJsScriptsHTML(InfoPath, CDCtx);
-  appendVector(std::move(JsNodes), Out);
-  return Out;
-}
-
-// Generates a header HTML node that can be used for any file
-// It contains the project name
-static std::unique_ptr<TagNode> genFileHeaderNode(StringRef ProjectName) {
-  auto HeaderNode = std::make_unique<TagNode>(HTMLTag::TAG_HEADER, ProjectName);
-  HeaderNode->Attributes.emplace_back("id", "project-title");
-  return HeaderNode;
-}
-
-// Generates a main HTML node that has all the main content of an info file
-// It contains both indexes and the info's documented information
-// This function should only be used for the info files (not for the file that
-// only has the general index)
-static std::unique_ptr<TagNode> genInfoFileMainNode(
-    StringRef InfoPath,
-    std::vector<std::unique_ptr<TagNode>> &MainContentInnerNodes,
-    const Index &InfoIndex) {
-  auto MainNode = std::make_unique<TagNode>(HTMLTag::TAG_MAIN);
-
-  auto LeftSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  LeftSidebarNode->Attributes.emplace_back("id", "sidebar-left");
-  LeftSidebarNode->Attributes.emplace_back("path", std::string(InfoPath));
-  LeftSidebarNode->Attributes.emplace_back(
-      "class", "col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left");
-
-  auto MainContentNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  MainContentNode->Attributes.emplace_back("id", "main-content");
-  MainContentNode->Attributes.emplace_back(
-      "class", "col-xs-12 col-sm-9 col-md-8 main-content");
-  appendVector(std::move(MainContentInnerNodes), MainContentNode->Children);
-
-  auto RightSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  RightSidebarNode->Attributes.emplace_back("id", "sidebar-right");
-  RightSidebarNode->Attributes.emplace_back(
-      "class", "col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right");
-  std::vector<std::unique_ptr<TagNode>> InfoIndexHTML =
-      genHTML(InfoIndex, InfoPath, true);
-  appendVector(std::move(InfoIndexHTML), RightSidebarNode->Children);
-
-  MainNode->Children.emplace_back(std::move(LeftSidebarNode));
-  MainNode->Children.emplace_back(std::move(MainContentNode));
-  MainNode->Children.emplace_back(std::move(RightSidebarNode));
-
-  return MainNode;
-}
-
-// Generates a footer HTML node that can be used for any file
-// It contains clang-doc's version
-static std::unique_ptr<TagNode> genFileFooterNode() {
-  auto FooterNode = std::make_unique<TagNode>(HTMLTag::TAG_FOOTER);
-  auto SpanNode = std::make_unique<TagNode>(
-      HTMLTag::TAG_SPAN, clang::getClangToolFullVersion("clang-doc"));
-  SpanNode->Attributes.emplace_back("class", "no-break");
-  FooterNode->Children.emplace_back(std::move(SpanNode));
-  return FooterNode;
-}
-
-// Generates a complete HTMLFile for an Info
-static HTMLFile
-genInfoFile(StringRef Title, StringRef InfoPath,
-            std::vector<std::unique_ptr<TagNode>> &MainContentNodes,
-            const Index &InfoIndex, const ClangDocContext &CDCtx) {
-  HTMLFile F;
-
-  std::vector<std::unique_ptr<TagNode>> HeadNodes =
-      genFileHeadNodes(Title, InfoPath, CDCtx);
-  std::unique_ptr<TagNode> HeaderNode = genFileHeaderNode(CDCtx.ProjectName);
-  std::unique_ptr<TagNode> MainNode =
-      genInfoFileMainNode(InfoPath, MainContentNodes, InfoIndex);
-  std::unique_ptr<TagNode> FooterNode = genFileFooterNode();
-
-  appendVector(std::move(HeadNodes), F.Children);
-  F.Children.emplace_back(std::move(HeaderNode));
-  F.Children.emplace_back(std::move(MainNode));
-  F.Children.emplace_back(std::move(FooterNode));
-
-  return F;
-}
-
-template <typename T,
-          typename = std::enable_if<std::is_base_of<T, Info>::value>>
-static Index genInfoIndexItem(const std::vector<T> &Infos, StringRef Title) {
-  Index Idx(Title, Title);
-  for (const auto &C : Infos)
-    Idx.Children.emplace_back(C.extractName(),
-                              llvm::toHex(llvm::toStringRef(C.USR)));
-  return Idx;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const Index &Index, StringRef InfoPath, bool IsOutermostList) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  if (!Index.Name.empty()) {
-    Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_SPAN));
-    auto &SpanBody = Out.back();
-    if (!Index.JumpToSection)
-      SpanBody->Children.emplace_back(genReference(Index, InfoPath));
-    else
-      SpanBody->Children.emplace_back(
-          genReference(Index, InfoPath, Index.JumpToSection->str()));
-  }
-  if (Index.Children.empty())
-    return Out;
-  // Only the outermost list should use ol, the others should use ul
-  HTMLTag ListHTMLTag = IsOutermostList ? HTMLTag::TAG_OL : HTMLTag::TAG_UL;
-  Out.emplace_back(std::make_unique<TagNode>(ListHTMLTag));
-  const auto &UlBody = Out.back();
-  for (const auto &C : Index.Children) {
-    auto LiBody = std::make_unique<TagNode>(HTMLTag::TAG_LI);
-    std::vector<std::unique_ptr<TagNode>> Nodes = genHTML(C, InfoPath, false);
-    appendVector(std::move(Nodes), LiBody->Children);
-    UlBody->Children.emplace_back(std::move(LiBody));
-  }
-  return Out;
-}
-
-static std::unique_ptr<HTMLNode> genHTML(const CommentInfo &I) {
-  switch (I.Kind) {
-  case CommentKind::CK_FullComment: {
-    auto FullComment = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-    for (const auto &Child : I.Children) {
-      std::unique_ptr<HTMLNode> Node = genHTML(*Child);
-      if (Node)
-        FullComment->Children.emplace_back(std::move(Node));
-    }
-    return std::move(FullComment);
-  }
-
-  case CommentKind::CK_ParagraphComment: {
-    auto ParagraphComment = std::make_unique<TagNode>(HTMLTag::TAG_P);
-    for (const auto &Child : I.Children) {
-      std::unique_ptr<HTMLNode> Node = genHTML(*Child);
-      if (Node)
-        ParagraphComment->Children.emplace_back(std::move(Node));
-    }
-    if (ParagraphComment->Children.empty())
-      return nullptr;
-    return std::move(ParagraphComment);
-  }
-
-  case CommentKind::CK_BlockCommandComment: {
-    auto BlockComment = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-    BlockComment->Children.emplace_back(
-        std::make_unique<TagNode>(HTMLTag::TAG_DIV, I.Name));
-    for (const auto &Child : I.Children) {
-      std::unique_ptr<HTMLNode> Node = genHTML(*Child);
-      if (Node)
-        BlockComment->Children.emplace_back(std::move(Node));
-    }
-    if (BlockComment->Children.empty())
-      return nullptr;
-    return std::move(BlockComment);
-  }
-
-  case CommentKind::CK_TextComment: {
-    if (I.Text.empty())
-      return nullptr;
-    return std::make_unique<TextNode>(I.Text);
-  }
-
-  // For now, return nullptr for unsupported comment kinds
-  case CommentKind::CK_InlineCommandComment:
-  case CommentKind::CK_HTMLStartTagComment:
-  case CommentKind::CK_HTMLEndTagComment:
-  case CommentKind::CK_ParamCommandComment:
-  case CommentKind::CK_TParamCommandComment:
-  case CommentKind::CK_VerbatimBlockComment:
-  case CommentKind::CK_VerbatimBlockLineComment:
-  case CommentKind::CK_VerbatimLineComment:
-  case CommentKind::CK_Unknown:
-    return nullptr;
-  }
-  llvm_unreachable("Unhandled CommentKind");
-}
-
-static std::unique_ptr<TagNode> genHTML(const std::vector<CommentInfo> &C) {
-  auto CommentBlock = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  for (const auto &Child : C) {
-    if (std::unique_ptr<HTMLNode> Node = genHTML(Child))
-      CommentBlock->Children.emplace_back(std::move(Node));
-  }
-  return CommentBlock;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const EnumInfo &I, const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  std::string EnumType = I.Scoped ? "enum class " : "enum ";
-  // Determine if enum members have comments attached
-  bool HasComments = llvm::any_of(
-      I.Members, [](const EnumValueInfo &M) { return !M.Description.empty(); });
-  std::unique_ptr<TagNode> Table =
-      std::make_unique<TagNode>(HTMLTag::TAG_TABLE);
-  std::unique_ptr<TagNode> THead =
-      std::make_unique<TagNode>(HTMLTag::TAG_THEAD);
-  std::unique_ptr<TagNode> TRow = std::make_unique<TagNode>(HTMLTag::TAG_TR);
-  std::unique_ptr<TagNode> TD =
-      std::make_unique<TagNode>(HTMLTag::TAG_TH, EnumType + I.Name);
-  // Span 3 columns if enum has comments
-  TD->Attributes.emplace_back("colspan", HasComments ? "3" : "2");
-
-  Table->Attributes.emplace_back("id", llvm::toHex(llvm::toStringRef(I.USR)));
-  TRow->Children.emplace_back(std::move(TD));
-  THead->Children.emplace_back(std::move(TRow));
-  Table->Children.emplace_back(std::move(THead));
-
-  if (std::unique_ptr<TagNode> Node = genEnumMembersBlock(I.Members))
-    Table->Children.emplace_back(std::move(Node));
-
-  Out.emplace_back(std::move(Table));
-
-  maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc);
-
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const FunctionInfo &I, const ClangDocContext &CDCtx,
-        StringRef ParentInfoDir) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H3, I.Name));
-  // USR is used as id for functions instead of name to disambiguate function
-  // overloads.
-  Out.back()->Attributes.emplace_back("id",
-                                      llvm::toHex(llvm::toStringRef(I.USR)));
-
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_P));
-  auto &FunctionHeader = Out.back();
-
-  std::string Access = getAccessSpelling(I.Access).str();
-  if (Access != "")
-    FunctionHeader->Children.emplace_back(
-        std::make_unique<TextNode>(Access + " "));
-  if (I.IsStatic)
-    FunctionHeader->Children.emplace_back(
-        std::make_unique<TextNode>("static "));
-  if (I.ReturnType.Type.Name != "") {
-    FunctionHeader->Children.emplace_back(
-        genReference(I.ReturnType.Type, ParentInfoDir));
-    FunctionHeader->Children.emplace_back(std::make_unique<TextNode>(" "));
-  }
-  FunctionHeader->Children.emplace_back(
-      std::make_unique<TextNode>(I.Name + "("));
-
-  for (const auto &P : I.Params) {
-    if (&P != I.Params.begin())
-      FunctionHeader->Children.emplace_back(std::make_unique<TextNode>(", "));
-    FunctionHeader->Children.emplace_back(genReference(P.Type, ParentInfoDir));
-    FunctionHeader->Children.emplace_back(
-        std::make_unique<TextNode>(" " + P.Name));
-  }
-  FunctionHeader->Children.emplace_back(std::make_unique<TextNode>(")"));
-
-  maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc);
-
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const NamespaceInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx,
-        std::string &InfoTitle) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  if (I.Name.str() == "")
-    InfoTitle = "Global Namespace";
-  else
-    InfoTitle = ("namespace " + I.Name).str();
+static std::unique_ptr<MustacheTemplateFile> NamespaceTemplate = nullptr;
 
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H1, InfoTitle));
+static std::unique_ptr<MustacheTemplateFile> RecordTemplate = nullptr;
 
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  llvm::SmallString<64> BasePath = I.getRelativeFilePath("");
-
-  std::vector<std::unique_ptr<TagNode>> ChildNamespaces =
-      genReferencesBlock(I.Children.Namespaces, "Namespaces", BasePath);
-  appendVector(std::move(ChildNamespaces), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildRecords =
-      genReferencesBlock(I.Children.Records, "Records", BasePath);
-  appendVector(std::move(ChildRecords), Out);
-
-  std::vector<std::unique_ptr<TagNode>> ChildFunctions =
-      genFunctionsBlock(I.Children.Functions, CDCtx, BasePath);
-  appendVector(std::move(ChildFunctions), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildEnums =
-      genEnumsBlock(I.Children.Enums, CDCtx);
-  appendVector(std::move(ChildEnums), Out);
-
-  if (!I.Children.Namespaces.empty())
-    InfoIndex.Children.emplace_back("Namespaces", "Namespaces");
-  if (!I.Children.Records.empty())
-    InfoIndex.Children.emplace_back("Records", "Records");
-  if (!I.Children.Functions.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Functions, "Functions"));
-  if (!I.Children.Enums.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Enums, "Enums"));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const RecordInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx,
-        std::string &InfoTitle) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  InfoTitle = (getTagType(I.TagType) + " " + I.Name).str();
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H1, InfoTitle));
-
-  maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc);
-
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  std::vector<std::unique_ptr<HTMLNode>> Parents =
-      genReferenceList(I.Parents, I.Path);
-  std::vector<std::unique_ptr<HTMLNode>> VParents =
-      genReferenceList(I.VirtualParents, I.Path);
-  if (!Parents.empty() || !VParents.empty()) {
-    Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_P));
-    auto &PBody = Out.back();
-    PBody->Children.emplace_back(std::make_unique<TextNode>("Inherits from "));
-    if (Parents.empty())
-      appendVector(std::move(VParents), PBody->Children);
-    else if (VParents.empty())
-      appendVector(std::move(Parents), PBody->Children);
-    else {
-      appendVector(std::move(Parents), PBody->Children);
-      PBody->Children.emplace_back(std::make_unique<TextNode>(", "));
-      appendVector(std::move(VParents), PBody->Children);
-    }
-  }
-
-  std::vector<std::unique_ptr<TagNode>> Members =
-      genRecordMembersBlock(I.Members, I.Path);
-  appendVector(std::move(Members), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildRecords =
-      genReferencesBlock(I.Children.Records, "Records", I.Path);
-  appendVector(std::move(ChildRecords), Out);
-
-  std::vector<std::unique_ptr<TagNode>> ChildFunctions =
-      genFunctionsBlock(I.Children.Functions, CDCtx, I.Path);
-  appendVector(std::move(ChildFunctions), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildEnums =
-      genEnumsBlock(I.Children.Enums, CDCtx);
-  appendVector(std::move(ChildEnums), Out);
-
-  if (!I.Members.empty())
-    InfoIndex.Children.emplace_back("Members", "Members");
-  if (!I.Children.Records.empty())
-    InfoIndex.Children.emplace_back("Records", "Records");
-  if (!I.Children.Functions.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Functions, "Functions"));
-  if (!I.Children.Enums.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Enums, "Enums"));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const TypedefInfo &I, const ClangDocContext &CDCtx,
-        std::string &InfoTitle) {
-  // TODO support typedefs in HTML.
-  return {};
-}
-
-/// Generator for HTML documentation.
-class HTMLGenerator : public Generator {
+class HTMLGenerator : public MustacheGenerator {
 public:
   static const char *Format;
-
+  Error createResources(ClangDocContext &CDCtx) override;
+  Error generateDocForInfo(Info *I, raw_ostream &OS,
+                           const ClangDocContext &CDCtx) override;
+  Error setupTemplateFiles(const ClangDocContext &CDCtx) override;
+  Error generateDocForJSON(json::Value &JSON, raw_fd_ostream &OS,
+                           const ClangDocContext &CDCtx, StringRef ObjTypeStr,
+                           StringRef RelativeRootPath) override;
+  // Populates templates with CSS stylesheets, JS scripts paths.
+  Error setupTemplateResources(const ClangDocContext &CDCtx, json::Value &V,
+                               SmallString<128> RelativeRootPath);
   llvm::Error generateDocumentation(
       StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
       const ClangDocContext &CDCtx, std::string DirName) override;
-  llvm::Error createResources(ClangDocContext &CDCtx) override;
-  llvm::Error generateDocForInfo(Info *I, llvm::raw_ostream &OS,
-                                 const ClangDocContext &CDCtx) override;
 };
 
-const char *HTMLGenerator::Format = "html";
-
-llvm::Error HTMLGenerator::generateDocumentation(
-    StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
-    const ClangDocContext &CDCtx, std::string DirName) {
-  // Track which directories we already tried to create.
-  llvm::StringSet<> CreatedDirs;
+Error HTMLGenerator::setupTemplateFiles(const ClangDocContext &CDCtx) {
+  // Template files need to use the native path when they're opened,
+  // but have to be used in POSIX style when used in HTML.
+  auto ConvertToNative = [](std::string &&Path) -> std::string {
+    SmallString<128> PathBuf(Path);
+    llvm::sys::path::native(PathBuf);
+    return PathBuf.str().str();
+  };
 
-  // Collect all output by file name and create the nexessary directories.
-  llvm::StringMap<std::vector<doc::Info *>> FileToInfos;
-  for (const auto &Group : Infos) {
-    doc::Info *Info = Group.getValue().get();
+  std::string NamespaceFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("namespace-template"));
+  std::string ClassFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("class-template"));
+  std::string CommentFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("comment-template"));
+  std::string FunctionFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("function-template"));
+  std::string EnumFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("enum-template"));
+  std::vector<std::pair<StringRef, StringRef>> Partials = {
+      {"Comments", CommentFilePath},
+      {"FunctionPartial", FunctionFilePath},
+      {"EnumPartial", EnumFilePath}};
+
+  if (Error Err = setupTemplate(NamespaceTemplate, NamespaceFilePath, Partials))
+    return Err;
 
-    llvm::SmallString<128> Path;
-    llvm::sys::path::native(RootDir, Path);
-    llvm::sys::path::append(Path, Info->getRelativeFilePath(""));
-    if (!CreatedDirs.contains(Path)) {
-      if (std::error_code Err = llvm::sys::fs::create_directories(Path);
-          Err != std::error_code()) {
-        return llvm::createStringError(Err, "Failed to create directory '%s'.",
-                                       Path.c_str());
-      }
-      CreatedDirs.insert(Path);
-    }
+  if (Error Err = setupTemplate(RecordTemplate, ClassFilePath, Partials))
+    return Err;
 
-    llvm::sys::path::append(Path, Info->getFileBaseName() + ".html");
-    FileToInfos[Path].push_back(Info);
-  }
+  return Error::success();
+}
 
-  for (const auto &Group : FileToInfos) {
-    std::error_code FileErr;
-    llvm::raw_fd_ostream InfoOS(Group.getKey(), FileErr,
-                                llvm::sys::fs::OF_Text);
-    if (FileErr) {
-      return llvm::createStringError(FileErr, "Error opening file '%s'",
-                                     Group.getKey().str().c_str());
-    }
+Error HTMLGenerator::setupTemplateResources(const ClangDocContext &CDCtx,
+                                            json::Value &V,
+                                            SmallString<128> RelativeRootPath) {
+  V.getAsObject()->insert({"ProjectName", CDCtx.ProjectName});
+  json::Value StylesheetArr = Array();
+  sys::path::native(RelativeRootPath, sys::path::Style::posix);
 
-    // TODO: https://github.com/llvm/llvm-project/issues/59073
-    // If there are multiple Infos for this file name (for example, template
-    // specializations), this will generate multiple complete web pages (with
-    // <DOCTYPE> and <title>, etc.) concatenated together. This generator needs
-    // some refactoring to be able to output the headers separately from the
-    // contents.
-    for (const auto &Info : Group.getValue()) {
-      if (llvm::Error Err = generateDocForInfo(Info, InfoOS, CDCtx)) {
-        return Err;
-      }
-    }
+  auto *SSA = StylesheetArr.getAsArray();
+  SSA->reserve(CDCtx.UserStylesheets.size());
+  for (const auto &FilePath : CDCtx.UserStylesheets) {
+    SmallString<128> StylesheetPath = RelativeRootPath;
+    sys::path::append(StylesheetPath, sys::path::Style::posix,
+                      sys::path::filename(FilePath));
+    SSA->emplace_back(StylesheetPath);
+  }
+  V.getAsObject()->insert({"Stylesheets", StylesheetArr});
+
+  json::Value ScriptArr = Array();
+  auto *SCA = ScriptArr.getAsArray();
+  SCA->reserve(CDCtx.JsScripts.size());
+  for (auto Script : CDCtx.JsScripts) {
+    SmallString<128> JsPath = RelativeRootPath;
+    sys::path::append(JsPath, sys::path::Style::posix,
+                      sys::path::filename(Script));
+    SCA->emplace_back(JsPath);
+  }
+  V.getAsObject()->insert({"Scripts", ScriptArr});
+  return Error::success();
+}
+
+Error HTMLGenerator::generateDocForJSON(json::Value &JSON, raw_fd_ostream &OS,
+                                        const ClangDocContext &CDCtx,
+                                        StringRef ObjTypeStr,
+                                        StringRef RelativeRootPath) {
+  if (ObjTypeStr == "namespace") {
+    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
+      return Err;
+    assert(NamespaceTemplate && "NamespaceTemplate is nullptr.");
+    NamespaceTemplate->render(JSON, OS);
+  } else if (ObjTypeStr == "record") {
+    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
+      return Err;
+    assert(RecordTemplate && "RecordTemplate is nullptr.");
+    RecordTemplate->render(JSON, OS);
   }
-
-  return llvm::Error::success();
+  return Error::success();
 }
 
-llvm::Error HTMLGenerator::generateDocForInfo(Info *I, llvm::raw_ostream &OS,
-                                              const ClangDocContext &CDCtx) {
-  std::string InfoTitle;
-  std::vector<std::unique_ptr<TagNode>> MainContentNodes;
-  Index InfoIndex;
+Error HTMLGenerator::generateDocForInfo(Info *I, raw_ostream &OS,
+                                        const ClangDocContext &CDCtx) {
   switch (I->IT) {
-  case InfoType::IT_namespace:
-    MainContentNodes = genHTML(*static_cast<clang::doc::NamespaceInfo *>(I),
-                               InfoIndex, CDCtx, InfoTitle);
-    break;
-  case InfoType::IT_record:
-    MainContentNodes = genHTML(*static_cast<clang::doc::RecordInfo *>(I),
-                               InfoIndex, CDCtx, InfoTitle);
-    break;
   case InfoType::IT_enum:
-    MainContentNodes = genHTML(*static_cast<clang::doc::EnumInfo *>(I), CDCtx);
-    break;
   case InfoType::IT_function:
-    MainContentNodes =
-        genHTML(*static_cast<clang::doc::FunctionInfo *>(I), CDCtx, "");
-    break;
   case InfoType::IT_typedef:
-    MainContentNodes =
-        genHTML(*static_cast<clang::doc::TypedefInfo *>(I), CDCtx, InfoTitle);
-    break;
-  case InfoType::IT_concept:
-  case InfoType::IT_variable:
-  case InfoType::IT_friend:
-    break;
-  case InfoType::IT_default:
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "unexpected info type");
-  }
-
-  HTMLFile F = genInfoFile(InfoTitle, I->getRelativeFilePath(""),
-                           MainContentNodes, InfoIndex, CDCtx);
-  F.render(OS);
-
-  return llvm::Error::success();
-}
-
-static std::string getRefType(InfoType IT) {
-  switch (IT) {
-  case InfoType::IT_default:
-    return "default";
   case InfoType::IT_namespace:
-    return "namespace";
   case InfoType::IT_record:
-    return "record";
-  case InfoType::IT_function:
-    return "function";
-  case InfoType::IT_enum:
-    return "enum";
-  case InfoType::IT_typedef:
-    return "typedef";
   case InfoType::IT_concept:
-    return "concept";
   case InfoType::IT_variable:
-    return "variable";
   case InfoType::IT_friend:
-    return "friend";
-  }
-  llvm_unreachable("Unknown InfoType");
-}
-
-static llvm::Error serializeIndex(ClangDocContext &CDCtx) {
-  std::error_code OK;
-  std::error_code FileErr;
-  llvm::SmallString<128> FilePath;
-  llvm::sys::path::native(CDCtx.OutDirectory, FilePath);
-  llvm::sys::path::append(FilePath, "index_json.js");
-  llvm::raw_fd_ostream OS(FilePath, FileErr, llvm::sys::fs::OF_Text);
-  if (FileErr != OK) {
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "error creating index file: " +
-                                       FileErr.message());
-  }
-  llvm::SmallString<128> RootPath(CDCtx.OutDirectory);
-  if (llvm::sys::path::is_relative(RootPath)) {
-    llvm::sys::fs::make_absolute(RootPath);
+    break;
+  case InfoType::IT_default:
+    return createStringError(inconvertibleErrorCode(), "unexpected InfoType");
   }
-  // Replace the escaped characters with a forward slash. It shouldn't matter
-  // when rendering the webpage in a web browser. This helps to prevent the
-  // JavaScript from escaping characters incorrectly, and introducing  bad paths
-  // in the URLs.
-  std::string RootPathEscaped = RootPath.str().str();
-  llvm::replace(RootPathEscaped, '\\', '/');
-  OS << "var RootPath = \"" << RootPathEscaped << "\";\n";
-
-  llvm::SmallString<128> Base(CDCtx.Base);
-  std::string BaseEscaped = Base.str().str();
-  llvm::replace(BaseEscaped, '\\', '/');
-  OS << "var Base = \"" << BaseEscaped << "\";\n";
-
-  CDCtx.Idx.sort();
-  llvm::json::OStream J(OS, 2);
-  std::function<void(Index)> IndexToJSON = [&](const Index &I) {
-    J.object([&] {
-      J.attribute("USR", toHex(llvm::toStringRef(I.USR)));
-      J.attribute("Name", I.Name);
-      J.attribute("RefType", getRefType(I.RefType));
-      J.attribute("Path", I.getRelativeFilePath(""));
-      J.attributeArray("Children", [&] {
-        for (const Index &C : I.Children)
-          IndexToJSON(C);
-      });
-    });
-  };
-  OS << "async function LoadIndex() {\nreturn";
-  IndexToJSON(CDCtx.Idx);
-  OS << ";\n}";
-  return llvm::Error::success();
+  return Error::success();
 }
 
-// Generates a main HTML node that has the main content of the file that shows
-// only the general index
-// It contains the general index with links to all the generated files
-static std::unique_ptr<TagNode> genIndexFileMainNode() {
-  auto MainNode = std::make_unique<TagNode>(HTMLTag::TAG_MAIN);
-
-  auto LeftSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  LeftSidebarNode->Attributes.emplace_back("id", "sidebar-left");
-  LeftSidebarNode->Attributes.emplace_back("path", "");
-  LeftSidebarNode->Attributes.emplace_back(
-      "class", "col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left");
-  LeftSidebarNode->Attributes.emplace_back("style", "flex: 0 100%;");
-
-  MainNode->Children.emplace_back(std::move(LeftSidebarNode));
-
-  return MainNode;
+Error HTMLGenerator::createResources(ClangDocContext &CDCtx) {
+  std::string ResourcePath(CDCtx.OutDirectory + "/html");
+  for (const auto &FilePath : CDCtx.UserStylesheets)
+    if (Error Err = copyFile(FilePath, ResourcePath))
+      return Err;
+  for (const auto &FilePath : CDCtx.JsScripts)
+    if (Error Err = copyFile(FilePath, ResourcePath))
+      return Err;
+  return Error::success();
 }
 
-static llvm::Error genIndex(const ClangDocContext &CDCtx) {
-  std::error_code FileErr, OK;
-  llvm::SmallString<128> IndexPath;
-  llvm::sys::path::native(CDCtx.OutDirectory, IndexPath);
-  llvm::sys::path::append(IndexPath, "index.html");
-  llvm::raw_fd_ostream IndexOS(IndexPath, FileErr, llvm::sys::fs::OF_Text);
-  if (FileErr != OK) {
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "error creating main index: " +
-                                       FileErr.message());
-  }
-
-  HTMLFile F;
-
-  std::vector<std::unique_ptr<TagNode>> HeadNodes =
-      genFileHeadNodes("Index", "", CDCtx);
-  std::unique_ptr<TagNode> HeaderNode = genFileHeaderNode(CDCtx.ProjectName);
-  std::unique_ptr<TagNode> MainNode = genIndexFileMainNode();
-  std::unique_ptr<TagNode> FooterNode = genFileFooterNode();
-
-  appendVector(std::move(HeadNodes), F.Children);
-  F.Children.emplace_back(std::move(HeaderNode));
-  F.Children.emplace_back(std::move(MainNode));
-  F.Children.emplace_back(std::move(FooterNode));
-
-  F.render(IndexOS);
-
-  return llvm::Error::success();
+Error HTMLGenerator::generateDocumentation(
+    StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
+    const ClangDocContext &CDCtx, std::string DirName) {
+  return MustacheGenerator::generateDocumentation(RootDir, std::move(Infos),
+                                                  CDCtx, "html");
 }
 
-llvm::Error HTMLGenerator::createResources(ClangDocContext &CDCtx) {
-  auto Err = serializeIndex(CDCtx);
-  if (Err)
-    return Err;
-  Err = genIndex(CDCtx);
-  if (Err)
-    return Err;
-
-  for (const auto &FilePath : CDCtx.UserStylesheets) {
-    Err = copyFile(FilePath, CDCtx.OutDirectory);
-    if (Err)
-      return Err;
-  }
-  for (const auto &FilePath : CDCtx.JsScripts) {
-    Err = copyFile(FilePath, CDCtx.OutDirectory);
-    if (Err)
-      return Err;
-  }
-  return llvm::Error::success();
-}
+const char *HTMLGenerator::Format = "html";
 
-static GeneratorRegistry::Add<HTMLGenerator> HTML(HTMLGenerator::Format,
-                                                  "Generator for HTML output.");
+static GeneratorRegistry::Add<HTMLGenerator>
+    HTML(HTMLGenerator::Format, "Generator for mustache HTML output.");
 
 // This anchor is used to force the linker to link in the generated object
 // file and thus register the generator.
diff --git a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
deleted file mode 100644
index d33b77feb84be..0000000000000
--- a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-///===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains the implementation of the MustacheHTMLGenerator class,
-/// which is Clang-Doc generator for HTML using Mustache templates.
-///
-//===----------------------------------------------------------------------===//
-
-#include "Generators.h"
-#include "Representation.h"
-#include "support/File.h"
-#include "clang/Basic/Diagnostic.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/Path.h"
-
-using namespace llvm;
-using namespace llvm::json;
-using namespace llvm::mustache;
-
-namespace clang {
-namespace doc {
-
-static std::unique_ptr<MustacheTemplateFile> NamespaceTemplate = nullptr;
-
-static std::unique_ptr<MustacheTemplateFile> RecordTemplate = nullptr;
-
-class MustacheHTMLGenerator : public MustacheGenerator {
-public:
-  static const char *Format;
-  Error createResources(ClangDocContext &CDCtx) override;
-  Error generateDocForInfo(Info *I, raw_ostream &OS,
-                           const ClangDocContext &CDCtx) override;
-  Error setupTemplateFiles(const ClangDocContext &CDCtx) override;
-  Error generateDocForJSON(json::Value &JSON, raw_fd_ostream &OS,
-                           const ClangDocContext &CDCtx, StringRef ObjTypeStr,
-                           StringRef RelativeRootPath) override;
-  // Populates templates with CSS stylesheets, JS scripts paths.
-  Error setupTemplateResources(const ClangDocContext &CDCtx, json::Value &V,
-                               SmallString<128> RelativeRootPath);
-  llvm::Error generateDocumentation(
-      StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
-      const ClangDocContext &CDCtx, std::string DirName) override;
-};
-
-Error MustacheHTMLGenerator::setupTemplateFiles(const ClangDocContext &CDCtx) {
-  // Template files need to use the native path when they're opened,
-  // but have to be used in POSIX style when used in HTML.
-  auto ConvertToNative = [](std::string &&Path) -> std::string {
-    SmallString<128> PathBuf(Path);
-    llvm::sys::path::native(PathBuf);
-    return PathBuf.str().str();
-  };
-
-  std::string NamespaceFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("namespace-template"));
-  std::string ClassFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("class-template"));
-  std::string CommentFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("comment-template"));
-  std::string FunctionFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("function-template"));
-  std::string EnumFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("enum-template"));
-  std::vector<std::pair<StringRef, StringRef>> Partials = {
-      {"Comments", CommentFilePath},
-      {"FunctionPartial", FunctionFilePath},
-      {"EnumPartial", EnumFilePath}};
-
-  if (Error Err = setupTemplate(NamespaceTemplate, NamespaceFilePath, Partials))
-    return Err;
-
-  if (Error Err = setupTemplate(RecordTemplate, ClassFilePath, Partials))
-    return Err;
-
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::setupTemplateResources(
-    const ClangDocContext &CDCtx, json::Value &V,
-    SmallString<128> RelativeRootPath) {
-  V.getAsObject()->insert({"ProjectName", CDCtx.ProjectName});
-  json::Value StylesheetArr = Array();
-  sys::path::native(RelativeRootPath, sys::path::Style::posix);
-
-  auto *SSA = StylesheetArr.getAsArray();
-  SSA->reserve(CDCtx.UserStylesheets.size());
-  for (const auto &FilePath : CDCtx.UserStylesheets) {
-    SmallString<128> StylesheetPath = RelativeRootPath;
-    sys::path::append(StylesheetPath, sys::path::Style::posix,
-                      sys::path::filename(FilePath));
-    SSA->emplace_back(StylesheetPath);
-  }
-  V.getAsObject()->insert({"Stylesheets", StylesheetArr});
-
-  json::Value ScriptArr = Array();
-  auto *SCA = ScriptArr.getAsArray();
-  SCA->reserve(CDCtx.JsScripts.size());
-  for (auto Script : CDCtx.JsScripts) {
-    SmallString<128> JsPath = RelativeRootPath;
-    sys::path::append(JsPath, sys::path::Style::posix,
-                      sys::path::filename(Script));
-    SCA->emplace_back(JsPath);
-  }
-  V.getAsObject()->insert({"Scripts", ScriptArr});
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::generateDocForJSON(json::Value &JSON,
-                                                raw_fd_ostream &OS,
-                                                const ClangDocContext &CDCtx,
-                                                StringRef ObjTypeStr,
-                                                StringRef RelativeRootPath) {
-  if (ObjTypeStr == "namespace") {
-    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
-      return Err;
-    assert(NamespaceTemplate && "NamespaceTemplate is nullptr.");
-    NamespaceTemplate->render(JSON, OS);
-  } else if (ObjTypeStr == "record") {
-    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
-      return Err;
-    assert(RecordTemplate && "RecordTemplate is nullptr.");
-    RecordTemplate->render(JSON, OS);
-  }
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::generateDocForInfo(Info *I, raw_ostream &OS,
-                                                const ClangDocContext &CDCtx) {
-  switch (I->IT) {
-  case InfoType::IT_enum:
-  case InfoType::IT_function:
-  case InfoType::IT_typedef:
-  case InfoType::IT_namespace:
-  case InfoType::IT_record:
-  case InfoType::IT_concept:
-  case InfoType::IT_variable:
-  case InfoType::IT_friend:
-    break;
-  case InfoType::IT_default:
-    return createStringError(inconvertibleErrorCode(), "unexpected InfoType");
-  }
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::createResources(ClangDocContext &CDCtx) {
-  std::string ResourcePath(CDCtx.OutDirectory + "/html");
-  for (const auto &FilePath : CDCtx.UserStylesheets)
-    if (Error Err = copyFile(FilePath, ResourcePath))
-      return Err;
-  for (const auto &FilePath : CDCtx.JsScripts)
-    if (Error Err = copyFile(FilePath, ResourcePath))
-      return Err;
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::generateDocumentation(
-    StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
-    const ClangDocContext &CDCtx, std::string DirName) {
-  return MustacheGenerator::generateDocumentation(RootDir, std::move(Infos),
-                                                  CDCtx, "html");
-}
-
-const char *MustacheHTMLGenerator::Format = "mustache";
-
-static GeneratorRegistry::Add<MustacheHTMLGenerator>
-    MHTML(MustacheHTMLGenerator::Format, "Generator for mustache HTML output.");
-
-// This anchor is used to force the linker to link in the generated object
-// file and thus register the generator.
-volatile int MHTMLGeneratorAnchorSource = 0;
-
-} // namespace doc
-} // namespace clang
diff --git a/clang-tools-extra/clang-doc/support/Utils.cpp b/clang-tools-extra/clang-doc/support/Utils.cpp
index 897a7ad0adb79..f410bfcf956d4 100644
--- a/clang-tools-extra/clang-doc/support/Utils.cpp
+++ b/clang-tools-extra/clang-doc/support/Utils.cpp
@@ -28,8 +28,7 @@ SmallString<128> appendPathPosix(StringRef Base, StringRef Path) {
   return Default;
 }
 
-void getMustacheHtmlFiles(StringRef AssetsPath,
-                          clang::doc::ClangDocContext &CDCtx) {
+void getHtmlFiles(StringRef AssetsPath, clang::doc::ClangDocContext &CDCtx) {
   assert(!AssetsPath.empty());
   assert(sys::fs::is_directory(AssetsPath));
 
diff --git a/clang-tools-extra/clang-doc/support/Utils.h b/clang-tools-extra/clang-doc/support/Utils.h
index 8161c37503f81..f4ed9ec42dce4 100644
--- a/clang-tools-extra/clang-doc/support/Utils.h
+++ b/clang-tools-extra/clang-doc/support/Utils.h
@@ -20,7 +20,7 @@ llvm::SmallString<128> appendPathNative(llvm::StringRef Base,
 llvm::SmallString<128> appendPathPosix(llvm::StringRef Base,
                                        llvm::StringRef Path);
 
-void getMustacheHtmlFiles(llvm::StringRef AssetsPath,
-                          clang::doc::ClangDocContext &CDCtx);
+void getHtmlFiles(llvm::StringRef AssetsPath,
+                  clang::doc::ClangDocContext &CDCtx);
 
 #endif
diff --git a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
index cb98c200442a6..ee4c449718871 100644
--- a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
+++ b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
@@ -111,21 +111,20 @@ Turn on time profiler. Generates clang-doc-tracing.json)"),
                                       llvm::cl::init(false),
                                       llvm::cl::cat(ClangDocCategory));
 
-enum OutputFormatTy { md, yaml, html, mustache, json };
-
-static llvm::cl::opt<OutputFormatTy> FormatEnum(
-    "format", llvm::cl::desc("Format for outputted docs."),
-    llvm::cl::values(clEnumValN(OutputFormatTy::yaml, "yaml",
-                                "Documentation in YAML format."),
-                     clEnumValN(OutputFormatTy::md, "md",
-                                "Documentation in MD format."),
-                     clEnumValN(OutputFormatTy::html, "html",
-                                "Documentation in HTML format."),
-                     clEnumValN(OutputFormatTy::mustache, "mustache",
-                                "Documentation in mustache HTML format"),
-                     clEnumValN(OutputFormatTy::json, "json",
-                                "Documentation in JSON format")),
-    llvm::cl::init(OutputFormatTy::yaml), llvm::cl::cat(ClangDocCategory));
+enum OutputFormatTy { md, yaml, html, json };
+
+static llvm::cl::opt<OutputFormatTy>
+    FormatEnum("format", llvm::cl::desc("Format for outputted docs."),
+               llvm::cl::values(clEnumValN(OutputFormatTy::yaml, "yaml",
+                                           "Documentation in YAML format."),
+                                clEnumValN(OutputFormatTy::md, "md",
+                                           "Documentation in MD format."),
+                                clEnumValN(OutputFormatTy::html, "html",
+                                           "Documentation in HTML format."),
+                                clEnumValN(OutputFormatTy::json, "json",
+                                           "Documentation in JSON format")),
+               llvm::cl::init(OutputFormatTy::yaml),
+               llvm::cl::cat(ClangDocCategory));
 
 static llvm::ExitOnError ExitOnErr;
 
@@ -137,8 +136,6 @@ static std::string getFormatString() {
     return "md";
   case OutputFormatTy::html:
     return "html";
-  case OutputFormatTy::mustache:
-    return "mustache";
   case OutputFormatTy::json:
     return "json";
   }
@@ -175,61 +172,12 @@ static llvm::Error getAssetFiles(clang::doc::ClangDocContext &CDCtx) {
   return llvm::Error::success();
 }
 
-static llvm::Error getDefaultAssetFiles(const char *Argv0,
-                                        clang::doc::ClangDocContext &CDCtx) {
-  void *MainAddr = (void *)(intptr_t)getExecutablePath;
-  std::string ClangDocPath = getExecutablePath(Argv0, MainAddr);
-  llvm::SmallString<128> NativeClangDocPath;
-  llvm::sys::path::native(ClangDocPath, NativeClangDocPath);
-
-  llvm::SmallString<128> AssetsPath;
-  AssetsPath = llvm::sys::path::parent_path(NativeClangDocPath);
-  llvm::sys::path::append(AssetsPath, "..", "share", "clang-doc");
-  llvm::SmallString<128> DefaultStylesheet =
-      appendPathNative(AssetsPath, "clang-doc-default-stylesheet.css");
-  llvm::SmallString<128> IndexJS = appendPathNative(AssetsPath, "index.js");
-
-  if (!llvm::sys::fs::is_regular_file(IndexJS))
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "default index.js file missing at " +
-                                       IndexJS + "\n");
-
-  if (!llvm::sys::fs::is_regular_file(DefaultStylesheet))
-    return llvm::createStringError(
-        llvm::inconvertibleErrorCode(),
-        "default clang-doc-default-stylesheet.css file missing at " +
-            DefaultStylesheet + "\n");
-
-  CDCtx.UserStylesheets.insert(CDCtx.UserStylesheets.begin(),
-                               std::string(DefaultStylesheet));
-  CDCtx.JsScripts.emplace_back(IndexJS.str());
-
-  return llvm::Error::success();
-}
-
-static llvm::Error getHtmlAssetFiles(const char *Argv0,
-                                     clang::doc::ClangDocContext &CDCtx) {
-  if (!UserAssetPath.empty() &&
-      !llvm::sys::fs::is_directory(std::string(UserAssetPath))) {
-    unsigned ID = CDCtx.Diags.getCustomDiagID(
-        DiagnosticsEngine::Warning,
-        "Asset path supply is not a directory: %0 falling back to default");
-    CDCtx.Diags.Report(ID) << UserAssetPath;
-  }
-  if (llvm::sys::fs::is_directory(std::string(UserAssetPath)))
-    return getAssetFiles(CDCtx);
-  return getDefaultAssetFiles(Argv0, CDCtx);
-}
-
-static llvm::Error getMustacheHtmlFiles(const char *Argv0,
-                                        clang::doc::ClangDocContext &CDCtx) {
+static llvm::Error getHtmlFiles(const char *Argv0,
+                                clang::doc::ClangDocContext &CDCtx) {
   bool IsDir = llvm::sys::fs::is_directory(UserAssetPath);
-  if (!UserAssetPath.empty() && !IsDir) {
-    unsigned ID = CDCtx.Diags.getCustomDiagID(
-        DiagnosticsEngine::Note,
-        "Asset path supply is not a directory: %0 falling back to default");
-    CDCtx.Diags.Report(ID) << UserAssetPath;
-  }
+  if (!UserAssetPath.empty() && !IsDir)
+    llvm::outs() << "Asset path supply is not a directory: " << UserAssetPath
+                 << " falling back to default\n";
   if (IsDir) {
     if (auto Err = getAssetFiles(CDCtx))
       return Err;
@@ -243,7 +191,7 @@ static llvm::Error getMustacheHtmlFiles(const char *Argv0,
   AssetsPath = llvm::sys::path::parent_path(NativeClangDocPath);
   llvm::sys::path::append(AssetsPath, "..", "share", "clang-doc");
 
-  getMustacheHtmlFiles(AssetsPath, CDCtx);
+  getHtmlFiles(AssetsPath, CDCtx);
 
   return llvm::Error::success();
 }
@@ -337,11 +285,8 @@ Example usage for a project using a compile commands database:
         SourceRoot, RepositoryUrl, RepositoryCodeLinePrefix, BaseDirectory,
         {UserStylesheets.begin(), UserStylesheets.end()}, Diags, FTimeTrace);
 
-    if (Format == "html") {
-      ExitOnErr(getHtmlAssetFiles(argv[0], CDCtx));
-    } else if (Format == "mustache") {
-      ExitOnErr(getMustacheHtmlFiles(argv[0], CDCtx));
-    }
+    if (Format == "html")
+      ExitOnErr(getHtmlFiles(argv[0], CDCtx));
 
     llvm::timeTraceProfilerBegin("Executor Launch", "total runtime");
     // Mapping phase
diff --git a/clang-tools-extra/test/clang-doc/DR-131697.cpp b/clang-tools-extra/test/clang-doc/DR-131697.cpp
index 06168e6642f62..9025bbf910813 100644
--- a/clang-tools-extra/test/clang-doc/DR-131697.cpp
+++ b/clang-tools-extra/test/clang-doc/DR-131697.cpp
@@ -1,7 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: split-file %s %t
 // RUN: clang-doc -format=html %t/compile_commands.json %t/main.cpp
-// RUN: clang-doc -format=mustache %t/compile_commands.json %t/main.cpp
 
 //--- main.cpp
 
diff --git a/clang-tools-extra/test/clang-doc/assets.cpp b/clang-tools-extra/test/clang-doc/assets.cpp
index 9acb64a10b4fe..853dfe53d09f0 100644
--- a/clang-tools-extra/test/clang-doc/assets.cpp
+++ b/clang-tools-extra/test/clang-doc/assets.cpp
@@ -1,24 +1,8 @@
 // RUN: rm -rf %t && mkdir %t
 // RUN: clang-doc --format=html --output=%t --asset=%S/Inputs/test-assets --executor=standalone %s --base base_dir
-// RUN: clang-doc --format=mustache --output=%t --asset=%S/Inputs/test-assets --executor=standalone %s --base base_dir
-// RUN: FileCheck %s -input-file=%t/index.html -check-prefix=INDEX
-// RUN: FileCheck %s -input-file=%t/test.css -check-prefix=CSS
-// RUN: FileCheck %s -input-file=%t/test.js -check-prefix=JS
-
 // RUN: FileCheck %s -input-file=%t/html/test.css -check-prefix=CSS
 // RUN: FileCheck %s -input-file=%t/html/test.js -check-prefix=JS
 
-// INDEX: <!DOCTYPE html>
-// INDEX-NEXT: <meta charset="utf-8"/>
-// INDEX-NEXT: <title>Index</title>
-// INDEX-NEXT: <link rel="stylesheet" href="test.css"/>
-// INDEX-NEXT: <script src="index_json.js"></script>
-// INDEX-NEXT: <script src="test.js"></script>
-// INDEX-NEXT: <header id="project-title"></header>
-// INDEX-NEXT: <main>
-// INDEX-NEXT:   <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left" style="flex: 0 100%;"></div>
-// INDEX-NEXT: </main>
-
 // CSS: body {
 // CSS-NEXT:     padding: 0;
 // CSS-NEXT: }
diff --git a/clang-tools-extra/test/clang-doc/basic-project.mustache.test b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
index 9f7de6e689313..282ca73384c3f 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.mustache.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t/docs %t/build
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 
-// RUN: clang-doc --format=mustache --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
+// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
 // RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV5Shape.html -check-prefix=HTML-SHAPE
 // RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV10Calculator.html -check-prefix=HTML-CALC
 // RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV9Rectangle.html -check-prefix=HTML-RECTANGLE
diff --git a/clang-tools-extra/test/clang-doc/basic-project.test b/clang-tools-extra/test/clang-doc/basic-project.test
index 9c1ed29973d79..9220dc6974508 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.test
@@ -1,31 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t/docs %t/build
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 
-// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
-// RUN: FileCheck %s -input-file=%t/docs/index_json.js -check-prefix=JSON-INDEX
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Shape.html -check-prefix=HTML-SHAPE
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Calculator.html -check-prefix=HTML-CALC
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefix=HTML-RECTANGLE
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefix=HTML-CIRCLE
-
-// RUN: clang-doc --format=html --output=%t/docs-with-prefix --executor=all-TUs %t/build/compile_commands.json --repository=https://repository.com --repository-line-prefix=L
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Shape.html -check-prefixes=HTML-SHAPE,SHAPE-LINE-PREFIX
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Calculator.html -check-prefixes=HTML-CALC,CALC-LINE-PREFIX
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Rectangle.html -check-prefixes=HTML-RECTANGLE,RECTANGLE-LINE-PREFIX
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Circle.html -check-prefixes=HTML-CIRCLE,CIRCLE-LINE-PREFIX
-
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Shape.html -check-prefixes=HTML-SHAPE,SHAPE-NO-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Calculator.html -check-prefixes=HTML-CALC,CALC-NO-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefixes=HTML-RECTANGLE,RECTANGLE-NO-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefixes=HTML-CIRCLE,CIRCLE-NO-REPOSITORY
-
-// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json --repository=https://repository.com
-// RUN: FileCheck %s -input-file=%t/docs/index_json.js -check-prefixes=JSON-INDEX
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Shape.html -check-prefixes=HTML-SHAPE,SHAPE-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Calculator.html -check-prefixes=HTML-CALC,CALC-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefixes=HTML-RECTANGLE,RECTANGLE-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefixes=HTML-CIRCLE,CIRCLE-REPOSITORY
-
 // RUN: clang-doc --format=md --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
 // RUN: FileCheck %s -input-file=%t/docs/all_files.md -check-prefixes=MD-ALL-FILES
 // RUN: FileCheck %s -input-file=%t/docs/index.md -check-prefixes=MD-INDEX
@@ -81,248 +56,6 @@
 // JSON-INDEX-NEXT: };
 // JSON-INDEX-NEXT: }
 
-//      HTML-SHAPE: <h1>class Shape</h1>
-//   SHAPE-NO-REPOSITORY: <p>Defined at line 8 of file .{{.}}include{{.}}Shape.h</p>
-//      SHAPE-REPOSITORY: <p>
-// SHAPE-REPOSITORY-NEXT: Defined at line
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h#8">8</a>
-//     SHAPE-LINE-PREFIX: <a href="https://repository.com/./include/Shape.h#L8">8</a>
-// SHAPE-REPOSITORY-NEXT: of file
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h">Shape.h</a>
-// SHAPE-REPOSITORY-NEXT: </p>
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Abstract base class for shapes.</p>
-//      HTML-SHAPE: <p> Provides a common interface for different types of shapes.</p>
-//      HTML-SHAPE: <h2 id="Functions">Functions</h2>
-//      HTML-SHAPE: <h3 id="{{([0-9A-F]{40})}}">area</h3>
-//      HTML-SHAPE: <p>public double area()</p>
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Calculates the area of the shape.</p>
-//      HTML-SHAPE: <h3 id="{{([0-9A-F]{40})}}">perimeter</h3>
-//      HTML-SHAPE: <p>public double perimeter()</p>
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Calculates the perimeter of the shape.</p>
-//      HTML-SHAPE: <div>return</div>
-//      HTML-SHAPE: <p> double The perimeter of the shape.</p>
-//      HTML-SHAPE: <h3 id="{{([0-9A-F]{40})}}">~Shape</h3>
-//      HTML-SHAPE: <p>public void ~Shape()</p>
-
-//   SHAPE-NO-REPOSITORY: Defined at line 13 of file .{{.}}include{{.}}Shape.h
-//      SHAPE-REPOSITORY: Defined at line 
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h#13">13</a>
-//     SHAPE-LINE-PREFIX: <a href="https://repository.com/./include/Shape.h#L13">13</a>
-// SHAPE-REPOSITORY-NEXT: of file 
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h">Shape.h</a>
-
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Virtual destructor.</p>
-
-//      HTML-CALC: <h1>class Calculator</h1>
-// CALC-NO-REPOSITORY: <p>Defined at line 8 of file .{{.}}include{{.}}Calculator.h</p>
-//      CALC-REPOSITORY: <p>
-// CALC-REPOSITORY-NEXT: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h#8">8</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./include/Calculator.h#L8">8</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h">Calculator.h</a>
-// CALC-REPOSITORY-NEXT: </p>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> A simple calculator class.</p>
-//      HTML-CALC: <p> Provides basic arithmetic operations.</p>
-
-//      HTML-CALC: <h2 id="Members">Members</h2>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Holds a public value.</p>
-//      HTML-CALC: <div>public int public_val</div>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> A static value.</p>
-//      HTML-CALC: <div>public static const int static_val</div>
-
-//      HTML-CALC: <h2 id="Functions">Functions</h2>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">add</h3>
-//      HTML-CALC: <p>public int add(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 3 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#3">3</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L3">3</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Adds two integers.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> int The sum of a and b.</p>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">subtract</h3>
-//      HTML-CALC: <p>public int subtract(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 7 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#7">7</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L7">7</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Subtracts the second integer from the first.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> int The result of a - b.</p>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">multiply</h3>
-//      HTML-CALC: <p>public int multiply(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 11 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#11">11</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L11">11</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Multiplies two integers.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> int The product of a and b.</p>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">divide</h3>
-//      HTML-CALC: <p>public double divide(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 15 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#15">15</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L15">15</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Divides the first integer by the second.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> double The result of a / b.</p>
-//      HTML-CALC: <div>throw</div>
-//      HTML-CALC: <p>if b is zero.</p>
-
-//      HTML-CALC: <p>public static int mod(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 54 of file .{{.}}include{{.}}Calculator.h
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h#54">54</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./include/Calculator.h#L54">54</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h">Calculator.h</a>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Performs the mod operation on integers.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> The result of a % b.</p>
-
-//      HTML-RECTANGLE: <h1>class Rectangle</h1>
-// RECTANGLE-NO-REPOSITORY: <p>Defined at line 10 of file .{{.}}include{{.}}Rectangle.h</p>
-//      RECTANGLE-REPOSITORY: <p>
-// RECTANGLE-REPOSITORY-NEXT: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Rectangle.h#10">10</a>
-//     RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./include/Rectangle.h#L10">10</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Rectangle.h">Rectangle.h</a>
-// RECTANGLE-REPOSITORY-NEXT: </p>
-
-//      HTML-RECTANGLE: <p> Represents a rectangle with a given width and height.</p>
-//      HTML-RECTANGLE: <p>
-//      HTML-RECTANGLE:   Inherits from
-//      HTML-RECTANGLE:   <a href="Shape.html">Shape</a>
-//      HTML-RECTANGLE: </p>
-//      HTML-RECTANGLE: <h2 id="Members">Members</h2>
-//      HTML-RECTANGLE: <p> Width of the rectangle.</p>
-//      HTML-RECTANGLE: <div>private double width_</div>
-//      HTML-RECTANGLE: <p> Height of the rectangle.</p>
-//      HTML-RECTANGLE: <div>private double height_</div>
-//      HTML-RECTANGLE: <h2 id="Functions">Functions</h2>
-//      HTML-RECTANGLE: <h3 id="{{([0-9A-F]{40})}}">Rectangle</h3>
-//      HTML-RECTANGLE: <p>public void Rectangle(double width, double height)</p>
-//   RECTANGLE-NO-REPOSITORY: Defined at line 3 of file .{{.}}src{{.}}Rectangle.cpp
-//      RECTANGLE-REPOSITORY: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp#3">3</a>
-//     RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./src/Rectangle.cpp#L3">3</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp">Rectangle.cpp</a>
-
-//      HTML-RECTANGLE: <div>brief</div>
-//      HTML-RECTANGLE: <p> Constructs a new Rectangle object.</p>
-//      HTML-RECTANGLE: <h3 id="{{([0-9A-F]{40})}}">area</h3>
-//      HTML-RECTANGLE: <p>public double area()</p>
-//   RECTANGLE-NO-REPOSITORY: Defined at line 6 of file .{{.}}src{{.}}Rectangle.cpp
-//      RECTANGLE-REPOSITORY: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp#6">6</a>
-//     RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./src/Rectangle.cpp#L6">6</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp">Rectangle.cpp</a>
-
-//      HTML-RECTANGLE: <div>brief</div>
-//      HTML-RECTANGLE: <p> Calculates the area of the rectangle.</p>
-//      HTML-RECTANGLE: <div>return</div>
-//      HTML-RECTANGLE: <p> double The area of the rectangle.</p>
-//      HTML-RECTANGLE: <h3 id="{{([0-9A-F]{40})}}">perimeter</h3>
-//      HTML-RECTANGLE: <p>public double perimeter()</p>
-//   RECTANGLE-NO-REPOSITORY: Defined at line 10 of file .{{.}}src{{.}}Rectangle.cpp
-//      RECTANGLE-REPOSITORY: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp#10">10</a>
-// RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./src/Rectangle.cpp#L10">10</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp">Rectangle.cpp</a>
-//      HTML-RECTANGLE: <div>brief</div>
-//      HTML-RECTANGLE: <p> Calculates the perimeter of the rectangle.</p>
-//      HTML-RECTANGLE: <div>return</div>
-//      HTML-RECTANGLE: <p> double The perimeter of the rectangle.</p>
-
-//      HTML-CIRCLE: <h1>class Circle</h1>
-// CIRCLE-NO-REPOSITORY: <p>Defined at line 10 of file .{{.}}include{{.}}Circle.h</p>
-//      CIRCLE-REPOSITORY: <p>
-// CIRCLE-REPOSITORY-NEXT: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Circle.h#10">10</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./include/Circle.h#L10">10</a>
-// CIRCLE-REPOSITORY-NEXT: of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Circle.h">Circle.h</a>
-// CIRCLE-REPOSITORY-NEXT: </p>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Circle class derived from Shape.</p>
-//      HTML-CIRCLE: <p> Represents a circle with a given radius.</p>
-//      HTML-CIRCLE: <p>
-//      HTML-CIRCLE:   Inherits from
-//      HTML-CIRCLE:   <a href="Shape.html">Shape</a>
-//      HTML-CIRCLE: </p>
-//      HTML-CIRCLE: <h2 id="Members">Members</h2>
-//      HTML-CIRCLE: <p> Radius of the circle.</p>
-//      HTML-CIRCLE: <div>private double radius_</div>
-//      HTML-CIRCLE: <h2 id="Functions">Functions</h2>
-//      HTML-CIRCLE: <h3 id="{{([0-9A-F]{40})}}">Circle</h3>
-//      HTML-CIRCLE: <p>public void Circle(double radius)</p>
-//   CIRCLE-NO-REPOSITORY: Defined at line 3 of file .{{.}}src{{.}}Circle.cpp
-//      CIRCLE-REPOSITORY: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp#3">3</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./src/Circle.cpp#L3">3</a>
-// CIRCLE-REPOSITORY-NEXT:  of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp">Circle.cpp</a>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Constructs a new Circle object.</p>
-//      HTML-CIRCLE: <h3 id="{{([0-9A-F]{40})}}">area</h3>
-//      HTML-CIRCLE: <p>public double area()</p>
-//   CIRCLE-NO-REPOSITORY: Defined at line 5 of file .{{.}}src{{.}}Circle.cpp
-//      CIRCLE-REPOSITORY: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp#5">5</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./src/Circle.cpp#L5">5</a>
-// CIRCLE-REPOSITORY-NEXT:  of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp">Circle.cpp</a>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Calculates the area of the circle.</p>
-//      HTML-CIRCLE: <div>return</div>
-//      HTML-CIRCLE: <p> double The area of the circle.</p>
-//      HTML-CIRCLE: <h3 id="{{([0-9A-F]{40})}}">perimeter</h3>
-//      HTML-CIRCLE: <p>public double perimeter()</p>
-//   CIRCLE-NO-REPOSITORY: Defined at line  9 of file .{{.}}src{{.}}Circle.cpp
-//      CIRCLE-REPOSITORY: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp#9">9</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./src/Circle.cpp#L9">9</a>
-// CIRCLE-REPOSITORY-NEXT:  of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp">Circle.cpp</a>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Calculates the perimeter of the circle.</p>
-//      HTML-CIRCLE: <div>return</div>
-//      HTML-CIRCLE: <p> double The perimeter of the circle.</p>
-
 // MD-CALC: # class Calculator
 // MD-CALC: *Defined at .{{[\/]}}include{{[\/]}}Calculator.h#8*
 // MD-CALC: **brief** A simple calculator class.
diff --git a/clang-tools-extra/test/clang-doc/comments-in-macros.cpp b/clang-tools-extra/test/clang-doc/comments-in-macros.cpp
index 0c70fadb7f9ac..bc0ec46b72a05 100644
--- a/clang-tools-extra/test/clang-doc/comments-in-macros.cpp
+++ b/clang-tools-extra/test/clang-doc/comments-in-macros.cpp
@@ -6,8 +6,8 @@
 // RUN: FileCheck %s < %t/GlobalNamespace/MyClass.md --check-prefix=MD-MYCLASS
 
 // RUN: clang-doc --format=html --doxygen --output=%t --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.html --check-prefix=HTML-MYCLASS-LINE
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.html --check-prefix=HTML-MYCLASS
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7MyClass.html --check-prefix=HTML-MYCLASS-LINE
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7MyClass.html --check-prefix=HTML-MYCLASS
 
 #define DECLARE_METHODS                                           \
     /**   							  
@@ -21,15 +21,18 @@
 // MD-MYCLASS: *public int Add(int a, int b)*
 // MD-MYCLASS: **brief** Declare a method to calculate the sum of two numbers
 
-// HTML-MYCLASS: <p>public int Add(int a, int b)</p>
-// HTML-MYCLASS: <div>brief</div>
-// HTML-MYCLASS: <p> Declare a method to calculate the sum of two numbers</p>
+
+// HTML-MYCLASS: <pre><code class="language-cpp code-clang-doc">int Add (int a, int b)</code></pre>
+// HTML-MYCLASS: <div>
+// HTML-MYCLASS:     <div>
+// HTML-MYCLASS:         <p> Declare a method to calculate the sum of two numbers</p>
+// HTML-MYCLASS:     </div>
 
 
 class MyClass {
 public:
-// MD-MYCLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp#[[@LINE+2]]*
-// HTML-MYCLASS-LINE: <p>Defined at line [[@LINE+1]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp</p>
+// MD-MYCLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp#[[@LINE-2]]*
+// HTML-MYCLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp</p>
     DECLARE_METHODS
 };
 
diff --git a/clang-tools-extra/test/clang-doc/conversion_function.cpp b/clang-tools-extra/test/clang-doc/conversion_function.cpp
index 0200a578219ee..63df5d6f50d39 100644
--- a/clang-tools-extra/test/clang-doc/conversion_function.cpp
+++ b/clang-tools-extra/test/clang-doc/conversion_function.cpp
@@ -4,7 +4,7 @@
 // RUN: find %t/ -regex ".*/[0-9A-F]*.yaml" -exec cat {} ";" | FileCheck %s --check-prefix=CHECK-YAML
 
 // RUN: clang-doc --format=html --output=%t --executor=standalone %s 
-// RUN: FileCheck %s < %t/GlobalNamespace/MyStruct.html --check-prefix=CHECK-HTML
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV8MyStruct.html --check-prefix=CHECK-HTML
 
 template <typename T>
 struct MyStruct {
@@ -14,5 +14,6 @@ struct MyStruct {
 // Output correct conversion names.
 // CHECK-YAML:         Name:            'operator T'
 
-// CHECK-HTML: <h3 id="{{[0-9A-F]*}}">operator T</h3>
-// CHECK-HTML: <p>public T operator T()</p>
+// CHECK-HTML: <div id="{{([0-9A-F]{40})}}">
+// CHECK-HTML:     <pre><code class="language-cpp code-clang-doc">T operator T ()</code></pre>
+// CHECK-HTML: </div>
diff --git a/clang-tools-extra/test/clang-doc/enum.cpp b/clang-tools-extra/test/clang-doc/enum.cpp
index 3ba834e0b2e70..bb0d51fc3b36c 100644
--- a/clang-tools-extra/test/clang-doc/enum.cpp
+++ b/clang-tools-extra/test/clang-doc/enum.cpp
@@ -1,19 +1,12 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=html --doxygen --output=%t --executor=standalone %s
 // RUN: clang-doc --format=md --doxygen --output=%t --executor=standalone %s
-// RUN: clang-doc --format=mustache --doxygen --output=%t --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/index.html --check-prefix=HTML-INDEX-LINE
-// RUN: FileCheck %s < %t/GlobalNamespace/index.html --check-prefix=HTML-INDEX
-// RUN: FileCheck %s < %t/GlobalNamespace/Animals.html --check-prefix=HTML-ANIMAL-LINE
-// RUN: FileCheck %s < %t/GlobalNamespace/Animals.html --check-prefix=HTML-ANIMAL
-// RUN: FileCheck %s < %t/Vehicles/index.html --check-prefix=HTML-VEHICLES-LINE
-// RUN: FileCheck %s < %t/Vehicles/index.html --check-prefix=HTML-VEHICLES
-// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=MUSTACHE-INDEX-LINE
-// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=MUSTACHE-INDEX
-// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=MUSTACHE-ANIMAL-LINE
-// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=MUSTACHE-ANIMAL
-// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=MUSTACHE-VEHICLES-LINE
-// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=MUSTACHE-VEHICLES
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=HTML-INDEX-LINE
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=HTML-INDEX
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=HTML-ANIMAL-LINE
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=HTML-ANIMAL
+// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=HTML-VEHICLES-LINE
+// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=HTML-VEHICLES
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md --check-prefix=MD-INDEX-LINE
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md --check-prefix=MD-INDEX
 // RUN: FileCheck %s < %t/GlobalNamespace/Animals.md --check-prefix=MD-ANIMAL-LINE
@@ -28,8 +21,7 @@
  */
 enum Color {
   // MD-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-  // HTML-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
+  // HTML-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
   Red,   ///< Comment 1
   Green, ///< Comment 2
   Blue   ///< Comment 3
@@ -43,48 +35,36 @@ enum Color {
 // MD-INDEX: | Blue |
 // MD-INDEX: **brief** For specifying RGB colors
 
-// HTML-INDEX: <th colspan="3">enum Color</th>
-// HTML-INDEX: <td>Red</td>
-// HTML-INDEX: <td>0</td>
-// HTML-INDEX: <p> Comment 1</p>
-// HTML-INDEX: <td>Green</td>
-// HTML-INDEX: <td>1</td>
-// HTML-INDEX: <p> Comment 2</p>
-// HTML-INDEX: <td>Blue</td>
-// HTML-INDEX: <td>2</td>
-// HTML-INDEX: <p> Comment 3</p>
-
-// MUSTACHE-INDEX:     <div>
-// MUSTACHE-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Color</code></pre>
-// MUSTACHE-INDEX:     </div>
-// MUSTACHE-INDEX:     <table class="table-wrapper">
-// MUSTACHE-INDEX:         <tbody>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <th>Name</th>
-// MUSTACHE-INDEX:                 <th>Value</th>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Red</td>
-// MUSTACHE-INDEX:                 <td>0</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Green</td>
-// MUSTACHE-INDEX:                 <td>1</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Blue</td>
-// MUSTACHE-INDEX:                 <td>2</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:         </tbody>
-// MUSTACHE-INDEX:     </table>
+// HTML-INDEX:     <div>
+// HTML-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Color</code></pre>
+// HTML-INDEX:     </div>
+// HTML-INDEX:     <table class="table-wrapper">
+// HTML-INDEX:         <tbody>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <th>Name</th>
+// HTML-INDEX:                 <th>Value</th>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Red</td>
+// HTML-INDEX:                 <td>0</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Green</td>
+// HTML-INDEX:                 <td>1</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Blue</td>
+// HTML-INDEX:                 <td>2</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:         </tbody>
+// HTML-INDEX:     </table>
 
 /**
  * @brief Shape Types
  */
 enum class Shapes {
   // MD-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-  // HTML-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
+  // HTML-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
 
   /// Comment 1
   Circle,
@@ -100,86 +80,60 @@ enum class Shapes {
 // MD-INDEX: | Triangle |
 // MD-INDEX: **brief** Shape Types
 
-// HTML-INDEX: <th colspan="3">enum class Shapes</th>
-// HTML-INDEX: <td>Circle</td>
-// HTML-INDEX: <td>0</td>
-// HTML-INDEX: <p> Comment 1</p>
-// HTML-INDEX: <td>Rectangle</td>
-// HTML-INDEX: <td>1</td>
-// HTML-INDEX: <p> Comment 2</p>
-// HTML-INDEX: <td>Triangle</td>
-// HTML-INDEX: <td>2</td>
-// HTML-INDEX: <p> Comment 3</p>
-
 // COM: FIXME: Serialize "enum class" in template
-// MUSTACHE-INDEX:     <div>
-// MUSTACHE-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Shapes</code></pre>
-// MUSTACHE-INDEX:     </div>
-// MUSTACHE-INDEX:     <table class="table-wrapper">
-// MUSTACHE-INDEX:         <tbody>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <th>Name</th>
-// MUSTACHE-INDEX:                 <th>Value</th>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Circle</td>
-// MUSTACHE-INDEX:                 <td>0</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Rectangle</td>
-// MUSTACHE-INDEX:                 <td>1</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Triangle</td>
-// MUSTACHE-INDEX:                 <td>2</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:         </tbody>
-// MUSTACHE-INDEX:     </table>
+// HTML-INDEX:     <div>
+// HTML-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Shapes</code></pre>
+// HTML-INDEX:     </div>
+// HTML-INDEX:     <table class="table-wrapper">
+// HTML-INDEX:         <tbody>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <th>Name</th>
+// HTML-INDEX:                 <th>Value</th>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Circle</td>
+// HTML-INDEX:                 <td>0</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Rectangle</td>
+// HTML-INDEX:                 <td>1</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Triangle</td>
+// HTML-INDEX:                 <td>2</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:         </tbody>
+// HTML-INDEX:     </table>
 
 // COM: FIXME: Add enums declared inside of classes to class template
 class Animals {
   // MD-ANIMAL-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
   // HTML-ANIMAL-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-ANIMAL-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
 public:
   /**
    * @brief specify what animal the class is
    */
   enum AnimalType {
     // MD-ANIMAL-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-    // HTML-ANIMAL-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-    // MUSTACHE-ANIMAL-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
+    // HTML-ANIMAL-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
     Dog,   ///< Man's best friend
     Cat,   ///< Man's other best friend
     Iguana ///< A lizard
   };
 };
 
-// HTML-ANIMAL: <h1>class Animals</h1>
-// HTML-ANIMAL: <h2 id="Enums">Enums</h2>
-// HTML-ANIMAL: <th colspan="3">enum AnimalType</th>
-// HTML-ANIMAL: <td>Dog</td>
-// HTML-ANIMAL: <td>0</td>
-// HTML-ANIMAL: <p> Man&apos;s best friend</p>
-// HTML-ANIMAL: <td>Cat</td>
-// HTML-ANIMAL: <td>1</td>
-// HTML-ANIMAL: <p> Man&apos;s other best friend</p>
-// HTML-ANIMAL: <td>Iguana</td>
-// HTML-ANIMAL: <td>2</td>
-// HTML-ANIMAL: <p> A lizard</p>
-
-// MUSTACHE-ANIMAL-NOT: <h1>class Animals</h1>
-// MUSTACHE-ANIMAL-NOT: <h2 id="Enums">Enums</h2>
-// MUSTACHE-ANIMAL-NOT: <th colspan="3">enum AnimalType</th>
-// MUSTACHE-ANIMAL-NOT: <td>Dog</td>
-// MUSTACHE-ANIMAL-NOT: <td>0</td>
-// MUSTACHE-ANIMAL-NOT: <p> Man&apos;s best friend</p>
-// MUSTACHE-ANIMAL-NOT: <td>Cat</td>
-// MUSTACHE-ANIMAL-NOT: <td>1</td>
-// MUSTACHE-ANIMAL-NOT: <p> Man&apos;s other best friend</p>
-// MUSTACHE-ANIMAL-NOT: <td>Iguana</td>
-// MUSTACHE-ANIMAL-NOT: <td>2</td>
-// MUSTACHE-ANIMAL-NOT: <p> A lizard</p>
+// HTML-ANIMAL-NOT: <h1>class Animals</h1>
+// HTML-ANIMAL-NOT: <h2 id="Enums">Enums</h2>
+// HTML-ANIMAL-NOT: <th colspan="3">enum AnimalType</th>
+// HTML-ANIMAL-NOT: <td>Dog</td>
+// HTML-ANIMAL-NOT: <td>0</td>
+// HTML-ANIMAL-NOT: <p> Man&apos;s best friend</p>
+// HTML-ANIMAL-NOT: <td>Cat</td>
+// HTML-ANIMAL-NOT: <td>1</td>
+// HTML-ANIMAL-NOT: <p> Man&apos;s other best friend</p>
+// HTML-ANIMAL-NOT: <td>Iguana</td>
+// HTML-ANIMAL-NOT: <td>2</td>
+// HTML-ANIMAL-NOT: <p> A lizard</p>
 
 // MD-ANIMAL: # class Animals
 // MD-ANIMAL: ## Enums
@@ -196,8 +150,7 @@ namespace Vehicles {
  */
 enum Car {
   // MD-VEHICLES-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-  // HTML-VEHICLES-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-VEHICLES-LINE: Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp
+  // HTML-VEHICLES-LINE: Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp
 
   Sedan,    ///< Comment 1
   SUV,      ///< Comment 2
@@ -216,48 +169,33 @@ enum Car {
 // MD-VEHICLES: | Hatchback |
 // MD-VEHICLES: **brief** specify type of car
 
-// HTML-VEHICLES: <h1>namespace Vehicles</h1>
-// HTML-VEHICLES: <th colspan="3">enum Car</th>
-// HTML-VEHICLES: <td>Sedan</td>
-// HTML-VEHICLES: <td>0</td>
-// HTML-VEHICLES: <p> Comment 1</p>
-// HTML-VEHICLES: <td>SUV</td>
-// HTML-VEHICLES: <td>1</td>
-// HTML-VEHICLES: <p> Comment 2</p>
-// HTML-VEHICLES: <td>Pickup</td>
-// HTML-VEHICLES: <td>2</td>
-// HTML-VEHICLES: <p> Comment 3</p>
-// HTML-VEHICLES: <td>Hatchback</td>
-// HTML-VEHICLES: <td>3</td>
-// HTML-VEHICLES: <p> Comment 4</p>
-
-// MUSTACHE-VEHICLES:     <div>
-// MUSTACHE-VEHICLES:         <pre><code class="language-cpp code-clang-doc">enum Car</code></pre>
-// MUSTACHE-VEHICLES:      </div>
-// MUSTACHE-VEHICLES:      <table class="table-wrapper">
-// MUSTACHE-VEHICLES:          <tbody>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <th>Name</th>
-// MUSTACHE-VEHICLES:                  <th>Value</th>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>Sedan</td>
-// MUSTACHE-VEHICLES:                  <td>0</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>SUV</td>
-// MUSTACHE-VEHICLES:                  <td>1</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>Pickup</td>
-// MUSTACHE-VEHICLES:                  <td>2</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>Hatchback</td>
-// MUSTACHE-VEHICLES:                  <td>3</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:          </tbody>
-// MUSTACHE-VEHICLES:      </table>
+// HTML-VEHICLES:     <div>
+// HTML-VEHICLES:         <pre><code class="language-cpp code-clang-doc">enum Car</code></pre>
+// HTML-VEHICLES:      </div>
+// HTML-VEHICLES:      <table class="table-wrapper">
+// HTML-VEHICLES:          <tbody>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <th>Name</th>
+// HTML-VEHICLES:                  <th>Value</th>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>Sedan</td>
+// HTML-VEHICLES:                  <td>0</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>SUV</td>
+// HTML-VEHICLES:                  <td>1</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>Pickup</td>
+// HTML-VEHICLES:                  <td>2</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>Hatchback</td>
+// HTML-VEHICLES:                  <td>3</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:          </tbody>
+// HTML-VEHICLES:      </table>
 
 enum ColorUserSpecified {
   RedUserSpecified = 'A',
@@ -271,34 +209,26 @@ enum ColorUserSpecified {
 // MD-INDEX: | GreenUserSpecified |
 // MD-INDEX: | BlueUserSpecified |
 
-// HTML-INDEX: <th colspan="2">enum ColorUserSpecified</th>
-// HTML-INDEX: <td>RedUserSpecified</td>
-// HTML-INDEX: <td>&apos;A&apos;</td>
-// HTML-INDEX: <td>GreenUserSpecified</td>
-// HTML-INDEX: <td>2</td>
-// HTML-INDEX: <td>BlueUserSpecified</td>
-// HTML-INDEX: <td>&apos;C&apos;</td>
-
-// MUSTACHE-INDEX:     <div>
-// MUSTACHE-INDEX:         <pre><code class="language-cpp code-clang-doc">enum ColorUserSpecified</code></pre>
-// MUSTACHE-INDEX:     </div>
-// MUSTACHE-INDEX:     <table class="table-wrapper">
-// MUSTACHE-INDEX:         <tbody>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <th>Name</th>
-// MUSTACHE-INDEX:                 <th>Value</th>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>RedUserSpecified</td>
-// MUSTACHE-INDEX:                 <td>&#39;A&#39;</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>GreenUserSpecified</td>
-// MUSTACHE-INDEX:                 <td>2</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>BlueUserSpecified</td>
-// MUSTACHE-INDEX:                 <td>&#39;C&#39;</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:         </tbody>
-// MUSTACHE-INDEX:     </table>
+// HTML-INDEX:     <div>
+// HTML-INDEX:         <pre><code class="language-cpp code-clang-doc">enum ColorUserSpecified</code></pre>
+// HTML-INDEX:     </div>
+// HTML-INDEX:     <table class="table-wrapper">
+// HTML-INDEX:         <tbody>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <th>Name</th>
+// HTML-INDEX:                 <th>Value</th>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>RedUserSpecified</td>
+// HTML-INDEX:                 <td>&#39;A&#39;</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>GreenUserSpecified</td>
+// HTML-INDEX:                 <td>2</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>BlueUserSpecified</td>
+// HTML-INDEX:                 <td>&#39;C&#39;</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:         </tbody>
+// HTML-INDEX:     </table>
diff --git a/clang-tools-extra/test/clang-doc/long-name.cpp b/clang-tools-extra/test/clang-doc/long-name.cpp
index 77e50b1553ad5..e4a5e29f973d5 100644
--- a/clang-tools-extra/test/clang-doc/long-name.cpp
+++ b/clang-tools-extra/test/clang-doc/long-name.cpp
@@ -1,7 +1,7 @@
 // FIXME: This test seems to break on windows, so disable it for now.
 // UNSUPPORTED: system-windows
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --output=%t --format=mustache --executor=standalone %s
+// RUN: clang-doc --output=%t --format=html --executor=standalone %s
 // RUN: ls %t/json/GlobalNamespace | FileCheck %s -check-prefix=CHECK-JSON
 // RUN: ls %t/html/GlobalNamespace | FileCheck %s -check-prefix=CHECK-HTML
 
diff --git a/clang-tools-extra/test/clang-doc/mustache-index.cpp b/clang-tools-extra/test/clang-doc/mustache-index.cpp
index 709cc82bf85bb..0aa6e21c37cac 100644
--- a/clang-tools-extra/test/clang-doc/mustache-index.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-index.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
+// RUN: clang-doc --format=html --output=%t --executor=standalone %s 
 // RUN: FileCheck %s < %t/html/GlobalNamespace/index.html
 
 enum Color {
diff --git a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
index dfc81df134596..add8a221feb40 100644
--- a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
+// RUN: clang-doc --format=html --output=%t --executor=standalone %s 
 // RUN: FileCheck %s < %t/html/MyNamespace/index.html
 // RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=CHECK-GLOBAL
 
diff --git a/clang-tools-extra/test/clang-doc/namespace.cpp b/clang-tools-extra/test/clang-doc/namespace.cpp
index adf7ab7d946ab..029f9974e775e 100644
--- a/clang-tools-extra/test/clang-doc/namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/namespace.cpp
@@ -1,24 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=html --output=%t --executor=standalone %s
 // RUN: clang-doc --format=md --output=%t --executor=standalone %s
-// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s
-// RUN: FileCheck %s < %t/index_json.js -check-prefix=JSON-INDEX
-// RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.html -check-prefix=HTML-ANON-CLASS-LINE
-// RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.html -check-prefix=HTML-ANON-CLASS
-// RUN: FileCheck %s < %t/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX-LINE
-// RUN: FileCheck %s < %t/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX
-// RUN: FileCheck %s < %t/AnotherNamespace/ClassInAnotherNamespace.html -check-prefix=HTML-ANOTHER-CLASS-LINE
-// RUN: FileCheck %s < %t/AnotherNamespace/ClassInAnotherNamespace.html -check-prefix=HTML-ANOTHER-CLASS
-// RUN: FileCheck %s < %t/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX-LINE
-// RUN: FileCheck %s < %t/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/ClassInNestedNamespace.html -check-prefix=HTML-NESTED-CLASS-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/ClassInNestedNamespace.html -check-prefix=HTML-NESTED-CLASS
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX
-// RUN: FileCheck %s < %t/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX
-// RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.html -check-prefix=HTML-PRIMARY-CLASS-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.html -check-prefix=HTML-PRIMARY-CLASS
 // RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.md -check-prefix=MD-ANON-CLASS-LINE
 // RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.md -check-prefix=MD-ANON-CLASS
 // RUN: FileCheck %s < %t/@nonymous_namespace/index.md -check-prefix=MD-ANON-INDEX-LINE
@@ -35,26 +17,26 @@
 // RUN: FileCheck %s < %t/PrimaryNamespace/index.md -check-prefix=MD-PRIMARY-INDEX
 // RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.md -check-prefix=MD-PRIMARY-CLASS-LINE
 // RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.md -check-prefix=MD-PRIMARY-CLASS
-// RUN: FileCheck %s < %t/GlobalNamespace/index.html -check-prefix=HTML-GLOBAL-INDEX
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html -check-prefix=HTML-GLOBAL-INDEX
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md -check-prefix=MD-GLOBAL-INDEX
 // RUN: FileCheck %s < %t/all_files.md -check-prefix=MD-ALL-FILES
 // RUN: FileCheck %s < %t/index.md -check-prefix=MD-INDEX
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=MUSTACHE-ANON-CLASS-LINE
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=MUSTACHE-ANON-CLASS
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=MUSTACHE-ANON-INDEX-LINE
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=MUSTACHE-ANON-INDEX
-// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=MUSTACHE-ANOTHER-CLASS-LINE
-// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=MUSTACHE-ANOTHER-CLASS
-// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=MUSTACHE-ANOTHER-INDEX-LINE
-// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=MUSTACHE-ANOTHER-INDEX
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=MUSTACHE-NESTED-CLASS-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=MUSTACHE-NESTED-CLASS
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=MUSTACHE-NESTED-INDEX-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=MUSTACHE-NESTED-INDEX
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=MUSTACHE-PRIMARY-INDEX-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=MUSTACHE-PRIMARY-INDEX
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=MUSTACHE-PRIMARY-CLASS-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=MUSTACHE-PRIMARY-CLASS
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=HTML-ANON-CLASS-LINE
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=HTML-ANON-CLASS
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX-LINE
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX
+// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=HTML-ANOTHER-CLASS-LINE
+// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=HTML-ANOTHER-CLASS
+// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX-LINE
+// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=HTML-NESTED-CLASS-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=HTML-NESTED-CLASS
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=HTML-PRIMARY-CLASS-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=HTML-PRIMARY-CLASS
 
 // COM: FIXME: Add global functions to the namespace template
 // COM: FIXME: Add namespaces to the namespace template
@@ -63,17 +45,14 @@
 namespace {
 void anonFunction() {}
 // MD-ANON-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-ANON-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANON-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-ANON-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 class AnonClass {};
 // MD-ANON-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANON-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANON-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-ANON-CLASS: # class AnonClass
-// HTML-ANON-CLASS: <h1>class AnonClass</h1>
-// MUSTACHE-ANON-CLASS: <h1 class="hero__title-large">class AnonClass</h1>
+// HTML-ANON-CLASS: <h1 class="hero__title-large">class AnonClass</h1>
 } // namespace
 
 // MD-ANON-INDEX: # namespace @nonymous_namespace
@@ -84,69 +63,51 @@ class AnonClass {};
 // MD-ANON-INDEX: ### anonFunction
 // MD-ANON-INDEX: *void anonFunction()*
 
-// HTML-ANON-INDEX: <h1>namespace @nonymous_namespace</h1>
-// HTML-ANON-INDEX: <p> Anonymous Namespace</p>
-// HTML-ANON-INDEX: <h2 id="Records">Records</h2>
-// HTML-ANON-INDEX: <a href="AnonClass.html">AnonClass</a>
-// HTML-ANON-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-ANON-INDEX: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
-// HTML-ANON-INDEX: <p>void anonFunction()</p>
-
-// MUSTACHE-ANON-INDEX: <h2> @nonymous_namespace</h2>
-// MUSTACHE-ANON-INDEX:     <h2>Inner Classes</h2>
-// MUSTACHE-ANON-INDEX:         <ul class="class-container">
-// MUSTACHE-ANON-INDEX:             <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-ANON-INDEX:                 <a href="_ZTVN12_GLOBAL__N_19AnonClassE.html">
-// MUSTACHE-ANON-INDEX:                     <pre><code class="language-cpp code-clang-doc">class AnonClass</code></pre>
-// MUSTACHE-ANON-INDEX:                 </a>
-// MUSTACHE-ANON-INDEX:             </li>
-// MUSTACHE-ANON-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-ANON-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
-// MUSTACHE-ANON-INDEX-NOT: <p>void anonFunction()</p>
+// HTML-ANON-INDEX: <h2> @nonymous_namespace</h2>
+// HTML-ANON-INDEX:     <h2>Inner Classes</h2>
+// HTML-ANON-INDEX:         <ul class="class-container">
+// HTML-ANON-INDEX:             <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-ANON-INDEX:                 <a href="_ZTVN12_GLOBAL__N_19AnonClassE.html">
+// HTML-ANON-INDEX:                     <pre><code class="language-cpp code-clang-doc">class AnonClass</code></pre>
+// HTML-ANON-INDEX:                 </a>
+// HTML-ANON-INDEX:             </li>
+// HTML-ANON-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-ANON-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
+// HTML-ANON-INDEX-NOT: <p>void anonFunction()</p>
 
 // Primary Namespace
 namespace PrimaryNamespace {
 // Function in PrimaryNamespace
 void functionInPrimaryNamespace() {}
 // MD-PRIMARY-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-PRIMARY-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-PRIMARY-INDEX-LINE-NOT: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-PRIMARY-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in PrimaryNamespace
 class ClassInPrimaryNamespace {};
 // MD-PRIMARY-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-PRIMARY-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-PRIMARY-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-PRIMARY-CLASS: # class ClassInPrimaryNamespace
 // MD-PRIMARY-CLASS: Class in PrimaryNamespace
 
-// HTML-PRIMARY-CLASS: <h1>class ClassInPrimaryNamespace</h1>
-// HTML-PRIMARY-CLASS: <p> Class in PrimaryNamespace</p>
-
-// MUSTACHE-PRIMARY-CLASS: <h1 class="hero__title-large">class ClassInPrimaryNamespace</h1>
+// HTML-PRIMARY-CLASS: <h1 class="hero__title-large">class ClassInPrimaryNamespace</h1>
 
 // Nested namespace
 namespace NestedNamespace {
 // Function in NestedNamespace
 void functionInNestedNamespace() {}
 // MD-NESTED-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-NESTED-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-NESTED-INDEX-LINE-NOT: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-NESTED-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in NestedNamespace
 class ClassInNestedNamespace {};
 // MD-NESTED-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-NESTED-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-NESTED-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-NESTED-CLASS: # class ClassInNestedNamespace
 // MD-NESTED-CLASS: Class in NestedNamespace
 
-// HTML-NESTED-CLASS: <h1>class ClassInNestedNamespace</h1>
-// HTML-NESTED-CLASS: <p> Class in NestedNamespace</p>
-
-// MUSTACHE-NESTED-CLASS: <h1 class="hero__title-large">class ClassInNestedNamespace</h1>
+// HTML-NESTED-CLASS: <h1 class="hero__title-large">class ClassInNestedNamespace</h1>
 } // namespace NestedNamespace
 
 // MD-NESTED-INDEX: # namespace NestedNamespace
@@ -158,28 +119,19 @@ class ClassInNestedNamespace {};
 // MD-NESTED-INDEX: *void functionInNestedNamespace()*
 // MD-NESTED-INDEX: Function in NestedNamespace
 
-// HTML-NESTED-INDEX: <h1>namespace NestedNamespace</h1>
-// HTML-NESTED-INDEX: <p> Nested namespace</p>
-// HTML-NESTED-INDEX: <h2 id="Records">Records</h2>
-// HTML-NESTED-INDEX: <a href="ClassInNestedNamespace.html">ClassInNestedNamespace</a>
-// HTML-NESTED-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-NESTED-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
-// HTML-NESTED-INDEX: <p>void functionInNestedNamespace()</p>
-// HTML-NESTED-INDEX: <p> Function in NestedNamespace</p>
-
-// MUSTACHE-NESTED-INDEX: <h2> NestedNamespace</h2>
-// MUSTACHE-NESTED-INDEX:     <h2>Inner Classes</h2>
-// MUSTACHE-NESTED-INDEX:     <ul class="class-container">
-// MUSTACHE-NESTED-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-NESTED-INDEX:             <a href="_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html">
-// MUSTACHE-NESTED-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInNestedNamespace</code></pre>
-// MUSTACHE-NESTED-INDEX:             </a>
-// MUSTACHE-NESTED-INDEX:         </li>
-// MUSTACHE-NESTED-INDEX:     </ul>
-// MUSTACHE-NESTED-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-NESTED-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
-// MUSTACHE-NESTED-INDEX-NOT: <p>void functionInNestedNamespace()</p>
-// MUSTACHE-NESTED-INDEX-NOT: <p> Function in NestedNamespace</p>
+// HTML-NESTED-INDEX: <h2> NestedNamespace</h2>
+// HTML-NESTED-INDEX:     <h2>Inner Classes</h2>
+// HTML-NESTED-INDEX:     <ul class="class-container">
+// HTML-NESTED-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-NESTED-INDEX:             <a href="_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html">
+// HTML-NESTED-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInNestedNamespace</code></pre>
+// HTML-NESTED-INDEX:             </a>
+// HTML-NESTED-INDEX:         </li>
+// HTML-NESTED-INDEX:     </ul>
+// HTML-NESTED-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-NESTED-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
+// HTML-NESTED-INDEX-NOT: <p>void functionInNestedNamespace()</p>
+// HTML-NESTED-INDEX-NOT: <p> Function in NestedNamespace</p>
 } // namespace PrimaryNamespace
 
 // MD-PRIMARY-INDEX: # namespace PrimaryNamespace
@@ -193,54 +145,38 @@ class ClassInNestedNamespace {};
 // MD-PRIMARY-INDEX: *void functionInPrimaryNamespace()*
 // MD-PRIMARY-INDEX:  Function in PrimaryNamespace
 
-// HTML-PRIMARY-INDEX: <h1>namespace PrimaryNamespace</h1>
-// HTML-PRIMARY-INDEX: <p> Primary Namespace</p>
-// HTML-PRIMARY-INDEX: <h2 id="Namespaces">Namespaces</h2>
-// HTML-PRIMARY-INDEX: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
-// HTML-PRIMARY-INDEX: <h2 id="Records">Records</h2>
-// HTML-PRIMARY-INDEX: <a href="ClassInPrimaryNamespace.html">ClassInPrimaryNamespace</a>
-// HTML-PRIMARY-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-PRIMARY-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
-// HTML-PRIMARY-INDEX: <p>void functionInPrimaryNamespace()</p>
-// HTML-PRIMARY-INDEX: <p> Function in PrimaryNamespace</p>
-
-// MUSTACHE-PRIMARY-INDEX: <h2> PrimaryNamespace</h2>
-// MUSTACHE-PRIMARY-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
-// MUSTACHE-PRIMARY-INDEX-NOT: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
-// MUSTACHE-PRIMARY-INDEX      <h2>Inner Classes</h2>
-// MUSTACHE-PRIMARY-INDEX          <ul class="class-container">
-// MUSTACHE-PRIMARY-INDEX              <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-PRIMARY-INDEX                  <a href="_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html">
-// MUSTACHE-PRIMARY-INDEX                      <pre><code class="language-cpp code-clang-doc">class ClassInPrimaryNamespace</code></pre>
-// MUSTACHE-PRIMARY-INDEX                  </a>
-// MUSTACHE-PRIMARY-INDEX              </li>
-// MUSTACHE-PRIMARY-INDEX          </ul>
-// MUSTACHE-PRIMARY-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-PRIMARY-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
-// MUSTACHE-PRIMARY-INDEX-NOT: <p>void functionInPrimaryNamespace()</p>
-// MUSTACHE-PRIMARY-INDEX-NOT: <p> Function in PrimaryNamespace</p>
+// HTML-PRIMARY-INDEX: <h2> PrimaryNamespace</h2>
+// HTML-PRIMARY-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
+// HTML-PRIMARY-INDEX-NOT: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
+// HTML-PRIMARY-INDEX      <h2>Inner Classes</h2>
+// HTML-PRIMARY-INDEX          <ul class="class-container">
+// HTML-PRIMARY-INDEX              <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-PRIMARY-INDEX                  <a href="_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html">
+// HTML-PRIMARY-INDEX                      <pre><code class="language-cpp code-clang-doc">class ClassInPrimaryNamespace</code></pre>
+// HTML-PRIMARY-INDEX                  </a>
+// HTML-PRIMARY-INDEX              </li>
+// HTML-PRIMARY-INDEX          </ul>
+// HTML-PRIMARY-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-PRIMARY-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
+// HTML-PRIMARY-INDEX-NOT: <p>void functionInPrimaryNamespace()</p>
+// HTML-PRIMARY-INDEX-NOT: <p> Function in PrimaryNamespace</p>
 
 // AnotherNamespace
 namespace AnotherNamespace {
 // Function in AnotherNamespace
 void functionInAnotherNamespace() {}
 // MD-ANOTHER-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-ANOTHER-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANOTHER-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-ANOTHER-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in AnotherNamespace
 class ClassInAnotherNamespace {};
 // MD-ANOTHER-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANOTHER-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANOTHER-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-ANOTHER-CLASS: # class ClassInAnotherNamespace
 // MD-ANOTHER-CLASS:  Class in AnotherNamespace
 
-// HTML-ANOTHER-CLASS: <h1>class ClassInAnotherNamespace</h1>
-// HTML-ANOTHER-CLASS: <p> Class in AnotherNamespace</p>
-
-// MUSTACHE-ANOTHER-CLASS: <h1 class="hero__title-large">class ClassInAnotherNamespace</h1>
+// HTML-ANOTHER-CLASS: <h1 class="hero__title-large">class ClassInAnotherNamespace</h1>
 
 } // namespace AnotherNamespace
 
@@ -253,120 +189,27 @@ class ClassInAnotherNamespace {};
 // MD-ANOTHER-INDEX: *void functionInAnotherNamespace()*
 // MD-ANOTHER-INDEX: Function in AnotherNamespace
 
-// HTML-ANOTHER-INDEX: <h1>namespace AnotherNamespace</h1>
-// HTML-ANOTHER-INDEX: <p> AnotherNamespace</p>
-// HTML-ANOTHER-INDEX: <h2 id="Records">Records</h2>
-// HTML-ANOTHER-INDEX: <a href="ClassInAnotherNamespace.html">ClassInAnotherNamespace</a>
-// HTML-ANOTHER-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-ANOTHER-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
-// HTML-ANOTHER-INDEX: <p>void functionInAnotherNamespace()</p>
-// HTML-ANOTHER-INDEX: <p> Function in AnotherNamespace</p>
-
-// MUSTACHE-ANOTHER-INDEX: <h2> AnotherNamespace</h2>
-// MUSTACHE-ANOTHER-INDEX:     <h2>Inner Classes</h2>
-// MUSTACHE-ANOTHER-INDEX:     <ul class="class-container">
-// MUSTACHE-ANOTHER-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-ANOTHER-INDEX:             <a href="_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html">
-// MUSTACHE-ANOTHER-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInAnotherNamespace</code></pre>
-// MUSTACHE-ANOTHER-INDEX:             </a>
-// MUSTACHE-ANOTHER-INDEX:         </li>
-// MUSTACHE-ANOTHER-INDEX:     </ul>
-// MUSTACHE-ANOTHER-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-ANOTHER-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
-// MUSTACHE-ANOTHER-INDEX-NOT: <p>void functionInAnotherNamespace()</p>
-// MUSTACHE-ANOTHER-INDEX-NOT: <p> Function in AnotherNamespace</p>
-
-// JSON-INDEX: async function LoadIndex() {
-// JSON-INDEX-NEXT: return{
-// JSON-INDEX-NEXT:   "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:   "Name": "",
-// JSON-INDEX-NEXT:   "RefType": "default",
-// JSON-INDEX-NEXT:   "Path": "",
-// JSON-INDEX-NEXT:   "Children": [
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "@nonymous_namespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "@nonymous_namespace",
-// JSON-INDEX-NEXT:       "Children": [
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "AnonClass",
-// JSON-INDEX-NEXT:           "RefType": "record",
-// JSON-INDEX-NEXT:           "Path": "@nonymous_namespace",
-// JSON-INDEX-NEXT:           "Children": []
-// JSON-INDEX-NEXT:         }
-// JSON-INDEX-NEXT:       ]
-// JSON-INDEX-NEXT:     },
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "AnotherNamespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "AnotherNamespace",
-// JSON-INDEX-NEXT:       "Children": [
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "ClassInAnotherNamespace",
-// JSON-INDEX-NEXT:           "RefType": "record",
-// JSON-INDEX-NEXT:           "Path": "AnotherNamespace",
-// JSON-INDEX-NEXT:           "Children": []
-// JSON-INDEX-NEXT:         }
-// JSON-INDEX-NEXT:       ]
-// JSON-INDEX-NEXT:     },
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "GlobalNamespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "GlobalNamespace",
-// JSON-INDEX-NEXT:       "Children": []
-// JSON-INDEX-NEXT:     },
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "PrimaryNamespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "PrimaryNamespace",
-// JSON-INDEX-NEXT:       "Children": [
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "ClassInPrimaryNamespace",
-// JSON-INDEX-NEXT:           "RefType": "record",
-// JSON-INDEX-NEXT:           "Path": "PrimaryNamespace",
-// JSON-INDEX-NEXT:           "Children": []
-// JSON-INDEX-NEXT:         },
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "NestedNamespace",
-// JSON-INDEX-NEXT:           "RefType": "namespace",
-// JSON-INDEX-NEXT:           "Path": "PrimaryNamespace{{[\/]+}}NestedNamespace",
-// JSON-INDEX-NEXT:           "Children": [
-// JSON-INDEX-NEXT:             {
-// JSON-INDEX-NEXT:               "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:               "Name": "ClassInNestedNamespace",
-// JSON-INDEX-NEXT:               "RefType": "record",
-// JSON-INDEX-NEXT:               "Path": "PrimaryNamespace{{[\/]+}}NestedNamespace",
-// JSON-INDEX-NEXT:               "Children": []
-// JSON-INDEX-NEXT:             }
-// JSON-INDEX-NEXT:           ]
-// JSON-INDEX-NEXT:         }
-// JSON-INDEX-NEXT:       ]
-// JSON-INDEX-NEXT:     }
-// JSON-INDEX-NEXT:   ]
-// JSON-INDEX-NEXT: };
-// JSON-INDEX-NEXT: }
-
-// HTML-GLOBAL-INDEX: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-// HTML-GLOBAL-INDEX: <h1>Global Namespace</h1>
-// HTML-GLOBAL-INDEX: <h2 id="Namespaces">Namespaces</h2>
-// HTML-GLOBAL-INDEX: <li>@nonymous_namespace</li>
-// HTML-GLOBAL-INDEX: <li>AnotherNamespace</li>
-// HTML-GLOBAL-INDEX: <li>PrimaryNamespace</li>
-
-// MUSTACHE-GLOBAL-INDEX: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-// MUSTACHE-GLOBAL-INDEX: <h1>Global Namespace</h1>
-// MUSTACHE-GLOBAL-INDEX: <h2 id="Namespaces">Namespaces</h2>
-// MUSTACHE-GLOBAL-INDEX: <li>@nonymous_namespace</li>
-// MUSTACHE-GLOBAL-INDEX: <li>AnotherNamespace</li>
-// MUSTACHE-GLOBAL-INDEX: <li>PrimaryNamespace</li>
+// HTML-ANOTHER-INDEX: <h2> AnotherNamespace</h2>
+// HTML-ANOTHER-INDEX:     <h2>Inner Classes</h2>
+// HTML-ANOTHER-INDEX:     <ul class="class-container">
+// HTML-ANOTHER-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-ANOTHER-INDEX:             <a href="_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html">
+// HTML-ANOTHER-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInAnotherNamespace</code></pre>
+// HTML-ANOTHER-INDEX:             </a>
+// HTML-ANOTHER-INDEX:         </li>
+// HTML-ANOTHER-INDEX:     </ul>
+// HTML-ANOTHER-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-ANOTHER-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
+// HTML-ANOTHER-INDEX-NOT: <p>void functionInAnotherNamespace()</p>
+// HTML-ANOTHER-INDEX-NOT: <p> Function in AnotherNamespace</p>
+
+// COM: FIXME: Add namespaces to namespace template
+// HTML-GLOBAL-INDEX-NOT: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
+// HTML-GLOBAL-INDEX-NOT: <h1>Global Namespace</h1>
+// HTML-GLOBAL-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
+// HTML-GLOBAL-INDEX-NOT: <li>@nonymous_namespace</li>
+// HTML-GLOBAL-INDEX-NOT: <li>AnotherNamespace</li>
+// HTML-GLOBAL-INDEX-NOT: <li>PrimaryNamespace</li>
 
 // MD-GLOBAL-INDEX: # Global Namespace
 // MD-GLOBAL-INDEX: ## Namespaces
diff --git a/clang-tools-extra/test/clang-doc/test-path-abs.cpp b/clang-tools-extra/test/clang-doc/test-path-abs.cpp
deleted file mode 100644
index 8875a3a73ab7e..0000000000000
--- a/clang-tools-extra/test/clang-doc/test-path-abs.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --format=html --executor=standalone %s --output=%t --base base_dir
-// RUN: FileCheck %s -input-file=%t/index_json.js  -check-prefix=JSON-INDEX
-
-// JSON-INDEX: var RootPath = "{{.*}}test-path-abs.cpp.tmp";
-// JSON-INDEX-NEXT: var Base = "base_dir";
-
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 18166acf9bbca..01b34ec9a791e 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -26,7 +26,6 @@ add_extra_unittest(ClangDocTests
   ClangDocTest.cpp
   GeneratorTest.cpp
   HTMLGeneratorTest.cpp
-  HTMLMustacheGeneratorTest.cpp
   MDGeneratorTest.cpp
   MergeTest.cpp
   SerializeTest.cpp
diff --git a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
index 2fe443d9db5c5..cf510afe214dd 100644
--- a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
@@ -9,17 +9,22 @@
 #include "ClangDocTest.h"
 #include "Generators.h"
 #include "Representation.h"
+#include "config.h"
+#include "support/Utils.h"
 #include "clang/Basic/Version.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-namespace clang {
-namespace doc {
+using namespace llvm;
+using namespace testing;
+using namespace clang;
+using namespace clang::doc;
 
-static const std::string ClangDocVersion =
-    clang::getClangToolFullVersion("clang-doc");
+static const std::string ClangDocVersion = getClangToolFullVersion("clang-doc");
 
 static std::unique_ptr<Generator> getHTMLGenerator() {
-  auto G = doc::findGeneratorByName("html");
+  auto G = findGeneratorByName("html");
   if (!G)
     return nullptr;
   return std::move(G.get());
@@ -50,454 +55,10 @@ class HTMLGeneratorTest : public ClangDocContextTest {
   }
 };
 
-TEST_F(HTMLGeneratorTest, emitNamespaceHTML) {
-  NamespaceInfo I;
-  I.Name = "Namespace";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.Children.Namespaces.emplace_back(EmptySID, "ChildNamespace",
-                                     InfoType::IT_namespace,
-                                     "Namespace::ChildNamespace", "Namespace");
-  I.Children.Records.emplace_back(EmptySID, "ChildStruct", InfoType::IT_record,
-                                  "Namespace::ChildStruct", "Namespace");
-  I.Children.Functions.emplace_back();
-  I.Children.Functions.back().Access = AccessSpecifier::AS_none;
-  I.Children.Functions.back().Name = "OneFunction";
-  I.Children.Enums.emplace_back();
-  I.Children.Enums.back().Name = "OneEnum";
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({"user-provided-stylesheet.css"});
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title>namespace Namespace</title>
-<link rel="stylesheet" href="../clang-doc-default-stylesheet.css"/>
-<link rel="stylesheet" href="../user-provided-stylesheet.css"/>
-<script src="../index_json.js"></script>
-<script src="../index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="Namespace" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h1>namespace Namespace</h1>
-    <h2 id="Namespaces">Namespaces</h2>
-    <ul>
-      <li>
-        <a href="ChildNamespace/index.html">ChildNamespace</a>
-      </li>
-    </ul>
-    <h2 id="Records">Records</h2>
-    <ul>
-      <li>
-        <a href="ChildStruct.html">ChildStruct</a>
-      </li>
-    </ul>
-    <h2 id="Functions">Functions</h2>
-    <div>
-      <h3 id="0000000000000000000000000000000000000000">OneFunction</h3>
-      <p>OneFunction()</p>
-    </div>
-    <h2 id="Enums">Enums</h2>
-    <div>
-      <table id="0000000000000000000000000000000000000000">
-        <thead>
-          <tr>
-            <th colspan="2">enum OneEnum</th>
-          </tr>
-        </thead>
-      </table>
-    </div>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right">
-    <ol>
-      <li>
-        <span>
-          <a href="#Namespaces">Namespaces</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Records">Records</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Functions">Functions</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneFunction</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-      <li>
-        <span>
-          <a href="#Enums">Enums</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneEnum</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-    </ol>
-  </div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitRecordHTML) {
-  RecordInfo I;
-  I.Name = "r";
-  I.Path = "X/Y/Z";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.DefLoc = Location(10, 10, "dir/test.cpp", true);
-  I.Loc.emplace_back(12, 12, "test.cpp");
-
-  SmallString<16> PathTo;
-  llvm::sys::path::native("path/to", PathTo);
-  I.Members.emplace_back(TypeInfo("int"), "X", AccessSpecifier::AS_private);
-  I.TagType = TagTypeKind::Class;
-  I.Parents.emplace_back(EmptySID, "F", InfoType::IT_record, "F", PathTo);
-  I.VirtualParents.emplace_back(EmptySID, "G", InfoType::IT_record);
-
-  I.Children.Records.emplace_back(EmptySID, "ChildStruct", InfoType::IT_record,
-                                  "X::Y::Z::r::ChildStruct", "X/Y/Z/r");
-  I.Children.Functions.emplace_back();
-  I.Children.Functions.back().Name = "OneFunction";
-  I.Children.Enums.emplace_back();
-  I.Children.Enums.back().Name = "OneEnum";
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({}, "http://www.repository.com");
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title>class r</title>
-<link rel="stylesheet" href="../../../clang-doc-default-stylesheet.css"/>
-<script src="../../../index_json.js"></script>
-<script src="../../../index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="X/Y/Z" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h1>class r</h1>
-    <p>
-      Defined at line 
-      <a href="http://www.repository.com/dir/test.cpp#10">10</a>
-       of file 
-      <a href="http://www.repository.com/dir/test.cpp">test.cpp</a>
-    </p>
-    <p>
-      Inherits from 
-      <a href="../../../path/to/F.html">F</a>
-      , G
-    </p>
-    <h2 id="Members">Members</h2>
-    <ul>
-      <li>
-        <div>private int X</div>
-      </li>
-    </ul>
-    <h2 id="Records">Records</h2>
-    <ul>
-      <li>
-        <a href="../../../X/Y/Z/r/ChildStruct.html">ChildStruct</a>
-      </li>
-    </ul>
-    <h2 id="Functions">Functions</h2>
-    <div>
-      <h3 id="0000000000000000000000000000000000000000">OneFunction</h3>
-      <p>public OneFunction()</p>
-    </div>
-    <h2 id="Enums">Enums</h2>
-    <div>
-      <table id="0000000000000000000000000000000000000000">
-        <thead>
-          <tr>
-            <th colspan="2">enum OneEnum</th>
-          </tr>
-        </thead>
-      </table>
-    </div>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right">
-    <ol>
-      <li>
-        <span>
-          <a href="#Members">Members</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Records">Records</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Functions">Functions</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneFunction</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-      <li>
-        <span>
-          <a href="#Enums">Enums</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneEnum</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-    </ol>
-  </div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitFunctionHTML) {
-  FunctionInfo I;
-  I.Name = "f";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.DefLoc = Location(10, 10, "dir/test.cpp", true);
-  I.Loc.emplace_back(12, 12, "test.cpp");
-
-  I.Access = AccessSpecifier::AS_none;
-
-  SmallString<16> PathTo;
-  llvm::sys::path::native("path/to", PathTo);
-  I.ReturnType = TypeInfo(
-      Reference(EmptySID, "float", InfoType::IT_default, "float", PathTo));
-  I.Params.emplace_back(TypeInfo("int", PathTo), "P");
-  I.IsMethod = true;
-  I.Parent = Reference(EmptySID, "Parent", InfoType::IT_record);
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({}, "https://www.repository.com");
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title></title>
-<link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
-<script src="index_json.js"></script>
-<script src="index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h3 id="0000000000000000000000000000000000000000">f</h3>
-    <p>
-      <a href="path/to/float.html">float</a>
-       f(
-      <a href="path/to/int.html">int</a>
-       P)
-    </p>
-    <p>
-      Defined at line 
-      <a href="https://www.repository.com/dir/test.cpp#10">10</a>
-       of file 
-      <a href="https://www.repository.com/dir/test.cpp">test.cpp</a>
-    </p>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right"></div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitEnumHTML) {
-  EnumInfo I;
-  I.Name = "e";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.DefLoc = Location(10, 10, "test.cpp", true);
-  I.Loc.emplace_back(12, 12, "test.cpp");
-
-  I.Members.emplace_back("X");
-  I.Scoped = true;
-
+TEST_F(HTMLGeneratorTest, createResources) {
   auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({}, "www.repository.com");
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title></title>
-<link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
-<script src="index_json.js"></script>
-<script src="index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <table id="0000000000000000000000000000000000000000">
-      <thead>
-        <tr>
-          <th colspan="2">enum class e</th>
-        </tr>
-      </thead>
-      <tbody>
-        <tr>
-          <td>X</td>
-          <td>0</td>
-        </tr>
-      </tbody>
-    </table>
-    <p>
-      Defined at line 
-      <a href="https://www.repository.com/test.cpp#10">10</a>
-       of file 
-      <a href="https://www.repository.com/test.cpp">test.cpp</a>
-    </p>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right"></div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitCommentHTML) {
-  FunctionInfo I;
-  I.Name = "f";
-  I.DefLoc = Location(10, 10, "test.cpp", true);
-  I.ReturnType = TypeInfo("void");
-  I.Params.emplace_back(TypeInfo("int"), "I");
-  I.Params.emplace_back(TypeInfo("int"), "J");
-  I.Access = AccessSpecifier::AS_none;
-
-  CommentInfo Top;
-  Top.Kind = CommentKind::CK_FullComment;
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *BlankLine = Top.Children.back().get();
-  BlankLine->Kind = CommentKind::CK_ParagraphComment;
-  BlankLine->Children.emplace_back(std::make_unique<CommentInfo>());
-  BlankLine->Children.back()->Kind = CommentKind::CK_TextComment;
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *Brief = Top.Children.back().get();
-  Brief->Kind = CommentKind::CK_ParagraphComment;
-  Brief->Children.emplace_back(std::make_unique<CommentInfo>());
-  Brief->Children.back()->Kind = CommentKind::CK_TextComment;
-  Brief->Children.back()->Name = "ParagraphComment";
-  Brief->Children.back()->Text = " Brief description.";
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *Extended = Top.Children.back().get();
-  Extended->Kind = CommentKind::CK_ParagraphComment;
-  Extended->Children.emplace_back(std::make_unique<CommentInfo>());
-  Extended->Children.back()->Kind = CommentKind::CK_TextComment;
-  Extended->Children.back()->Text = " Extended description that";
-  Extended->Children.emplace_back(std::make_unique<CommentInfo>());
-  Extended->Children.back()->Kind = CommentKind::CK_TextComment;
-  Extended->Children.back()->Text = " continues onto the next line.";
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *Entities = Top.Children.back().get();
-  Entities->Kind = CommentKind::CK_ParagraphComment;
-  Entities->Children.emplace_back(std::make_unique<CommentInfo>());
-  Entities->Children.back()->Kind = CommentKind::CK_TextComment;
-  Entities->Children.back()->Name = "ParagraphComment";
-  Entities->Children.back()->Text =
-      " Comment with html entities: &, <, >, \", \'.";
-
-  I.Description.emplace_back(std::move(Top));
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
+  ASSERT_THAT(G, NotNull()) << "Could not find HTMLGenerator";
   ClangDocContext CDCtx = getClangDocContext();
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title></title>
-<link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
-<script src="index_json.js"></script>
-<script src="index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h3 id="0000000000000000000000000000000000000000">f</h3>
-    <p>void f(int I, int J)</p>
-    <p>
-      Defined at line 
-      <a href="test.cpp#10">10</a>
-       of file 
-      <a href="test.cpp">test.cpp</a>
-    </p>
-    <div>
-      <div>
-        <p> Brief description.</p>
-        <p> Extended description that continues onto the next line.</p>
-        <p> Comment with html entities: &amp;, &lt;, &gt;, &quot;, &apos;.</p>
-      </div>
-    </div>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right"></div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
+  EXPECT_THAT_ERROR(G->createResources(CDCtx), Failed())
+      << "Empty UserStylesheets or JsScripts should fail!";
 }
-
-} // namespace doc
-} // namespace clang
diff --git a/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp
deleted file mode 100644
index 7bbd299d7b59c..0000000000000
--- a/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-//===-- clang-doc/HTMLMustacheGeneratorTest.cpp ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ClangDocTest.h"
-#include "Generators.h"
-#include "Representation.h"
-#include "config.h"
-#include "support/Utils.h"
-#include "clang/Basic/Version.h"
-#include "llvm/Testing/Support/Error.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace testing;
-using namespace clang;
-using namespace clang::doc;
-
-// FIXME: Don't enable unit tests that can read files. Remove once we can use
-// lit to test these properties.
-#define ENABLE_LOCAL_TEST 0
-
-static const std::string ClangDocVersion = getClangToolFullVersion("clang-doc");
-
-static std::unique_ptr<Generator> getHTMLMustacheGenerator() {
-  auto G = findGeneratorByName("mustache");
-  if (!G)
-    return nullptr;
-  return std::move(G.get());
-}
-
-class HTMLMustacheGeneratorTest : public ClangDocContextTest {
-protected:
-  ClangDocContext
-  getClangDocContext(std::vector<std::string> UserStylesheets = {},
-                     StringRef RepositoryUrl = "",
-                     StringRef RepositoryLinePrefix = "", StringRef Base = "") {
-    ClangDocContext CDCtx{nullptr,
-                          "test-project",
-                          false,
-                          "",
-                          "",
-                          RepositoryUrl,
-                          RepositoryLinePrefix,
-                          Base,
-                          UserStylesheets,
-                          Diags,
-                          false};
-    CDCtx.UserStylesheets.insert(CDCtx.UserStylesheets.begin(), "");
-    CDCtx.JsScripts.emplace_back("");
-    return CDCtx;
-  }
-};
-
-TEST_F(HTMLMustacheGeneratorTest, createResources) {
-  auto G = getHTMLMustacheGenerator();
-  ASSERT_THAT(G, NotNull()) << "Could not find HTMLMustacheGenerator";
-  ClangDocContext CDCtx = getClangDocContext();
-  EXPECT_THAT_ERROR(G->createResources(CDCtx), Failed())
-      << "Empty UserStylesheets or JsScripts should fail!";
-}

From 4bff9fdb908b1f9a3710e4570d249e24bd2aae4d Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 9 Dec 2025 19:50:45 +0000
Subject: [PATCH 45/85] [gn build] Port 24117f75ad9d

---
 llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn     | 1 -
 .../gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn
index 5815537318b3a..280d72f4d36b5 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn
@@ -22,7 +22,6 @@ static_library("clang-doc") {
     "ClangDoc.cpp",
     "Generators.cpp",
     "HTMLGenerator.cpp",
-    "HTMLMustacheGenerator.cpp",
     "JSONGenerator.cpp",
     "MDGenerator.cpp",
     "Mapper.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn
index 4cbd3b96d8ff4..427a64e7a8b10 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn
@@ -40,7 +40,6 @@ unittest("ClangDocTests") {
     "ClangDocTest.cpp",
     "GeneratorTest.cpp",
     "HTMLGeneratorTest.cpp",
-    "HTMLMustacheGeneratorTest.cpp",
     "JSONGeneratorTest.cpp",
     "MDGeneratorTest.cpp",
     "MergeTest.cpp",

From dc92bd03c965a200724a51ec6d5e89f2d449f1dd Mon Sep 17 00:00:00 2001
From: Andrew Haberlandt <ndrewh@users.noreply.github.com>
Date: Tue, 9 Dec 2025 12:05:43 -0800
Subject: [PATCH 46/85] [sanitizer_common] [Darwin] Replace pty with pipe on
 posix_spawn path for spawning symbolizer (#170809)

Due to a legacy incompatibility with `atos`, we were allocating a pty
whenever we spawned the symbolizer. This is no longer necessary and we
can use a regular ol' pipe.

This PR is split into two commits:
- The first removes the pty allocation and replaces it with a pipe. This
relocates the `CreateTwoHighNumberedPipes` call to be common to the
`posix_spawn` and `StartSubprocess` path.
- The second commit adds the `child_stdin_fd_` field to
`SymbolizerProcess`, storing the read end of the stdin pipe. By holding
on to this fd for the lifetime of the symbolizer, we are able to avoid
getting SIGPIPE (which would occur when we write to a pipe whose
read-end had been closed due to the death of the symbolizer). This will
be very close to solving #120915, but this PR is intentionally not
touching the non-posix_spawn path.

rdar://165894284
---
 .../lib/sanitizer_common/sanitizer_mac.cpp    | 97 ++++++-------------
 .../lib/sanitizer_common/sanitizer_posix.h    |  3 +-
 .../sanitizer_symbolizer_internal.h           |  6 +-
 .../sanitizer_symbolizer_libcdep.cpp          | 12 ++-
 .../sanitizer_symbolizer_posix_libcdep.cpp    | 37 +++----
 5 files changed, 65 insertions(+), 90 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index a6f757173728b..3f8de8dd064a5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -281,53 +281,43 @@ int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp,
                       (size_t)newlen);
 }
 
-static fd_t internal_spawn_impl(const char *argv[], const char *envp[],
-                                pid_t *pid) {
-  fd_t primary_fd = kInvalidFd;
-  fd_t secondary_fd = kInvalidFd;
-
+bool internal_spawn(const char* argv[], const char* envp[], pid_t* pid,
+                    fd_t fd_stdin, fd_t fd_stdout) {
+  // NOTE: Caller ensures that fd_stdin and fd_stdout are not 0, 1, or 2, since
+  // this can break communication.
+  //
+  // NOTE: Caller is responsible for closing fd_stdin after the process has
+  // died.
+
+  int res;
   auto fd_closer = at_scope_exit([&] {
-    internal_close(primary_fd);
-    internal_close(secondary_fd);
+    // NOTE: We intentionally do not close fd_stdin since this can
+    // cause us to receive a fatal SIGPIPE if the process dies.
+    internal_close(fd_stdout);
   });
 
-  // We need a new pseudoterminal to avoid buffering problems. The 'atos' tool
-  // in particular detects when it's talking to a pipe and forgets to flush the
-  // output stream after sending a response.
-  primary_fd = posix_openpt(O_RDWR);
-  if (primary_fd == kInvalidFd)
-    return kInvalidFd;
-
-  int res = grantpt(primary_fd) || unlockpt(primary_fd);
-  if (res != 0) return kInvalidFd;
-
-  // Use TIOCPTYGNAME instead of ptsname() to avoid threading problems.
-  char secondary_pty_name[128];
-  res = ioctl(primary_fd, TIOCPTYGNAME, secondary_pty_name);
-  if (res == -1) return kInvalidFd;
-
-  secondary_fd = internal_open(secondary_pty_name, O_RDWR);
-  if (secondary_fd == kInvalidFd)
-    return kInvalidFd;
-
   // File descriptor actions
   posix_spawn_file_actions_t acts;
   res = posix_spawn_file_actions_init(&acts);
-  if (res != 0) return kInvalidFd;
+  if (res != 0)
+    return false;
 
   auto acts_cleanup = at_scope_exit([&] {
     posix_spawn_file_actions_destroy(&acts);
   });
 
-  res = posix_spawn_file_actions_adddup2(&acts, secondary_fd, STDIN_FILENO) ||
-        posix_spawn_file_actions_adddup2(&acts, secondary_fd, STDOUT_FILENO) ||
-        posix_spawn_file_actions_addclose(&acts, secondary_fd);
-  if (res != 0) return kInvalidFd;
+  res = posix_spawn_file_actions_adddup2(&acts, fd_stdin, STDIN_FILENO) ||
+        posix_spawn_file_actions_adddup2(&acts, fd_stdout, STDOUT_FILENO) ||
+        posix_spawn_file_actions_addclose(&acts, fd_stdin) ||
+        posix_spawn_file_actions_addclose(&acts, fd_stdout);
+  if (res != 0)
+    return false;
 
   // Spawn attributes
   posix_spawnattr_t attrs;
   res = posix_spawnattr_init(&attrs);
-  if (res != 0) return kInvalidFd;
+  if (res != 0)
+    return false;
 
   auto attrs_cleanup  = at_scope_exit([&] {
     posix_spawnattr_destroy(&attrs);
@@ -336,50 +326,17 @@ static fd_t internal_spawn_impl(const char *argv[], const char *envp[],
   // In the spawned process, close all file descriptors that are not explicitly
   // described by the file actions object. This is Darwin-specific extension.
   res = posix_spawnattr_setflags(&attrs, POSIX_SPAWN_CLOEXEC_DEFAULT);
-  if (res != 0) return kInvalidFd;
+  if (res != 0)
+    return false;
 
   // posix_spawn
   char **argv_casted = const_cast<char **>(argv);
   char **envp_casted = const_cast<char **>(envp);
   res = posix_spawn(pid, argv[0], &acts, &attrs, argv_casted, envp_casted);
-  if (res != 0) return kInvalidFd;
-
-  // Disable echo in the new terminal, disable CR.
-  struct termios termflags;
-  tcgetattr(primary_fd, &termflags);
-  termflags.c_oflag &= ~ONLCR;
-  termflags.c_lflag &= ~ECHO;
-  tcsetattr(primary_fd, TCSANOW, &termflags);
-
-  // On success, do not close primary_fd on scope exit.
-  fd_t fd = primary_fd;
-  primary_fd = kInvalidFd;
-
-  return fd;
-}
-
-fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid) {
-  // The client program may close its stdin and/or stdout and/or stderr thus
-  // allowing open/posix_openpt to reuse file descriptors 0, 1 or 2. In this
-  // case the communication is broken if either the parent or the child tries to
-  // close or duplicate these descriptors. We temporarily reserve these
-  // descriptors here to prevent this.
-  fd_t low_fds[3];
-  size_t count = 0;
-
-  for (; count < 3; count++) {
-    low_fds[count] = posix_openpt(O_RDWR);
-    if (low_fds[count] >= STDERR_FILENO)
-      break;
-  }
-
-  fd_t fd = internal_spawn_impl(argv, envp, pid);
-
-  for (; count > 0; count--) {
-    internal_close(low_fds[count]);
-  }
+  if (res != 0)
+    return false;
 
-  return fd;
+  return true;
 }
 
 uptr internal_rename(const char *oldpath, const char *newpath) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
index b5491c540dc08..063408b8360c1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
@@ -67,7 +67,8 @@ uptr internal_ptrace(int request, int pid, void *addr, void *data);
 uptr internal_waitpid(int pid, int *status, int options);
 
 int internal_fork();
-fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid);
+bool internal_spawn(const char* argv[], const char* envp[], pid_t* pid,
+                    fd_t stdin, fd_t stdout);
 
 int internal_sysctl(const int *name, unsigned int namelen, void *oldp,
                     uptr *oldlenp, const void *newp, uptr newlen);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h
index 2345aee985541..6442a2980bf2f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h
@@ -83,7 +83,7 @@ class SymbolizerProcess {
   const char *SendCommand(const char *command);
 
  protected:
-  ~SymbolizerProcess() {}
+  ~SymbolizerProcess();
 
   /// The maximum number of arguments required to invoke a tool process.
   static const unsigned kArgVMax = 16;
@@ -114,6 +114,10 @@ class SymbolizerProcess {
   fd_t input_fd_;
   fd_t output_fd_;
 
+  // We hold on to the child's stdin fd (the read end of the pipe)
+  // so that when we write to it, we don't get a SIGPIPE
+  fd_t child_stdin_fd_;
+
   InternalMmapVector<char> buffer_;
 
   static const uptr kMaxTimesRestarted = 5;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
index 565701c85d978..cc31d3d8056f9 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
@@ -476,10 +476,11 @@ const char *LLVMSymbolizer::FormatAndSendCommand(const char *command_prefix,
   return symbolizer_process_->SendCommand(buffer_);
 }
 
-SymbolizerProcess::SymbolizerProcess(const char *path, bool use_posix_spawn)
+SymbolizerProcess::SymbolizerProcess(const char* path, bool use_posix_spawn)
     : path_(path),
       input_fd_(kInvalidFd),
       output_fd_(kInvalidFd),
+      child_stdin_fd_(kInvalidFd),
       times_restarted_(0),
       failed_to_start_(false),
       reported_invalid_path_(false),
@@ -488,6 +489,11 @@ SymbolizerProcess::SymbolizerProcess(const char *path, bool use_posix_spawn)
   CHECK_NE(path_[0], '\0');
 }
 
+SymbolizerProcess::~SymbolizerProcess() {
+  if (child_stdin_fd_ != kInvalidFd)
+    CloseFile(child_stdin_fd_);
+}
+
 static bool IsSameModule(const char *path) {
   if (const char *ProcessName = GetProcessName()) {
     if (const char *SymbolizerName = StripModuleName(path)) {
@@ -533,6 +539,10 @@ bool SymbolizerProcess::Restart() {
     CloseFile(input_fd_);
   if (output_fd_ != kInvalidFd)
     CloseFile(output_fd_);
+  if (child_stdin_fd_ != kInvalidFd) {
+    CloseFile(child_stdin_fd_);
+    child_stdin_fd_ = kInvalidFd;  // Don't free in destructor
+  }
   return StartSymbolizerSubprocess();
 }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index 7eb0c9756d64a..29c73e3e1cac1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -156,30 +156,30 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
     Printf("\n");
   }
 
+  fd_t infd[2] = {}, outfd[2] = {};
+  if (!CreateTwoHighNumberedPipes(infd, outfd)) {
+    Report(
+        "WARNING: Can't create a socket pair to start "
+        "external symbolizer (errno: %d)\n",
+        errno);
+    return false;
+  }
+
   if (use_posix_spawn_) {
 #  if SANITIZER_APPLE
-    fd_t fd = internal_spawn(argv, const_cast<const char **>(GetEnvP()), &pid);
-    if (fd == kInvalidFd) {
+    bool success = internal_spawn(argv, const_cast<const char**>(GetEnvP()),
+                                  &pid, outfd[0], infd[1]);
+    if (!success) {
       Report("WARNING: failed to spawn external symbolizer (errno: %d)\n",
              errno);
+      internal_close(infd[0]);
+      internal_close(outfd[1]);
       return false;
     }
-
-    input_fd_ = fd;
-    output_fd_ = fd;
 #  else   // SANITIZER_APPLE
     UNIMPLEMENTED();
 #  endif  // SANITIZER_APPLE
   } else {
-    fd_t infd[2] = {}, outfd[2] = {};
-    if (!CreateTwoHighNumberedPipes(infd, outfd)) {
-      Report(
-          "WARNING: Can't create a socket pair to start "
-          "external symbolizer (errno: %d)\n",
-          errno);
-      return false;
-    }
-
     pid = StartSubprocess(path_, argv, GetEnvP(), /* stdin */ outfd[0],
                           /* stdout */ infd[1]);
     if (pid < 0) {
@@ -187,11 +187,14 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
       internal_close(outfd[1]);
       return false;
     }
-
-    input_fd_ = infd[0];
-    output_fd_ = outfd[1];
   }
 
+  input_fd_ = infd[0];
+  output_fd_ = outfd[1];
+
+  // We intentionally hold on to the read-end so that we don't get a SIGPIPE
+  child_stdin_fd_ = outfd[0];
+
   CHECK_GT(pid, 0);
 
   // Check that symbolizer subprocess started successfully.

From 7a5e2c9358eabff3d9eb66141590ac453a2e0e08 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 9 Dec 2025 20:06:13 +0000
Subject: [PATCH 47/85] [LV] Add test with threshold=0 and metadata forcing
 vectorization.

Test case for the mis-compile mentioned in
https://github.com/llvm/llvm-project/pull/166247#issuecomment-3631471588

The issue is that we don't generate a runtime check even though it is
required to vectorize.
---
 ...ime-check-threshold-with-force-metadata.ll | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll

diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
new file mode 100644
index 0000000000000..b7d36fe7928e5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
+; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
+
+; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
+; no runtime check is generated even though one is needed and !noalias
+; annotations are added.
+define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
+; LIMIT0-LABEL: define i16 @runtime_checks_needed(
+; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; LIMIT0-NEXT:  [[ENTRY:.*:]]
+; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
+; LIMIT0:       [[VECTOR_PH]]:
+; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
+; LIMIT0:       [[VECTOR_BODY]]:
+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
+; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; LIMIT0:       [[MIDDLE_BLOCK]]:
+; LIMIT0-NEXT:    br label %[[EXIT:.*]]
+; LIMIT0:       [[EXIT]]:
+; LIMIT0-NEXT:    ret i16 [[TMP0]]
+;
+; LIMIT1-LABEL: define i16 @runtime_checks_needed(
+; LIMIT1-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; LIMIT1-NEXT:  [[ENTRY:.*:]]
+; LIMIT1-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
+; LIMIT1:       [[VECTOR_MEMCHECK]]:
+; LIMIT1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2000
+; LIMIT1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 2
+; LIMIT1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; LIMIT1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; LIMIT1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; LIMIT1-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; LIMIT1:       [[VECTOR_PH]]:
+; LIMIT1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+; LIMIT1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; LIMIT1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; LIMIT1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; LIMIT1:       [[VECTOR_BODY]]:
+; LIMIT1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; LIMIT1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
+; LIMIT1-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; LIMIT1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; LIMIT1-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; LIMIT1-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; LIMIT1:       [[MIDDLE_BLOCK]]:
+; LIMIT1-NEXT:    br label %[[EXIT:.*]]
+; LIMIT1:       [[SCALAR_PH]]:
+; LIMIT1-NEXT:    br label %[[LOOP:.*]]
+; LIMIT1:       [[LOOP]]:
+; LIMIT1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; LIMIT1-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
+; LIMIT1-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; LIMIT1-NEXT:    store i16 [[L]], ptr [[GEP_DST]], align 1
+; LIMIT1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; LIMIT1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; LIMIT1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; LIMIT1:       [[EXIT]]:
+; LIMIT1-NEXT:    [[L_LCSSA:%.*]] = phi i16 [ [[L]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; LIMIT1-NEXT:    ret i16 [[L_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i16, ptr %src, align 1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %l, ptr %gep.dst, align 1
+  %iv.next = add nsw nuw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret i16 %l
+}
+
+!0 = distinct !{!0, !2, !3}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+;.
+; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
+; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
+; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
+; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
+; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
+; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; LIMIT1: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; LIMIT1: [[META3]] = !{[[META4:![0-9]+]]}
+; LIMIT1: [[META4]] = distinct !{[[META4]], [[META2]]}
+; LIMIT1: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; LIMIT1: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; LIMIT1: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; LIMIT1: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]}
+;.

From b3d05e680709dbe7988c25c144dabdadc0c4cf27 Mon Sep 17 00:00:00 2001
From: Fateme Hosseini <quic_fhossein@quicinc.com>
Date: Tue, 9 Dec 2025 14:07:52 -0600
Subject: [PATCH 48/85]  [Hexagon] Add HVX V81 builtins (#170680)

Expose the HVXV81 abs, conversion, comparison, log2, negate and mixed
subtract intrinsics so Clang can emit the new instructions.
---
 clang/include/clang/Basic/BuiltinsHexagon.td | 66 ++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsHexagon.td b/clang/include/clang/Basic/BuiltinsHexagon.td
index cf18359e7bf60..00f84cd72a051 100644
--- a/clang/include/clang/Basic/BuiltinsHexagon.td
+++ b/clang/include/clang/Basic/BuiltinsHexagon.td
@@ -2146,3 +2146,69 @@ let Features = HVXV79.Features in {
   def V6_vsub_hf_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
   def V6_vsub_hf_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
 }
+
+// V81 HVX Instructions.
+let Features = HVXV81.Features in {
+  def V6_vabs_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_qf16_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf16_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_qf32_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf32_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_valign4 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_valign4_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vconv_bf_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>)">;
+  def V6_vconv_bf_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>)">;
+  def V6_vconv_f8_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_f8_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_h_hf_rnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_h_hf_rnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf16_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vconv_qf16_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vconv_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf16_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf16_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf32_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf32_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_veqhf : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqhf_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqhf_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqhf_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vilog2_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vilog2_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vilog2_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vilog2_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf16_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf16_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf32_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf32_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vsub_hf_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_hf_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_sf_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_sf_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+}

From 0eb00eff475dd3950d8a1e7db14f3905d67d119e Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Tue, 9 Dec 2025 12:15:56 -0800
Subject: [PATCH 49/85] [alpha.webkit.RetainPtrCtorAdoptChecker] Don't treat
 assignment to an +1 out argument as a leak (#161633)

Make RetainPtrCtorAdoptChecker recognize an assignment to an +1 out
argument so that it won't emit a memory leak warning.
---
 .../WebKit/RetainPtrCtorAdoptChecker.cpp      | 36 +++++++++++++++----
 .../WebKit/retain-ptr-ctor-adopt-use.mm       |  8 +++++
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
index 955b8d19a820c..07ef699a5d883 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
@@ -355,15 +355,37 @@ class RetainPtrCtorAdoptChecker
   void visitBinaryOperator(const BinaryOperator *BO) const {
     if (!BO->isAssignmentOp())
       return;
-    if (!isa<ObjCIvarRefExpr>(BO->getLHS()))
-      return;
+    auto *LHS = BO->getLHS();
     auto *RHS = BO->getRHS()->IgnoreParenCasts();
-    const Expr *Inner = nullptr;
-    if (isAllocInit(RHS, &Inner)) {
-      CreateOrCopyFnCall.insert(RHS);
-      if (Inner)
-        CreateOrCopyFnCall.insert(Inner);
+    if (isa<ObjCIvarRefExpr>(LHS)) {
+      const Expr *Inner = nullptr;
+      if (isAllocInit(RHS, &Inner)) {
+        CreateOrCopyFnCall.insert(RHS);
+        if (Inner)
+          CreateOrCopyFnCall.insert(Inner);
+      }
+      return;
     }
+    auto *UO = dyn_cast<UnaryOperator>(LHS);
+    if (!UO)
+      return;
+    auto OpCode = UO->getOpcode();
+    if (OpCode != UO_Deref)
+      return;
+    auto *DerefTarget = UO->getSubExpr();
+    if (!DerefTarget)
+      return;
+    DerefTarget = DerefTarget->IgnoreParenCasts();
+    auto *DRE = dyn_cast<DeclRefExpr>(DerefTarget);
+    if (!DRE)
+      return;
+    auto *Decl = DRE->getDecl();
+    if (!Decl)
+      return;
+    if (!isa<ParmVarDecl>(Decl) || !isCreateOrCopy(RHS))
+      return;
+    if (Decl->hasAttr<CFReturnsRetainedAttr>())
+      CreateOrCopyFnCall.insert(RHS);
   }
 
   void visitReturnStmt(const ReturnStmt *RS, const Decl *DeclWithIssue) const {
diff --git a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
index 45705615f3196..427affdbbd601 100644
--- a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
+++ b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
@@ -190,6 +190,14 @@ void adopt_retainptr() {
   auto bar = adoptNS([allocSomeObj() init]);
 }
 
+CFTypeRef make_cf_obj() CF_RETURNS_RETAINED {
+  return CFArrayCreateMutable(kCFAllocatorDefault, 1);
+}
+
+void get_cf_obj(CFTypeRef* CF_RETURNS_RETAINED result) {
+  *result = CFArrayCreateMutable(kCFAllocatorDefault, 1);
+}
+
 RetainPtr<CFArrayRef> return_arg(CFArrayRef arg) {
   return arg;
 }

From f9326ffb7ede55229a81907cc03aaa8c523520ab Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Tue, 9 Dec 2025 12:28:22 -0800
Subject: [PATCH 50/85] [WebKit checkers] Treat a weak property / variable as
 safe (#163689)

Treat a weak Objective-C property, ivar, member variable, and local
variable as safe.
---
 .../WebKit/RawPtrRefLambdaCapturesChecker.cpp |  2 ++
 .../WebKit/RawPtrRefLocalVarsChecker.cpp      |  2 ++
 .../WebKit/RawPtrRefMemberChecker.cpp         |  7 ++++-
 .../unretained-lambda-captures-weak-arc.mm    | 22 +++++++++++++
 .../WebKit/unretained-lambda-captures-weak.mm | 22 +++++++++++++
 .../WebKit/unretained-local-vars-weak-arc.mm  | 13 ++++++++
 .../WebKit/unretained-local-vars-weak.mm      | 13 ++++++++
 .../WebKit/unretained-members-weak-arc.mm     | 29 +++++++++++++++++
 .../WebKit/unretained-members-weak.mm         | 31 +++++++++++++++++++
 9 files changed, 140 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
index f60d1936b7584..f3fadeaefc491 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
@@ -587,6 +587,8 @@ class UnretainedLambdaCapturesChecker : public RawPtrRefLambdaCapturesChecker {
   }
 
   std::optional<bool> isUnsafePtr(QualType QT) const final {
+    if (QT.hasStrongOrWeakObjCLifetime())
+      return false;
     return RTC->isUnretained(QT);
   }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
index c13df47920f72..f2235e7c25ab2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
@@ -433,6 +433,8 @@ class UnretainedLocalVarsChecker final : public RawPtrRefLocalVarsChecker {
     RTC = RetainTypeChecker();
   }
   std::optional<bool> isUnsafePtr(const QualType T) const final {
+    if (T.hasStrongOrWeakObjCLifetime())
+      return false;
     return RTC->isUnretained(T);
   }
   bool isSafePtr(const CXXRecordDecl *Record) const final {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
index ace639ce7ab18..0e23ae34ea212 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
@@ -231,8 +231,11 @@ class RawPtrRefMemberChecker
     // "assign" property doesn't retain even under ARC so treat it as unsafe.
     bool ignoreARC =
         !PD->isReadOnly() && PD->getSetterKind() == ObjCPropertyDecl::Assign;
+    bool IsWeak =
+        PD->getPropertyAttributes() & ObjCPropertyAttribute::kind_weak;
+    bool HasSafeAttr = PD->isRetaining() || IsWeak;
     auto IsUnsafePtr = isUnsafePtr(QT, ignoreARC);
-    return {IsUnsafePtr && *IsUnsafePtr && !PD->isRetaining(), PropType};
+    return {IsUnsafePtr && *IsUnsafePtr && !HasSafeAttr, PropType};
   }
 
   bool shouldSkipDecl(const RecordDecl *RD) const {
@@ -363,6 +366,8 @@ class NoUnretainedMemberChecker final : public RawPtrRefMemberChecker {
   }
 
   std::optional<bool> isUnsafePtr(QualType QT, bool ignoreARC) const final {
+    if (QT.hasStrongOrWeakObjCLifetime())
+      return false;
     return RTC->isUnretained(QT, ignoreARC);
   }
 
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm
new file mode 100644
index 0000000000000..a52bc7c9a5572
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLambdaCapturesChecker -fobjc-runtime-has-weak -fobjc-weak -fobjc-arc -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+void someFunction();
+template <typename Callback> void call(Callback callback) {
+  someFunction();
+  callback();
+}
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+void foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  auto lambda = [weakStr, weakObj]() {
+    return [weakStr length] + [weakObj value];
+  };
+  call(lambda);
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm
new file mode 100644
index 0000000000000..7439d7f8bb93b
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLambdaCapturesChecker -fobjc-runtime-has-weak -fobjc-weak -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+void someFunction();
+template <typename Callback> void call(Callback callback) {
+  someFunction();
+  callback();
+}
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+void foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  auto lambda = [weakStr, weakObj]() {
+    return [weakStr length] + [weakObj value];
+  };
+  call(lambda);
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm
new file mode 100644
index 0000000000000..8c709b5921227
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm
@@ -0,0 +1,13 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLocalVarsChecker -fobjc-runtime-has-weak -fobjc-weak -fobjc-arc -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+int foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  return [weakStr length] + [weakObj value];
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm
new file mode 100644
index 0000000000000..3ac4ff9d1e4cb
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm
@@ -0,0 +1,13 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLocalVarsChecker -fobjc-runtime-has-weak -fobjc-weak -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+int foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  return [weakStr length] + [weakObj value];
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm
new file mode 100644
index 0000000000000..c0aaac09e68d8
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm
@@ -0,0 +1,29 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUnretainedMemberChecker -fobjc-runtime-has-weak -fobjc-weak -fobjc-arc -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+struct Foo {
+  __weak NSString *weakPtr = nullptr;
+  Foo();
+  ~Foo();
+  void bar();
+};
+
+@interface ObjectWithWeakProperty : NSObject
+@property(nonatomic, weak) NSString *weak_prop;
+@end
+
+@implementation ObjectWithWeakProperty
+@end
+
+NS_REQUIRES_PROPERTY_DEFINITIONS
+@interface NoSynthesisObjectWithWeakProperty : NSObject
+@property(nonatomic, readonly, weak) NSString *weak_prop;
+@end
+
+@implementation NoSynthesisObjectWithWeakProperty
+- (NSString *)weak_prop {
+  return nil;
+}
+@end
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm
new file mode 100644
index 0000000000000..422cf6189446d
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm
@@ -0,0 +1,31 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUnretainedMemberChecker -fobjc-runtime-has-weak -fobjc-weak -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+struct Foo {
+  __weak NSString *weakPtr = nullptr;
+  Foo();
+  ~Foo();
+  void bar();
+};
+
+@interface ObjectWithWeakProperty : NSObject
+@property(nonatomic, weak) NSString *weak_prop;
+@end
+
+@implementation ObjectWithWeakProperty
+@end
+
+NS_REQUIRES_PROPERTY_DEFINITIONS
+@interface NoSynthesisObjectWithWeakProperty : NSObject
+@property(nonatomic, readonly, weak) NSString *weak_prop;
+@end
+
+@implementation NoSynthesisObjectWithWeakProperty {
+  __weak NSNumber *weak_ivar;
+}
+- (NSString *)weak_prop {
+  return nil;
+}
+@end

From 06f0758282bfa5457ea779e66c79fdff34e58320 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Tue, 9 Dec 2025 12:29:11 -0800
Subject: [PATCH 51/85] [alpha.webkit.UnretainedCallArgsChecker] Recognize
 [allocObj() init] pattern (#161019)

Generalize the check for recognizing [[Obj alloc] init] to also
recognize [allocObj() init]. We do this by utilizing isAllocInit
function in RetainPtrCtorAdoptChecker.
---
 .../Checkers/WebKit/ASTUtils.cpp              | 45 +++++++++++++++++++
 .../StaticAnalyzer/Checkers/WebKit/ASTUtils.h |  4 ++
 .../WebKit/RawPtrRefCallArgsChecker.cpp       |  9 +---
 .../WebKit/RetainPtrCtorAdoptChecker.cpp      | 44 ------------------
 .../Checkers/WebKit/unretained-call-args.mm   |  3 ++
 5 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index 84adbf318e9f8..e46bba7be4aea 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -321,6 +321,51 @@ bool isExprToGetCheckedPtrCapableMember(const clang::Expr *E) {
   return result && *result;
 }
 
+bool isAllocInit(const Expr *E, const Expr **InnerExpr) {
+  auto *ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(E);
+  if (auto *POE = dyn_cast<PseudoObjectExpr>(E)) {
+    if (unsigned ExprCount = POE->getNumSemanticExprs()) {
+      auto *Expr = POE->getSemanticExpr(ExprCount - 1)->IgnoreParenCasts();
+      ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(Expr);
+      if (InnerExpr)
+        *InnerExpr = ObjCMsgExpr;
+    }
+  }
+  if (!ObjCMsgExpr)
+    return false;
+  auto Selector = ObjCMsgExpr->getSelector();
+  auto NameForFirstSlot = Selector.getNameForSlot(0);
+  if (NameForFirstSlot.starts_with("alloc") ||
+      NameForFirstSlot.starts_with("copy") ||
+      NameForFirstSlot.starts_with("mutableCopy"))
+    return true;
+  if (!NameForFirstSlot.starts_with("init") &&
+      !NameForFirstSlot.starts_with("_init"))
+    return false;
+  if (!ObjCMsgExpr->isInstanceMessage())
+    return false;
+  auto *Receiver = ObjCMsgExpr->getInstanceReceiver();
+  if (!Receiver)
+    return false;
+  Receiver = Receiver->IgnoreParenCasts();
+  if (auto *Inner = dyn_cast<ObjCMessageExpr>(Receiver)) {
+    if (InnerExpr)
+      *InnerExpr = Inner;
+    auto InnerSelector = Inner->getSelector();
+    return InnerSelector.getNameForSlot(0).starts_with("alloc");
+  } else if (auto *CE = dyn_cast<CallExpr>(Receiver)) {
+    if (InnerExpr)
+      *InnerExpr = CE;
+    if (auto *Callee = CE->getDirectCallee()) {
+      if (Callee->getDeclName().isIdentifier()) {
+        auto CalleeName = Callee->getName();
+        return CalleeName.starts_with("alloc");
+      }
+    }
+  }
+  return false;
+}
+
 class EnsureFunctionVisitor
     : public ConstStmtVisitor<EnsureFunctionVisitor, bool> {
 public:
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
index 9fff456b7e8b8..d0a3e471365e2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
@@ -77,6 +77,10 @@ bool isConstOwnerPtrMemberExpr(const clang::Expr *E);
 /// supports CheckedPtr.
 bool isExprToGetCheckedPtrCapableMember(const clang::Expr *E);
 
+/// \returns true if \p E is a [[alloc] init] pattern expression.
+/// Sets \p InnerExpr to the inner function call or selector invocation.
+bool isAllocInit(const Expr *E, const Expr **InnerExpr = nullptr);
+
 /// \returns true if E is a CXXMemberCallExpr which returns a const smart
 /// pointer type.
 class EnsureFunctionAnalysis {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
index 791e70998477f..dcc14a0aecdf7 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
@@ -177,16 +177,11 @@ class RawPtrRefCallArgsChecker
     if (BR->getSourceManager().isInSystemHeader(E->getExprLoc()))
       return;
 
-    auto Selector = E->getSelector();
     if (auto *Receiver = E->getInstanceReceiver()) {
       std::optional<bool> IsUnsafe = isUnsafePtr(E->getReceiverType());
       if (IsUnsafe && *IsUnsafe && !isPtrOriginSafe(Receiver)) {
-        if (auto *InnerMsg = dyn_cast<ObjCMessageExpr>(Receiver)) {
-          auto InnerSelector = InnerMsg->getSelector();
-          if (InnerSelector.getNameForSlot(0) == "alloc" &&
-              Selector.getNameForSlot(0).starts_with("init"))
-            return;
-        }
+        if (isAllocInit(E))
+          return;
         reportBugOnReceiver(Receiver, D);
       }
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
index 07ef699a5d883..2af9067f8f808 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
@@ -445,50 +445,6 @@ class RetainPtrCtorAdoptChecker
     return std::nullopt;
   }
 
-  bool isAllocInit(const Expr *E, const Expr **InnerExpr = nullptr) const {
-    auto *ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(E);
-    if (auto *POE = dyn_cast<PseudoObjectExpr>(E)) {
-      if (unsigned ExprCount = POE->getNumSemanticExprs()) {
-        auto *Expr = POE->getSemanticExpr(ExprCount - 1)->IgnoreParenCasts();
-        ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(Expr);
-        if (InnerExpr)
-          *InnerExpr = ObjCMsgExpr;
-      }
-    }
-    if (!ObjCMsgExpr)
-      return false;
-    auto Selector = ObjCMsgExpr->getSelector();
-    auto NameForFirstSlot = Selector.getNameForSlot(0);
-    if (NameForFirstSlot == "alloc" || NameForFirstSlot.starts_with("copy") ||
-        NameForFirstSlot.starts_with("mutableCopy"))
-      return true;
-    if (!NameForFirstSlot.starts_with("init") &&
-        !NameForFirstSlot.starts_with("_init"))
-      return false;
-    if (!ObjCMsgExpr->isInstanceMessage())
-      return false;
-    auto *Receiver = ObjCMsgExpr->getInstanceReceiver();
-    if (!Receiver)
-      return false;
-    Receiver = Receiver->IgnoreParenCasts();
-    if (auto *Inner = dyn_cast<ObjCMessageExpr>(Receiver)) {
-      if (InnerExpr)
-        *InnerExpr = Inner;
-      auto InnerSelector = Inner->getSelector();
-      return InnerSelector.getNameForSlot(0) == "alloc";
-    } else if (auto *CE = dyn_cast<CallExpr>(Receiver)) {
-      if (InnerExpr)
-        *InnerExpr = CE;
-      if (auto *Callee = CE->getDirectCallee()) {
-        if (Callee->getDeclName().isIdentifier()) {
-          auto CalleeName = Callee->getName();
-          return CalleeName.starts_with("alloc");
-        }
-      }
-    }
-    return false;
-  }
-
   bool isCreateOrCopy(const Expr *E) const {
     auto *CE = dyn_cast<CallExpr>(E);
     if (!CE)
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
index 8bef24f93ceed..cfc214fae33e5 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
@@ -625,6 +625,8 @@ void foo() {
 
 } // namespace template_function
 
+SomeObj *allocObj();
+
 @interface TestObject : NSObject
 - (void)doWork:(NSString *)msg, ...;
 - (void)doWorkOnSelf;
@@ -647,6 +649,7 @@ - (void)doWorkOnSelf {
   [self doWork:__null];
   [self doWork:nil];
   [NSApp run];
+  adoptNS([allocObj() init]);
 }
 
 - (SomeObj *)getSomeObj {

From 8a115b6934a90441d77ea54af73e7aaaa1394b38 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 9 Dec 2025 20:37:20 +0000
Subject: [PATCH 52/85] [LV] Mark checks as never succeeding for high cost
 cutoff.

When GeneratedRTChecks::create bails out due to exceeding the cost
threshold, no runtime checks are generated and we must not proceed
assuming checks have been generated.

Mark the checks as never succeeding, to make sure we don't try to
vectorize assuming the runtime checks hold. This fixes a case where we
previously incorrectly vectorized assuming runtime checks had been
generated when forcing vectorization via metadate.

Fixes the mis-compile mentioned in
https://github.com/llvm/llvm-project/pull/166247#issuecomment-3631471588
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 ++-
 ...ime-check-threshold-with-force-metadata.ll | 39 +++++++------------
 2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 15d0fa41bd902..79cdae25e38da 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1827,8 +1827,12 @@ class GeneratedRTChecks {
     // profile info.
     CostTooHigh =
         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
-    if (CostTooHigh)
+    if (CostTooHigh) {
+      // Mark runtime checks as never succeeding when they exceed the threshold.
+      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
+      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
       return;
+    }
 
     BasicBlock *LoopHeader = L->getHeader();
     BasicBlock *Preheader = L->getLoopPreheader();
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
index b7d36fe7928e5..5376eb86882b7 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
@@ -2,29 +2,23 @@
 ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
 ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
 
-; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
-; no runtime check is generated even though one is needed and !noalias
-; annotations are added.
+; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
+; no runtime check is generated and the loop should not be vectorized.
 define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
 ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
 ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
-; LIMIT0-NEXT:  [[ENTRY:.*:]]
-; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
-; LIMIT0:       [[VECTOR_PH]]:
-; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
-; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
-; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
-; LIMIT0:       [[VECTOR_BODY]]:
-; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; LIMIT0-NEXT:  [[ENTRY:.*]]:
+; LIMIT0-NEXT:    br label %[[LOOP:.*]]
+; LIMIT0:       [[LOOP]]:
+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
 ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
-; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
 ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; LIMIT0:       [[MIDDLE_BLOCK]]:
-; LIMIT0-NEXT:    br label %[[EXIT:.*]]
+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LIMIT0:       [[EXIT]]:
+; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
 ; LIMIT0-NEXT:    ret i16 [[TMP0]]
 ;
 ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
@@ -88,14 +82,9 @@ exit:
 !3 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;.
-; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
-; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
-; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
-; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
-; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
-; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
-; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
-; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
+; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
 ;.
 ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
 ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}

From af3c3ecb181994146df20c82c3046727b62bc269 Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach@cantab.net>
Date: Tue, 9 Dec 2025 20:41:44 +0000
Subject: [PATCH 53/85] [AArch64] recognise trn1/trn2 with flipped operands
 (#169858)

This PR is very similar to #167235, but applied to `trn` rather than
`zip`. There are two further differences:
- The `@combine_v8i16_8first` and `@combine_v8i16_8firstundef` test
  cases in `arm64-zip.ll` didn't have equivalents in `arm64-trn.ll`, so
  this PR adds new test cases `@vtrni8_8first`, `@vtrni8_9first`,
  `@vtrni8_89first_undef`.
- `AArch64TTIImpl::getShuffleCost` calls `isZIPMask`, but not
  `isTRNMask`. It relies on `Kind == TTI::SK_Transpose` instead (which
  in turn is based on `ShuffleVectorInst::isTransposeMask` through
  `improveShuffleKindFromMask`).
Therefore, this PR does not itself influence the slp-vectorizer. In a
follow-up PR, I intend to override
`AArch64TTIImpl::improveShuffleKindFromMask` to ensure we get
`ShuffleKind::SK_Transpose` based on the new `isTRNMask`. In fact, that
follow-up change is the actual motivation for this PR, as it will result
in
  ```C++
  int8x16_t g(int8_t x)
  {
    return (int8x16_t) { 0, x, 1, x, 2, x, 3, x,
                         4, x, 5, x, 6, x, 7, x };
  }
  ```
  from #137447 being optimised by the slp-vectorizer.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  16 +-
 .../Target/AArch64/AArch64PerfectShuffle.h    |  52 +++--
 .../GISel/AArch64PostLegalizerLowering.cpp    |   7 +-
 llvm/test/CodeGen/AArch64/arm64-trn.ll        |  57 ++++++
 .../AArch64/fixed-vector-deinterleave.ll      |   8 +-
 llvm/test/CodeGen/AArch64/reduce-shuffle.ll   | 185 +++++++++---------
 6 files changed, 202 insertions(+), 123 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e140aabb9bbeb..30eb19036ddda 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14965,9 +14965,10 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
     return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
   }
-  if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
+  if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
-    return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
+    return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
+                       OperandOrder == 0 ? V2 : V1);
   }
 
   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
@@ -16679,7 +16680,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
           isREVMask(M, EltSize, NumElts, 16) ||
           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
           isSingletonEXTMask(M, VT, DummyUnsigned) ||
-          isTRNMask(M, NumElts, DummyUnsigned) ||
+          isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
           isUZPMask(M, NumElts, DummyUnsigned) ||
           isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
@@ -31798,10 +31799,13 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
                     OperandOrder == 0 ? Op1 : Op2,
                     OperandOrder == 0 ? Op2 : Op1));
 
-  if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
+  if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
+                OperandOrder)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
-    return convertFromScalableVector(
-        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
+    SDValue TRN =
+        DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2,
+                    OperandOrder == 0 ? Op2 : Op1);
+    return convertFromScalableVector(DAG, VT, TRN);
   }
 
   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index ef8786d0ad0e1..c7d6b31291197 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6699,33 +6699,53 @@ inline bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
 }
 
 /// Return true for trn1 or trn2 masks of the form:
-///  <0, 8, 2, 10, 4, 12, 6, 14> or
-///  <1, 9, 3, 11, 5, 13, 7, 15>
+///  <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0, OperandOrderOut = 0) or
+///  <1, 9, 3, 11, 5, 13, 7, 15> (WhichResultOut = 1, OperandOrderOut = 0) or
+///  <8, 0, 10, 2, 12, 4, 14, 6> (WhichResultOut = 0, OperandOrderOut = 1) or
+///  <9, 1, 11, 3, 13, 5, 15, 7> (WhichResultOut = 1, OperandOrderOut = 1) or
 inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
-                      unsigned &WhichResultOut) {
+                      unsigned &WhichResultOut, unsigned &OperandOrderOut) {
   if (NumElts % 2 != 0)
     return false;
-  // Check the first non-undef element for trn1 vs trn2.
-  unsigned WhichResult = 2;
+
+  // "Result" corresponds to "WhichResultOut", selecting between trn1 and trn2.
+  // "Order" corresponds to "OperandOrderOut", selecting the order of operands
+  // for the instruction (flipped or not).
+  bool Result0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
+  bool Result1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
+  bool Result0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
+  bool Result1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
+  // Check all elements match.
   for (unsigned i = 0; i != NumElts; i += 2) {
     if (M[i] >= 0) {
-      WhichResult = ((unsigned)M[i] == i ? 0 : 1);
-      break;
+      unsigned EvenElt = (unsigned)M[i];
+      if (EvenElt != i)
+        Result0Order0 = false;
+      if (EvenElt != i + 1)
+        Result1Order0 = false;
+      if (EvenElt != NumElts + i)
+        Result0Order1 = false;
+      if (EvenElt != NumElts + i + 1)
+        Result1Order1 = false;
     }
     if (M[i + 1] >= 0) {
-      WhichResult = ((unsigned)M[i + 1] == i + NumElts ? 0 : 1);
-      break;
+      unsigned OddElt = (unsigned)M[i + 1];
+      if (OddElt != NumElts + i)
+        Result0Order0 = false;
+      if (OddElt != NumElts + i + 1)
+        Result1Order0 = false;
+      if (OddElt != i)
+        Result0Order1 = false;
+      if (OddElt != i + 1)
+        Result1Order1 = false;
     }
   }
-  if (WhichResult == 2)
+
+  if (Result0Order0 + Result1Order0 + Result0Order1 + Result1Order1 != 1)
     return false;
 
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
-      return false;
-  }
-  WhichResultOut = WhichResult;
+  WhichResultOut = (Result0Order0 || Result0Order1) ? 0 : 1;
+  OperandOrderOut = (Result0Order0 || Result1Order0) ? 0 : 1;
   return true;
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 4fba593b3d0fb..221a7bcd881bb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -215,14 +215,15 @@ bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
               ShuffleVectorPseudo &MatchInfo) {
   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
   unsigned WhichResult;
+  unsigned OperandOrder;
   ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
   Register Dst = MI.getOperand(0).getReg();
   unsigned NumElts = MRI.getType(Dst).getNumElements();
-  if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
+  if (!isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder))
     return false;
   unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
-  Register V1 = MI.getOperand(1).getReg();
-  Register V2 = MI.getOperand(2).getReg();
+  Register V1 = MI.getOperand(OperandOrder == 0 ? 1 : 2).getReg();
+  Register V2 = MI.getOperand(OperandOrder == 0 ? 2 : 1).getReg();
   MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
   return true;
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-trn.ll b/llvm/test/CodeGen/AArch64/arm64-trn.ll
index fe245d01a7a6d..120c2d13a7ab7 100644
--- a/llvm/test/CodeGen/AArch64/arm64-trn.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-trn.ll
@@ -246,6 +246,63 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind {
 	ret <4 x float> %tmp5
 }
 
+define <8 x i8> @vtrni8_trn1_flipped(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_trn1_flipped:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    trn1 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: vtrni8_trn1_flipped:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v1.8b, v1.8b
+; CHECKBE-NEXT:    trn1 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @vtrni8_trn2_flipped(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_trn2_flipped:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: vtrni8_trn2_flipped:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v1.8b, v1.8b
+; CHECKBE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @vtrni8_both_flipped_with_poison_values(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_both_flipped_with_poison_values:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    trn1 v2.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    add v0.8b, v2.8b, v0.8b
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: vtrni8_both_flipped_with_poison_values:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v1.8b, v1.8b
+; CHECKBE-NEXT:    trn1 v2.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    add v0.8b, v2.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 poison, i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 14, i32 6>
+  %tmp2 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 poison, i32 1, i32 poison, i32 3, i32 13, i32 5, i32 15, i32 poison>
+  %tmp3 = add <8 x i8> %tmp1, %tmp2
+  ret <8 x i8> %tmp3
+}
+
 ; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN:
 
 define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 282e0503dd7be..8e75d69be5062 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -6,12 +6,10 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-SD-LABEL: vector_deinterleave_v2f16_v4f16:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    dup v2.2s, v0.s[1]
-; CHECK-SD-NEXT:    mov v1.16b, v2.16b
-; CHECK-SD-NEXT:    zip1 v2.4h, v0.4h, v2.4h
-; CHECK-SD-NEXT:    mov v1.h[0], v0.h[1]
+; CHECK-SD-NEXT:    dup v1.2s, v0.s[1]
+; CHECK-SD-NEXT:    zip1 v2.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    trn2 v1.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    fmov d0, d2
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16:
diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index 072f6f4e8f73e..39beffcf85783 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -36,93 +36,93 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    zip1 v5.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    trn1 v6.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    zip2 v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    ext v16.16b, v1.16b, v1.16b, #12
-; CHECK-NEXT:    zip2 v17.4s, v1.4s, v2.4s
-; CHECK-NEXT:    zip2 v7.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip1 v18.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ext v7.16b, v1.16b, v1.16b, #12
+; CHECK-NEXT:    zip2 v16.4s, v1.4s, v2.4s
+; CHECK-NEXT:    zip1 v17.4s, v2.4s, v1.4s
+; CHECK-NEXT:    trn2 v18.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    uzp2 v4.4s, v4.4s, v1.4s
 ; CHECK-NEXT:    ext v3.16b, v3.16b, v5.16b, #8
-; CHECK-NEXT:    mov v1.s[0], v2.s[1]
-; CHECK-NEXT:    ext v2.16b, v2.16b, v16.16b, #12
-; CHECK-NEXT:    mov v17.d[1], v6.d[1]
-; CHECK-NEXT:    mov v7.d[1], v6.d[1]
+; CHECK-NEXT:    zip2 v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ext v2.16b, v2.16b, v7.16b, #12
+; CHECK-NEXT:    mov v16.d[1], v6.d[1]
+; CHECK-NEXT:    mov v18.d[1], v5.d[1]
 ; CHECK-NEXT:    mov v4.d[1], v0.d[1]
-; CHECK-NEXT:    mov v18.d[1], v3.d[1]
-; CHECK-NEXT:    mov v1.d[1], v5.d[1]
+; CHECK-NEXT:    mov v17.d[1], v3.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
 ; CHECK-NEXT:    mov v2.d[1], v0.d[1]
-; CHECK-NEXT:    add v0.4s, v4.4s, v17.4s
-; CHECK-NEXT:    add v3.4s, v1.4s, v18.4s
-; CHECK-NEXT:    sub v1.4s, v18.4s, v1.4s
-; CHECK-NEXT:    sub v2.4s, v7.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v4.4s, v16.4s
+; CHECK-NEXT:    add v3.4s, v18.4s, v17.4s
+; CHECK-NEXT:    sub v6.4s, v17.4s, v18.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    rev64 v4.4s, v0.4s
 ; CHECK-NEXT:    rev64 v5.4s, v3.4s
-; CHECK-NEXT:    sub v6.4s, v1.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sub v2.4s, v6.4s, v1.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v6.4s
 ; CHECK-NEXT:    mov v4.d[1], v0.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v3.d[1]
-; CHECK-NEXT:    rev64 v2.4s, v6.4s
+; CHECK-NEXT:    rev64 v6.4s, v2.4s
 ; CHECK-NEXT:    rev64 v7.4s, v1.4s
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v4.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    sub v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    sub v4.4s, v2.4s, v6.4s
 ; CHECK-NEXT:    sub v5.4s, v1.4s, v7.4s
-; CHECK-NEXT:    addp v4.4s, v3.4s, v6.4s
+; CHECK-NEXT:    addp v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    addp v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    rev64 v6.4s, v0.4s
 ; CHECK-NEXT:    rev64 v7.4s, v3.4s
-; CHECK-NEXT:    ext v16.16b, v4.16b, v2.16b, #4
+; CHECK-NEXT:    ext v16.16b, v2.16b, v4.16b, #4
 ; CHECK-NEXT:    ext v17.16b, v1.16b, v5.16b, #4
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    mov v7.16b, v2.16b
-; CHECK-NEXT:    zip2 v6.4s, v16.4s, v4.4s
+; CHECK-NEXT:    mov v7.16b, v4.16b
+; CHECK-NEXT:    zip2 v6.4s, v16.4s, v2.4s
 ; CHECK-NEXT:    mov v16.16b, v5.16b
 ; CHECK-NEXT:    zip2 v17.4s, v17.4s, v1.4s
 ; CHECK-NEXT:    ext v18.16b, v0.16b, v1.16b, #4
-; CHECK-NEXT:    mov v7.s[2], v4.s[3]
+; CHECK-NEXT:    mov v7.s[2], v2.s[3]
 ; CHECK-NEXT:    mov v21.16b, v3.16b
 ; CHECK-NEXT:    mov v16.s[2], v1.s[3]
 ; CHECK-NEXT:    ext v5.16b, v5.16b, v17.16b, #12
 ; CHECK-NEXT:    zip1 v17.4s, v1.4s, v1.4s
-; CHECK-NEXT:    ext v2.16b, v2.16b, v6.16b, #12
+; CHECK-NEXT:    ext v4.16b, v4.16b, v6.16b, #12
 ; CHECK-NEXT:    ext v18.16b, v18.16b, v18.16b, #4
 ; CHECK-NEXT:    mov v19.16b, v7.16b
-; CHECK-NEXT:    ext v6.16b, v3.16b, v4.16b, #8
-; CHECK-NEXT:    mov v21.s[2], v4.s[1]
+; CHECK-NEXT:    ext v6.16b, v3.16b, v2.16b, #8
+; CHECK-NEXT:    mov v21.s[2], v2.s[1]
 ; CHECK-NEXT:    mov v20.16b, v16.16b
-; CHECK-NEXT:    mov v19.s[1], v4.s[2]
+; CHECK-NEXT:    mov v19.s[1], v2.s[2]
 ; CHECK-NEXT:    trn2 v0.4s, v17.4s, v0.4s
 ; CHECK-NEXT:    sub v16.4s, v16.4s, v5.4s
 ; CHECK-NEXT:    mov v17.16b, v18.16b
 ; CHECK-NEXT:    ext v3.16b, v6.16b, v3.16b, #4
-; CHECK-NEXT:    sub v7.4s, v7.4s, v2.4s
+; CHECK-NEXT:    sub v7.4s, v7.4s, v4.4s
 ; CHECK-NEXT:    mov v20.s[1], v1.s[2]
 ; CHECK-NEXT:    mov v17.s[0], v1.s[1]
 ; CHECK-NEXT:    mov v1.16b, v21.16b
-; CHECK-NEXT:    add v2.4s, v19.4s, v2.4s
-; CHECK-NEXT:    uzp2 v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
 ; CHECK-NEXT:    add v5.4s, v20.4s, v5.4s
-; CHECK-NEXT:    mov v1.s[1], v4.s[0]
-; CHECK-NEXT:    sub v4.4s, v0.4s, v18.4s
-; CHECK-NEXT:    mov v2.d[1], v7.d[1]
+; CHECK-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-NEXT:    uzp2 v2.4s, v6.4s, v3.4s
+; CHECK-NEXT:    sub v3.4s, v0.4s, v18.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v17.4s
+; CHECK-NEXT:    mov v4.d[1], v7.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v16.d[1]
-; CHECK-NEXT:    sub v6.4s, v21.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mov v0.d[1], v4.d[1]
-; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT:    cmlt v3.8h, v5.8h, #0
+; CHECK-NEXT:    sub v6.4s, v21.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    mov v0.d[1], v3.d[1]
+; CHECK-NEXT:    cmlt v3.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v2.8h, v5.8h, #0
 ; CHECK-NEXT:    mov v1.d[1], v6.d[1]
-; CHECK-NEXT:    add v2.4s, v4.4s, v2.4s
 ; CHECK-NEXT:    cmlt v6.8h, v0.8h, #0
-; CHECK-NEXT:    add v5.4s, v3.4s, v5.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    add v4.4s, v3.4s, v4.4s
+; CHECK-NEXT:    add v5.4s, v2.4s, v5.4s
 ; CHECK-NEXT:    cmlt v7.8h, v1.8h, #0
 ; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-NEXT:    eor v3.16b, v5.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v2.16b, v5.16b, v2.16b
 ; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v6.16b
-; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    eor v1.16b, v1.16b, v7.16b
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -255,77 +255,76 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
 ; CHECK-NEXT:    saddw v2.4s, v7.4s, v2.4h
 ; CHECK-NEXT:    zip1 v4.4s, v1.4s, v0.4s
-; CHECK-NEXT:    trn1 v18.4s, v1.4s, v0.4s
+; CHECK-NEXT:    trn1 v6.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    zip2 v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v2.4s
-; CHECK-NEXT:    mov v7.16b, v3.16b
-; CHECK-NEXT:    zip1 v6.4s, v2.4s, v3.4s
-; CHECK-NEXT:    zip2 v16.4s, v3.4s, v2.4s
+; CHECK-NEXT:    zip1 v7.4s, v2.4s, v3.4s
+; CHECK-NEXT:    trn2 v16.4s, v2.4s, v3.4s
+; CHECK-NEXT:    ext v18.16b, v3.16b, v3.16b, #12
 ; CHECK-NEXT:    ext v17.16b, v1.16b, v4.16b, #8
-; CHECK-NEXT:    mov v7.s[0], v2.s[1]
-; CHECK-NEXT:    ext v1.16b, v3.16b, v3.16b, #12
+; CHECK-NEXT:    zip2 v1.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v3.4s
+; CHECK-NEXT:    mov v16.d[1], v4.d[1]
 ; CHECK-NEXT:    zip2 v3.4s, v2.4s, v3.4s
-; CHECK-NEXT:    mov v16.d[1], v18.d[1]
-; CHECK-NEXT:    mov v6.d[1], v17.d[1]
-; CHECK-NEXT:    mov v7.d[1], v4.d[1]
-; CHECK-NEXT:    ext v1.16b, v2.16b, v1.16b, #12
+; CHECK-NEXT:    ext v2.16b, v2.16b, v18.16b, #12
+; CHECK-NEXT:    mov v7.d[1], v17.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v0.d[1]
-; CHECK-NEXT:    mov v3.d[1], v18.d[1]
-; CHECK-NEXT:    add v2.4s, v7.4s, v6.4s
-; CHECK-NEXT:    mov v1.d[1], v0.d[1]
-; CHECK-NEXT:    add v4.4s, v5.4s, v16.4s
-; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    rev64 v0.4s, v4.4s
-; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    sub v3.4s, v6.4s, v7.4s
-; CHECK-NEXT:    mov v5.d[1], v2.d[1]
-; CHECK-NEXT:    add v6.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mov v0.d[1], v4.d[1]
-; CHECK-NEXT:    add v4.4s, v4.4s, v5.4s
-; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    zip1 v2.4s, v4.4s, v6.4s
-; CHECK-NEXT:    uzp2 v3.4s, v4.4s, v6.4s
-; CHECK-NEXT:    zip2 v16.4s, v4.4s, v6.4s
-; CHECK-NEXT:    zip1 v5.4s, v0.4s, v1.4s
-; CHECK-NEXT:    trn1 v7.4s, v0.4s, v1.4s
-; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    trn2 v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    uzp2 v3.4s, v3.4s, v4.4s
-; CHECK-NEXT:    mov v4.s[1], v6.s[1]
-; CHECK-NEXT:    ext v0.16b, v0.16b, v5.16b, #8
-; CHECK-NEXT:    mov v16.d[1], v7.d[1]
-; CHECK-NEXT:    mov v3.d[1], v1.d[1]
-; CHECK-NEXT:    mov v4.d[1], v5.d[1]
+; CHECK-NEXT:    mov v3.d[1], v6.d[1]
 ; CHECK-NEXT:    mov v2.d[1], v0.d[1]
-; CHECK-NEXT:    add v0.4s, v16.4s, v3.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    add v1.4s, v4.4s, v2.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v4.4s, v16.4s, v7.4s
+; CHECK-NEXT:    sub v6.4s, v7.4s, v16.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT:    sub v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    rev64 v5.4s, v4.4s
+; CHECK-NEXT:    rev64 v0.4s, v1.4s
+; CHECK-NEXT:    add v3.4s, v2.4s, v6.4s
+; CHECK-NEXT:    sub v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    mov v5.d[1], v4.d[1]
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
+; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sub v0.4s, v4.4s, v0.4s
+; CHECK-NEXT:    zip1 v4.4s, v1.4s, v3.4s
+; CHECK-NEXT:    uzp2 v5.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v7.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip1 v6.4s, v0.4s, v2.4s
+; CHECK-NEXT:    trn1 v16.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip2 v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    trn2 v4.4s, v1.4s, v4.4s
+; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v1.4s
+; CHECK-NEXT:    mov v1.s[1], v3.s[1]
+; CHECK-NEXT:    ext v0.16b, v0.16b, v6.16b, #8
+; CHECK-NEXT:    mov v7.d[1], v16.d[1]
+; CHECK-NEXT:    mov v5.d[1], v2.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
+; CHECK-NEXT:    mov v4.d[1], v0.d[1]
+; CHECK-NEXT:    add v0.4s, v7.4s, v5.4s
+; CHECK-NEXT:    sub v3.4s, v5.4s, v7.4s
+; CHECK-NEXT:    add v2.4s, v1.4s, v4.4s
+; CHECK-NEXT:    sub v1.4s, v4.4s, v1.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #4
 ; CHECK-NEXT:    zip2 v6.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    zip2 v7.4s, v3.4s, v0.4s
-; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    zip2 v16.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip2 v17.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v5.16b, v2.16b, v2.16b, #4
+; CHECK-NEXT:    zip2 v16.4s, v1.4s, v2.4s
+; CHECK-NEXT:    zip2 v17.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    zip1 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    ext v18.16b, v4.16b, v3.16b, #8
-; CHECK-NEXT:    ext v19.16b, v5.16b, v2.16b, #8
+; CHECK-NEXT:    ext v19.16b, v5.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    add v2.4s, v16.4s, v7.4s
 ; CHECK-NEXT:    sub v3.4s, v6.4s, v17.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ext v4.16b, v18.16b, v4.16b, #4
-; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
 ; CHECK-NEXT:    cmlt v6.8h, v2.8h, #0
 ; CHECK-NEXT:    ext v5.16b, v19.16b, v5.16b, #4
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
 ; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
 ; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    add v4.4s, v5.4s, v4.4s
 ; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    cmlt v7.8h, v4.8h, #0
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
@@ -480,7 +479,7 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    uzp2 v4.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v7.4s, v1.4s, v0.4s
-; CHECK-NEXT:    mov v6.s[3], v5.s[2]
+; CHECK-NEXT:    trn1 v6.4s, v6.4s, v5.4s
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    zip2 v17.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    zip1 v2.4s, v2.4s, v3.4s

From 0895163097b66f944b0e0a59960eab1deaf36684 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 12:43:01 -0800
Subject: [PATCH 54/85] [bazel] Port 24117f75ad9d7bbb439e8e1bd596fdcf0aa8d6e2
 (#171497)

This patch removed some source files that were explicitly enumerated in
the bazel files. Remove them so that the build passes.
---
 .../clang-tools-extra/clang-doc/BUILD.bazel                     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
index 45dada5884cb4..179658cadb0e2 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
@@ -28,7 +28,6 @@ cc_library(
         exclude = [
             "Generators.cpp",
             "HTMLGenerator.cpp",
-            "HTMLMustacheGenerator.cpp",
             "MDGenerator.cpp",
             "YAMLGenerator.cpp",
         ],
@@ -53,7 +52,6 @@ cc_library(
     srcs = [
         "Generators.cpp",
         "HTMLGenerator.cpp",
-        "HTMLMustacheGenerator.cpp",
         "MDGenerator.cpp",
         "YAMLGenerator.cpp",
     ],

From 019a2947719b979a4192ad4baa96e155e240f145 Mon Sep 17 00:00:00 2001
From: Medha Tiwari <75640645+medhatiwari@users.noreply.github.com>
Date: Wed, 10 Dec 2025 02:19:42 +0530
Subject: [PATCH 55/85] [CIR][X86] Implement xsave/xrstor builtins Fixes part
 of #167752 (#170877)

Handle xsave/xrstor family of X86 builtins in ClangIR

Part of #167752

---------

Signed-off-by: Medha Tiwari <medhatiwari@ibm.com>
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  71 ++++++-
 .../CIR/CodeGenBuiltins/X86/xsave-builtins.c  | 194 ++++++++++++++++++
 2 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 855134ba2b249..62836ce0f7537 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -544,9 +544,78 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_xsaves:
   case X86::BI__builtin_ia32_xsaves64:
   case X86::BI__builtin_ia32_xsetbv:
-  case X86::BI_xsetbv:
+  case X86::BI_xsetbv: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    StringRef intrinsicName;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unexpected builtin");
+    case X86::BI__builtin_ia32_xsave:
+      intrinsicName = "x86.xsave";
+      break;
+    case X86::BI__builtin_ia32_xsave64:
+      intrinsicName = "x86.xsave64";
+      break;
+    case X86::BI__builtin_ia32_xrstor:
+      intrinsicName = "x86.xrstor";
+      break;
+    case X86::BI__builtin_ia32_xrstor64:
+      intrinsicName = "x86.xrstor64";
+      break;
+    case X86::BI__builtin_ia32_xsaveopt:
+      intrinsicName = "x86.xsaveopt";
+      break;
+    case X86::BI__builtin_ia32_xsaveopt64:
+      intrinsicName = "x86.xsaveopt64";
+      break;
+    case X86::BI__builtin_ia32_xrstors:
+      intrinsicName = "x86.xrstors";
+      break;
+    case X86::BI__builtin_ia32_xrstors64:
+      intrinsicName = "x86.xrstors64";
+      break;
+    case X86::BI__builtin_ia32_xsavec:
+      intrinsicName = "x86.xsavec";
+      break;
+    case X86::BI__builtin_ia32_xsavec64:
+      intrinsicName = "x86.xsavec64";
+      break;
+    case X86::BI__builtin_ia32_xsaves:
+      intrinsicName = "x86.xsaves";
+      break;
+    case X86::BI__builtin_ia32_xsaves64:
+      intrinsicName = "x86.xsaves64";
+      break;
+    case X86::BI__builtin_ia32_xsetbv:
+    case X86::BI_xsetbv:
+      intrinsicName = "x86.xsetbv";
+      break;
+    }
+
+    // The xsave family of instructions take a 64-bit mask that specifies
+    // which processor state components to save/restore. The hardware expects
+    // this mask split into two 32-bit registers: EDX (high 32 bits) and
+    // EAX (low 32 bits).
+    mlir::Type i32Ty = builder.getSInt32Ty();
+
+    // Mhi = (uint32_t)(ops[1] >> 32) - extract high 32 bits via right shift
+    cir::ConstantOp shift32 = builder.getSInt64(32, loc);
+    mlir::Value mhi = builder.createShift(loc, ops[1], shift32.getResult(),
+                                          /*isShiftLeft=*/false);
+    mhi = builder.createIntCast(mhi, i32Ty);
+
+    // Mlo = (uint32_t)ops[1] - extract low 32 bits by truncation
+    mlir::Value mlo = builder.createIntCast(ops[1], i32Ty);
+
+    return emitIntrinsicCallOp(builder, loc, intrinsicName, voidTy,
+                               mlir::ValueRange{ops[0], mhi, mlo});
+  }
   case X86::BI__builtin_ia32_xgetbv:
   case X86::BI_xgetbv:
+    // xgetbv reads the extended control register specified by ops[0] (ECX)
+    // and returns the 64-bit value
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "x86.xgetbv", builder.getUInt64Ty(), ops[0]);
   case X86::BI__builtin_ia32_storedqudi128_mask:
   case X86::BI__builtin_ia32_storedqusi128_mask:
   case X86::BI__builtin_ia32_storedquhi128_mask:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c
new file mode 100644
index 0000000000000..23d6edc6c8c6e
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c
@@ -0,0 +1,194 @@
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +xsave -target-feature +xsaveopt -target-feature +xsavec -target-feature +xsaves -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +xsave -target-feature +xsaveopt -target-feature +xsavec -target-feature +xsaves -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +xsave -target-feature +xsaveopt -target-feature +xsavec -target-feature +xsaves -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+void test_xsave(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsave
+  // CIR: [[P:%.*]] = cir.load {{.*}} : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+  // CIR: [[M:%.*]] = cir.load {{.*}} : !cir.ptr<!u64i>, !u64i
+  // CIR: [[CONST:%.*]] = cir.const #cir.int<32> : !s64i
+  // CIR: [[SHIFT:%.*]] = cir.shift(right, [[M]] : !u64i, [[CONST]] : !s64i) -> !u64i
+  // CIR: [[CAST1:%.*]] = cir.cast integral [[SHIFT]] : !u64i -> !s32i
+  // CIR: [[CAST2:%.*]] = cir.cast integral [[M]] : !u64i -> !s32i
+  // CIR: cir.call_llvm_intrinsic "x86.xsave" [[P]], [[CAST1]], [[CAST2]]
+
+  // LLVM-LABEL: test_xsave
+  // LLVM: [[LP:%.*]] = load ptr, ptr
+  // LLVM: [[LM:%.*]] = load i64, ptr
+  // LLVM: [[LSHIFT:%.*]] = lshr i64 [[LM]], 32
+  // LLVM: [[LCAST1:%.*]] = trunc i64 [[LSHIFT]] to i32
+  // LLVM: [[LCAST2:%.*]] = trunc i64 [[LM]] to i32
+  // LLVM: call void @llvm.x86.xsave(ptr [[LP]], i32 [[LCAST1]], i32 [[LCAST2]])
+
+  // OGCG-LABEL: test_xsave
+  // OGCG: [[OP:%.*]] = load ptr, ptr
+  // OGCG: [[OM:%.*]] = load i64, ptr
+  // OGCG: [[OSHIFT:%.*]] = lshr i64 [[OM]], 32
+  // OGCG: [[OCAST1:%.*]] = trunc i64 [[OSHIFT]] to i32
+  // OGCG: [[OCAST2:%.*]] = trunc i64 [[OM]] to i32
+  // OGCG: call void @llvm.x86.xsave(ptr [[OP]], i32 [[OCAST1]], i32 [[OCAST2]])
+  __builtin_ia32_xsave(p, m);
+}
+
+// The following tests use the same pattern as test_xsave (load, shift, cast, cast, intrinsic call).
+// Only the intrinsic name differs, so we just check the intrinsic call.
+
+void test_xsave64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsave64
+  // CIR: cir.call_llvm_intrinsic "x86.xsave64"
+
+  // LLVM-LABEL: test_xsave64
+  // LLVM: call void @llvm.x86.xsave64
+
+  // OGCG-LABEL: test_xsave64
+  // OGCG: call void @llvm.x86.xsave64
+  __builtin_ia32_xsave64(p, m);
+}
+
+void test_xrstor(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstor
+  // CIR: cir.call_llvm_intrinsic "x86.xrstor"
+
+  // LLVM-LABEL: test_xrstor
+  // LLVM: call void @llvm.x86.xrstor
+
+  // OGCG-LABEL: test_xrstor
+  // OGCG: call void @llvm.x86.xrstor
+  __builtin_ia32_xrstor(p, m);
+}
+
+void test_xrstor64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstor64
+  // CIR: cir.call_llvm_intrinsic "x86.xrstor64"
+
+  // LLVM-LABEL: test_xrstor64
+  // LLVM: call void @llvm.x86.xrstor64
+
+  // OGCG-LABEL: test_xrstor64
+  // OGCG: call void @llvm.x86.xrstor64
+  __builtin_ia32_xrstor64(p, m);
+}
+
+void test_xsaveopt(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaveopt
+  // CIR: cir.call_llvm_intrinsic "x86.xsaveopt"
+
+  // LLVM-LABEL: test_xsaveopt
+  // LLVM: call void @llvm.x86.xsaveopt
+
+  // OGCG-LABEL: test_xsaveopt
+  // OGCG: call void @llvm.x86.xsaveopt
+  __builtin_ia32_xsaveopt(p, m);
+}
+
+void test_xsaveopt64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaveopt64
+  // CIR: cir.call_llvm_intrinsic "x86.xsaveopt64"
+
+  // LLVM-LABEL: test_xsaveopt64
+  // LLVM: call void @llvm.x86.xsaveopt64
+
+  // OGCG-LABEL: test_xsaveopt64
+  // OGCG: call void @llvm.x86.xsaveopt64
+  __builtin_ia32_xsaveopt64(p, m);
+}
+
+void test_xsavec(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsavec
+  // CIR: cir.call_llvm_intrinsic "x86.xsavec"
+
+  // LLVM-LABEL: test_xsavec
+  // LLVM: call void @llvm.x86.xsavec
+
+  // OGCG-LABEL: test_xsavec
+  // OGCG: call void @llvm.x86.xsavec
+  __builtin_ia32_xsavec(p, m);
+}
+
+void test_xsavec64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsavec64
+  // CIR: cir.call_llvm_intrinsic "x86.xsavec64"
+
+  // LLVM-LABEL: test_xsavec64
+  // LLVM: call void @llvm.x86.xsavec64
+
+  // OGCG-LABEL: test_xsavec64
+  // OGCG: call void @llvm.x86.xsavec64
+  __builtin_ia32_xsavec64(p, m);
+}
+
+void test_xsaves(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaves
+  // CIR: cir.call_llvm_intrinsic "x86.xsaves"
+
+  // LLVM-LABEL: test_xsaves
+  // LLVM: call void @llvm.x86.xsaves
+
+  // OGCG-LABEL: test_xsaves
+  // OGCG: call void @llvm.x86.xsaves
+  __builtin_ia32_xsaves(p, m);
+}
+
+void test_xsaves64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaves64
+  // CIR: cir.call_llvm_intrinsic "x86.xsaves64"
+
+  // LLVM-LABEL: test_xsaves64
+  // LLVM: call void @llvm.x86.xsaves64
+
+  // OGCG-LABEL: test_xsaves64
+  // OGCG: call void @llvm.x86.xsaves64
+  __builtin_ia32_xsaves64(p, m);
+}
+
+void test_xrstors(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstors
+  // CIR: cir.call_llvm_intrinsic "x86.xrstors"
+
+  // LLVM-LABEL: test_xrstors
+  // LLVM: call void @llvm.x86.xrstors
+
+  // OGCG-LABEL: test_xrstors
+  // OGCG: call void @llvm.x86.xrstors
+  __builtin_ia32_xrstors(p, m);
+}
+
+void test_xrstors64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstors64
+  // CIR: cir.call_llvm_intrinsic "x86.xrstors64"
+
+  // LLVM-LABEL: test_xrstors64
+  // LLVM: call void @llvm.x86.xrstors64
+
+  // OGCG-LABEL: test_xrstors64
+  // OGCG: call void @llvm.x86.xrstors64
+  __builtin_ia32_xrstors64(p, m);
+}
+
+unsigned long long test_xgetbv(unsigned int a) {
+  // CIR-LABEL: test_xgetbv
+  // CIR: cir.call_llvm_intrinsic "x86.xgetbv"
+
+  // LLVM-LABEL: test_xgetbv
+  // LLVM: call i64 @llvm.x86.xgetbv
+
+  // OGCG-LABEL: test_xgetbv
+  // OGCG: call i64 @llvm.x86.xgetbv
+  return __builtin_ia32_xgetbv(a);
+}
+
+void test_xsetbv(unsigned int a, unsigned long long m) {
+  // CIR-LABEL: test_xsetbv
+  // CIR: cir.call_llvm_intrinsic "x86.xsetbv"
+
+  // LLVM-LABEL: test_xsetbv
+  // LLVM: call void @llvm.x86.xsetbv
+
+  // OGCG-LABEL: test_xsetbv
+  // OGCG: call void @llvm.x86.xsetbv
+  __builtin_ia32_xsetbv(a, m);
+}
+

From 87bf5ee23863bc0b467ee44b2184b2c134a98464 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 9 Dec 2025 12:55:47 -0800
Subject: [PATCH 56/85] [CIR] Add basic support for data member pointers
 (#170939)

This adds the minimum support for C++ data member pointer variables.
---
 clang/include/clang/CIR/Dialect/IR/CIRAttrs.h |  1 +
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  | 51 +++++++++++
 .../CIR/Dialect/IR/CIRTypeConstraints.td      | 10 ++-
 .../include/clang/CIR/Dialect/IR/CIRTypes.td  | 32 ++++++-
 clang/include/clang/CIR/MissingFeatures.h     |  5 ++
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |  5 ++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |  7 +-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 20 +++++
 clang/lib/CIR/CodeGen/CIRGenModule.h          |  2 +
 clang/lib/CIR/CodeGen/CIRGenTypes.cpp         | 15 ++++
 clang/lib/CIR/Dialect/IR/CIRAttrs.cpp         | 32 +++++++
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  6 ++
 clang/lib/CIR/Dialect/IR/CIRTypes.cpp         | 20 +++++
 .../lib/CIR/Dialect/Transforms/CMakeLists.txt |  3 +
 .../Transforms/TargetLowering/CIRCXXABI.cpp   | 20 +++++
 .../Transforms/TargetLowering/CIRCXXABI.h     | 55 ++++++++++++
 .../Transforms/TargetLowering/CMakeLists.txt  | 20 +++++
 .../TargetLowering/LowerItaniumCXXABI.cpp     | 90 +++++++++++++++++++
 .../Transforms/TargetLowering/LowerModule.cpp | 87 ++++++++++++++++++
 .../Transforms/TargetLowering/LowerModule.h   | 55 ++++++++++++
 .../CIR/Lowering/DirectToLLVM/CMakeLists.txt  |  5 ++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 34 ++++++-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   |  2 +
 .../CIR/CodeGen/pointer-to-data-member.cpp    | 32 +++++++
 clang/test/CIR/IR/invalid-data-member.cir     | 27 ++++++
 clang/utils/TableGen/CIRLoweringEmitter.cpp   |  7 +-
 26 files changed, 631 insertions(+), 12 deletions(-)
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h
 create mode 100644 clang/test/CIR/CodeGen/pointer-to-data-member.cpp
 create mode 100644 clang/test/CIR/IR/invalid-data-member.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
index 03a6a97dc8c2e..858d4d6350bed 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
@@ -35,6 +35,7 @@ namespace cir {
 class ArrayType;
 class BoolType;
 class ComplexType;
+class DataMemberType;
 class IntType;
 class MethodType;
 class PointerType;
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 98d4636dafc29..c0279a0b20670 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -447,6 +447,57 @@ def CIR_ConstPtrAttr : CIR_Attr<"ConstPtr", "ptr", [TypedAttrInterface]> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// DataMemberAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_DataMemberAttr : CIR_Attr<"DataMember", "data_member", [
+  TypedAttrInterface
+]> {
+  let summary = "Holds a constant data member pointer value";
+  let parameters = (ins AttributeSelfTypeParameter<
+                            "", "cir::DataMemberType">:$type,
+                        OptionalParameter<
+                            "std::optional<unsigned>">:$member_index);
+  let description = [{
+    A data member attribute is a literal attribute that represents a constant
+    pointer-to-data-member value.
+
+    The `member_index` parameter represents the index of the pointed-to member
+    within its containing record. It is an optional parameter; lack of this
+    parameter indicates a null pointer-to-data-member value.
+
+    Example:
+    ```
+    #ptr = #cir.data_member<1> : !cir.data_member<!s32i in !rec_22Point22>
+
+    #null = #cir.data_member<null> : !cir.data_member<!s32i in !rec_22Point22>
+    ```
+  }];
+
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "cir::DataMemberType":$type), [{
+      return $_get(type.getContext(), type, std::nullopt);
+    }]>,
+    AttrBuilderWithInferredContext<(ins "cir::DataMemberType":$type,
+                                        "unsigned":$member_index), [{
+      return $_get(type.getContext(), type, member_index);
+    }]>,
+  ];
+
+  let genVerifyDecl = 1;
+
+  let assemblyFormat = [{
+    `<` ($member_index^):(`null`)? `>`
+  }];
+
+  let extraClassDeclaration = [{
+    bool isNullPtr() const {
+      return !getMemberIndex().has_value();
+    }
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // GlobalViewAttr
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
index ddca98eac93ab..89762249ed0c4 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
@@ -309,6 +309,13 @@ def CIR_AnyFloatOrVecOfFloatType
     let cppFunctionName = "isFPOrVectorOfFPType";
 }
 
+//===----------------------------------------------------------------------===//
+// Data member type predicates
+//===----------------------------------------------------------------------===//
+
+def CIR_AnyDataMemberType : CIR_TypeBase<"::cir::DataMemberType",
+    "data member type">;
+
 //===----------------------------------------------------------------------===//
 // VPtr type predicates
 //===----------------------------------------------------------------------===//
@@ -322,7 +329,8 @@ def CIR_PtrToVPtr : CIR_PtrToType<CIR_AnyVPtrType>;
 //===----------------------------------------------------------------------===//
 
 defvar CIR_ScalarTypes = [
-    CIR_AnyBoolType, CIR_AnyIntType, CIR_AnyFloatType, CIR_AnyPtrType
+    CIR_AnyBoolType, CIR_AnyIntType, CIR_AnyFloatType, CIR_AnyPtrType,
+    CIR_AnyDataMemberType, CIR_AnyVPtrType
 ];
 
 def CIR_AnyScalarType : AnyTypeOf<CIR_ScalarTypes, "cir scalar type"> {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index 3e062add6633a..59b97f0c6d39a 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -305,6 +305,36 @@ def CIR_PointerType : CIR_Type<"Pointer", "ptr", [
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// CIR_DataMemberType
+//===----------------------------------------------------------------------===//
+
+def CIR_DataMemberType : CIR_Type<"DataMember", "data_member",
+    [DeclareTypeInterfaceMethods<DataLayoutTypeInterface>]
+> {
+  let summary = "CIR type that represents a pointer-to-data-member in C++";
+  let description = [{
+    `cir.data_member` models a pointer-to-data-member in C++. Values of this
+    type are essentially offsets of the pointed-to member within one of its
+    containing record.
+  }];
+
+  let parameters = (ins "mlir::Type":$member_ty,
+                        "cir::RecordType":$class_ty);
+
+  let builders = [
+    TypeBuilderWithInferredContext<(ins
+      "mlir::Type":$member_ty, "cir::RecordType":$class_ty
+    ), [{
+      return $_get(member_ty.getContext(), member_ty, class_ty);
+    }]>,
+  ];
+
+  let assemblyFormat = [{
+    `<` $member_ty `in` $class_ty `>`
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // CIR_VPtrType
 //===----------------------------------------------------------------------===//
@@ -693,7 +723,7 @@ def CIRRecordType : Type<
 def CIR_AnyType : AnyTypeOf<[
   CIR_VoidType, CIR_BoolType, CIR_ArrayType, CIR_VectorType, CIR_IntType,
   CIR_AnyFloatType, CIR_PointerType, CIR_FuncType, CIR_RecordType,
-  CIR_ComplexType, CIR_VPtrType
+  CIR_ComplexType, CIR_VPtrType, CIR_DataMemberType
 ]>;
 
 #endif // CLANG_CIR_DIALECT_IR_CIRTYPES_TD
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 826a4b13f5c0c..b2d94709016fa 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -189,6 +189,10 @@ struct MissingFeatures {
   static bool globalCtorLexOrder() { return false; }
   static bool globalCtorAssociatedData() { return false; }
 
+  // LowerModule handling
+  static bool lowerModuleCodeGenOpts() { return false; }
+  static bool lowerModuleLangOpts() { return false; }
+
   // Misc
   static bool aarch64SIMDIntrinsics() { return false; }
   static bool aarch64SMEIntrinsics() { return false; }
@@ -292,6 +296,7 @@ struct MissingFeatures {
   static bool lowerModeOptLevel() { return false; }
   static bool loweringPrepareX86CXXABI() { return false; }
   static bool loweringPrepareAArch64XXABI() { return false; }
+  static bool makeTripleAlwaysPresent() { return false; }
   static bool maybeHandleStaticInExternC() { return false; }
   static bool mergeAllConstants() { return false; }
   static bool metaDataNode() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 85b38120169fd..bf13eeeaea60a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -189,6 +189,11 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return getType<cir::RecordType>(nameAttr, kind);
   }
 
+  cir::DataMemberAttr getDataMemberAttr(cir::DataMemberType ty,
+                                        unsigned memberIndex) {
+    return cir::DataMemberAttr::get(ty, memberIndex);
+  }
+
   // Return true if the value is a null constant such as null pointer, (+0.0)
   // for floating-point or zero initializer
   bool isNullValue(mlir::Attribute attr) const {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 6820e2a403288..25ce1ba26da09 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -731,11 +731,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   }
 
   mlir::Value VisitUnaryAddrOf(const UnaryOperator *e) {
-    if (llvm::isa<MemberPointerType>(e->getType())) {
-      cgf.cgm.errorNYI(e->getSourceRange(), "Address of member pointer");
-      return builder.getNullPtr(cgf.convertType(e->getType()),
-                                cgf.getLoc(e->getExprLoc()));
-    }
+    if (llvm::isa<MemberPointerType>(e->getType()))
+      return cgf.cgm.emitMemberPointerConstant(e);
 
     return cgf.emitLValue(e->getSubExpr()).getPointer();
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index e1894c040dd53..41a5d9db83e2b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1464,6 +1464,26 @@ void CIRGenModule::emitExplicitCastExprType(const ExplicitCastExpr *e,
          "emitExplicitCastExprType");
 }
 
+mlir::Value CIRGenModule::emitMemberPointerConstant(const UnaryOperator *e) {
+  assert(!cir::MissingFeatures::cxxABI());
+
+  mlir::Location loc = getLoc(e->getSourceRange());
+
+  const auto *decl = cast<DeclRefExpr>(e->getSubExpr())->getDecl();
+
+  // A member function pointer.
+  if (isa<CXXMethodDecl>(decl)) {
+    errorNYI(e->getSourceRange(), "emitMemberPointerConstant: method pointer");
+    return {};
+  }
+
+  // Otherwise, a member data pointer.
+  auto ty = mlir::cast<cir::DataMemberType>(convertType(e->getType()));
+  const auto *fieldDecl = cast<FieldDecl>(decl);
+  return cir::ConstantOp::create(
+      builder, loc, builder.getDataMemberAttr(ty, fieldDecl->getFieldIndex()));
+}
+
 void CIRGenModule::emitDeclContext(const DeclContext *dc) {
   for (Decl *decl : dc->decls()) {
     // Unlike other DeclContexts, the contents of an ObjCImplDecl at TU scope
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 59eb5f8938129..9c0961579718d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -497,6 +497,8 @@ class CIRGenModule : public CIRGenTypeCache {
   /// the given type. This is usually, but not always, an LLVM null constant.
   mlir::TypedAttr emitNullConstantForBase(const CXXRecordDecl *record);
 
+  mlir::Value emitMemberPointerConstant(const UnaryOperator *e);
+
   llvm::StringRef getMangledName(clang::GlobalDecl gd);
 
   void emitTentativeDefinition(const VarDecl *d);
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index 24b106b4bcee7..7f000ece8a494 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -482,6 +482,21 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     break;
   }
 
+  case Type::MemberPointer: {
+    const auto *mpt = cast<MemberPointerType>(ty);
+
+    mlir::Type memberTy = convertType(mpt->getPointeeType());
+    auto clsTy = mlir::cast<cir::RecordType>(
+        convertType(QualType(mpt->getQualifier().getAsType(), 0)));
+    if (mpt->isMemberDataPointer()) {
+      resultType = cir::DataMemberType::get(memberTy, clsTy);
+    } else {
+      assert(!cir::MissingFeatures::methodType());
+      cgm.errorNYI(SourceLocation(), "MethodType");
+    }
+    break;
+  }
+
   case Type::FunctionNoProto:
   case Type::FunctionProto:
     resultType = convertFunctionTypeInternal(type);
diff --git a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
index ee296f171e0d9..59d7765198f9e 100644
--- a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
@@ -269,6 +269,38 @@ ConstComplexAttr::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DataMemberAttr definitions
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+DataMemberAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                       cir::DataMemberType ty,
+                       std::optional<unsigned> memberIndex) {
+  // DataMemberAttr without a given index represents a null value.
+  if (!memberIndex.has_value())
+    return success();
+
+  cir::RecordType recTy = ty.getClassTy();
+  if (recTy.isIncomplete())
+    return emitError()
+           << "incomplete 'cir.record' cannot be used to build a non-null "
+              "data member pointer";
+
+  unsigned memberIndexValue = memberIndex.value();
+  if (memberIndexValue >= recTy.getNumElements())
+    return emitError()
+           << "member index of a #cir.data_member attribute is out of range";
+
+  mlir::Type memberTy = recTy.getMembers()[memberIndexValue];
+  if (memberTy != ty.getMemberTy())
+    return emitError()
+           << "member type of a #cir.data_member attribute must match the "
+              "attribute type";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // CIR ConstArrayAttr
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index ec8cae62d6bc8..38a2cecbb8617 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -357,6 +357,12 @@ static LogicalResult checkConstantTypes(mlir::Operation *op, mlir::Type opType,
     return success();
   }
 
+  if (isa<cir::DataMemberAttr>(attrType)) {
+    // More detailed type verifications are already done in
+    // DataMemberAttr::verify. Don't need to repeat here.
+    return success();
+  }
+
   if (isa<cir::ZeroAttr>(attrType)) {
     if (isa<cir::RecordType, cir::ArrayType, cir::VectorType, cir::ComplexType>(
             opType))
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
index bb87056048ec5..9a37a4f4e3996 100644
--- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -750,6 +750,26 @@ BoolType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
   return 1;
 }
 
+//===----------------------------------------------------------------------===//
+//  DataMemberType Definitions
+//===----------------------------------------------------------------------===//
+
+llvm::TypeSize
+DataMemberType::getTypeSizeInBits(const ::mlir::DataLayout &dataLayout,
+                                  ::mlir::DataLayoutEntryListRef params) const {
+  // FIXME: consider size differences under different ABIs
+  assert(!MissingFeatures::cxxABI());
+  return llvm::TypeSize::getFixed(64);
+}
+
+uint64_t
+DataMemberType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
+                                ::mlir::DataLayoutEntryListRef params) const {
+  // FIXME: consider alignment differences under different ABIs
+  assert(!MissingFeatures::cxxABI());
+  return 8;
+}
+
 //===----------------------------------------------------------------------===//
 //  VPtrType Definitions
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
index 3fc5b06b74e4d..e3b7106c1d6b9 100644
--- a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
+++ b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(TargetLowering)
+
 add_clang_library(MLIRCIRTransforms
   CIRCanonicalize.cpp
   CIRSimplify.cpp
@@ -21,4 +23,5 @@ add_clang_library(MLIRCIRTransforms
 
   MLIRCIR
   MLIRCIRInterfaces
+  MLIRCIRTargetLowering
 )
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp
new file mode 100644
index 0000000000000..86cf7ebdc8f50
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp
@@ -0,0 +1,20 @@
+//===- CIRCXXABI.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics clang/lib/CodeGen/CGCXXABI.cpp. The queries are
+// adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRCXXABI.h"
+
+namespace cir {
+
+CIRCXXABI::~CIRCXXABI() {}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
new file mode 100644
index 0000000000000..003cd78eb3f26
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
@@ -0,0 +1,55 @@
+//===----- CIRCXXABI.h - Interface to C++ ABIs for CIR Dialect --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics the CodeGen/CGCXXABI.h class. The main difference
+// is that this is adapted to operate on the CIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
+#define CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
+
+#include "mlir/Transforms/DialectConversion.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+
+namespace cir {
+
+// Forward declarations.
+class LowerModule;
+
+class CIRCXXABI {
+  friend class LowerModule;
+
+protected:
+  LowerModule &lm;
+
+  CIRCXXABI(LowerModule &lm) : lm(lm) {}
+
+public:
+  virtual ~CIRCXXABI();
+
+  /// Lower the given data member pointer type to its ABI type. The returned
+  /// type is also a CIR type.
+  virtual mlir::Type
+  lowerDataMemberType(cir::DataMemberType type,
+                      const mlir::TypeConverter &typeConverter) const = 0;
+
+  /// Lower the given data member pointer constant to a constant of the ABI
+  /// type. The returned constant is represented as an attribute as well.
+  virtual mlir::TypedAttr
+  lowerDataMemberConstant(cir::DataMemberAttr attr,
+                          const mlir::DataLayout &layout,
+                          const mlir::TypeConverter &typeConverter) const = 0;
+};
+
+/// Creates an Itanium-family ABI.
+std::unique_ptr<CIRCXXABI> createItaniumCXXABI(LowerModule &lm);
+
+} // namespace cir
+
+#endif // CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
new file mode 100644
index 0000000000000..158c42e729536
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
@@ -0,0 +1,20 @@
+add_clang_library(MLIRCIRTargetLowering
+  CIRCXXABI.cpp
+  LowerModule.cpp
+  LowerItaniumCXXABI.cpp
+
+  DEPENDS
+  clangBasic
+
+  LINK_COMPONENTS
+  TargetParser
+
+  LINK_LIBS PUBLIC
+
+  clangBasic
+  MLIRIR
+  MLIRPass
+  MLIRDLTIDialect
+  MLIRCIR
+  MLIRCIRInterfaces
+)
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
new file mode 100644
index 0000000000000..7089990343dc0
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
@@ -0,0 +1,90 @@
+//===---- LowerItaniumCXXABI.cpp - Emit CIR code Itanium-specific code  ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides CIR lowering logic targeting the Itanium C++ ABI. The class in
+// this file generates records that follow the Itanium C++ ABI, which is
+// documented at:
+//  https://itanium-cxx-abi.github.io/cxx-abi/abi.html
+//  https://itanium-cxx-abi.github.io/cxx-abi/abi-eh.html
+//
+// It also supports the closely-related ARM ABI, documented at:
+// https://developer.arm.com/documentation/ihi0041/g/
+//
+// This file partially mimics clang/lib/CodeGen/ItaniumCXXABI.cpp. The queries
+// are adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRCXXABI.h"
+#include "LowerModule.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace cir {
+
+namespace {
+
+class LowerItaniumCXXABI : public CIRCXXABI {
+public:
+  LowerItaniumCXXABI(LowerModule &lm) : CIRCXXABI(lm) {}
+
+  /// Lower the given data member pointer type to its ABI type. The returned
+  /// type is also a CIR type.
+  virtual mlir::Type
+  lowerDataMemberType(cir::DataMemberType type,
+                      const mlir::TypeConverter &typeConverter) const override;
+
+  mlir::TypedAttr lowerDataMemberConstant(
+      cir::DataMemberAttr attr, const mlir::DataLayout &layout,
+      const mlir::TypeConverter &typeConverter) const override;
+};
+
+} // namespace
+
+std::unique_ptr<CIRCXXABI> createItaniumCXXABI(LowerModule &lm) {
+  return std::make_unique<LowerItaniumCXXABI>(lm);
+}
+
+static cir::IntType getPtrDiffCIRTy(LowerModule &lm) {
+  const clang::TargetInfo &target = lm.getTarget();
+  clang::TargetInfo::IntType ptrdiffTy =
+      target.getPtrDiffType(clang::LangAS::Default);
+  return cir::IntType::get(lm.getMLIRContext(), target.getTypeWidth(ptrdiffTy),
+                           target.isTypeSigned(ptrdiffTy));
+}
+
+mlir::Type LowerItaniumCXXABI::lowerDataMemberType(
+    cir::DataMemberType type, const mlir::TypeConverter &typeConverter) const {
+  // Itanium C++ ABI 2.3.1:
+  //   A data member pointer is represented as the data member's offset in bytes
+  //   from the address point of an object of the base type, as a ptrdiff_t.
+  return getPtrDiffCIRTy(lm);
+}
+
+mlir::TypedAttr LowerItaniumCXXABI::lowerDataMemberConstant(
+    cir::DataMemberAttr attr, const mlir::DataLayout &layout,
+    const mlir::TypeConverter &typeConverter) const {
+  uint64_t memberOffset;
+  if (attr.isNullPtr()) {
+    // Itanium C++ ABI 2.3:
+    //   A NULL pointer is represented as -1.
+    memberOffset = -1ull;
+  } else {
+    // Itanium C++ ABI 2.3:
+    //   A pointer to data member is an offset from the base address of
+    //   the class object containing it, represented as a ptrdiff_t
+    unsigned memberIndex = attr.getMemberIndex().value();
+    memberOffset =
+        attr.getType().getClassTy().getElementOffset(layout, memberIndex);
+  }
+
+  mlir::Type abiTy = lowerDataMemberType(attr.getType(), typeConverter);
+  return cir::IntAttr::get(abiTy, memberOffset);
+}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp
new file mode 100644
index 0000000000000..7576e20ac8f54
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp
@@ -0,0 +1,87 @@
+//===--- LowerModule.cpp - Lower CIR Module to a Target -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics clang/lib/CodeGen/CodeGenModule.cpp. The queries
+// are adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LowerModule.h"
+#include "CIRCXXABI.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
+#include "clang/CIR/MissingFeatures.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace cir {
+
+static std::unique_ptr<CIRCXXABI> createCXXABI(LowerModule &lm) {
+  switch (lm.getCXXABIKind()) {
+  case clang::TargetCXXABI::AppleARM64:
+  case clang::TargetCXXABI::Fuchsia:
+  case clang::TargetCXXABI::GenericAArch64:
+  case clang::TargetCXXABI::GenericARM:
+  case clang::TargetCXXABI::iOS:
+  case clang::TargetCXXABI::WatchOS:
+  case clang::TargetCXXABI::GenericMIPS:
+  case clang::TargetCXXABI::GenericItanium:
+  case clang::TargetCXXABI::WebAssembly:
+  case clang::TargetCXXABI::XL:
+    return createItaniumCXXABI(lm);
+  case clang::TargetCXXABI::Microsoft:
+    llvm_unreachable("Windows ABI NYI");
+  }
+
+  llvm_unreachable("invalid C++ ABI kind");
+}
+
+LowerModule::LowerModule(clang::LangOptions langOpts,
+                         clang::CodeGenOptions codeGenOpts,
+                         mlir::ModuleOp &module,
+                         std::unique_ptr<clang::TargetInfo> target,
+                         mlir::PatternRewriter &rewriter)
+    : module(module), target(std::move(target)), abi(createCXXABI(*this)),
+      rewriter(rewriter) {}
+
+// TODO: not to create it every time
+std::unique_ptr<LowerModule>
+createLowerModule(mlir::ModuleOp module, mlir::PatternRewriter &rewriter) {
+  // Fetch target information.
+  llvm::Triple triple(mlir::cast<mlir::StringAttr>(
+                          module->getAttr(cir::CIRDialect::getTripleAttrName()))
+                          .getValue());
+  clang::TargetOptions targetOptions;
+  targetOptions.Triple = triple.str();
+  auto targetInfo = clang::targets::AllocateTarget(triple, targetOptions);
+
+  // FIXME(cir): This just uses the default language options. We need to account
+  // for custom options.
+  // Create context.
+  assert(!cir::MissingFeatures::lowerModuleLangOpts());
+  clang::LangOptions langOpts;
+
+  // FIXME(cir): This just uses the default code generation options. We need to
+  // account for custom options.
+  assert(!cir::MissingFeatures::lowerModuleCodeGenOpts());
+  clang::CodeGenOptions codeGenOpts;
+
+  if (auto optInfo = mlir::cast_if_present<cir::OptInfoAttr>(
+          module->getAttr(cir::CIRDialect::getOptInfoAttrName()))) {
+    codeGenOpts.OptimizationLevel = optInfo.getLevel();
+    codeGenOpts.OptimizeSize = optInfo.getSize();
+  }
+
+  return std::make_unique<LowerModule>(std::move(langOpts),
+                                       std::move(codeGenOpts), module,
+                                       std::move(targetInfo), rewriter);
+}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h
new file mode 100644
index 0000000000000..440e307f571e9
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h
@@ -0,0 +1,55 @@
+//===--- LowerModule.h - Abstracts CIR's module lowering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics clang/lib/CodeGen/CodeGenModule.h. The queries are
+// adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_LOWERMODULE_H
+#define CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_LOWERMODULE_H
+
+#include "CIRCXXABI.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "clang/Basic/CodeGenOptions.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/MissingFeatures.h"
+#include <memory>
+
+namespace cir {
+
+class LowerModule {
+  mlir::ModuleOp module;
+  const std::unique_ptr<clang::TargetInfo> target;
+  std::unique_ptr<CIRCXXABI> abi;
+  [[maybe_unused]] mlir::PatternRewriter &rewriter;
+
+public:
+  LowerModule(clang::LangOptions langOpts, clang::CodeGenOptions codeGenOpts,
+              mlir::ModuleOp &module, std::unique_ptr<clang::TargetInfo> target,
+              mlir::PatternRewriter &rewriter);
+  ~LowerModule() = default;
+
+  clang::TargetCXXABI::Kind getCXXABIKind() const {
+    assert(!cir::MissingFeatures::lowerModuleLangOpts());
+    return target->getCXXABI().getKind();
+  }
+
+  CIRCXXABI &getCXXABI() const { return *abi; }
+  const clang::TargetInfo &getTarget() const { return *target; }
+  mlir::MLIRContext *getMLIRContext() { return module.getContext(); }
+};
+
+std::unique_ptr<LowerModule> createLowerModule(mlir::ModuleOp module,
+                                               mlir::PatternRewriter &rewriter);
+
+} // namespace cir
+
+#endif // CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_LOWERMODULE_H
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
index 7baff3412a84e..2525e02ae8f85 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
@@ -18,7 +18,12 @@ add_clang_library(clangCIRLoweringDirectToLLVM
   clangCIRLoweringCommon
   ${dialect_libs}
   MLIRCIR
+  MLIRCIRTargetLowering
   MLIRBuiltinToLLVMIRTranslation
   MLIRLLVMToLLVMIRTranslation
   MLIRIR
   )
+
+target_include_directories(clangCIRLoweringDirectToLLVM PRIVATE
+  ${CLANG_SOURCE_DIR}/lib/CIR/Dialect/Transforms/TargetLowering
+  )
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index cc911cfc7d778..88ca8033b48ea 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1755,6 +1755,14 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
       return mlir::success();
     }
     attr = op.getValue();
+  } else if (mlir::isa<cir::DataMemberType>(op.getType())) {
+    assert(lowerMod && "lower module is not available");
+    auto dataMember = mlir::cast<cir::DataMemberAttr>(op.getValue());
+    mlir::DataLayout layout(op->getParentOfType<mlir::ModuleOp>());
+    mlir::TypedAttr abiValue = lowerMod->getCXXABI().lowerDataMemberConstant(
+        dataMember, layout, *typeConverter);
+    rewriter.replaceOpWithNewOp<ConstantOp>(op, abiValue);
+    return mlir::success();
   } else if (const auto arrTy = mlir::dyn_cast<cir::ArrayType>(op.getType())) {
     const auto constArr = mlir::dyn_cast<cir::ConstArrayAttr>(op.getValue());
     if (!constArr && !isa<cir::ZeroAttr, cir::UndefAttr>(op.getValue()))
@@ -2839,8 +2847,20 @@ mlir::LogicalResult CIRToLLVMSelectOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+std::unique_ptr<cir::LowerModule> prepareLowerModule(mlir::ModuleOp module) {
+  mlir::PatternRewriter rewriter{module->getContext()};
+  // If the triple is not present, e.g. CIR modules parsed from text, we
+  // cannot init LowerModule properly. This happens in some lowering tests,
+  // but it should not happen in real compilation.
+  assert(!cir::MissingFeatures::makeTripleAlwaysPresent());
+  if (!module->hasAttr(cir::CIRDialect::getTripleAttrName()))
+    return {};
+  return cir::createLowerModule(module, rewriter);
+}
+
 static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
-                                 mlir::DataLayout &dataLayout) {
+                                 mlir::DataLayout &dataLayout,
+                                 cir::LowerModule *lowerModule) {
   converter.addConversion([&](cir::PointerType type) -> mlir::Type {
     unsigned addrSpace =
         type.getAddrSpace() ? type.getAddrSpace().getValue().getUInt() : 0;
@@ -2850,6 +2870,13 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
     assert(!cir::MissingFeatures::addressSpace());
     return mlir::LLVM::LLVMPointerType::get(type.getContext());
   });
+  converter.addConversion(
+      [&, lowerModule](cir::DataMemberType type) -> mlir::Type {
+        assert(lowerModule && "CXXABI is not available");
+        mlir::Type abiType =
+            lowerModule->getCXXABI().lowerDataMemberType(type, converter);
+        return converter.convertType(abiType);
+      });
   converter.addConversion([&](cir::ArrayType type) -> mlir::Type {
     mlir::Type ty =
         convertTypeForMemory(converter, dataLayout, type.getElementType());
@@ -3118,7 +3145,8 @@ void ConvertCIRToLLVMPass::runOnOperation() {
   mlir::ModuleOp module = getOperation();
   mlir::DataLayout dl(module);
   mlir::LLVMTypeConverter converter(&getContext());
-  prepareTypeConverter(converter, dl);
+  std::unique_ptr<cir::LowerModule> lowerModule = prepareLowerModule(module);
+  prepareTypeConverter(converter, dl, lowerModule.get());
 
   mlir::RewritePatternSet patterns(&getContext());
 
@@ -3126,7 +3154,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
 #define GET_LLVM_LOWERING_PATTERNS_LIST
 #include "clang/CIR/Dialect/IR/CIRLowering.inc"
 #undef GET_LLVM_LOWERING_PATTERNS_LIST
-      >(converter, patterns.getContext(), dl);
+      >(converter, patterns.getContext(), lowerModule.get(), dl);
 
   processCIRAttrs(module);
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 0591de545b81d..d32f8603ee0be 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -12,6 +12,8 @@
 #ifndef CLANG_CIR_LOWERTOLLVM_H
 #define CLANG_CIR_LOWERTOLLVM_H
 
+#include "LowerModule.h"
+
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Transforms/DialectConversion.h"
diff --git a/clang/test/CIR/CodeGen/pointer-to-data-member.cpp b/clang/test/CIR/CodeGen/pointer-to-data-member.cpp
new file mode 100644
index 0000000000000..b116d21f01170
--- /dev/null
+++ b/clang/test/CIR/CodeGen/pointer-to-data-member.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -Wno-unused-value -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -Wno-unused-value -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+struct Point {
+  int x;
+  int y;
+  int z;
+};
+
+auto test1() -> int Point::* {
+  return &Point::y;
+}
+
+// CIR: cir.func {{.*}} @_Z5test1v() -> !cir.data_member<!s32i in !rec_Point> {
+// CIR:   %[[RETVAL:.*]] = cir.alloca !cir.data_member<!s32i in !rec_Point>, !cir.ptr<!cir.data_member<!s32i in !rec_Point>>, ["__retval"]
+// CIR:   %[[MEMBER:.*]] = cir.const #cir.data_member<1> : !cir.data_member<!s32i in !rec_Point>
+// CIR:   cir.store %[[MEMBER]], %[[RETVAL]] : !cir.data_member<!s32i in !rec_Point>, !cir.ptr<!cir.data_member<!s32i in !rec_Point>>
+// CIR:   %[[RET:.*]] = cir.load %[[RETVAL]] : !cir.ptr<!cir.data_member<!s32i in !rec_Point>>, !cir.data_member<!s32i in !rec_Point>
+// CIR:   cir.return %[[RET]] : !cir.data_member<!s32i in !rec_Point>
+
+// LLVM: define {{.*}} i64 @_Z5test1v()
+// LLVM:   %[[RETVAL:.*]] = alloca i64
+// LLVM:   store i64 4, ptr %[[RETVAL]]
+// LLVM:   %[[RET:.*]] = load i64, ptr %[[RETVAL]]
+// LLVM:   ret i64 %[[RET]]
+
+// OGCG: define {{.*}} i64 @_Z5test1v()
+// OGCG:   ret i64 4
diff --git a/clang/test/CIR/IR/invalid-data-member.cir b/clang/test/CIR/IR/invalid-data-member.cir
new file mode 100644
index 0000000000000..2941777404973
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-data-member.cir
@@ -0,0 +1,27 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+// -----
+
+!u16i = !cir.int<u, 16>
+!u32i = !cir.int<u, 32>
+!struct1 = !cir.record<struct "Struct1" {!u16i, !u32i}>
+
+// expected-error@+1 {{member type of a #cir.data_member attribute must match the attribute type}}
+#invalid_member_ty = #cir.data_member<0> : !cir.data_member<!u32i in !struct1>
+
+// -----
+
+!u16i = !cir.int<u, 16>
+!incomplete_struct = !cir.record<struct "Incomplete" incomplete>
+
+// expected-error@+1 {{incomplete 'cir.record' cannot be used to build a non-null data member pointer}}
+#incomplete_cls_member = #cir.data_member<0> : !cir.data_member<!u16i in !incomplete_struct>
+
+// -----
+
+!u16i = !cir.int<u, 16>
+!u32i = !cir.int<u, 32>
+!struct1 = !cir.record<struct "Struct1" {!u16i, !u32i}>
+
+// expected-error@+1 {{member index of a #cir.data_member attribute is out of range}}
+#invalid_member_ty = #cir.data_member<2> : !cir.data_member<!u32i in !struct1>
diff --git a/clang/utils/TableGen/CIRLoweringEmitter.cpp b/clang/utils/TableGen/CIRLoweringEmitter.cpp
index 80dc209c69a7b..c81b8941f9a39 100644
--- a/clang/utils/TableGen/CIRLoweringEmitter.cpp
+++ b/clang/utils/TableGen/CIRLoweringEmitter.cpp
@@ -60,6 +60,7 @@ void GenerateLLVMLoweringPattern(llvm::StringRef OpName,
 
   Code << "class " << PatternName
        << " : public mlir::OpConversionPattern<cir::" << OpName << "> {\n";
+  Code << "  [[maybe_unused]] cir::LowerModule *lowerMod;\n";
   Code << "  [[maybe_unused]] mlir::DataLayout const &dataLayout;\n";
   Code << "\n";
 
@@ -69,10 +70,12 @@ void GenerateLLVMLoweringPattern(llvm::StringRef OpName,
 
   Code << "  " << PatternName
        << "(mlir::TypeConverter const "
-          "&typeConverter, mlir::MLIRContext *context, mlir::DataLayout const "
+          "&typeConverter, mlir::MLIRContext *context, "
+          "cir::LowerModule *lowerMod, mlir::DataLayout const "
           "&dataLayout)\n";
   Code << "    : OpConversionPattern<cir::" << OpName
-       << ">(typeConverter, context), dataLayout(dataLayout)";
+       << ">(typeConverter, context), lowerMod(lowerMod), "
+          "dataLayout(dataLayout)";
   if (IsRecursive) {
     Code << " {\n";
     Code << "    setHasBoundedRewriteRecursion();\n";

From 8f3c8dabc64517132c9c438ff467e75f39fbf8a6 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 13:09:19 -0800
Subject: [PATCH 57/85] [MLIR][Bytecode] Use consistent types for resolveEntry
 (#171502)

uint64_t and size_t are not the same across all platforms. This was
causing build failures when building this file for wasm:

llvm-project/mlir/lib/Bytecode/Reader/BytecodeReader.cpp:1323:19: error:
out-of-line definition of 'resolveEntry' does not match any declaration
in '(anonymous namespace)::AttrTypeReader'
1323 | T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>>
&entries, size_t index,
      |                   ^~~~~~~~~~~~

third_party/llvm/llvm-project/mlir/lib/Bytecode/Reader/BytecodeReader.cpp:851:7:
note: AttrTypeReader defined here
  851 | class AttrTypeReader {
      |       ^~~~~~~~~~~~~~
1 error generated.

Use uint64_t everywhere to ensure portability.
---
 mlir/lib/Bytecode/Reader/BytecodeReader.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
index dd367b5922558..0ac5fc5358ea5 100644
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -1320,8 +1320,9 @@ LogicalResult AttrTypeReader::initialize(
 }
 
 template <typename T>
-T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
-                               StringRef entryType, uint64_t depth) {
+T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries,
+                               uint64_t index, StringRef entryType,
+                               uint64_t depth) {
   if (index >= entries.size()) {
     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
     return {};

From 926cbddc185e035a4266f25203e81eec8960f114 Mon Sep 17 00:00:00 2001
From: Andrew Haberlandt <ndrewh@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:22:54 -0800
Subject: [PATCH 58/85] [sanitizer_common] child_stdin_fd_ should only be set
 on posix_spawn path (#171508)

#170809 added the child_stdin_fd_ field on SymbolizerProcess to allow
the parent process to hold on to the read in of the child's stdin pipe.
This was to avoid SIGPIPE.

However, the `StartSubprocess` path still closes the stdin fd in the
parent here:

https://github.com/llvm/llvm-project/blob/7f5ed91684c808444ede24eb01ad9af73b5806e5/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp#L525-L535

This could cause a double-close of this fd (problematic in the case of
fd reuse).

This moves the `child_stdin_fd_` field to only be initialized on the
posix_spawn path. This should ensure #170809 only truly affects Darwin.
---
 .../sanitizer_symbolizer_posix_libcdep.cpp                 | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index 29c73e3e1cac1..ab6aee7c9fba7 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -176,6 +176,10 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
       internal_close(outfd[1]);
       return false;
     }
+
+    // We intentionally hold on to the read-end so that we don't get a SIGPIPE
+    child_stdin_fd_ = outfd[0];
+
 #  else   // SANITIZER_APPLE
     UNIMPLEMENTED();
 #  endif  // SANITIZER_APPLE
@@ -192,9 +196,6 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
   input_fd_ = infd[0];
   output_fd_ = outfd[1];
 
-  // We intentionally hold on to the read-end so that we don't get a SIGPIPE
-  child_stdin_fd_ = outfd[0];
-
   CHECK_GT(pid, 0);
 
   // Check that symbolizer subprocess started successfully.

From 64ee4bf73411d1462fec0c2c2b5c44aaaa3a1903 Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:30:46 -0800
Subject: [PATCH 59/85] [scudo] Refactor initialization of TSDs. (#169738)

Instead of getting a lock and then checking/modifying the Initialization
variable, make it an atomic. Doing this, we can remove one of the
mutexes in shared TSDs and avoid any potential lock contention in both
shared TSDs and exclusive TSDs if multiple threads do allocation
operations at the same time.

Add two new tests that make sure no crashes occur if multiple threads
try and do allocations at the same time.
---
 .../scudo/standalone/tests/combined_test.cpp  | 88 +++++++++++++++++++
 .../lib/scudo/standalone/tsd_exclusive.h      | 19 ++--
 compiler-rt/lib/scudo/standalone/tsd_shared.h | 50 ++++++-----
 3 files changed, 127 insertions(+), 30 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 1d4208b6a2aa0..b70b9c9269fed 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -18,6 +18,7 @@
 #include "size_class_map.h"
 
 #include <algorithm>
+#include <atomic>
 #include <condition_variable>
 #include <memory>
 #include <mutex>
@@ -1396,6 +1397,7 @@ TEST(ScudoCombinedTest, FullUsableSizeMTE) {
   VerifyExactUsableSize<AllocatorT>(*Allocator);
   VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
 }
+
 // Verify that no special quarantine blocks appear in iterateOverChunks.
 TEST(ScudoCombinedTest, QuarantineIterateOverChunks) {
   using AllocatorT = TestAllocator<TestQuarantineConfig>;
@@ -1426,3 +1428,89 @@ TEST(ScudoCombinedTest, QuarantineIterateOverChunks) {
                        << std::hex << Base << " Size " << std::dec << Size;
   }
 }
+
+struct InitSizeClassConfig {
+  static const scudo::uptr NumBits = 1;
+  static const scudo::uptr MinSizeLog = 10;
+  static const scudo::uptr MidSizeLog = 10;
+  static const scudo::uptr MaxSizeLog = 13;
+  static const scudo::u16 MaxNumCachedHint = 8;
+  static const scudo::uptr MaxBytesCachedLog = 12;
+  static const scudo::uptr SizeDelta = 0;
+};
+
+struct TestInitSizeConfig {
+  static const bool MaySupportMemoryTagging = false;
+  static const bool QuarantineDisabled = true;
+
+  struct Primary {
+    using SizeClassMap = scudo::FixedSizeClassMap<InitSizeClassConfig>;
+    static const scudo::uptr RegionSizeLog = 21U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 18;
+  };
+  template <typename Config>
+  using PrimaryT = scudo::SizeClassAllocator64<Config>;
+
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
+};
+
+struct TestInitSizeTSDSharedConfig : public TestInitSizeConfig {
+  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 4U, 4U>;
+};
+
+struct TestInitSizeTSDExclusiveConfig : public TestInitSizeConfig {
+  template <class A> using TSDRegistryT = scudo::TSDRegistryExT<A>;
+};
+
+template <class AllocatorT> void RunStress() {
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  // This test is designed to try and have many threads trying to initialize
+  // the TSD at the same time. Make sure this doesn't crash.
+  std::atomic_bool StartRunning = false;
+  std::vector<std::thread *> threads;
+  for (size_t I = 0; I < 16; I++) {
+    threads.emplace_back(new std::thread([&Allocator, &StartRunning]() {
+      while (!StartRunning.load())
+        ;
+
+      void *Ptr = Allocator->allocate(10, Origin);
+      EXPECT_TRUE(Ptr != nullptr);
+      // Make sure this value is not optimized away.
+      asm volatile("" : : "r,m"(Ptr) : "memory");
+      Allocator->deallocate(Ptr, Origin);
+    }));
+  }
+
+  StartRunning = true;
+
+  for (auto *thread : threads) {
+    thread->join();
+    delete thread;
+  }
+}
+
+TEST(ScudoCombinedTest, StressThreadInitTSDShared) {
+  using AllocatorT = scudo::Allocator<TestInitSizeTSDSharedConfig>;
+  // Run the stress test a few times.
+  for (size_t I = 0; I < 10; I++)
+    RunStress<AllocatorT>();
+}
+
+TEST(ScudoCombinedTest, StressThreadInitTSDExclusive) {
+  using AllocatorT = scudo::Allocator<TestInitSizeTSDExclusiveConfig>;
+  // Run the stress test a few times.
+  for (size_t I = 0; I < 10; I++)
+    RunStress<AllocatorT>();
+}
diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
index a58ba6505089f..75921f2be3ffe 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
@@ -52,17 +52,20 @@ template <class Allocator> struct TSDRegistryExT {
     bool UnlockRequired;
   };
 
-  void init(Allocator *Instance) REQUIRES(Mutex) {
-    DCHECK(!Initialized);
+  void init(Allocator *Instance) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+    // If more than one thread is initializing at the exact same moment, the
+    // threads that lose don't need to do anything.
+    if (UNLIKELY(atomic_load_relaxed(&Initialized) != 0))
+      return;
     Instance->init();
     CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread<Allocator>), 0);
     FallbackTSD.init(Instance);
-    Initialized = true;
+    atomic_store_relaxed(&Initialized, 1);
   }
 
-  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
+  void initOnceMaybe(Allocator *Instance) {
+    if (LIKELY(atomic_load_relaxed(&Initialized) != 0))
       return;
     init(Instance); // Sets Initialized.
   }
@@ -81,7 +84,7 @@ template <class Allocator> struct TSDRegistryExT {
     FallbackTSD = {};
     State = {};
     ScopedLock L(Mutex);
-    Initialized = false;
+    atomic_store_relaxed(&Initialized, 0);
   }
 
   void drainCaches(Allocator *Instance) {
@@ -158,7 +161,7 @@ template <class Allocator> struct TSDRegistryExT {
   }
 
   pthread_key_t PThreadKey = {};
-  bool Initialized GUARDED_BY(Mutex) = false;
+  atomic_u8 Initialized = {};
   atomic_u8 Disabled = {};
   TSD<Allocator> FallbackTSD;
   HybridMutex Mutex;
diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
index 404e984e1f5e9..425a028c955aa 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -47,20 +47,24 @@ struct TSDRegistrySharedT {
     TSD<Allocator> *CurrentTSD;
   };
 
-  void init(Allocator *Instance) REQUIRES(Mutex) {
-    DCHECK(!Initialized);
+  void init(Allocator *Instance) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+    // If more than one thread is initializing at the exact same moment, the
+    // threads that lose don't need to do anything.
+    if (UNLIKELY(atomic_load_relaxed(&Initialized) != 0))
+      return;
+
     Instance->init();
     for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].init(Instance);
     const u32 NumberOfCPUs = getNumberOfCPUs();
     setNumberOfTSDs((NumberOfCPUs == 0) ? DefaultTSDCount
                                         : Min(NumberOfCPUs, DefaultTSDCount));
-    Initialized = true;
+    atomic_store_relaxed(&Initialized, 1);
   }
 
-  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
+  void initOnceMaybe(Allocator *Instance) {
+    if (LIKELY(atomic_load_relaxed(&Initialized) != 0))
       return;
     init(Instance); // Sets Initialized.
   }
@@ -72,11 +76,11 @@ struct TSDRegistrySharedT {
     }
     setCurrentTSD(nullptr);
     ScopedLock L(Mutex);
-    Initialized = false;
+    atomic_store_relaxed(&Initialized, 0);
   }
 
   void drainCaches(Allocator *Instance) {
-    ScopedLock L(MutexTSDs);
+    ScopedLock L(Mutex);
     for (uptr I = 0; I < NumberOfTSDs; ++I) {
       TSDs[I].lock();
       Instance->drainCache(&TSDs[I]);
@@ -93,7 +97,6 @@ struct TSDRegistrySharedT {
 
   void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
-    MutexTSDs.lock();
     for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].lock();
   }
@@ -101,13 +104,14 @@ struct TSDRegistrySharedT {
   void enable() NO_THREAD_SAFETY_ANALYSIS {
     for (s32 I = static_cast<s32>(TSDsArraySize - 1); I >= 0; I--)
       TSDs[I].unlock();
-    MutexTSDs.unlock();
     Mutex.unlock();
   }
 
   bool setOption(Option O, sptr Value) {
-    if (O == Option::MaxTSDsCount)
+    if (O == Option::MaxTSDsCount) {
+      ScopedLock L(Mutex);
       return setNumberOfTSDs(static_cast<u32>(Value));
+    }
     if (O == Option::ThreadDisableMemInit)
       setDisableMemInit(Value);
     // Not supported by the TSD Registry, but not an error either.
@@ -116,8 +120,8 @@ struct TSDRegistrySharedT {
 
   bool getDisableMemInit() const { return *getTlsPtr() & 1; }
 
-  void getStats(ScopedString *Str) EXCLUDES(MutexTSDs) {
-    ScopedLock L(MutexTSDs);
+  void getStats(ScopedString *Str) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
 
     Str->append("Stats: SharedTSDs: %u available; total %u\n", NumberOfTSDs,
                 TSDsArraySize);
@@ -171,8 +175,7 @@ struct TSDRegistrySharedT {
     return reinterpret_cast<TSD<Allocator> *>(*getTlsPtr() & ~1ULL);
   }
 
-  bool setNumberOfTSDs(u32 N) EXCLUDES(MutexTSDs) {
-    ScopedLock L(MutexTSDs);
+  bool setNumberOfTSDs(u32 N) REQUIRES(Mutex) {
     if (N < NumberOfTSDs)
       return false;
     if (N > TSDsArraySize)
@@ -213,14 +216,14 @@ struct TSDRegistrySharedT {
   // TSDs is an array of locks which is not supported for marking thread-safety
   // capability.
   NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD)
-      EXCLUDES(MutexTSDs) {
+      EXCLUDES(Mutex) {
     // Use the Precedence of the current TSD as our random seed. Since we are
     // in the slow path, it means that tryLock failed, and as a result it's
     // very likely that said Precedence is non-zero.
     const u32 R = static_cast<u32>(CurrentTSD->getPrecedence());
     u32 N, Inc;
     {
-      ScopedLock L(MutexTSDs);
+      ScopedLock L(Mutex);
       N = NumberOfTSDs;
       DCHECK_NE(NumberOfCoPrimes, 0U);
       Inc = CoPrimes[R % NumberOfCoPrimes];
@@ -257,12 +260,15 @@ struct TSDRegistrySharedT {
   }
 
   atomic_u32 CurrentIndex = {};
-  u32 NumberOfTSDs GUARDED_BY(MutexTSDs) = 0;
-  u32 NumberOfCoPrimes GUARDED_BY(MutexTSDs) = 0;
-  u32 CoPrimes[TSDsArraySize] GUARDED_BY(MutexTSDs) = {};
-  bool Initialized GUARDED_BY(Mutex) = false;
+  u32 NumberOfTSDs GUARDED_BY(Mutex) = 0;
+  u32 NumberOfCoPrimes GUARDED_BY(Mutex) = 0;
+  u32 CoPrimes[TSDsArraySize] GUARDED_BY(Mutex) = {};
+  atomic_u8 Initialized = {};
+  // Used for global initialization and TSDs access.
+  // Acquiring the global initialization should only lock once in normal
+  // operation, which is why using it for TSDs access should not cause
+  // any interference.
   HybridMutex Mutex;
-  HybridMutex MutexTSDs;
   TSD<Allocator> TSDs[TSDsArraySize];
 };
 

From f29d06029f1ccae7359440a5d33d06406d10df31 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 21:32:09 +0000
Subject: [PATCH 60/85] Revert "[LV] Mark checks as never succeeding for high
 cost cutoff."

This reverts commit 8a115b6934a90441d77ea54af73e7aaaa1394b38.

This broke premerge. https://lab.llvm.org/staging/#/builders/192/builds/13326

/home/gha/llvm-project/clang/test/Frontend/optimization-remark-options.c:10:11: remark: loop not vectorized: cannot prove it is safe to reorder floating-point operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop or by providing the compiler option '-ffast-math'
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 +--
 ...ime-check-threshold-with-force-metadata.ll | 39 ++++++++++++-------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79cdae25e38da..15d0fa41bd902 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1827,12 +1827,8 @@ class GeneratedRTChecks {
     // profile info.
     CostTooHigh =
         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
-    if (CostTooHigh) {
-      // Mark runtime checks as never succeeding when they exceed the threshold.
-      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
-      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
+    if (CostTooHigh)
       return;
-    }
 
     BasicBlock *LoopHeader = L->getHeader();
     BasicBlock *Preheader = L->getLoopPreheader();
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
index 5376eb86882b7..b7d36fe7928e5 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
@@ -2,23 +2,29 @@
 ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
 ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
 
-; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
-; no runtime check is generated and the loop should not be vectorized.
+; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
+; no runtime check is generated even though one is needed and !noalias
+; annotations are added.
 define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
 ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
 ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
-; LIMIT0-NEXT:  [[ENTRY:.*]]:
-; LIMIT0-NEXT:    br label %[[LOOP:.*]]
-; LIMIT0:       [[LOOP]]:
-; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
-; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
+; LIMIT0-NEXT:  [[ENTRY:.*:]]
+; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
+; LIMIT0:       [[VECTOR_PH]]:
+; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
+; LIMIT0:       [[VECTOR_BODY]]:
+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
-; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
-; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; LIMIT0:       [[MIDDLE_BLOCK]]:
+; LIMIT0-NEXT:    br label %[[EXIT:.*]]
 ; LIMIT0:       [[EXIT]]:
-; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
 ; LIMIT0-NEXT:    ret i16 [[TMP0]]
 ;
 ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
@@ -82,9 +88,14 @@ exit:
 !3 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;.
-; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
-; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
+; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
+; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
+; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
+; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
 ;.
 ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
 ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}

From d86bc19ad79896b956c828a8f9c2c0d94d131466 Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7@gmail.com>
Date: Tue, 9 Dec 2025 13:37:00 -0800
Subject: [PATCH 61/85] [clang-doc] Do not serialize empty text comments
 (#169087)

---
 clang-tools-extra/clang-doc/JSONGenerator.cpp | 43 +++++++-
 .../clang-doc/basic-project.mustache.test     | 97 -------------------
 .../test/clang-doc/json/class.cpp             |  3 -
 3 files changed, 40 insertions(+), 103 deletions(-)

diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 97c599a3f605c..77aa8794561e4 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -84,8 +84,23 @@ serializeLocation(const Location &Loc,
   return LocationObj;
 }
 
+/// Insert comments into a key in the Description object.
+///
+/// \param Comment Either an Object or Array, depending on the comment type
+/// \param Key     The type (Brief, Code, etc.) of comment to be inserted
 static void insertComment(Object &Description, json::Value &Comment,
                           StringRef Key) {
+  // The comment has a Children array for the actual text, with meta attributes
+  // alongside it in the Object.
+  if (auto *Obj = Comment.getAsObject()) {
+    if (auto *Children = Obj->getArray("Children"); Children->empty())
+      return;
+  }
+  // The comment is just an array of text comments.
+  else if (auto *Array = Comment.getAsArray(); Array->empty()) {
+    return;
+  }
+
   auto DescriptionIt = Description.find(Key);
 
   if (DescriptionIt == Description.end()) {
@@ -98,10 +113,28 @@ static void insertComment(Object &Description, json::Value &Comment,
   }
 }
 
+/// Takes the nested "Children" array from a comment Object.
+///
+/// \return a json::Array of comments, possible json::Value::Kind::Null
 static json::Value extractTextComments(Object *ParagraphComment) {
   if (!ParagraphComment)
-    return json::Object();
-  return *ParagraphComment->get("Children");
+    return json::Value(nullptr);
+  json::Value *Children = ParagraphComment->get("Children");
+  if (!Children)
+    return json::Value(nullptr);
+  auto ChildrenArray = *Children->getAsArray();
+  auto ChildrenIt = ChildrenArray.begin();
+  while (ChildrenIt != ChildrenArray.end()) {
+    auto *ChildObj = ChildrenIt->getAsObject();
+    assert(ChildObj && "Invalid JSON object in Comment");
+    auto TextComment = ChildObj->getString("TextComment");
+    if (!TextComment || TextComment->empty()) {
+      ChildrenIt = ChildrenArray.erase(ChildrenIt);
+      continue;
+    }
+    ++ChildrenIt;
+  }
+  return ChildrenArray;
 }
 
 static json::Value extractVerbatimComments(json::Array VerbatimLines) {
@@ -131,7 +164,8 @@ static Object serializeComment(const CommentInfo &I, Object &Description) {
 
   switch (I.Kind) {
   case CommentKind::CK_TextComment: {
-    Obj.insert({commentKindToString(I.Kind), I.Text});
+    if (!I.Text.empty())
+      Obj.insert({commentKindToString(I.Kind), I.Text});
     return Obj;
   }
 
@@ -265,6 +299,9 @@ serializeCommonAttributes(const Info &I, json::Object &Obj,
       if (auto *ParagraphComment = Comment.getAsObject();
           ParagraphComment->get("ParagraphComment")) {
         auto TextCommentsArray = extractTextComments(ParagraphComment);
+        if (TextCommentsArray.kind() == json::Value::Null ||
+            TextCommentsArray.getAsArray()->empty())
+          continue;
         insertComment(Description, TextCommentsArray, "ParagraphComments");
       }
     }
diff --git a/clang-tools-extra/test/clang-doc/basic-project.mustache.test b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
index 282ca73384c3f..b985a39265de7 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.mustache.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
@@ -65,9 +65,6 @@ HTML-SHAPE:                         <div>
 HTML-SHAPE:                             <p> Abstract base class for shapes.</p>
 HTML-SHAPE:                         </div>
 HTML-SHAPE:                         <div>
-HTML-SHAPE:                             <p></p>
-HTML-SHAPE:                         </div>
-HTML-SHAPE:                         <div>
 HTML-SHAPE:                             <p> Provides a common interface for different types of shapes.</p>
 HTML-SHAPE:                         </div>
 HTML-SHAPE:                     </div>
@@ -83,12 +80,6 @@ HTML-SHAPE:                             <div>
 HTML-SHAPE:                                 <div>
 HTML-SHAPE:                                     <p> Calculates the area of the shape.</p>
 HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
 HTML-SHAPE:                                 <h3>Returns</h3>
 HTML-SHAPE:                                 <p> double The area of the shape.</p>
 HTML-SHAPE:                             </div>
@@ -101,12 +92,6 @@ HTML-SHAPE:                             <div>
 HTML-SHAPE:                                 <div>
 HTML-SHAPE:                                     <p> Calculates the perimeter of the shape.</p>
 HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
 HTML-SHAPE:                                 <h3>Returns</h3>
 HTML-SHAPE:                                 <p> double The perimeter of the shape.</p>
 HTML-SHAPE:                             </div>
@@ -119,9 +104,6 @@ HTML-SHAPE:                             <div>
 HTML-SHAPE:                                 <div>
 HTML-SHAPE:                                     <p> Virtual destructor.</p>
 HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
 HTML-SHAPE:                             </div>
 HTML-SHAPE:                         </div>
 HTML-SHAPE:                     </div>
@@ -210,9 +192,6 @@ HTML-CALC:                         <div>
 HTML-CALC:                             <p> A simple calculator class.</p>
 HTML-CALC:                         </div>
 HTML-CALC:                         <div>
-HTML-CALC:                             <p></p>
-HTML-CALC:                         </div>
-HTML-CALC:                         <div>
 HTML-CALC:                             <p> Provides basic arithmetic operations.</p>
 HTML-CALC:                         </div>
 HTML-CALC:                     </div>
@@ -239,12 +218,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Adds two integers.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -264,12 +237,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Subtracts the second integer from the first.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -289,12 +256,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Multiplies two integers.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -314,12 +275,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Divides the first integer by the second.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -329,7 +284,6 @@ HTML-CALC:                                     <b>b</b>   Second integer.
 HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Returns</h3>
 HTML-CALC:                                 <p> double The result of a / b.</p>
-HTML-CALC:                                 <p></p>
 HTML-CALC:                                 <h3>Throws</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>std::invalid_argument</b> if b is zero.
@@ -344,12 +298,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Performs the mod operation on integers.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -431,9 +379,6 @@ HTML-RECTANGLE:                         <div>
 HTML-RECTANGLE:                             <p> Rectangle class derived from Shape.</p>
 HTML-RECTANGLE:                         </div>
 HTML-RECTANGLE:                         <div>
-HTML-RECTANGLE:                             <p></p>
-HTML-RECTANGLE:                         </div>
-HTML-RECTANGLE:                         <div>
 HTML-RECTANGLE:                             <p> Represents a rectangle with a given width and height.</p>
 HTML-RECTANGLE:                         </div>
 HTML-RECTANGLE:                     </div>
@@ -449,12 +394,6 @@ HTML-RECTANGLE:                             <div>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <p> Constructs a new Rectangle object.</p>
 HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
 HTML-RECTANGLE:                                 <h3>Parameters</h3>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <b>width</b>   Width of the rectangle.
@@ -472,12 +411,6 @@ HTML-RECTANGLE:                             <div>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <p> Calculates the area of the rectangle.</p>
 HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
 HTML-RECTANGLE:                                 <h3>Returns</h3>
 HTML-RECTANGLE:                                 <p> double The area of the rectangle.</p>
 HTML-RECTANGLE:                             </div>
@@ -490,12 +423,6 @@ HTML-RECTANGLE:                             <div>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <p> Calculates the perimeter of the rectangle.</p>
 HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
 HTML-RECTANGLE:                                 <h3>Returns</h3>
 HTML-RECTANGLE:                                 <p> double The perimeter of the rectangle.</p>
 HTML-RECTANGLE:                             </div>
@@ -570,9 +497,6 @@ HTML-CIRCLE:                         <div>
 HTML-CIRCLE:                             <p> Circle class derived from Shape.</p>
 HTML-CIRCLE:                         </div>
 HTML-CIRCLE:                         <div>
-HTML-CIRCLE:                             <p></p>
-HTML-CIRCLE:                         </div>
-HTML-CIRCLE:                         <div>
 HTML-CIRCLE:                             <p> Represents a circle with a given radius.</p>
 HTML-CIRCLE:                         </div>
 HTML-CIRCLE:                     </div>
@@ -588,12 +512,6 @@ HTML-CIRCLE:                             <div>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <p> Constructs a new Circle object.</p>
 HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
 HTML-CIRCLE:                                 <h3>Parameters</h3>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <b>radius</b>   Radius of the circle.
@@ -608,12 +526,6 @@ HTML-CIRCLE:                             <div>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <p> Calculates the area of the circle.</p>
 HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
 HTML-CIRCLE:                                 <h3>Returns</h3>
 HTML-CIRCLE:                                 <p> double The area of the circle.</p>
 HTML-CIRCLE:                             </div>
@@ -626,15 +538,6 @@ HTML-CIRCLE:                             <div>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <p> Calculates the perimeter of the circle.</p>
 HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
 HTML-CIRCLE:                                 <h3>Returns</h3>
 HTML-CIRCLE:                                 <p> double The perimeter of the circle.</p>
 HTML-CIRCLE:                                 <h3>Code</h3>
diff --git a/clang-tools-extra/test/clang-doc/json/class.cpp b/clang-tools-extra/test/clang-doc/json/class.cpp
index 91160585bef1a..8bf9402adf054 100644
--- a/clang-tools-extra/test/clang-doc/json/class.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class.cpp
@@ -47,9 +47,6 @@ struct MyClass {
 // CHECK-NEXT:          },
 // CHECK-NEXT:          {
 // CHECK-NEXT:            "TextComment": " It has some nice methods and fields."
-// CHECK-NEXT:          },
-// CHECK-NEXT:          {
-// CHECK-NEXT:            "TextComment": ""
 // CHECK-NEXT:          }
 // CHECK:         "DocumentationFileName": "_ZTV7MyClass",
 // CHECK:         "Enums": [

From 3310c0be583451e5770b6afbedd926ff3781356f Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 9 Dec 2025 21:38:58 +0000
Subject: [PATCH 62/85] [VPlan] Strip TODO to consolidate
 (ActiveLaneMask|Widen)PHI (#171392)

They cannot be consolidated, as WidenPHI is not a header PHI, while
ActtiveLaneMaskPHI is.
---
 llvm/lib/Transforms/Vectorize/VPlan.h          | 2 --
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 --
 2 files changed, 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fd02493fa2c78..afb654ed882f4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3600,8 +3600,6 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 
 /// A recipe for generating the active lane mask for the vector loop that is
 /// used to predicate the vector operations.
-/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
-/// remove VPActiveLaneMaskPHIRecipe.
 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
 public:
   VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 519a104b9484f..b0c8564ad231a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -4407,8 +4407,6 @@ void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
-// remove VPActiveLaneMaskPHIRecipe.
 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH =
       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));

From d233e787f0adfa2acd2e6c67aa2c362f9cf47ab4 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Tue, 9 Dec 2025 22:36:44 +0100
Subject: [PATCH 63/85] Revert "[Flang] Move builtin .mod generation into
 runtimes (Reapply #137828) (#169638)"

This reverts commit 7675fc79c802cf7f6a95660f6ee59bf6cb62102f.

Requested in PR:
https://github.com/llvm/llvm-project/pull/169638#issuecomment-3634227707
---
 clang/include/clang/Driver/ToolChain.h        |   4 -
 clang/include/clang/Options/Options.td        |   2 +-
 clang/lib/Driver/Driver.cpp                   |  11 -
 clang/lib/Driver/ToolChain.cpp                |   6 -
 clang/lib/Driver/ToolChains/AMDGPU.cpp        |   3 +-
 clang/lib/Driver/ToolChains/Flang.cpp         |   8 -
 clang/lib/Driver/ToolChains/HIPAMD.cpp        |   3 +-
 clang/lib/Driver/ToolChains/HIPSPV.cpp        |   3 +-
 flang-rt/CMakeLists.txt                       | 122 +++++--
 flang-rt/cmake/modules/AddFlangRT.cmake       |  38 +--
 .../cmake/modules/AddFlangRTOffload.cmake     |  15 +-
 .../cmake/modules/FlangRTIntrospection.cmake  |  36 ---
 .../cmake/modules}/GetToolchainDirs.cmake     |  11 -
 flang-rt/lib/CMakeLists.txt                   |  15 +-
 flang-rt/lib/runtime/CMakeLists.txt           | 173 +++-------
 flang-rt/test/lit.site.cfg.py.in              |   2 +-
 flang-rt/unittests/CMakeLists.txt             |   5 +-
 flang/CMakeLists.txt                          |   1 +
 .../flang/Frontend/CompilerInvocation.h       |   7 -
 flang/lib/Frontend/CompilerInvocation.cpp     |  26 +-
 flang/lib/Semantics/semantics.cpp             |  13 +-
 flang/module/.clang-format                    |   1 +
 .../module}/__cuda_builtins.f90               |   0
 .../module}/__cuda_device.f90                 |   0
 .../module}/__fortran_builtins.f90            |   2 +-
 .../module}/__fortran_ieee_exceptions.f90     |   2 +-
 .../module}/__fortran_type_info.f90           |   0
 .../module}/__ppc_intrinsics.f90              |   0
 .../runtime => flang/module}/__ppc_types.f90  |   0
 .../module}/cooperative_groups.f90            |   1 -
 .../runtime => flang/module}/cudadevice.f90   |   0
 .../runtime => flang/module}/flang_debug.f90  |   0
 .../module}/ieee_arithmetic.f90               |   2 +-
 .../module}/ieee_exceptions.f90               |   0
 .../module}/ieee_features.f90                 |   0
 .../module}/iso_c_binding.f90                 |   0
 .../module}/iso_fortran_env.f90               |   2 +-
 .../module}/iso_fortran_env_impl.f90          |   0
 .../lib/runtime => flang/module}/mma.f90      |   0
 flang/test/CMakeLists.txt                     |  31 +-
 flang/test/Driver/Inputs/ieee_arithmetic.mod  |   1 -
 flang/test/Driver/Inputs/iso_fortran_env.mod  |   1 -
 flang/test/Driver/intrinsic-module-path.F90   |  55 ----
 flang/test/Driver/intrinsic-module-path.f90   |  23 ++
 flang/test/Driver/lto-fatlto.f90              |   4 +-
 flang/test/Driver/pp-fixed-form.f90           |   4 +-
 .../Lower/HLFIR/type-bound-call-mismatch.f90  |   2 +-
 flang/test/Lower/OpenMP/simd_aarch64.f90      |   7 +-
 .../target-enter-data-default-openmp52.f90    |   4 +-
 flang/test/Preprocessing/fixed-free.f         |   2 +-
 flang/test/Preprocessing/no-pp-if.f90         |   2 +-
 flang/test/Semantics/bug163242.f90            |   2 +-
 flang/test/Semantics/bug164303.f90            |   2 +-
 flang/test/lit.cfg.py                         |  94 +-----
 flang/test/lit.site.cfg.py.in                 |   9 +-
 flang/tools/CMakeLists.txt                    |   1 +
 flang/tools/bbc/bbc.cpp                       |  13 +-
 flang/tools/f18/CMakeLists.txt                | 171 ++++++++++
 flang/tools/f18/dump.cpp                      |  42 +++
 llvm/runtimes/CMakeLists.txt                  |  24 +-
 openmp/CMakeLists.txt                         |  11 +-
 openmp/README.rst                             |   7 +-
 .../modules/LibompCheckFortranFlag.cmake      |  29 ++
 openmp/cmake/modules/LibompHandleFlags.cmake  |  11 +
 openmp/module/CMakeLists.txt                  |  89 +++--
 openmp/runtime/CMakeLists.txt                 |   2 +-
 openmp/runtime/cmake/LibompExports.cmake      |  10 +-
 openmp/runtime/test/lit.cfg                   |   1 -
 openmp/runtime/test/lit.site.cfg.in           |   1 -
 runtimes/CMakeLists.txt                       | 303 +-----------------
 70 files changed, 595 insertions(+), 877 deletions(-)
 delete mode 100644 flang-rt/cmake/modules/FlangRTIntrospection.cmake
 rename {cmake/Modules => flang-rt/cmake/modules}/GetToolchainDirs.cmake (94%)
 create mode 100644 flang/module/.clang-format
 rename {flang-rt/lib/runtime => flang/module}/__cuda_builtins.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/__cuda_device.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/__fortran_builtins.f90 (99%)
 rename {flang-rt/lib/runtime => flang/module}/__fortran_ieee_exceptions.f90 (98%)
 rename {flang-rt/lib/runtime => flang/module}/__fortran_type_info.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/__ppc_intrinsics.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/__ppc_types.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/cooperative_groups.f90 (96%)
 rename {flang-rt/lib/runtime => flang/module}/cudadevice.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/flang_debug.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/ieee_arithmetic.f90 (99%)
 rename {flang-rt/lib/runtime => flang/module}/ieee_exceptions.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/ieee_features.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/iso_c_binding.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/iso_fortran_env.f90 (98%)
 rename {flang-rt/lib/runtime => flang/module}/iso_fortran_env_impl.f90 (100%)
 rename {flang-rt/lib/runtime => flang/module}/mma.f90 (100%)
 delete mode 100644 flang/test/Driver/intrinsic-module-path.F90
 create mode 100644 flang/test/Driver/intrinsic-module-path.f90
 create mode 100644 flang/tools/f18/CMakeLists.txt
 create mode 100644 flang/tools/f18/dump.cpp
 create mode 100644 openmp/cmake/modules/LibompCheckFortranFlag.cmake

diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h
index 26af88242eb3e..1425714d34110 100644
--- a/clang/include/clang/Driver/ToolChain.h
+++ b/clang/include/clang/Driver/ToolChain.h
@@ -538,10 +538,6 @@ class ToolChain {
   // Returns Triple without the OSs version.
   llvm::Triple getTripleWithoutOSVersion() const;
 
-  /// Returns the target-specific path for Flang's intrinsic modules in the
-  /// resource directory if it exists.
-  std::optional<std::string> getDefaultIntrinsicModuleDir() const;
-
   // Returns the target specific runtime path if it exists.
   std::optional<std::string> getRuntimePath() const;
 
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 90fc78db45d05..cac122d296624 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -6124,7 +6124,7 @@ def prebind : Flag<["-"], "prebind">;
 def preload : Flag<["-"], "preload">;
 def print_file_name_EQ : Joined<["-", "--"], "print-file-name=">,
   HelpText<"Print the full library path of <file>">, MetaVarName<"<file>">,
-  Visibility<[ClangOption, FlangOption, CLOption]>;
+  Visibility<[ClangOption, CLOption]>;
 def print_ivar_layout : Flag<["-"], "print-ivar-layout">,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Enable Objective-C Ivar layout bitmap print trace">,
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 6f0d8078825de..8644a271a04b5 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6607,17 +6607,6 @@ std::string Driver::GetFilePath(StringRef Name, const ToolChain &TC) const {
   if (llvm::sys::fs::exists(Twine(P)))
     return std::string(P);
 
-  // With Flang, also look for instrinsic modules
-  if (IsFlangMode()) {
-    if (std::optional<std::string> IntrPath =
-            TC.getDefaultIntrinsicModuleDir()) {
-      SmallString<128> P(*IntrPath);
-      llvm::sys::path::append(P, Name);
-      if (llvm::sys::fs::exists(Twine(P)))
-        return std::string(P);
-    }
-  }
-
   SmallString<128> D(Dir);
   llvm::sys::path::append(D, "..", Name);
   if (llvm::sys::fs::exists(Twine(D)))
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index f8520725b9b03..77a2c73f0d446 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1020,12 +1020,6 @@ ToolChain::getTargetSubDirPath(StringRef BaseDir) const {
   return {};
 }
 
-std::optional<std::string> ToolChain::getDefaultIntrinsicModuleDir() const {
-  SmallString<128> P(D.ResourceDir);
-  llvm::sys::path::append(P, "finclude", "flang");
-  return getTargetSubDirPath(P);
-}
-
 std::optional<std::string> ToolChain::getRuntimePath() const {
   SmallString<128> P(D.ResourceDir);
   llvm::sys::path::append(P, "lib");
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index d3515c8fa2362..87ccd40372681 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -858,8 +858,7 @@ void AMDGPUToolChain::addClangTargetOptions(
   // Default to "hidden" visibility, as object level linking will not be
   // supported for the foreseeable future.
   if (!DriverArgs.hasArg(options::OPT_fvisibility_EQ,
-                         options::OPT_fvisibility_ms_compat) &&
-      !getDriver().IsFlangMode()) {
+                         options::OPT_fvisibility_ms_compat)) {
     CC1Args.push_back("-fvisibility=hidden");
     CC1Args.push_back("-fapply-global-visibility-to-externs");
   }
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 2c7a06f2893b9..2f5e93d139858 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -1069,14 +1069,6 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-resource-dir");
   CmdArgs.push_back(D.ResourceDir.c_str());
 
-  // Default intrinsic module dirs must be added after any user-provided
-  // -fintrinsic-modules-path to have lower precedence
-  if (std::optional<std::string> IntrModPath =
-          TC.getDefaultIntrinsicModuleDir()) {
-    CmdArgs.push_back("-fintrinsic-modules-path");
-    CmdArgs.push_back(Args.MakeArgString(*IntrModPath));
-  }
-
   // Offloading related options
   addOffloadOptions(C, Inputs, JA, Args, CmdArgs);
 
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.cpp b/clang/lib/Driver/ToolChains/HIPAMD.cpp
index 8bfd3c887452c..f2f64922cb404 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.cpp
+++ b/clang/lib/Driver/ToolChains/HIPAMD.cpp
@@ -274,8 +274,7 @@ void HIPAMDToolChain::addClangTargetOptions(
   // Default to "hidden" visibility, as object level linking will not be
   // supported for the foreseeable future.
   if (!DriverArgs.hasArg(options::OPT_fvisibility_EQ,
-                         options::OPT_fvisibility_ms_compat) &&
-      !getDriver().IsFlangMode()) {
+                         options::OPT_fvisibility_ms_compat)) {
     CC1Args.append({"-fvisibility=hidden"});
     CC1Args.push_back("-fapply-global-visibility-to-externs");
   }
diff --git a/clang/lib/Driver/ToolChains/HIPSPV.cpp b/clang/lib/Driver/ToolChains/HIPSPV.cpp
index 261adc2b1c440..be0f49d8e1497 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.cpp
+++ b/clang/lib/Driver/ToolChains/HIPSPV.cpp
@@ -143,8 +143,7 @@ void HIPSPVToolChain::addClangTargetOptions(
   // Default to "hidden" visibility, as object level linking will not be
   // supported for the foreseeable future.
   if (!DriverArgs.hasArg(options::OPT_fvisibility_EQ,
-                         options::OPT_fvisibility_ms_compat) &&
-      !getDriver().IsFlangMode())
+                         options::OPT_fvisibility_ms_compat))
     CC1Args.append(
         {"-fvisibility=hidden", "-fapply-global-visibility-to-externs"});
 
diff --git a/flang-rt/CMakeLists.txt b/flang-rt/CMakeLists.txt
index 20ca6dc7124bc..50b8e834776fb 100644
--- a/flang-rt/CMakeLists.txt
+++ b/flang-rt/CMakeLists.txt
@@ -23,6 +23,40 @@ set(FLANG_RT_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
 set(FLANG_RT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}")
 set(FLANG_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../flang")
 
+# CMake 3.24 is the first version of CMake that directly recognizes Flang.
+# LLVM's requirement is only CMake 3.20, teach CMake 3.20-3.23 how to use Flang.
+if (CMAKE_VERSION VERSION_LESS "3.24")
+  cmake_path(GET CMAKE_Fortran_COMPILER STEM _Fortran_COMPILER_STEM)
+  if (_Fortran_COMPILER_STEM STREQUAL "flang-new" OR _Fortran_COMPILER_STEM STREQUAL "flang")
+    include(CMakeForceCompiler)
+    CMAKE_FORCE_Fortran_COMPILER("${CMAKE_Fortran_COMPILER}" "LLVMFlang")
+
+    set(CMAKE_Fortran_COMPILER_ID "LLVMFlang")
+    set(CMAKE_Fortran_COMPILER_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}")
+
+    set(CMAKE_Fortran_SUBMODULE_SEP "-")
+    set(CMAKE_Fortran_SUBMODULE_EXT ".mod")
+
+    set(CMAKE_Fortran_PREPROCESS_SOURCE
+      "<CMAKE_Fortran_COMPILER> -cpp <DEFINES> <INCLUDES> <FLAGS> -E <SOURCE> > <PREPROCESSED_SOURCE>")
+
+    set(CMAKE_Fortran_FORMAT_FIXED_FLAG "-ffixed-form")
+    set(CMAKE_Fortran_FORMAT_FREE_FLAG "-ffree-form")
+
+    set(CMAKE_Fortran_MODDIR_FLAG "-module-dir")
+
+    set(CMAKE_Fortran_COMPILE_OPTIONS_PREPROCESS_ON "-cpp")
+    set(CMAKE_Fortran_COMPILE_OPTIONS_PREPROCESS_OFF "-nocpp")
+    set(CMAKE_Fortran_POSTPROCESS_FLAG "-ffixed-line-length-72")
+
+    set(CMAKE_Fortran_COMPILE_OPTIONS_TARGET "--target=")
+
+    set(CMAKE_Fortran_LINKER_WRAPPER_FLAG "-Wl,")
+    set(CMAKE_Fortran_LINKER_WRAPPER_FLAG_SEP ",")
+  endif ()
+endif ()
+enable_language(Fortran)
+
 
 list(APPEND CMAKE_MODULE_PATH
     "${FLANG_RT_SOURCE_DIR}/cmake/modules"
@@ -31,22 +65,69 @@ list(APPEND CMAKE_MODULE_PATH
 include(AddFlangRT)
 include(GetToolchainDirs)
 include(FlangCommon)
-include(FlangRTIntrospection)
 include(HandleCompilerRT)
 include(ExtendPath)
-include(CheckFortranSourceCompiles)
-include(CMakePushCheckState)
 
 
 ############################
 # Build Mode Introspection #
 ############################
 
+# Determine whether we are in the runtimes/runtimes-bins directory of a
+# bootstrap build.
+set(LLVM_TREE_AVAILABLE OFF)
+if (LLVM_LIBRARY_OUTPUT_INTDIR AND LLVM_RUNTIME_OUTPUT_INTDIR AND PACKAGE_VERSION)
+  set(LLVM_TREE_AVAILABLE ON)
+endif()
+
 # Path to LLVM development tools (FileCheck, llvm-lit, not, ...)
 set(LLVM_TOOLS_DIR "${LLVM_BINARY_DIR}/bin")
 
-# Fortran compiler not optional for building Flang-RT
-enable_language(Fortran)
+# Determine build and install paths.
+# The build path is absolute, but the install dir is relative, CMake's install
+# command has to apply CMAKE_INSTALL_PREFIX itself.
+get_toolchain_library_subdir(toolchain_lib_subdir)
+if (LLVM_TREE_AVAILABLE)
+  # In a bootstrap build emit the libraries into a default search path in the
+  # build directory of the just-built compiler. This allows using the
+  # just-built compiler without specifying paths to runtime libraries.
+  #
+  # Despite Clang in the name, get_clang_resource_dir does not depend on Clang
+  # being added to the build. Flang uses the same resource dir as clang.
+  include(GetClangResourceDir)
+  get_clang_resource_dir(FLANG_RT_OUTPUT_RESOURCE_DIR PREFIX "${LLVM_LIBRARY_OUTPUT_INTDIR}/..")
+  get_clang_resource_dir(FLANG_RT_INSTALL_RESOURCE_PATH_DEFAULT)
+
+  extend_path(FLANG_RT_OUTPUT_RESOURCE_LIB_DIR "${FLANG_RT_OUTPUT_RESOURCE_DIR}" "${toolchain_lib_subdir}")
+else ()
+  # In a standalone runtimes build, do not write into LLVM_BINARY_DIR. It may be
+  # read-only and/or shared by multiple runtimes with different build
+  # configurations (e.g. Debug/Release). Use the runtime's own lib dir like any
+  # non-toolchain library.
+  # For the install prefix, still use the resource dir assuming that Flang will
+  # be installed there using the same prefix. This is to not have a difference
+  # between bootstrap and standalone runtimes builds.
+  set(FLANG_RT_OUTPUT_RESOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
+  set(FLANG_RT_INSTALL_RESOURCE_PATH_DEFAULT "lib${LLVM_LIBDIR_SUFFIX}/clang/${LLVM_VERSION_MAJOR}")
+
+  extend_path(FLANG_RT_OUTPUT_RESOURCE_LIB_DIR "${FLANG_RT_OUTPUT_RESOURCE_DIR}" "lib${LLVM_LIBDIR_SUFFIX}")
+endif ()
+set(FLANG_RT_INSTALL_RESOURCE_PATH "${FLANG_RT_INSTALL_RESOURCE_PATH_DEFAULT}"
+    CACHE PATH "Path to install runtime libraries to (default: clang resource dir)")
+extend_path(FLANG_RT_INSTALL_RESOURCE_LIB_PATH "${FLANG_RT_INSTALL_RESOURCE_PATH}" "${toolchain_lib_subdir}")
+cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_DIR)
+cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_PATH)
+# FIXME: For the libflang_rt.so, the toolchain resource lib dir is not a good
+#        destination because it is not a ld.so default search path.
+#        The machine where the executable is eventually executed may not be the
+#        machine where the Flang compiler and its resource dir is installed, so
+#        setting RPath by the driver is not an solution. It should belong into
+#        /usr/lib/<triple>/libflang_rt.so, like e.g. libgcc_s.so.
+#        But the linker as invoked by the Flang driver also requires
+#        libflang_rt.so to be found when linking and the resource lib dir is
+#        the only reliable location.
+cmake_path(NORMAL_PATH FLANG_RT_OUTPUT_RESOURCE_LIB_DIR)
+cmake_path(NORMAL_PATH FLANG_RT_INSTALL_RESOURCE_LIB_PATH)
 
 
 #################
@@ -56,6 +137,8 @@ enable_language(Fortran)
 # Important: flang-rt user options must be prefixed with "FLANG_RT_". Variables
 # with this prefix will be forwarded in bootstrap builds.
 
+option(FLANG_RT_INCLUDE_TESTS "Generate build targets for the flang-rt unit and regression-tests." "${LLVM_INCLUDE_TESTS}")
+
 # Provide an interface to link against the LLVM libc/libc++ projects directly.
 set(FLANG_RT_SUPPORTED_PROVIDERS system llvm)
 set(FLANG_RT_LIBC_PROVIDER "system" CACHE STRING "Specify C library to use. Supported values are ${FLANG_RT_SUPPORTED_PROVIDERS}.")
@@ -68,14 +151,7 @@ if (NOT "${FLANG_RT_LIBCXX_PROVIDER}" IN_LIST FLANG_RT_SUPPORTED_PROVIDERS)
   message(FATAL_ERROR "Unsupported library: '${FLANG_RT_LIBCXX_PROVIDER}'. Supported values are ${FLANG_RT_SUPPORTED_PROVIDERS}.")
 endif ()
 
-if (LLVM_RUNTIMES_TARGET MATCHES "^amdgcn|^nvptx")
-  # Compiling libraries for offload targets is currently experimental;
-  # Only build the builtin modules by default.
-  set(FLANG_RT_ENABLE_STATIC_default OFF)
-else ()
-  set(FLANG_RT_ENABLE_STATIC_default ON)
-endif ()
-option(FLANG_RT_ENABLE_STATIC "Build Flang-RT as a static library." "${FLANG_RT_ENABLE_STATIC_default}")
+option(FLANG_RT_ENABLE_STATIC "Build Flang-RT as a static library." ON)
 if (WIN32)
   # Windows DLL currently not implemented.
   set(FLANG_RT_ENABLE_SHARED OFF)
@@ -88,14 +164,11 @@ else ()
   #       breaking change unless the driver is changed.
   option(FLANG_RT_ENABLE_SHARED "Build Flang-RT as a shared library." OFF)
 endif ()
-
-
-if (FLANG_RT_ENABLE_STATIC OR FLANG_RT_ENABLE_SHARED)
-  option(FLANG_RT_INCLUDE_TESTS "Generate build targets for the flang-rt unit and regression-tests." "${LLVM_INCLUDE_TESTS}")
-else ()
-  # Tests require at least one of the libraries
-  message(STATUS "Flang-RT testing disabled without either FLANG_RT_ENABLE_STATIC OR FLANG_RT_ENABLE_SHARED")
-  set(FLANG_RT_INCLUDE_TESTS OFF)
+if (NOT FLANG_RT_ENABLE_STATIC AND NOT FLANG_RT_ENABLE_SHARED)
+  message(FATAL_ERROR "
+      Must build at least one type of library
+      (FLANG_RT_ENABLE_STATIC=ON, FLANG_RT_ENABLE_SHARED=ON, or both)
+    ")
 endif ()
 
 
@@ -113,7 +186,7 @@ elseif (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
   option(FLANG_RT_CUDA_RUNTIME_PTX_WITHOUT_GLOBAL_VARS "Do not compile global variables' definitions when producing PTX library" OFF)
 elseif (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP")
   # Support for OpenMP offloading
-  set(FLANG_RT_DEVICE_ARCHITECTURES "${RUNTIMES_DEVICE_ARCHITECTURES}" CACHE STRING
+  set(FLANG_RT_DEVICE_ARCHITECTURES "all" CACHE STRING
       "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')"
     )
 
@@ -161,10 +234,6 @@ check_cxx_source_compiles(
   "
   HAVE_DECL_STRERROR_S)
 
-# Look for support of REAL(16), if not already defined via command
-# line via -DFORTRAN_SUPPORTS_REAL16=YES/NO
-check_fortran_quadmath_support()
-
 # Search for clang_rt.builtins library. Need in addition to msvcrt.
 if (WIN32)
   find_compiler_rt_library(builtins FLANG_RT_BUILTINS_LIBRARY)
@@ -262,7 +331,6 @@ else ()
   add_custom_target(check-flang-rt)
 endif()
 
-
 ###################
 # Install headers #
 ###################
diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake
index 0ba485a79beb8..923507764d691 100644
--- a/flang-rt/cmake/modules/AddFlangRT.cmake
+++ b/flang-rt/cmake/modules/AddFlangRT.cmake
@@ -94,10 +94,6 @@ function (add_flangrt_library name)
     set(build_object ON)
   elseif (build_static AND build_shared)
     set(build_object ON)
-  elseif (NOT build_static AND NOT build_shared)
-    # If not building a library, still build the object files
-    # Needed to generate the .mod files as byproduct
-    set(build_object ON)
   endif ()
 
   # srctargets: targets that contain source files
@@ -172,18 +168,14 @@ function (add_flangrt_library name)
     if (BUILD_SHARED_LIBS)
       if (build_shared)
         set(default_target "${name_shared}")
-      elseif (build_static)
-        set(default_target "${name_static}")
       else ()
-        set(default_target "${name_object}")
+        set(default_target "${name_static}")
       endif ()
     else ()
       if (build_static)
         set(default_target "${name_static}")
-      elseif (build_shared)
-        set(default_target "${name_shared}")
       else ()
-        set(default_target "${name_object}")
+        set(default_target "${name_shared}")
       endif ()
     endif ()
     add_library(${name}.default ALIAS "${default_target}")
@@ -198,12 +190,6 @@ function (add_flangrt_library name)
     endif ()
   endif ()
 
-  if (build_object)
-    add_library(${name}.compile ALIAS "${name_object}")
-  else ()
-    add_library(${name}.compile ALIAS "${default_target}")
-  endif ()
-
   foreach (tgtname IN LISTS libtargets)
     if (NOT WIN32)
       # Use same stem name for .a and .so. Common in UNIX environments.
@@ -233,14 +219,6 @@ function (add_flangrt_library name)
     # Minimum required C++ version for Flang-RT, even if CMAKE_CXX_STANDARD is defined to something else.
     target_compile_features(${tgtname} PRIVATE cxx_std_17)
 
-    target_compile_options(${tgtname} PRIVATE
-      # Always enable preprocessor regardless of file extension
-      "$<$<COMPILE_LANGUAGE:Fortran>:-cpp>"
-
-      # Missing type descriptors are expected for intrinsic modules
-      "$<$<COMPILE_LANGUAGE:Fortran>:SHELL:-mmlir;SHELL:-ignore-missing-type-desc>"
-    )
-
     # When building the flang runtime if LTO is enabled the archive file
     # contains LLVM IR rather than object code. Currently flang is not
     # LTO aware so cannot link this file to compiled Fortran code.
@@ -248,10 +226,6 @@ function (add_flangrt_library name)
       target_compile_options(${tgtname} PRIVATE -fno-lto)
     endif ()
 
-    if (FORTRAN_SUPPORTS_REAL16)
-      target_compile_definitions(${tgtname} PRIVATE FLANG_SUPPORT_R16=1)
-    endif ()
-
     # Use compiler-specific options to disable exceptions and RTTI.
     if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
       target_compile_options(${tgtname} PRIVATE
@@ -370,13 +344,13 @@ function (add_flangrt_library name)
     if (ARG_INSTALL_WITH_TOOLCHAIN)
       set_target_properties(${tgtname}
         PROPERTIES
-          ARCHIVE_OUTPUT_DIRECTORY "${RUNTIMES_OUTPUT_RESOURCE_LIB_DIR}"
-          LIBRARY_OUTPUT_DIRECTORY "${RUNTIMES_OUTPUT_RESOURCE_LIB_DIR}"
+          ARCHIVE_OUTPUT_DIRECTORY "${FLANG_RT_OUTPUT_RESOURCE_LIB_DIR}"
+          LIBRARY_OUTPUT_DIRECTORY "${FLANG_RT_OUTPUT_RESOURCE_LIB_DIR}"
         )
 
       install(TARGETS ${tgtname}
-          ARCHIVE DESTINATION "${RUNTIMES_INSTALL_RESOURCE_LIB_PATH}"
-          LIBRARY DESTINATION "${RUNTIMES_INSTALL_RESOURCE_LIB_PATH}"
+          ARCHIVE DESTINATION "${FLANG_RT_INSTALL_RESOURCE_LIB_PATH}"
+          LIBRARY DESTINATION "${FLANG_RT_INSTALL_RESOURCE_LIB_PATH}"
         )
     endif ()
 
diff --git a/flang-rt/cmake/modules/AddFlangRTOffload.cmake b/flang-rt/cmake/modules/AddFlangRTOffload.cmake
index 5b6464741c9c9..cbc69f3a9656a 100644
--- a/flang-rt/cmake/modules/AddFlangRTOffload.cmake
+++ b/flang-rt/cmake/modules/AddFlangRTOffload.cmake
@@ -88,16 +88,16 @@ macro(enable_omp_offload_compilation name files)
         "${FLANG_RT_DEVICE_ARCHITECTURES}"
         )
 
-      set(OMP_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:C,CXX>:
+      set(OMP_COMPILE_OPTIONS
         -fopenmp
         -fvisibility=hidden
         -fopenmp-cuda-mode
         --offload-arch=${compile_for_architectures}
         # Force LTO for the device part.
         -foffload-lto
-        >)
-      set_property(SOURCE ${files} APPEND
-        PROPERTY COMPILE_DEFINITIONS ${OMP_COMPILE_OPTIONS}
+        )
+      set_source_files_properties(${files} PROPERTIES COMPILE_OPTIONS
+        "${OMP_COMPILE_OPTIONS}"
         )
       target_link_options(${name}.static PUBLIC ${OMP_COMPILE_OPTIONS})
 
@@ -105,13 +105,6 @@ macro(enable_omp_offload_compilation name files)
       set_source_files_properties(${files}
         PROPERTIES COMPILE_DEFINITIONS OMP_OFFLOAD_BUILD
         )
-
-     # If building flang-rt together with libomp, ensure that libomp is built
-     # first and found because -fopenmp will try to link it.
-     if (TARGET omp)
-       add_dependencies(${name} omp)
-       target_link_options(${name}.static PUBLIC "-L$<TARGET_FILE_DIR:omp>")
-     endif ()
     else()
       message(FATAL_ERROR
         "Flang-rt build with OpenMP offload is not supported for these compilers:\n"
diff --git a/flang-rt/cmake/modules/FlangRTIntrospection.cmake b/flang-rt/cmake/modules/FlangRTIntrospection.cmake
deleted file mode 100644
index 10830ff1ac57e..0000000000000
--- a/flang-rt/cmake/modules/FlangRTIntrospection.cmake
+++ /dev/null
@@ -1,36 +0,0 @@
-#===-- cmake/modules/FlangRTIntrospection.cmake ----------------------------===#
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-#===------------------------------------------------------------------------===#
-
-include(CMakePushCheckState)
-
-# Check whether the Fortran compiler supports real(16)/quadmath types
-#
-# Implementation notes:
-#
-#  * FORTRAN_SUPPORTS_REAL16 can be set externally in a bootstrapping-runtimes
-#    build to ensure consistency of real(16) support between compiler and
-#    runtime.
-#
-#  * cmake_push_check_state/cmake_pop_check_state is insufficient to isolate
-#    a compiler introspection environment, see
-#    https://gitlab.kitware.com/cmake/cmake/-/issues/27419
-#    Additionally wrap it in a function namespace.
-function (check_fortran_quadmath_support)
-  cmake_push_check_state(RESET)
-  set(CMAKE_REQUIRED_FLAGS "-ffree-form")
-  set(CMAKE_TRY_COMPILE_TARGET_TYPE "STATIC_LIBRARY") # Skip link step
-  check_fortran_source_compiles([[
-      subroutine test_quadmath
-        real(16) :: var1
-      end
-    ]]
-    FORTRAN_SUPPORTS_REAL16
-  )
-  cmake_pop_check_state()
-endfunction ()
-
diff --git a/cmake/Modules/GetToolchainDirs.cmake b/flang-rt/cmake/modules/GetToolchainDirs.cmake
similarity index 94%
rename from cmake/Modules/GetToolchainDirs.cmake
rename to flang-rt/cmake/modules/GetToolchainDirs.cmake
index ce2f8c294b2bc..fba12502b5946 100644
--- a/cmake/Modules/GetToolchainDirs.cmake
+++ b/flang-rt/cmake/modules/GetToolchainDirs.cmake
@@ -47,17 +47,6 @@ function (get_toolchain_library_subdir outvar)
 endfunction ()
 
 
-# Corresponds to Flang's ToolChain::getDefaultIntrinsicModuleDir().
-function (get_toolchain_module_subdir outvar)
-  set(outval "finclude/flang")
-
-  get_toolchain_arch_dirname(arch_dirname)
-  set(outval "${outval}/${arch_dirname}")
-
-  set(${outvar} "${outval}" PARENT_SCOPE)
-endfunction ()
-
-
 # Corresponds to Clang's ToolChain::getOSLibName(). Adapted from Compiler-RT.
 function (get_toolchain_os_dirname outvar)
   if (ANDROID)
diff --git a/flang-rt/lib/CMakeLists.txt b/flang-rt/lib/CMakeLists.txt
index 8738ae7b9c4b6..aee51dcc9fa24 100644
--- a/flang-rt/lib/CMakeLists.txt
+++ b/flang-rt/lib/CMakeLists.txt
@@ -6,16 +6,11 @@
 #
 #===------------------------------------------------------------------------===#
 
-if (FLANG_RT_ENABLE_STATIC OR FLANG_RT_ENABLE_SHARED)
-  add_subdirectory(quadmath)
-  add_subdirectory(runtime)
-  if (FLANG_RT_INCLUDE_CUF)
-    add_subdirectory(cuda)
-  endif()
-else ()
-  # Generate modules files only, skip the libraries
-  add_subdirectory(runtime)
-endif ()
+add_subdirectory(quadmath)
+add_subdirectory(runtime)
+if (FLANG_RT_INCLUDE_CUF)
+  add_subdirectory(cuda)
+endif()
 
 if (FLANG_RT_INCLUDE_TESTS)
   add_subdirectory(Testing)
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 4be679d1ca8ac..e8f70bd544e0b 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -12,30 +12,6 @@ find_package(Backtrace)
 set(HAVE_BACKTRACE ${Backtrace_FOUND})
 set(BACKTRACE_HEADER ${Backtrace_HEADER})
 
-# Module sources that are required by other modules
-set(intrinsics_sources
-  __fortran_builtins.f90
-  __cuda_builtins.f90
-)
-
-# Fortran sources for builtin .mod files
-set(module_sources
-  __fortran_ieee_exceptions.f90
-  __fortran_type_info.f90
-  flang_debug.f90
-  iso_fortran_env.f90
-  ieee_arithmetic.f90
-  ieee_exceptions.f90
-  ieee_features.f90
-  iso_c_binding.f90
-  iso_fortran_env.f90
-  iso_fortran_env_impl.f90
-
-  __cuda_device.f90
-  cooperative_groups.f90
-  cudadevice.f90
-)
-
 # List of files that are buildable for all devices.
 set(supported_sources
   ${FLANG_SOURCE_DIR}/lib/Decimal/binary-to-decimal.cpp
@@ -97,6 +73,7 @@ set(supported_sources
 
 # List of source not used for GPU offloading.
 set(host_sources
+  ${FLANG_SOURCE_DIR}/module/iso_fortran_env_impl.f90
   command.cpp
   complex-powi.cpp
   complex-reduction.c
@@ -172,91 +149,45 @@ file(GLOB_RECURSE private_headers
   "${FLANG_SOURCE_DIR}/lib/Common/*.h"
   )
 
-if (LLVM_TARGET_TRIPLE MATCHES "^ppc|^powerpc")
-  list(APPEND intrinsics_sources
-    __ppc_types.f90
-  )
-  list(APPEND module_sources
-    __ppc_intrinsics.f90
-    mma.f90
-  )
-endif ()
-
-# Compile as CUDA-Fortran, not directly supported by CMake
-set_property(SOURCE
-    __cuda_device.f90
-    cooperative_groups.f90
-    cudadevice.f90
-  APPEND PROPERTY
-    COMPILE_OPTIONS --offload-host-only -xcuda
-)
 
 # Import changes from flang_rt.quadmath
-set(f128_sources "")
-if (TARGET FortranFloat128MathILib)
-  get_target_property(_propval
-    FortranFloat128MathILib INTERFACE_SOURCES
+get_target_property(f128_sources
+  FortranFloat128MathILib INTERFACE_SOURCES
+  )
+if (f128_sources)
+  # The interface may define special macros for Float128Math files,
+  # so we need to propagate them.
+  get_target_property(f128_defs
+    FortranFloat128MathILib INTERFACE_COMPILE_DEFINITIONS
     )
-  if (_propval)
-    set(f128_sources "${_propval}")
-    # The interface may define special macros for Float128Math files,
-    # so we need to propagate them.
-    get_target_property(f128_defs
-      FortranFloat128MathILib INTERFACE_COMPILE_DEFINITIONS
-      )
-    set_property(SOURCE ${f128_sources}
-      APPEND PROPERTY COMPILE_DEFINITIONS
-      ${f128_defs}
-      )
-    get_target_property(f128_include_dirs
-      FortranFloat128MathILib INTERFACE_INCLUDE_DIRECTORIES
-      )
-    set_property(SOURCE ${f128_sources}
-      APPEND PROPERTY INCLUDE_DIRECTORIES
-      ${f128_include_dirs}
-      )
-  endif ()
+  set_property(SOURCE ${f128_sources}
+    APPEND PROPERTY COMPILE_DEFINITIONS
+    ${f128_defs}
+    )
+  get_target_property(f128_include_dirs
+    FortranFloat128MathILib INTERFACE_INCLUDE_DIRECTORIES
+    )
+  set_property(SOURCE ${f128_sources}
+    APPEND PROPERTY INCLUDE_DIRECTORIES
+    ${f128_include_dirs}
+    )
+else ()
+  set(f128_sources "")
 endif ()
 
-if (NOT FLANG_RT_ENABLE_STATIC AND NOT FLANG_RT_ENABLE_SHARED)
-  # If not compiling the library, only build the modules.
-  set(sources ${module_sources})
-elseif (LLVM_RUNTIMES_TARGET MATCHES "^amdgcn|^nvptx")
-  set(sources ${gpu_sources} ${module_sources})
+if ("${LLVM_RUNTIMES_TARGET}" MATCHES "^amdgcn|^nvptx")
+  set(sources ${gpu_sources})
+elseif(FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "CUDA")
+  set(sources ${supported_sources})
 else ()
-  set(sources ${supported_sources} ${host_sources} ${module_sources} ${f128_sources})
+  set(sources ${supported_sources} ${host_sources} ${f128_sources})
 endif ()
 
 
-# check-flang depends on flang-rt-mod to build intrinsic modules
-if (NOT TARGET flang-rt-mod)
-  add_custom_target(flang-rt-mod)
-endif ()
-
 if (NOT WIN32)
-  # CMake ignores intrinsic USE dependencies
-  # CMake has an option Fortran_BUILDING_INSTRINSIC_MODULES/Fortran_BUILDING_INTRINSIC_MODULES
-  # to disable this behavior, unfortunately it does not work with Ninja
-  # (https://gitlab.kitware.com/cmake/cmake/-/issues/26803)
-  # As a workaround, we build those intrinsic modules first such that the main
-  # runtime can depend on it.
-  add_flangrt_library(flang_rt.intrinsics.obj OBJECT
-    ${intrinsics_sources}
-  )
-
-  # This barrier exists to force all of the intrinsic modules of
-  # flang_rt.intrinsics.obj to be built before anything that depends on it.
-  # Without it, CMake/Ninja seem to think that the modules of
-  # flang_rt.intrinsics.obj can be built concurrently to those in
-  # flang_rt.runtime.
-  add_custom_target(flang_rt.intrinsics
-    COMMENT "Intrinsic module dependency barrier"
-  )
-  add_dependencies(flang_rt.intrinsics flang_rt.intrinsics.obj)
-
   add_flangrt_library(flang_rt.runtime STATIC SHARED
-    ${sources} $<TARGET_OBJECTS:flang_rt.intrinsics.obj>
-    LINK_LIBRARIES flang_rt.intrinsics.obj ${Backtrace_LIBRARY}
+    ${sources}
+    LINK_LIBRARIES ${Backtrace_LIBRARY}
     INSTALL_WITH_TOOLCHAIN
     ADDITIONAL_HEADERS ${public_headers} ${private_headers}
   )
@@ -265,17 +196,8 @@ if (NOT WIN32)
   enable_omp_offload_compilation(flang_rt.runtime "${supported_sources}")
 
   # Select a default runtime, which is used for unit and regression tests.
-  if (FLANG_RT_INCLUDE_TESTS)
-    get_target_property(default_target flang_rt.runtime.default ALIASED_TARGET)
-    add_library(flang_rt.runtime.unittest ALIAS "${default_target}")
-  endif ()
-
-  # Select a target that compiles the sources to build the public module files.
-  get_target_property(compile_target flang_rt.runtime.compile ALIASED_TARGET)
-  flang_module_target(flang_rt.intrinsics.obj PUBLIC)
-  flang_module_target(${compile_target} PUBLIC)
-  add_dependencies(${compile_target} flang_rt.intrinsics)
-  add_dependencies(flang-rt-mod flang_rt.intrinsics ${compile_target})
+  get_target_property(default_target flang_rt.runtime.default ALIASED_TARGET)
+  add_library(flang_rt.runtime.unittest ALIAS "${default_target}")
 else()
   # Target for building all versions of the runtime
   add_custom_target(flang_rt.runtime)
@@ -283,23 +205,12 @@ else()
 
   function (add_win_flangrt_runtime libtype suffix msvc_lib)
     set(name "flang_rt.runtime.${suffix}")
-
-    add_flangrt_library(${name}.intrinsics.obj OBJECT
-      ${intrinsics_sources}
-    )
-    add_custom_target(${name}.intrinsics
-      COMMAND echo "${name} Dependency barrier"
-      COMMENT "Intrinsic module dependency barrier"
-    )
-    add_dependencies(${name}.intrinsics ${name}.intrinsics.obj)
-
     add_flangrt_library(${name} ${libtype}
-        ${sources} $<TARGET_OBJECTS:${name}.intrinsics.obj>
+        ${sources}
         ${ARGN}
-        LINK_LIBRARIES ${name}.intrinsics.obj ${Backtrace_LIBRARY}
+        LINK_LIBRARIES ${Backtrace_LIBRARY}
         ADDITIONAL_HEADERS ${public_headers} ${private_headers}
       )
-    get_target_property(compile_target ${name}.compile ALIASED_TARGET)
 
     if (msvc_lib)
       set_target_properties(${name}
@@ -309,19 +220,11 @@ else()
     endif ()
 
     # Setting an unique Fortran_MODULE_DIRECTORY is required for each variant to
-    # write a different .mod file. One of them has to be selected to be the
-    # public module that is to be installed. We select the first.
-    if (_has_public_intrinsics)
-      set(is_public "")
-    else ()
-      set(is_public PUBLIC)
-      add_dependencies(flang-rt-mod ${name}.intrinsics ${compile_target})
-      set(_has_public_intrinsics "YES" PARENT_SCOPE)
-    endif ()
-
-    flang_module_target(${name}.intrinsics.obj ${is_public})
-    flang_module_target(${compile_target} ${is_public})
-    add_dependencies(${compile_target} ${name}.intrinsics)
+    # write a different .mod file.
+    set_target_properties(${name}
+        PROPERTIES
+          Fortran_MODULE_DIRECTORY "module.${suffix}"
+      )
 
     enable_cuda_compilation(${name} "${supported_sources}")
     enable_omp_offload_compilation(${name} "${supported_sources}")
diff --git a/flang-rt/test/lit.site.cfg.py.in b/flang-rt/test/lit.site.cfg.py.in
index 0e9dc08b59925..662d076b1fe24 100644
--- a/flang-rt/test/lit.site.cfg.py.in
+++ b/flang-rt/test/lit.site.cfg.py.in
@@ -6,7 +6,7 @@ config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
 config.flang_source_dir = "@FLANG_SOURCE_DIR@"
 config.flang_rt_source_dir = "@FLANG_RT_SOURCE_DIR@"
 config.flang_rt_binary_test_dir = os.path.dirname(__file__)
-config.flang_rt_output_resource_lib_dir = "@RUNTIMES_OUTPUT_RESOURCE_LIB_DIR@"
+config.flang_rt_output_resource_lib_dir = "@FLANG_RT_OUTPUT_RESOURCE_LIB_DIR@"
 config.flang_rt_experimental_offload_support = "@FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT@"
 config.cc = "@CMAKE_C_COMPILER@"
 config.flang = "@CMAKE_Fortran_COMPILER@"
diff --git a/flang-rt/unittests/CMakeLists.txt b/flang-rt/unittests/CMakeLists.txt
index 75b76e93cb56d..e1ab73d7d9301 100644
--- a/flang-rt/unittests/CMakeLists.txt
+++ b/flang-rt/unittests/CMakeLists.txt
@@ -49,8 +49,9 @@ function(add_flangrt_unittest_offload_properties target)
   # FIXME: replace 'native' in --offload-arch option with the list
   #        of targets that Fortran Runtime was built for.
   if (FLANG_RT_EXPERIMENTAL_OFFLOAD_SUPPORT STREQUAL "OpenMP")
-    set_property(TARGET ${target} APPEND
-      PROPERTY LINK_OPTIONS -fopenmp --offload-arch=native
+    set_target_properties(${target}
+      PROPERTIES LINK_OPTIONS
+      "-fopenmp;--offload-arch=native"
       )
   endif()
 endfunction()
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index df38846998b9e..c01eb56d5e496 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -273,6 +273,7 @@ set(FLANG_TOOLS_INSTALL_DIR "${CMAKE_INSTALL_BINDIR}" CACHE PATH
     "Path for binary subdirectory (defaults to '${CMAKE_INSTALL_BINDIR}')")
 mark_as_advanced(FLANG_TOOLS_INSTALL_DIR)
 
+set(FLANG_INTRINSIC_MODULES_DIR ${CMAKE_BINARY_DIR}/include/flang)
 set(FLANG_INCLUDE_DIR ${FLANG_BINARY_DIR}/include)
 
 # TODO: Remove when libclangDriver is lifted out of Clang
diff --git a/flang/include/flang/Frontend/CompilerInvocation.h b/flang/include/flang/Frontend/CompilerInvocation.h
index feaee28a53349..d294955af780e 100644
--- a/flang/include/flang/Frontend/CompilerInvocation.h
+++ b/flang/include/flang/Frontend/CompilerInvocation.h
@@ -92,10 +92,6 @@ class CompilerInvocation : public CompilerInvocationBase {
   // intrinsic of iso_fortran_env.
   std::string allCompilerInvocOpts;
 
-  /// Location of the resource directory containing files specific to this
-  /// instance/version of Flang.
-  std::string resourceDir;
-
   /// Semantic options
   // TODO: Merge with or translate to frontendOpts. We shouldn't need two sets
   // of options.
@@ -181,9 +177,6 @@ class CompilerInvocation : public CompilerInvocationBase {
   getSemanticsCtx(Fortran::parser::AllCookedSources &allCookedSources,
                   const llvm::TargetMachine &);
 
-  std::string &getResourceDir() { return resourceDir; }
-  const std::string &getResourceDir() const { return resourceDir; }
-
   std::string &getModuleDir() { return moduleDir; }
   const std::string &getModuleDir() const { return moduleDir; }
 
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index ace24e09d86ec..b6c4e6303cdac 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -884,6 +884,16 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
   return diags.getNumErrors() == numErrorsBefore;
 }
 
+// Generate the path to look for intrinsic modules
+static std::string getIntrinsicDir(const char *argv) {
+  // TODO: Find a system independent API
+  llvm::SmallString<128> driverPath;
+  driverPath.assign(llvm::sys::fs::getMainExecutable(argv, nullptr));
+  llvm::sys::path::remove_filename(driverPath);
+  driverPath.append("/../include/flang/");
+  return std::string(driverPath);
+}
+
 // Generate the path to look for OpenMP headers
 static std::string getOpenMPHeadersDir(const char *argv) {
   llvm::SmallString<128> includePath;
@@ -1557,14 +1567,6 @@ bool CompilerInvocation::createFromArgs(
     success = false;
   }
 
-  // User-specified or default resource dir
-  if (const llvm::opt::Arg *a =
-          args.getLastArg(clang::options::OPT_resource_dir))
-    invoc.resourceDir = a->getValue();
-  else
-    invoc.resourceDir = clang::GetResourcesPath(
-        llvm::sys::fs::getMainExecutable(argv0, nullptr));
-
   // -flang-experimental-hlfir
   if (args.hasArg(clang::options::OPT_flang_experimental_hlfir) ||
       args.hasArg(clang::options::OPT_emit_hlfir)) {
@@ -1831,11 +1833,9 @@ void CompilerInvocation::setFortranOpts() {
       preprocessorOptions.searchDirectoriesFromIntrModPath.begin(),
       preprocessorOptions.searchDirectoriesFromIntrModPath.end());
 
-  // Add the ordered list of -fintrinsic-modules-path
-  fortranOptions.intrinsicModuleDirectories.insert(
-      fortranOptions.intrinsicModuleDirectories.end(),
-      preprocessorOptions.searchDirectoriesFromIntrModPath.begin(),
-      preprocessorOptions.searchDirectoriesFromIntrModPath.end());
+  //  Add the default intrinsic module directory
+  fortranOptions.intrinsicModuleDirectories.emplace_back(
+      getIntrinsicDir(getArgv0()));
 
   // Add the directory supplied through -J/-module-dir to the list of search
   // directories
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index f4b90bf01abe3..2606d997b1cd7 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -621,15 +621,12 @@ bool Semantics::Perform() {
     const auto *frontModule{std::get_if<common::Indirection<parser::Module>>(
         &program_.v.front().u)};
     if (frontModule &&
-        std::get<parser::Statement<parser::ModuleStmt>>(frontModule->value().t)
-                .statement.v.source == "__fortran_builtins") {
+        (std::get<parser::Statement<parser::ModuleStmt>>(frontModule->value().t)
+                    .statement.v.source == "__fortran_builtins" ||
+            std::get<parser::Statement<parser::ModuleStmt>>(
+                frontModule->value().t)
+                    .statement.v.source == "__ppc_types")) {
       // Don't try to read the builtins module when we're actually building it.
-    } else if (frontModule &&
-        std::get<parser::Statement<parser::ModuleStmt>>(frontModule->value().t)
-                .statement.v.source == "__ppc_types") {
-      // Don't try to read the UsePPCBuiltinTypesModule() we are currently
-      // building, but __fortran_builtins is needed to build it.
-      context_.UseFortranBuiltinsModule();
     } else if (frontModule &&
         (std::get<parser::Statement<parser::ModuleStmt>>(frontModule->value().t)
                     .statement.v.source == "__ppc_intrinsics" ||
diff --git a/flang/module/.clang-format b/flang/module/.clang-format
new file mode 100644
index 0000000000000..e3845288a2aec
--- /dev/null
+++ b/flang/module/.clang-format
@@ -0,0 +1 @@
+DisableFormat: true
diff --git a/flang-rt/lib/runtime/__cuda_builtins.f90 b/flang/module/__cuda_builtins.f90
similarity index 100%
rename from flang-rt/lib/runtime/__cuda_builtins.f90
rename to flang/module/__cuda_builtins.f90
diff --git a/flang-rt/lib/runtime/__cuda_device.f90 b/flang/module/__cuda_device.f90
similarity index 100%
rename from flang-rt/lib/runtime/__cuda_device.f90
rename to flang/module/__cuda_device.f90
diff --git a/flang-rt/lib/runtime/__fortran_builtins.f90 b/flang/module/__fortran_builtins.f90
similarity index 99%
rename from flang-rt/lib/runtime/__fortran_builtins.f90
rename to flang/module/__fortran_builtins.f90
index cede255de7c2c..a9b60508785db 100644
--- a/flang-rt/lib/runtime/__fortran_builtins.f90
+++ b/flang/module/__fortran_builtins.f90
@@ -6,7 +6,7 @@
 !
 !===------------------------------------------------------------------------===!
 
-#include '../../../flang/include/flang/Runtime/magic-numbers.h'
+#include '../include/flang/Runtime/magic-numbers.h'
 
 ! These naming shenanigans prevent names from Fortran intrinsic modules
 ! from being usable on INTRINSIC statements, and force the program
diff --git a/flang-rt/lib/runtime/__fortran_ieee_exceptions.f90 b/flang/module/__fortran_ieee_exceptions.f90
similarity index 98%
rename from flang-rt/lib/runtime/__fortran_ieee_exceptions.f90
rename to flang/module/__fortran_ieee_exceptions.f90
index ff5c6b44317f8..3ac9b993186aa 100644
--- a/flang-rt/lib/runtime/__fortran_ieee_exceptions.f90
+++ b/flang/module/__fortran_ieee_exceptions.f90
@@ -11,7 +11,7 @@
 ! here under another name so that IEEE_ARITHMETIC can USE it and export its
 ! declarations without clashing with a non-intrinsic module in a program.
 
-#include '../../../flang/include/flang/Runtime/magic-numbers.h'
+#include '../include/flang/Runtime/magic-numbers.h'
 
 module __fortran_ieee_exceptions
   use __fortran_builtins, only: &
diff --git a/flang-rt/lib/runtime/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
similarity index 100%
rename from flang-rt/lib/runtime/__fortran_type_info.f90
rename to flang/module/__fortran_type_info.f90
diff --git a/flang-rt/lib/runtime/__ppc_intrinsics.f90 b/flang/module/__ppc_intrinsics.f90
similarity index 100%
rename from flang-rt/lib/runtime/__ppc_intrinsics.f90
rename to flang/module/__ppc_intrinsics.f90
diff --git a/flang-rt/lib/runtime/__ppc_types.f90 b/flang/module/__ppc_types.f90
similarity index 100%
rename from flang-rt/lib/runtime/__ppc_types.f90
rename to flang/module/__ppc_types.f90
diff --git a/flang-rt/lib/runtime/cooperative_groups.f90 b/flang/module/cooperative_groups.f90
similarity index 96%
rename from flang-rt/lib/runtime/cooperative_groups.f90
rename to flang/module/cooperative_groups.f90
index 5ca0c3aa1f3a5..8bb4af3afa791 100644
--- a/flang-rt/lib/runtime/cooperative_groups.f90
+++ b/flang/module/cooperative_groups.f90
@@ -11,7 +11,6 @@
 module cooperative_groups
 
 use, intrinsic :: __fortran_builtins, only: c_devptr => __builtin_c_devptr
-use :: cudadevice ! implicit dependency, made explicit for CMake
 
 implicit none
 
diff --git a/flang-rt/lib/runtime/cudadevice.f90 b/flang/module/cudadevice.f90
similarity index 100%
rename from flang-rt/lib/runtime/cudadevice.f90
rename to flang/module/cudadevice.f90
diff --git a/flang-rt/lib/runtime/flang_debug.f90 b/flang/module/flang_debug.f90
similarity index 100%
rename from flang-rt/lib/runtime/flang_debug.f90
rename to flang/module/flang_debug.f90
diff --git a/flang-rt/lib/runtime/ieee_arithmetic.f90 b/flang/module/ieee_arithmetic.f90
similarity index 99%
rename from flang-rt/lib/runtime/ieee_arithmetic.f90
rename to flang/module/ieee_arithmetic.f90
index 02cfae2dc6b18..4e938a2daaa91 100644
--- a/flang-rt/lib/runtime/ieee_arithmetic.f90
+++ b/flang/module/ieee_arithmetic.f90
@@ -8,7 +8,7 @@
 
 ! Fortran 2018 Clause 17
 
-#include '../../../flang/include/flang/Runtime/magic-numbers.h'
+#include '../include/flang/Runtime/magic-numbers.h'
 
 module ieee_arithmetic
   ! F18 Clause 17.1p1:
diff --git a/flang-rt/lib/runtime/ieee_exceptions.f90 b/flang/module/ieee_exceptions.f90
similarity index 100%
rename from flang-rt/lib/runtime/ieee_exceptions.f90
rename to flang/module/ieee_exceptions.f90
diff --git a/flang-rt/lib/runtime/ieee_features.f90 b/flang/module/ieee_features.f90
similarity index 100%
rename from flang-rt/lib/runtime/ieee_features.f90
rename to flang/module/ieee_features.f90
diff --git a/flang-rt/lib/runtime/iso_c_binding.f90 b/flang/module/iso_c_binding.f90
similarity index 100%
rename from flang-rt/lib/runtime/iso_c_binding.f90
rename to flang/module/iso_c_binding.f90
diff --git a/flang-rt/lib/runtime/iso_fortran_env.f90 b/flang/module/iso_fortran_env.f90
similarity index 98%
rename from flang-rt/lib/runtime/iso_fortran_env.f90
rename to flang/module/iso_fortran_env.f90
index 2dc38bd1acfe5..3729b95a339f3 100644
--- a/flang-rt/lib/runtime/iso_fortran_env.f90
+++ b/flang/module/iso_fortran_env.f90
@@ -8,7 +8,7 @@
 
 ! See Fortran 2023, subclause 16.10.2
 
-#include '../../../flang/include/flang/Runtime/magic-numbers.h'
+#include '../include/flang/Runtime/magic-numbers.h'
 
 module iso_fortran_env
 
diff --git a/flang-rt/lib/runtime/iso_fortran_env_impl.f90 b/flang/module/iso_fortran_env_impl.f90
similarity index 100%
rename from flang-rt/lib/runtime/iso_fortran_env_impl.f90
rename to flang/module/iso_fortran_env_impl.f90
diff --git a/flang-rt/lib/runtime/mma.f90 b/flang/module/mma.f90
similarity index 100%
rename from flang-rt/lib/runtime/mma.f90
rename to flang/module/mma.f90
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index c43ef8b29b30d..8c8e92faa787a 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -2,31 +2,11 @@
 # for use by Lit, and delegates to LLVM's lit test handlers.
 add_subdirectory(lib)
 
-set(FLANG_TEST_Fortran_FLAGS "" CACHE STRING "Additional Fortran flags for running tests, such as -fintrinsic-modules-path=<path>")
-
-if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
-  set(FLANG_TEST_ENABLE_MODULES_default ON)
-else ()
-  set(FLANG_TEST_ENABLE_MODULES_default OFF)
-endif ()
-option(FLANG_TEST_ENABLE_MODULES "Force-enable tests that require intrinsic modules from Flang-RT" "${FLANG_TEST_ENABLE_MODULES_default}")
-
-
-if ("openmp" IN_LIST LLVM_ENABLE_RUNTIMES AND FLANG_TEST_ENABLE_MODULES AND NOT FLANG_STANDALONE_BUILD)
-  set(FLANG_TEST_ENABLE_OPENMP_default ON)
-else ()
-  set(FLANG_TEST_ENABLE_OPENMP_default OFF)
-endif ()
-option(FLANG_TEST_ENABLE_OPENMP "Force-enable tests that require modules from OpenMP" "${FLANG_TEST_ENABLE_OPENMP_default}")
-
-
 llvm_canonicalize_cmake_booleans(
   FLANG_STANDALONE_BUILD
   LLVM_BUILD_EXAMPLES
   LLVM_BYE_LINK_INTO_TOOLS
   LLVM_ENABLE_PLUGINS
-  FLANG_TEST_ENABLE_MODULES
-  FLANG_TEST_ENABLE_OPENMP
 )
 
 set(FLANG_TOOLS_DIR ${FLANG_BINARY_DIR}/bin)
@@ -79,6 +59,7 @@ set(FLANG_TEST_PARAMS
 
 set(FLANG_TEST_DEPENDS
   flang
+  module_files
   fir-opt
   tco
   bbc
@@ -120,14 +101,8 @@ if (LLVM_BUILD_EXAMPLES)
     )
 endif ()
 
-if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES AND NOT FLANG_STANDALONE_BUILD)
-  # For intrinsic module files (in flang-rt/)
-  list(APPEND FLANG_TEST_DEPENDS "flang-rt-mod")
-
-  if ("openmp" IN_LIST LLVM_ENABLE_RUNTIMES)
-    # For omplib.mod and omplib_kinds.mod (in openmp/)
-    list(APPEND FLANG_TEST_DEPENDS "libomp-mod")
-  endif ()
+if ("openmp" IN_LIST LLVM_ENABLE_RUNTIMES AND NOT FLANG_STANDALONE_BUILD)
+  list(APPEND FLANG_TEST_DEPENDS "libomp-mod")
 endif ()
 
 add_custom_target(flang-test-depends DEPENDS ${FLANG_TEST_DEPENDS})
diff --git a/flang/test/Driver/Inputs/ieee_arithmetic.mod b/flang/test/Driver/Inputs/ieee_arithmetic.mod
index 264ff8d035628..30fd57801970b 100644
--- a/flang/test/Driver/Inputs/ieee_arithmetic.mod
+++ b/flang/test/Driver/Inputs/ieee_arithmetic.mod
@@ -1,6 +1,5 @@
 ! DUMMY module
 ! Added for testing purposes. The contents of this file are currently not relevant.
-! Using this file will cause an error because of missing checksum
 module ieee_arithmetic
 type::ieee_round_type
 integer(1),private::mode=0_1
diff --git a/flang/test/Driver/Inputs/iso_fortran_env.mod b/flang/test/Driver/Inputs/iso_fortran_env.mod
index c53375d78dec5..689297d52027b 100644
--- a/flang/test/Driver/Inputs/iso_fortran_env.mod
+++ b/flang/test/Driver/Inputs/iso_fortran_env.mod
@@ -1,6 +1,5 @@
 ! DUMMY module
 ! Added for testing purposes. The contents of this file are currently not relevant.
-! Using this file will cause an error because of missing checksum
 module iso_fortran_env
 use __fortran_builtins,only:event_type=>__builtin_event_type
 use __fortran_builtins,only:lock_type=>__builtin_lock_type
diff --git a/flang/test/Driver/intrinsic-module-path.F90 b/flang/test/Driver/intrinsic-module-path.F90
deleted file mode 100644
index 3317eb776f0a1..0000000000000
--- a/flang/test/Driver/intrinsic-module-path.F90
+++ /dev/null
@@ -1,55 +0,0 @@
-! Ensure argument -fintrinsic-modules-path works as expected.
-
-!-----------------------------------------
-! FLANG DRIVER
-!-----------------------------------------
-! NOTE: Depending on how Flang is built, the default intrinsics may have higher
-!       or lower priority than -fintrinsic-modules-path added here. Using
-!       basictestmoduleone.mod from Inputs/module-dir/ will trigger an error.
-
-! RUN:     %flang -fsyntax-only -### %s 2>&1 | FileCheck %s --check-prefix=DEFAULTPATH
-
-! RUN:     %flang -fsyntax-only -DINTRINSICS_DEFAULT %s
-! RUN: not %flang -fsyntax-only -DINTRINSICS_INPUTONE %s 2>&1 | FileCheck %s --check-prefix=NOINPUTONE
-! RUN: not %flang -fsyntax-only -DINTRINSICS_INPUTTWO %s 2>&1 | FileCheck %s --check-prefix=NOINPUTTWO
-! RUN:     %flang -fsyntax-only -DINTRINSICS_DEFAULT -DINTRINSICS_INPUTTWO -fintrinsic-modules-path=%S/Inputs/module-dir/ %s
-! RUN:     %flang -fsyntax-only -DINTRINSICS_INPUTONE -fintrinsic-modules-path=%S/Inputs/ %s
-! RUN:     %flang -fsyntax-only -DINTRINSICS_INPUTONE -DINTRINSICS_INPUTTWO -fintrinsic-modules-path=%S/Inputs/ -fintrinsic-modules-path=%S/Inputs/module-dir/ %s
-! RUN: not %flang -fsyntax-only -DINTRINSICS_INPUTONE -DINTRINSICS_INPUTTWO -fintrinsic-modules-path=%S/Inputs/module-dir/ -fintrinsic-modules-path=%S/Inputs/ %s 2>&1 | FileCheck %s --check-prefix=WRONGINPUTONE
-
-
-!-----------------------------------------
-! FLANG FRONTEND (flang -fc1)
-!-----------------------------------------
-! NOTE: %flang_cc1 the default intrinsics path always has higher priority than
-!       -fintrinsic-modules-path added here. Accidentally using
-!       ieee_arithmetic/iso_fortran_env from the Inputs/ directory will trigger
-!       an error (e.g. when the default intrinsics dir is empty).
-
-! RUN:     %flang_fc1 -fsyntax-only -DINTRINSICS_DEFAULT %s
-! RUN: not %flang_fc1 -fsyntax-only -DINTRINSICS_DEFAULT -DINTRINSICS_INPUTONE %s 2>&1 | FileCheck %s --check-prefix=NOINPUTONE
-! RUN: not %flang_fc1 -fsyntax-only -DINTRINSICS_DEFAULT -DINTRINSICS_INPUTTWO %s 2>&1 | FileCheck %s --check-prefix=NOINPUTTWO
-! RUN:     %flang_fc1 -fsyntax-only -DINTRINSICS_DEFAULT -DINTRINSICS_INPUTTWO -fintrinsic-modules-path=%S/Inputs/module-dir %s
-! RUN:     %flang_fc1 -fsyntax-only -DINTRINSICS_DEFAULT -DINTRINSICS_INPUTONE -fintrinsic-modules-path=%S/Inputs/ %s
-! RUN:     %flang_fc1 -fsyntax-only -DINTRINSICS_DEFAULT -DINTRINSICS_INPUTONE -DINTRINSICS_INPUTTWO -fintrinsic-modules-path=%S/Inputs/ -fintrinsic-modules-path=%S/Inputs/module-dir/ %s
-! RUN: not %flang_fc1 -fsyntax-only -DINTRINSICS_DEFAULT -DINTRINSICS_INPUTONE -DINTRINSICS_INPUTTWO -fintrinsic-modules-path=%S/Inputs/module-dir -fintrinsic-modules-path=%S/Inputs/ %s 2>&1 | FileCheck %s --check-prefix=WRONGINPUTONE
-
-
-! DEFAULTPATH: flang{{.*}}-fc1{{.*}}-fintrinsic-modules-path
-
-! NOINPUTONE: Source file 'basictestmoduleone.mod' was not found
-! NOINPUTTWO: Source file 'basictestmoduletwo.mod' was not found
-! WRONGINPUTONE: 't1' not found in module 'basictestmoduleone'
-
-program test_intrinsic_module_path
-#ifdef INTRINSICS_DEFAULT
-   use ieee_arithmetic, only: ieee_round_type
-   use iso_fortran_env, only: team_type, event_type, lock_type
-#endif
-#ifdef INTRINSICS_INPUTONE
-   use basictestmoduleone, only: t1
-#endif
-#ifdef INTRINSICS_INPUTTWO
-   use basictestmoduletwo, only: t2
-#endif
-end program
diff --git a/flang/test/Driver/intrinsic-module-path.f90 b/flang/test/Driver/intrinsic-module-path.f90
new file mode 100644
index 0000000000000..615d8f9a1730a
--- /dev/null
+++ b/flang/test/Driver/intrinsic-module-path.f90
@@ -0,0 +1,23 @@
+! Ensure argument -fintrinsic-modules-path works as expected.
+! WITHOUT the option, the default location for the module is checked and no error generated.
+! With the option GIVEN, the module with the same name is PREPENDED, and considered over the
+! default one, causing a CHECKSUM error.
+
+!-----------------------------------------
+! FRONTEND FLANG DRIVER (flang -fc1)
+!-----------------------------------------
+! RUN: %flang_fc1 -fsyntax-only %s  2>&1 | FileCheck %s --allow-empty --check-prefix=WITHOUT
+! RUN: not %flang_fc1 -fsyntax-only -fintrinsic-modules-path %S/Inputs/ %s  2>&1 | FileCheck %s --check-prefix=GIVEN
+! RUN: not %flang_fc1 -fsyntax-only -fintrinsic-modules-path=%S/Inputs/ %s  2>&1 | FileCheck %s --check-prefix=GIVEN
+
+! WITHOUT-NOT: 'ieee_arithmetic.mod' was not found
+! WITHOUT-NOT: 'iso_fortran_env.mod' was not found
+
+! GIVEN: error: Cannot use module file for module 'ieee_arithmetic': File has invalid checksum
+! GIVEN: error: Cannot use module file for module 'iso_fortran_env': File has invalid checksum
+
+
+program test_intrinsic_module_path
+   use ieee_arithmetic, only: ieee_round_type
+   use iso_fortran_env, only: team_type, event_type, lock_type
+end program
diff --git a/flang/test/Driver/lto-fatlto.f90 b/flang/test/Driver/lto-fatlto.f90
index 9adf264f114dc..2ea251eafacbf 100644
--- a/flang/test/Driver/lto-fatlto.f90
+++ b/flang/test/Driver/lto-fatlto.f90
@@ -1,7 +1,7 @@
 ! REQUIRES: x86-registered-target
 ! checks fatlto objects: that valid bitcode is included in the object file generated.
 
-! RUN: %flang_fc1 -triple x86_64-unknown-linux-gnu -flto -ffat-lto-objects -emit-obj %s -o %t.o
+! RUN: %flang -fc1 -triple x86_64-unknown-linux-gnu -flto -ffat-lto-objects -emit-obj %s -o %t.o
 ! RUN: llvm-readelf -S %t.o | FileCheck %s --check-prefixes=ELF
 ! RUN: llvm-objcopy --dump-section=.llvm.lto=%t.bc %t.o
 ! RUN: llvm-dis %t.bc -o - | FileCheck %s --check-prefixes=DIS
@@ -11,7 +11,7 @@
 ! DIS-NEXT:  ret void
 ! DIS-NEXT: }
 
-! RUN: %flang_fc1 -triple x86_64-unknown-linux-gnu -flto -ffat-lto-objects -S %s -o - | FileCheck %s --check-prefixes=ASM
+! RUN: %flang -fc1 -triple x86_64-unknown-linux-gnu -flto -ffat-lto-objects -S %s -o - | FileCheck %s --check-prefixes=ASM
 
 !      ASM: .section        .llvm.lto,"e",@llvm_lto
 ! ASM-NEXT: .Lllvm.embedded.object:
diff --git a/flang/test/Driver/pp-fixed-form.f90 b/flang/test/Driver/pp-fixed-form.f90
index fb9e1dd0d1e6b..bb869cd3341a7 100644
--- a/flang/test/Driver/pp-fixed-form.f90
+++ b/flang/test/Driver/pp-fixed-form.f90
@@ -8,12 +8,12 @@
 
 !RUN: %flang -save-temps -### -ffree-form %S/Inputs/free-form-test.f90  2>&1 | FileCheck %s --check-prefix=FREE-FLAG
 FREE-FLAG:           "-fc1" {{.*}} "-o" "free-form-test.i" {{.*}} "-x" "f95" "{{.*}}/free-form-test.f90"
-FREE-FLAG-NEXT:      "-fc1" {{.*}} "-emit-llvm-bc" {{.*}}"-ffree-form"
+FREE-FLAG-NEXT:      "-fc1" {{.*}} "-emit-llvm-bc" "-ffree-form"
 FREE-FLAG-NOT:       "-ffixed-form"
 FREE-FLAG-SAME:      "-x" "f95-cpp-input" "free-form-test.i"
 
 !RUN: %flang -save-temps -### -ffixed-form %S/Inputs/fixed-form-test.f  2>&1 | FileCheck %s --check-prefix=FIXED-FLAG
 FIXED-FLAG:          "-fc1" {{.*}} "-o" "fixed-form-test.i" {{.*}} "-x" "f95" "{{.*}}/fixed-form-test.f"
-FIXED-FLAG-NEXT:     "-fc1" {{.*}} "-emit-llvm-bc" {{.*}}"-ffixed-form"
+FIXED-FLAG-NEXT:     "-fc1" {{.*}} "-emit-llvm-bc" "-ffixed-form"
 FIXED-FLAG-NOT:      "-ffixed-form"
 FIXED-FLAG-SAME:     "-x" "f95-cpp-input" "fixed-form-test.i"
diff --git a/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90 b/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90
index 29a9784b984b6..2e0c72ccfe048 100644
--- a/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90
+++ b/flang/test/Lower/HLFIR/type-bound-call-mismatch.f90
@@ -1,6 +1,6 @@
 ! Test interface that lowering handles small interface mismatch with
 ! type bound procedures.
-! RUN: %bbc_bare -emit-hlfir %s -o - -I nw | FileCheck %s
+! RUN: bbc -emit-hlfir %s -o - -I nw | FileCheck %s
 
 module dispatch_mismatch
 type t
diff --git a/flang/test/Lower/OpenMP/simd_aarch64.f90 b/flang/test/Lower/OpenMP/simd_aarch64.f90
index 2e4136273c75b..735237223bcb5 100644
--- a/flang/test/Lower/OpenMP/simd_aarch64.f90
+++ b/flang/test/Lower/OpenMP/simd_aarch64.f90
@@ -1,11 +1,6 @@
-! Tests for 2.9.3.1 Simd and target dependent default alignment for AArch64
+! Tests for 2.9.3.1 Simd and target dependent defult alignment for AArch64
 ! The default alignment for AARCH64 is 0 so we do not emit aligned clause
 ! REQUIRES: aarch64-registered-target
-
-! Requires aarch64 iso_c_binding.mod which currently is only available if your host is also aarch64
-! FIXME: Make flang a cross-compiler
-! UNSUPPORTED: true
-
 ! RUN: %flang_fc1 -triple aarch64-unknown-linux-gnu -emit-hlfir -fopenmp %s -o - | FileCheck  %s
 subroutine simdloop_aligned_cptr(A)
     use iso_c_binding
diff --git a/flang/test/Lower/OpenMP/target-enter-data-default-openmp52.f90 b/flang/test/Lower/OpenMP/target-enter-data-default-openmp52.f90
index df3f88eabbfc1..a717b38a76593 100644
--- a/flang/test/Lower/OpenMP/target-enter-data-default-openmp52.f90
+++ b/flang/test/Lower/OpenMP/target-enter-data-default-openmp52.f90
@@ -1,7 +1,7 @@
 ! This test checks the lowering and application of default map types for the target enter/exit data constructs and map clauses
 
-!RUN: %flang_fc1 -emit-fir -fopenmp -fopenmp-version=52 -o - %s | FileCheck %s --check-prefix=CHECK-52
-!RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1| FileCheck %s --check-prefix=CHECK-51
+!RUN: %flang -fc1 -emit-fir -fopenmp -fopenmp-version=52 -o - %s | FileCheck %s --check-prefix=CHECK-52
+!RUN: not %flang -fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1| FileCheck %s --check-prefix=CHECK-51
 
 module test
   real, allocatable :: A
diff --git a/flang/test/Preprocessing/fixed-free.f b/flang/test/Preprocessing/fixed-free.f
index 7140bc6aec360..95f63a4d71e4c 100644
--- a/flang/test/Preprocessing/fixed-free.f
+++ b/flang/test/Preprocessing/fixed-free.f
@@ -1,5 +1,5 @@
 !RUN: %flang -E %s 2>&1 | FileCheck %s
-!RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
+!RUN: %flang -fc1 -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
 !CHECK-NOT: dir$
 !CHECK-NOT: error:
 !dir$ fixed
diff --git a/flang/test/Preprocessing/no-pp-if.f90 b/flang/test/Preprocessing/no-pp-if.f90
index ab08a4f838a90..3e49df3deb251 100644
--- a/flang/test/Preprocessing/no-pp-if.f90
+++ b/flang/test/Preprocessing/no-pp-if.f90
@@ -1,4 +1,4 @@
-!RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+!RUN: %flang -fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
 !CHECK-NOT: ERROR STOP
 !CHECK: CONTINUE
 #if defined UNDEFINED
diff --git a/flang/test/Semantics/bug163242.f90 b/flang/test/Semantics/bug163242.f90
index 3c0a2b6b32229..5e020aeb4dc0d 100644
--- a/flang/test/Semantics/bug163242.f90
+++ b/flang/test/Semantics/bug163242.f90
@@ -1,4 +1,4 @@
-!RUN: %flang_fc1 -fsyntax-only %s | FileCheck --allow-empty %s
+!RUN: %flang -fc1 -fsyntax-only %s | FileCheck --allow-empty %s
 !CHECK-NOT: error:
 character(0), allocatable :: ch
 allocate(character(-1) :: ch)
diff --git a/flang/test/Semantics/bug164303.f90 b/flang/test/Semantics/bug164303.f90
index 39af27e914248..c356c07392577 100644
--- a/flang/test/Semantics/bug164303.f90
+++ b/flang/test/Semantics/bug164303.f90
@@ -1,4 +1,4 @@
-!RUN: %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
+!RUN: %flang -fc1 -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
 module foo_mod
   use, intrinsic :: iso_fortran_env
   use, intrinsic :: iso_c_binding
diff --git a/flang/test/lit.cfg.py b/flang/test/lit.cfg.py
index c3050dc876ec7..4221354df34a2 100644
--- a/flang/test/lit.cfg.py
+++ b/flang/test/lit.cfg.py
@@ -139,95 +139,18 @@
 if config.default_sysroot:
     config.available_features.add("default_sysroot")
 
-
-flang_exe = lit.util.which("flang", config.flang_llvm_tools_dir)
-if not flang_exe:
-    lit_config.fatal(f"Could not identify flang executable")
-
-# Intrinsic paths that are added implicitly by the `flang` driver, but have to be added manually when invoking the frontend `flang -fc1`.
-flang_driver_search_args = []
-
-# Intrinsic paths that are added to `flang` as well as `flang -fc1`.
-flang_extra_search_args = list(config.flang_test_fortran_flags)
-
-
-def get_resource_module_intrinsic_dir(modfile):
-    # Determine the intrinsic module search path that is added by the driver. If
-    # skipping the driver using -fc1, we need to append the path manually.
-    flang_intrinsics_dir = subprocess.check_output(
-        [flang_exe, *config.flang_test_fortran_flags, f"-print-file-name={modfile}"],
-        text=True,
-    ).strip()
-    flang_intrinsics_dir = os.path.dirname(flang_intrinsics_dir)
-    return flang_intrinsics_dir or None
-
-
-intrinsics_mod_path = get_resource_module_intrinsic_dir("__fortran_builtins.mod")
-if intrinsics_mod_path:
-    flang_driver_search_args += [f"-fintrinsic-modules-path={intrinsics_mod_path}"]
-
-openmp_mod_path = get_resource_module_intrinsic_dir("omp_lib.mod")
-if openmp_mod_path and openmp_mod_path != intrinsics_mod_path:
-    flang_driver_search_args += [f"-fintrinsic-modules-path={openmp_mod_path}"]
-
-
-# If intrinsic modules are not available, disable tests unless they are marked as 'module-independent'.
-config.available_features.add("module-independent")
-if config.flang_test_enable_modules or intrinsics_mod_path:
-    config.available_features.add("flangrt-modules")
-else:
-    lit_config.warning(
-        f"Intrinsic modules not in driver default paths: disabling most tests; Use FLANG_TEST_ENABLE_MODULES=ON to force-enable"
-    )
-    config.limit_to_features.add("module-independent")
-
-# Determine if OpenMP runtime was built (enable OpenMP tests via REQUIRES in test file)
-if config.flang_test_enable_openmp or openmp_mod_path:
-    config.available_features.add("openmp_runtime")
-
-    # Search path for omp_lib.h with LLVM_ENABLE_RUNTIMES=openmp
-    # FIXME: openmp should write this file into the resource directory
-    flang_extra_search_args += [
-        "-I",
-        f"{config.flang_obj_root}/../../runtimes/runtimes-bins/openmp/runtime/src",
-    ]
-else:
-    lit_config.warning(
-        f"OpenMP modules found not in driver default paths: OpenMP tests disabled; Use FLANG_TEST_ENABLE_OPENMP=ON to force-enable"
-    )
-
-
-lit_config.note(f"using flang: {flang_exe}")
-lit_config.note(
-    f"using flang implicit search paths: {' '.join(flang_driver_search_args)}"
-)
-lit_config.note(f"using flang extra search paths: {' '.join(flang_extra_search_args)}")
-
 # For each occurrence of a flang tool name, replace it with the full path to
 # the build directory holding that tool.
 tools = [
-    ToolSubst(
-        "bbc",
-        command=FindTool("bbc"),
-        extra_args=flang_driver_search_args + flang_extra_search_args,
-        unresolved="fatal",
-    ),
     ToolSubst(
         "%flang",
-        command=flang_exe,
-        extra_args=flang_extra_search_args,
+        command=FindTool("flang"),
         unresolved="fatal",
     ),
     ToolSubst(
         "%flang_fc1",
-        command=flang_exe,
-        extra_args=["-fc1"] + flang_driver_search_args + flang_extra_search_args,
-        unresolved="fatal",
-    ),
-    # Variant that does not implicitly add intrinsic search paths
-    ToolSubst(
-        "%bbc_bare",
-        command=FindTool("bbc"),
+        command=FindTool("flang"),
+        extra_args=["-fc1"],
         unresolved="fatal",
     ),
 ]
@@ -270,7 +193,16 @@ def get_resource_module_intrinsic_dir(modfile):
 if result:
     config.environment["LIBPGMATH"] = True
 
-config.substitutions.append(("%openmp_flags", "-fopenmp"))
+# Determine if OpenMP runtime was built (enable OpenMP tests via REQUIRES in test file)
+openmp_flags_substitution = "-fopenmp"
+if config.have_openmp_rtl:
+    config.available_features.add("openmp_runtime")
+    # For the enabled OpenMP tests, add a substitution that is needed in the tests to find
+    # the omp_lib.{h,mod} files, depending on whether the OpenMP runtime was built as a
+    # project or runtime.
+    if config.openmp_module_dir:
+        openmp_flags_substitution += f" -J {config.openmp_module_dir}"
+config.substitutions.append(("%openmp_flags", openmp_flags_substitution))
 
 # Add features and substitutions to test F128 math support.
 # %f128-lib substitution may be used to generate check prefixes
diff --git a/flang/test/lit.site.cfg.py.in b/flang/test/lit.site.cfg.py.in
index 25f2a88d068d7..cc1f4fa6cc9c5 100644
--- a/flang/test/lit.site.cfg.py.in
+++ b/flang/test/lit.site.cfg.py.in
@@ -13,12 +13,10 @@ config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
 config.errc_messages = "@LLVM_LIT_ERRC_MESSAGES@"
 config.flang_obj_root = "@FLANG_BINARY_DIR@"
 config.flang_tools_dir = lit_config.substitute("@FLANG_TOOLS_DIR@")
+config.flang_intrinsic_modules_dir = "@FLANG_INTRINSIC_MODULES_DIR@"
 config.flang_headers_dir = "@HEADER_BINARY_DIR@"
 config.flang_llvm_tools_dir = "@CMAKE_BINARY_DIR@/bin"
 config.flang_test_triple = "@FLANG_TEST_TARGET_TRIPLE@"
-config.flang_test_fortran_flags = "@FLANG_TEST_Fortran_FLAGS@".split()
-config.flang_test_enable_modules = @FLANG_TEST_ENABLE_MODULES@
-config.flang_test_enable_openmp = @FLANG_TEST_ENABLE_OPENMP@
 config.flang_examples = @LLVM_BUILD_EXAMPLES@
 config.python_executable = "@PYTHON_EXECUTABLE@"
 config.flang_standalone_build = @FLANG_STANDALONE_BUILD@
@@ -27,6 +25,11 @@ config.linked_bye_extension = @LLVM_BYE_LINK_INTO_TOOLS@
 config.osx_sysroot = path(r"@CMAKE_OSX_SYSROOT@")
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.default_sysroot = "@DEFAULT_SYSROOT@"
+config.have_openmp_rtl = ("@LLVM_TOOL_OPENMP_BUILD@" == "TRUE") or ("openmp" in "@LLVM_ENABLE_RUNTIMES@".lower().split(";"))
+if "openmp" in "@LLVM_ENABLE_RUNTIMES@".lower().split(";"):
+    config.openmp_module_dir = "@CMAKE_BINARY_DIR@/runtimes/runtimes-bins/openmp/runtime/src"
+else:
+    config.openmp_module_dir = None
 config.flang_runtime_f128_math_lib = "@FLANG_RUNTIME_F128_MATH_LIB@"
 config.have_ldbl_mant_dig_113 = "@HAVE_LDBL_MANT_DIG_113@"
 
diff --git a/flang/tools/CMakeLists.txt b/flang/tools/CMakeLists.txt
index 1b297af74cae7..1d2d2c608faf9 100644
--- a/flang/tools/CMakeLists.txt
+++ b/flang/tools/CMakeLists.txt
@@ -7,6 +7,7 @@
 #===------------------------------------------------------------------------===#
 
 add_subdirectory(bbc)
+add_subdirectory(f18)
 add_subdirectory(flang-driver)
 add_subdirectory(tco)
 add_subdirectory(f18-parse-demo)
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index ab35fb3af6c2a..8b12da3a7b50a 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -98,11 +98,6 @@ static llvm::cl::alias
                           llvm::cl::desc("intrinsic module directory"),
                           llvm::cl::aliasopt(intrinsicIncludeDirs));
 
-static llvm::cl::alias
-    intrinsicModulePath("fintrinsic-modules-path",
-                        llvm::cl::desc("intrinsic module search paths"),
-                        llvm::cl::aliasopt(intrinsicIncludeDirs));
-
 static llvm::cl::opt<std::string>
     moduleDir("module", llvm::cl::desc("module output directory (default .)"),
               llvm::cl::init("."));
@@ -579,6 +574,14 @@ int main(int argc, char **argv) {
 
   if (includeDirs.size() == 0) {
     includeDirs.push_back(".");
+    // Default Fortran modules should be installed in include/flang (a sibling
+    // to the bin) directory.
+    intrinsicIncludeDirs.push_back(
+        llvm::sys::path::parent_path(
+            llvm::sys::path::parent_path(
+                llvm::sys::fs::getMainExecutable(argv[0], nullptr)))
+            .str() +
+        "/include/flang");
   }
 
   Fortran::parser::Options options;
diff --git a/flang/tools/f18/CMakeLists.txt b/flang/tools/f18/CMakeLists.txt
new file mode 100644
index 0000000000000..58ea782ce213e
--- /dev/null
+++ b/flang/tools/f18/CMakeLists.txt
@@ -0,0 +1,171 @@
+set(LLVM_LINK_COMPONENTS
+  FrontendOpenACC
+  FrontendOpenMP
+  Support
+  )
+
+# Define the list of Fortran module files for which it is
+# sufficient to generate the module file via -fsyntax-only.
+set(MODULES
+  "__fortran_builtins"
+  "__fortran_ieee_exceptions"
+  "__fortran_type_info"
+  "__ppc_types"
+  "__ppc_intrinsics"
+  "mma"
+  "__cuda_builtins"
+  "__cuda_device"
+  "cooperative_groups"
+  "cudadevice"
+  "ieee_arithmetic"
+  "ieee_exceptions"
+  "ieee_features"
+  "iso_c_binding"
+  "iso_fortran_env"
+  "iso_fortran_env_impl"
+  "flang_debug"
+)
+
+# Check if 128-bit float computations can be done via long double.
+check_cxx_source_compiles(
+  "#include <cfloat>
+   #if LDBL_MANT_DIG != 113
+   #error LDBL_MANT_DIG != 113
+   #endif
+   int main() { return 0; }
+  "
+  HAVE_LDBL_MANT_DIG_113)
+
+# Figure out whether we can support REAL(KIND=16)
+if (FLANG_RUNTIME_F128_MATH_LIB)
+  set(FLANG_SUPPORT_R16 "1")
+elseif (HAVE_LDBL_MANT_DIG_113)
+  set(FLANG_SUPPORT_R16 "1")
+else()
+  set(FLANG_SUPPORT_R16 "0")
+endif()
+
+# Init variable to hold extra object files coming from the Fortran modules;
+# these module files will be contributed from the CMakeLists in flang/tools/f18.
+set(module_objects "")
+
+# Create module files directly from the top-level module source directory.
+# If CMAKE_CROSSCOMPILING, then the newly built flang executable was
+# cross compiled, and thus can't be executed on the build system and thus
+# can't be used for generating module files.
+if (NOT CMAKE_CROSSCOMPILING)
+  foreach(filename ${MODULES})
+    set(depends "")
+    set(opts "")
+    if(${filename} STREQUAL "__fortran_builtins" OR
+       ${filename} STREQUAL "__ppc_types")
+    elseif(${filename} STREQUAL "__ppc_intrinsics" OR
+           ${filename} STREQUAL "mma")
+      set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__ppc_types.mod)
+    elseif(${filename} STREQUAL "__cuda_device" OR
+           ${filename} STREQUAL "cudadevice" OR
+           ${filename} STREQUAL "cooperative_groups")
+      set(opts -fc1 -xcuda)
+      if(${filename} STREQUAL "__cuda_device")
+        set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_builtins.mod)
+      elseif(${filename} STREQUAL "cudadevice")
+        set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__cuda_device.mod)
+      elseif(${filename} STREQUAL "cooperative_groups")
+        set(depends ${FLANG_INTRINSIC_MODULES_DIR}/cudadevice.mod)
+      endif()
+    else()
+      set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_builtins.mod)
+      if(${filename} STREQUAL "iso_fortran_env")
+        set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/iso_fortran_env_impl.mod)
+      endif()
+      if(${filename} STREQUAL "ieee_arithmetic" OR
+         ${filename} STREQUAL "ieee_exceptions")
+        set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_ieee_exceptions.mod)
+      endif()
+    endif()
+    if(NOT ${filename} STREQUAL "__fortran_type_info" AND NOT ${filename} STREQUAL "__fortran_builtins")
+      set(depends ${depends} ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_type_info.mod)
+    endif()
+
+    # The module contains PPC vector types that needs the PPC target.
+    if(${filename} STREQUAL "__ppc_intrinsics" OR
+       ${filename} STREQUAL "mma")
+      if (PowerPC IN_LIST LLVM_TARGETS_TO_BUILD)
+        set(opts "--target=ppc64le")
+      else()
+        # Do not compile PPC module if the target is not available.
+        continue()
+      endif()
+    endif()
+
+    set(decls "")
+    if (FLANG_SUPPORT_R16)
+      set(decls "-DFLANG_SUPPORT_R16")
+    endif()
+
+    # Some modules have an implementation part that needs to be added to the
+    # flang_rt.runtime library.
+    set(compile_with "-fsyntax-only")
+    set(object_output "")
+    set(include_in_link FALSE)
+
+    set(base ${FLANG_INTRINSIC_MODULES_DIR}/${filename})
+    # TODO: We may need to flag this with conditional, in case Flang is built w/o OpenMP support
+    add_custom_command(OUTPUT ${base}.mod ${object_output}
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${FLANG_INTRINSIC_MODULES_DIR}
+      COMMAND flang ${opts} ${decls} -cpp ${compile_with} -module-dir ${FLANG_INTRINSIC_MODULES_DIR}
+        ${FLANG_SOURCE_DIR}/module/${filename}.f90
+      DEPENDS flang ${FLANG_SOURCE_DIR}/module/${filename}.f90 ${FLANG_SOURCE_DIR}/module/__fortran_builtins.f90 ${depends}
+    )
+    list(APPEND MODULE_FILES ${base}.mod)
+    install(FILES ${base}.mod DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/flang" COMPONENT flang-module-interfaces)
+
+    # If a module has been compiled into an object file, add the file to
+    # the link line for the flang_rt.runtime library.
+    if(include_in_link)
+      list(APPEND module_objects ${object_output})
+    endif()
+  endforeach()
+
+  # Set a CACHE variable that is visible to the CMakeLists.txt in runtime/, so that
+  # the compiled Fortran modules can be added to the link line of the flang_rt.runtime
+  # library.
+  set(FORTRAN_MODULE_OBJECTS ${module_objects} CACHE INTERNAL "" FORCE)
+
+  # Special case for omp_lib.mod, because its source comes from openmp/runtime/src/include.
+  # It also produces two module files: omp_lib.mod and omp_lib_kinds.mod.  Compile these
+  # files only if OpenMP support has been configured.
+  if (LLVM_TOOL_OPENMP_BUILD)
+    message(STATUS "OpenMP runtime support enabled via LLVM_ENABLE_PROJECTS, building omp_lib.mod")
+    set(base ${FLANG_INTRINSIC_MODULES_DIR}/omp_lib)
+    add_custom_command(OUTPUT ${base}.mod ${base}_kinds.mod
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${FLANG_INTRINSIC_MODULES_DIR}
+      COMMAND flang -cpp -fsyntax-only ${opts} -module-dir ${FLANG_INTRINSIC_MODULES_DIR}
+        ${CMAKE_BINARY_DIR}/projects/openmp/runtime/src/omp_lib.F90
+      DEPENDS flang ${FLANG_INTRINSIC_MODULES_DIR}/iso_c_binding.mod ${CMAKE_BINARY_DIR}/projects/openmp/runtime/src/omp_lib.F90 ${depends}
+    )
+    list(APPEND MODULE_FILES ${base}.mod ${base}_kinds.mod)
+    install(FILES ${base}.mod ${base}_kinds.mod DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/flang" COMPONENT flang-module-interfaces)
+  elseif ("openmp" IN_LIST LLVM_ENABLE_RUNTIMES)
+    message(STATUS "OpenMP runtime support enabled via LLVM_ENABLE_RUNTIMES, assuming omp_lib.mod is built there")
+  else()
+    message(WARNING "Not building omp_lib.mod, no OpenMP runtime in either LLVM_ENABLE_PROJECTS or LLVM_ENABLE_RUNTIMES")
+  endif()
+  add_llvm_install_targets(install-flang-module-interfaces
+    COMPONENT flang-module-interfaces)
+endif()
+
+add_custom_target(module_files ALL DEPENDS ${MODULE_FILES})
+set_target_properties(module_files PROPERTIES FOLDER "Flang/Resources")
+
+# TODO Move this to a more suitable location
+# Copy the generated omp_lib.h header file, if OpenMP support has been configured.
+if (LLVM_TOOL_OPENMP_BUILD)
+  message(STATUS "OpenMP runtime support enabled via LLVM_ENABLE_PROJECTS, building omp_lib.h")
+  file(COPY ${CMAKE_BINARY_DIR}/projects/openmp/runtime/src/omp_lib.h DESTINATION "${CMAKE_BINARY_DIR}/include/flang/OpenMP/" FILE_PERMISSIONS OWNER_READ OWNER_WRITE)
+  install(FILES ${CMAKE_BINARY_DIR}/include/flang/OpenMP/omp_lib.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/flang/OpenMP")
+elseif ("openmp" IN_LIST LLVM_ENABLE_RUNTIMES)
+  message(STATUS "OpenMP runtime support enabled via LLVM_ENABLE_RUNTIMES, assuming omp_lib.h is built there")
+else()
+  message(STATUS "Not copying omp_lib.h, no OpenMP runtime in either LLVM_ENABLE_PROJECTS or LLVM_ENABLE_RUNTIMES")
+endif()
diff --git a/flang/tools/f18/dump.cpp b/flang/tools/f18/dump.cpp
new file mode 100644
index 0000000000000..f11b5aedf4c6a
--- /dev/null
+++ b/flang/tools/f18/dump.cpp
@@ -0,0 +1,42 @@
+//===-- tools/f18/dump.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// This file defines Dump routines available for calling from the debugger.
+// Each is based on operator<< for that type. There are overloadings for
+// reference and pointer, and for dumping to a provided raw_ostream or errs().
+
+#ifdef DEBUGF18
+
+#include "llvm/Support/raw_ostream.h"
+
+#define DEFINE_DUMP(ns, name) \
+  namespace ns { \
+  class name; \
+  llvm::raw_ostream &operator<<(llvm::raw_ostream &, const name &); \
+  } \
+  void Dump(llvm::raw_ostream &os, const ns::name &x) { os << x << '\n'; } \
+  void Dump(llvm::raw_ostream &os, const ns::name *x) { \
+    if (x == nullptr) \
+      os << "null\n"; \
+    else \
+      Dump(os, *x); \
+  } \
+  void Dump(const ns::name &x) { Dump(llvm::errs(), x); } \
+  void Dump(const ns::name *x) { Dump(llvm::errs(), *x); }
+
+namespace Fortran {
+DEFINE_DUMP(parser, Name)
+DEFINE_DUMP(parser, CharBlock)
+DEFINE_DUMP(semantics, Symbol)
+DEFINE_DUMP(semantics, Scope)
+DEFINE_DUMP(semantics, IntrinsicTypeSpec)
+DEFINE_DUMP(semantics, DerivedTypeSpec)
+DEFINE_DUMP(semantics, DeclTypeSpec)
+} // namespace Fortran
+
+#endif
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 7f8f239efbe68..88fb323e5e449 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -252,11 +252,6 @@ function(runtime_default_target)
     # OpenMP tests
     list(APPEND extra_targets "libomp-mod")
   endif ()
-  if ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
-    # The target flang-rt-mod is a dependee of check-flang needed to run its
-    # tests.
-    list(APPEND extra_targets "flang-rt-mod")
-  endif ()
 
   if(LLVM_INCLUDE_TESTS)
     set_property(GLOBAL APPEND PROPERTY LLVM_ALL_LIT_TESTSUITES "@${LLVM_BINARY_DIR}/runtimes/runtimes-bins/lit.tests")
@@ -556,15 +551,18 @@ if(build_runtimes)
   if ("openmp" IN_LIST LLVM_ENABLE_RUNTIMES AND "flang" IN_LIST LLVM_ENABLE_PROJECTS)
     list(APPEND extra_args ENABLE_FORTRAN)
   endif()
-  if("flang" IN_LIST LLVM_ENABLE_PROJECTS)
-    # Ensure REAL(16) support in runtimes to be consistent with compiler
-    if(FLANG_RUNTIME_F128_MATH_LIB OR HAVE_LDBL_MANT_DIG_113)
-      list(APPEND extra_cmake_args "-DFORTRAN_SUPPORTS_REAL16=TRUE")
-    else()
-      list(APPEND extra_cmake_args "-DFORTRAN_SUPPORTS_REAL16=FALSE")
-    endif()
-  endif()
   if("openmp" IN_LIST LLVM_ENABLE_RUNTIMES OR "offload" IN_LIST LLVM_ENABLE_RUNTIMES)
+    if (${LLVM_TOOL_FLANG_BUILD})
+      message(STATUS "Configuring build of omp_lib.mod and omp_lib_kinds.mod via flang")
+      set(LIBOMP_FORTRAN_MODULES_COMPILER "${CMAKE_BINARY_DIR}/bin/flang")
+      set(LIBOMP_MODULES_INSTALL_PATH "${CMAKE_INSTALL_INCLUDEDIR}/flang")
+      # TODO: This is a workaround until flang becomes a first-class project
+      # in llvm/CMakeList.txt.  Until then, this line ensures that flang is
+      # built before "openmp" is built as a runtime project.  Besides "flang"
+      # to build the compiler, we also need to add "module_files" to make sure
+      # that all .mod files are also properly build.
+      list(APPEND extra_deps "flang" "module_files")
+    endif()
     foreach(dep opt llvm-link llvm-extract clang llvm-offload-binary clang-nvlink-wrapper)
       if(TARGET ${dep})
         list(APPEND extra_deps ${dep})
diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt
index 44d831623014c..df568419824a6 100644
--- a/openmp/CMakeLists.txt
+++ b/openmp/CMakeLists.txt
@@ -134,8 +134,6 @@ set(OPENMP_TEST_FLAGS "" CACHE STRING
   "Extra compiler flags to send to the test compiler.")
 set(OPENMP_TEST_OPENMP_FLAGS ${OPENMP_TEST_COMPILER_OPENMP_FLAGS} CACHE STRING
   "OpenMP compiler flag to use for testing OpenMP runtime libraries.")
-set(OPENMP_TEST_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}" CACHE STRING
-  "Additional compiler flags to use for testing Fortran programs (e.g. additional module search paths via -fintrinsic-modules-path )")
 
 set(ENABLE_LIBOMPTARGET ON)
 # Currently libomptarget cannot be compiled on Windows or MacOS X.
@@ -159,18 +157,13 @@ else()
   get_clang_resource_dir(LIBOMP_HEADERS_INSTALL_PATH SUBDIR include)
 endif()
 
-if(RUNTIMES_FLANG_MODULES_ENABLED)
-  add_subdirectory(module)
-else()
-  message(STATUS "Building omp_lib.mod disabled by RUNTIMES_FLANG_MODULES_ENABLED='${RUNTIMES_FLANG_MODULES_ENABLED}'; libomp-mod does nothing")
-  add_custom_target(libomp-mod)
-endif()
-
 # Use the current compiler target to determine the appropriate runtime to build.
 if("${LLVM_DEFAULT_TARGET_TRIPLE}" MATCHES "^amdgcn|^nvptx" OR
    "${CMAKE_CXX_COMPILER_TARGET}" MATCHES "^amdgcn|^nvptx")
   add_subdirectory(device)
 else()
+  add_subdirectory(module)
+
   # Build host runtime library, after LIBOMPTARGET variables are set since they
   # are needed to enable time profiling support in the OpenMP runtime.
   add_subdirectory(runtime)
diff --git a/openmp/README.rst b/openmp/README.rst
index e16cfa6583082..cc485f9a56ce0 100644
--- a/openmp/README.rst
+++ b/openmp/README.rst
@@ -96,7 +96,7 @@ variable.
 
 **CMAKE_Fortran_COMPILER** = <Fortran compiler name>
   Specify the Fortran compiler. This option is only needed when
-  **RUNTIMES_FLANG_MODULES_ENABLED** is ``ON``.  So typically, a Fortran
+  **LIBOMP_FORTRAN_MODULES** is ``ON`` (see below).  So typically, a Fortran
   compiler is not needed during the build.
 
 **CMAKE_ASM_MASM_COMPILER** = ``ml|ml64``
@@ -166,6 +166,9 @@ Options for ``libomp``
 
     Static libraries are not supported on Windows*.
 
+**LIBOMP_FORTRAN_MODULES** = ``OFF|ON``
+  Create the Fortran modules (requires Fortran compiler).
+
 macOS* Fat Libraries
 """"""""""""""""""""
 On macOS* machines, it is possible to build universal (or fat) libraries which
@@ -340,7 +343,7 @@ Advanced Builds with Various Options
 
   .. code-block:: console
 
-    $ cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DCMAKE_Fortran_COMPILER=ifort -DRUNTIMES_FLANG_MODULES_ENABLED=on ..
+    $ cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DCMAKE_Fortran_COMPILER=ifort -DLIBOMP_FORTRAN_MODULES=on ..
 
 - Have CMake find the C/C++ compiler and specify additional flags for the
   preprocessor and C++ compiler.
diff --git a/openmp/cmake/modules/LibompCheckFortranFlag.cmake b/openmp/cmake/modules/LibompCheckFortranFlag.cmake
new file mode 100644
index 0000000000000..344389f989388
--- /dev/null
+++ b/openmp/cmake/modules/LibompCheckFortranFlag.cmake
@@ -0,0 +1,29 @@
+#
+#//===----------------------------------------------------------------------===//
+#//
+#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#// See https://llvm.org/LICENSE.txt for license information.
+#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#//
+#//===----------------------------------------------------------------------===//
+#
+
+# Checking a fortran compiler flag
+# There is no real trivial way to do this in CMake, so we implement it here
+# this will have ${boolean} = TRUE if the flag succeeds, otherwise false.
+function(libomp_check_fortran_flag flag boolean)
+  if(NOT DEFINED "${boolean}")
+    set(retval TRUE)
+    set(fortran_source
+"      program hello
+           print *, \"Hello World!\"
+      end program hello")
+
+    # Compiling as a part of runtimes introduces ARCH-unknown-linux-gnu as a
+    # part of a working directory.  So adding a guard for unknown.
+    set(failed_regexes "[Ee]rror;[Uu]nknown[^-];[Ss]kipping")
+    include(CheckFortranSourceCompiles)
+    check_fortran_source_compiles("${fortran_source}" ${boolean} FAIL_REGEX "${failed_regexes}")
+    set(${boolean} ${${boolean}} PARENT_SCOPE)
+  endif()
+endfunction()
diff --git a/openmp/cmake/modules/LibompHandleFlags.cmake b/openmp/cmake/modules/LibompHandleFlags.cmake
index a27c8cc407e11..c36a88fb862ae 100644
--- a/openmp/cmake/modules/LibompHandleFlags.cmake
+++ b/openmp/cmake/modules/LibompHandleFlags.cmake
@@ -156,6 +156,17 @@ function(libomp_get_libflags libflags)
   set(${libflags} ${libflags_local_list} PARENT_SCOPE)
 endfunction()
 
+# Fortran flags
+function(libomp_get_fflags fflags)
+  set(fflags_local)
+  if(CMAKE_SIZEOF_VOID_P EQUAL 4)
+    libomp_append(fflags_local -m32 LIBOMP_HAVE_M32_FORTRAN_FLAG)
+  endif()
+  set(fflags_local ${fflags_local} ${LIBOMP_FFLAGS})
+  libomp_setup_flags(fflags_local)
+  set(${fflags} ${fflags_local} PARENT_SCOPE)
+endfunction()
+
 # Python generate-defs.py flags (For Windows only)
 function(libomp_get_gdflags gdflags)
   set(gdflags_local)
diff --git a/openmp/module/CMakeLists.txt b/openmp/module/CMakeLists.txt
index 5a54546fcd285..48f5b0f7a2e86 100644
--- a/openmp/module/CMakeLists.txt
+++ b/openmp/module/CMakeLists.txt
@@ -6,38 +6,71 @@
 #//
 #//===----------------------------------------------------------------------===//
 
-# Build the module files if a Fortran compiler is available.
-# Only LLVM_ENABLE_RUNTIMES=openmp is supported, LLVM_ENABLE_PROJECTS=openmp
-# has been deprecated.
+include(LibompCheckFortranFlag)
 
-configure_file(omp_lib.F90.var "{CMAKE_CURRENT_BINARY_DIR}/omp_lib.F90" @ONLY)
-configure_file(omp_lib.h.var "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib.h" @ONLY)
+set(LIBOMP_FFLAGS "" CACHE STRING
+  "Appended user specified Fortran compiler flags.  These are only used if LIBOMP_FORTRAN_MODULES==TRUE.")
+
+# Enabling Fortran if it is needed
+if (LIBOMP_FORTRAN_MODULES)
+  enable_language(Fortran)
 
-# One compilation step creates both omp_lib.mod and omp_lib_kinds.mod. Only
-# these files are used, the object file itself can be discarded.
-# FIXME: Adding it to libomp.so would allow implementing Fortran API in Fortran
-add_library(libomp-mod OBJECT
-  "{CMAKE_CURRENT_BINARY_DIR}/omp_lib.F90"
-)
-set_target_properties(libomp-mod PROPERTIES FOLDER "OpenMP/Fortran Modules")
-
-# The following requests explicit building of the Fortran module files
-# Workaround for gfortran to build modules with the
-# omp_sched_monotonic integer parameter
-if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
-  target_compile_options(libomp-mod PRIVATE -fno-range-check)
+  libomp_check_fortran_flag(-m32 LIBOMP_HAVE_M32_FORTRAN_FLAG)
 endif ()
 
-flang_module_target(libomp-mod PUBLIC)
-if (FORTRAN_MODULE_DEPS)
-  add_dependencies(libomp-mod ${FORTRAN_MODULE_DEPS})
+# Building the Fortran module files
+# One compilation step creates both omp_lib.mod and omp_lib_kinds.mod
+configure_file(omp_lib.F90.var omp_lib.F90 @ONLY)
+configure_file(omp_lib.h.var "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib.h" @ONLY)
+
+set(BUILD_FORTRAN_MODULES False)
+if (LIBOMP_FORTRAN_MODULES_COMPILER)
+  # If libomp is built as an LLVM runtime and the flang compiler is available,
+  # compile the Fortran module files.
+  message(STATUS "configuring openmp to build Fortran module files using '${LIBOMP_FORTRAN_MODULES_COMPILER}'")
+  set(LIBOMP_FORTRAN_SOURCE_FILE omp_lib.F90)
+  add_custom_target(libomp-mod ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib.mod" "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib_kinds.mod")
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib.mod" "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib_kinds.mod"
+    COMMAND ${LIBOMP_FORTRAN_MODULES_COMPILER} -cpp -fsyntax-only ${LIBOMP_FORTRAN_SOURCE_FILE} "-J${CMAKE_CURRENT_BINARY_DIR}/../runtime/src"
+    DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_FORTRAN_SOURCE_FILE}"
+  )
+  set(BUILD_FORTRAN_MODULES True)
+elseif (LIBOMP_FORTRAN_MODULES)
+  # The following requests explicit building of the Fortran module files
+  # Workaround for gfortran to build modules with the
+  # omp_sched_monotonic integer parameter
+  if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
+    set(ADDITIONAL_Fortran_FLAGS "-fno-range-check")
+  endif ()
+  add_custom_target(libomp-mod ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib.mod" "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib_kinds.mod")
+  set_target_properties(libomp-mod PROPERTIES FOLDER "OpenMP/Misc")
+  libomp_get_fflags(LIBOMP_CONFIGURED_FFLAGS)
+  if (CMAKE_Fortran_COMPILER_SUPPORTS_F90)
+    set(LIBOMP_FORTRAN_SOURCE_FILE omp_lib.F90)
+  else ()
+    message(FATAL_ERROR "Fortran module build requires Fortran 90 compiler")
+  endif ()
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib.mod" "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib_kinds.mod"
+    COMMAND ${CMAKE_Fortran_COMPILER} -c ${ADDITIONAL_Fortran_FLAGS}
+            ${LIBOMP_CONFIGURED_FFLAGS} ${LIBOMP_FORTRAN_SOURCE_FILE} "-J${CMAKE_CURRENT_BINARY_DIR}/../runtime/src"
+    DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/${LIBOMP_FORTRAN_SOURCE_FILE}"
+  )
+  set_property(DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "omp_lib${CMAKE_C_OUTPUT_EXTENSION}")
+  set(BUILD_FORTRAN_MODULES True)
 endif ()
 
-set(destination "${LIBOMP_HEADERS_INSTALL_PATH}")
-if (LIBOMP_MODULES_INSTALL_PATH)
-  set(destination "${LIBOMP_MODULES_INSTALL_PATH}")
+
+if (BUILD_FORTRAN_MODULES)
+  set(destination "${LIBOMP_HEADERS_INSTALL_PATH}")
+  if (LIBOMP_MODULES_INSTALL_PATH)
+    set(destination "${LIBOMP_MODULES_INSTALL_PATH}")
+  endif ()
+  install(FILES
+    "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib.mod"
+    "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib_kinds.mod"
+    "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib.h"
+    DESTINATION ${destination}
+  )
 endif ()
-install(FILES
-  "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src/omp_lib.h"
-  DESTINATION ${destination}
-)
diff --git a/openmp/runtime/CMakeLists.txt b/openmp/runtime/CMakeLists.txt
index 1917af595288f..93948b941f0dc 100644
--- a/openmp/runtime/CMakeLists.txt
+++ b/openmp/runtime/CMakeLists.txt
@@ -422,7 +422,7 @@ if(${OPENMP_STANDALONE_BUILD})
   libomp_say("Build Type           -- ${CMAKE_BUILD_TYPE}")
   libomp_say("Library Kind         -- ${LIBOMP_LIBRARY_KIND}")
   libomp_say("Library Type         -- ${LIBOMP_LIB_TYPE}")
-  libomp_say("Fortran Modules      -- ${RUNTIMES_FLANG_MODULES_ENABLED}")
+  libomp_say("Fortran Modules      -- ${LIBOMP_FORTRAN_MODULES}")
   # will say development if all zeros
   if(${LIBOMP_VERSION_BUILD} STREQUAL 00000000)
     set(LIBOMP_BUILD Development)
diff --git a/openmp/runtime/cmake/LibompExports.cmake b/openmp/runtime/cmake/LibompExports.cmake
index 39bc8cebbe178..6dfe4f4569aa1 100644
--- a/openmp/runtime/cmake/LibompExports.cmake
+++ b/openmp/runtime/cmake/LibompExports.cmake
@@ -56,21 +56,21 @@ set(LIBOMP_EXPORTS_LIB_DIR "${LIBOMP_EXPORTS_DIR}/${libomp_platform}${libomp_suf
 # Put headers in exports/ directory post build
 add_custom_command(TARGET omp POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_CMN_DIR}
-  COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMP_HEADERS_INTDIR}/omp.h ${LIBOMP_EXPORTS_CMN_DIR}
-  COMMAND ${CMAKE_COMMAND} -E copy ${LIBOMP_HEADERS_INTDIR}/ompx.h ${LIBOMP_EXPORTS_CMN_DIR}
+  COMMAND ${CMAKE_COMMAND} -E copy omp.h ${LIBOMP_EXPORTS_CMN_DIR}
+  COMMAND ${CMAKE_COMMAND} -E copy ompx.h ${LIBOMP_EXPORTS_CMN_DIR}
 )
 if(${LIBOMP_OMPT_SUPPORT})
   add_custom_command(TARGET omp POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E copy omp-tools.h ${LIBOMP_EXPORTS_CMN_DIR}
   )
 endif()
-if(RUNTIMES_FLANG_MODULES_ENABLED)
+if(${LIBOMP_FORTRAN_MODULES})
   # We cannot attach a POST_BUILD command to libomp-mod, so instead attach it
   # to omp and ensure that libomp-mod is built before by adding a dependency
   add_custom_command(TARGET omp POST_BUILD
     COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBOMP_EXPORTS_MOD_DIR}
-    COMMAND ${CMAKE_COMMAND} -E copy ${RUNTIMES_OUTPUT_RESOURCE_MOD_DIR}/omp_lib.mod ${LIBOMP_EXPORTS_MOD_DIR}
-    COMMAND ${CMAKE_COMMAND} -E copy ${RUNTIMES_OUTPUT_RESOURCE_MOD_DIR}/omp_lib_kinds.mod ${LIBOMP_EXPORTS_MOD_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy omp_lib.mod ${LIBOMP_EXPORTS_MOD_DIR}
+    COMMAND ${CMAKE_COMMAND} -E copy omp_lib_kinds.mod ${LIBOMP_EXPORTS_MOD_DIR}
   )
   add_dependencies(omp libomp-mod)
   add_custom_command(TARGET omp POST_BUILD
diff --git a/openmp/runtime/test/lit.cfg b/openmp/runtime/test/lit.cfg
index a4e3aa505638a..72da1ba1411f8 100644
--- a/openmp/runtime/test/lit.cfg
+++ b/openmp/runtime/test/lit.cfg
@@ -48,7 +48,6 @@ if config.test_fortran_compiler:
         ToolSubst(
             "%flang",
             command=config.test_fortran_compiler,
-            extra_args=config.test_fortran_flags.split(),
             unresolved="fatal",
         ),
     ], [config.llvm_tools_dir])
diff --git a/openmp/runtime/test/lit.site.cfg.in b/openmp/runtime/test/lit.site.cfg.in
index fd0ad30228026..cc8b3b252d7d1 100644
--- a/openmp/runtime/test/lit.site.cfg.in
+++ b/openmp/runtime/test/lit.site.cfg.in
@@ -8,7 +8,6 @@ config.test_compiler_has_omp_h = @OPENMP_TEST_COMPILER_HAS_OMP_H@
 config.test_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
 config.test_not = "@OPENMP_NOT_EXECUTABLE@"
 config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@"
-config.test_fortran_flags = "@OPENMP_TEST_Fortran_FLAGS@"
 config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
 config.libomp_obj_root = "@CMAKE_CURRENT_BINARY_DIR@"
 config.library_dir = "@LIBOMP_LIBRARY_DIR@"
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index a5bb2d975272e..5220b9353fed7 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -85,195 +85,6 @@ include(CheckLibraryExists)
 include(LLVMCheckCompilerLinkerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
-include(GetToolchainDirs)
-include(ExtendPath)
-
-# Check whether the Fortran compiler already access to builtin modules. Sets
-# HAVE_FORTRAN_INTRINSIC_MODS when returning.
-#
-# This must be wrapped in a function because
-# cmake_push_check_state/cmake_pop_check_state is insufficient to isolate
-# a compiler introspection environment, see
-# https://gitlab.kitware.com/cmake/cmake/-/issues/27419
-function (check_fortran_builtins_available)
-  if (CMAKE_Fortran_COMPILER_FORCED AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-    # CMake's try_compile does not take a user-defined
-    # CMAKE_Fortran_PREPROCESS_SOURCE into account. Instead of test-compiling,
-    # ask Flang directly for the builtin module files.
-    if (NOT DEFINED HAVE_FORTRAN_HAS_ISO_C_BINDING_MOD)
-      message(STATUS "Performing Test ISO_C_BINDING_PATH")
-      execute_process(
-        COMMAND ${CMAKE_Fortran_COMPILER} ${CMAKE_Fortran_FLAGS} "-print-file-name=iso_c_binding.mod"
-        OUTPUT_VARIABLE ISO_C_BINDING_PATH
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        ERROR_QUIET
-      )
-      set(HAVE_FORTRAN_HAS_ISO_C_BINDING_MOD "" )
-      if (EXISTS "${ISO_C_BINDING_PATH}")
-        message(STATUS "Performing Test ISO_C_BINDING_PATH -- Success")
-        set(HAVE_FORTRAN_HAS_ISO_C_BINDING_MOD TRUE CACHE INTERNAL "Existence result of ${CMAKE_Fortran_COMPILER} -print-file-name=iso_c_binding.mod")
-      else ()
-        message(STATUS "Performing Test ISO_C_BINDING_PATH -- Failed")
-        set(HAVE_FORTRAN_HAS_ISO_C_BINDING_MOD FALSE CACHE INTERNAL "Existence result of ${CMAKE_Fortran_COMPILER} -print-file-name=iso_c_binding.mod")
-      endif ()
-    endif ()
-  else ()
-    cmake_push_check_state(RESET)
-    set(CMAKE_TRY_COMPILE_TARGET_TYPE "STATIC_LIBRARY")
-    check_fortran_source_compiles("
-      subroutine testroutine
-        use iso_c_binding
-      end subroutine
-      " HAVE_FORTRAN_HAS_ISO_C_BINDING_MOD SRC_EXT F90)
-    cmake_pop_check_state()
-  endif ()
-  set(HAVE_FORTRAN_INTRINSIC_MODS "${HAVE_FORTRAN_HAS_ISO_C_BINDING_MOD}" PARENT_SCOPE)
-endfunction ()
-
-
-# Set options to compile Fortran module files.
-#
-# Usage:
-#
-# flang_module_target(name
-#   PUBLIC
-#     Modules files are to be used by other Fortran sources. If a library is
-#     compiled multiple times (e.g. static/shared, or msvcrt variants), only
-#     one of those can be public module files; non-public modules are still
-#     generated but to be forgotten deep inside the build directory to not
-#     conflict with each other.
-#     Also, installs the module with the toolchain.
-# )
-function (flang_module_target tgtname)
-  set(options PUBLIC)
-  cmake_parse_arguments(ARG
-    "${options}"
-    ""
-    ""
-    ${ARGN})
-
-  if (NOT RUNTIMES_FLANG_MODULES_ENABLED)
-    message(WARNING "Cannot build module files for ${tgtname} when RUNTIMES_FLANG_MODULES_ENABLED is ${RUNTIMES_FLANG_MODULES_ENABLED}")
-    return ()
-  endif ()
-
-  target_compile_options(${tgtname} PRIVATE
-      # Let non-public modules find the public module files
-      "$<$<COMPILE_LANGUAGE:Fortran>:-fintrinsic-modules-path=${RUNTIMES_OUTPUT_RESOURCE_MOD_DIR}>"
-
-      # Flang bug workaround: Reformating of cooked token buffer causes identifier to be split between lines
-      "$<$<COMPILE_LANGUAGE:Fortran>:SHELL:-Xflang;SHELL:-fno-reformat>"
-    )
-
-  if (LLVM_RUNTIMES_TARGET MATCHES "^nvptx")
-    foreach (_arch IN LISTS RUNTIMES_DEVICE_ARCHITECTURES)
-      target_compile_options(${tgtname} PRIVATE
-       "$<$<COMPILE_LANGUAGE:Fortran>:-march=${_arch}>"
-      )
-    endforeach()
-  endif ()
-
-  if (ARG_PUBLIC)
-    set_target_properties(${tgtname}
-      PROPERTIES
-        Fortran_MODULE_DIRECTORY "${RUNTIMES_OUTPUT_RESOURCE_MOD_DIR}"
-      )
-  else ()
-    set_target_properties(${tgtname}
-      PROPERTIES
-        Fortran_MODULE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${tgtname}.mod"
-      )
-  endif ()
-endfunction ()
-
-# Check whether the build environment supports building Fortran modules
-# flang-rt and openmp are the only runtimes that contain Fortran modules.
-set(FORTRAN_MODULE_DEPS "")
-if (CMAKE_Fortran_COMPILER AND ("flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES OR "openmp" IN_LIST LLVM_ENABLE_RUNTIMES))
-  cmake_path(GET CMAKE_Fortran_COMPILER STEM _Fortran_COMPILER_STEM)
-
-  if (_Fortran_COMPILER_STEM STREQUAL "flang-new" OR _Fortran_COMPILER_STEM STREQUAL "flang")
-    # CMake 3.24 is the first version of CMake that directly recognizes Flang.
-    # LLVM's requirement is only CMake 3.20, teach CMake 3.20-3.23 how to use Flang, if used.
-    if (CMAKE_VERSION VERSION_LESS "3.24")
-      include(CMakeForceCompiler)
-      CMAKE_FORCE_Fortran_COMPILER("${CMAKE_Fortran_COMPILER}" "LLVMFlang")
-
-      set(CMAKE_Fortran_COMPILER_ID "LLVMFlang")
-      set(CMAKE_Fortran_COMPILER_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}")
-
-      set(CMAKE_Fortran_SUBMODULE_SEP "-")
-      set(CMAKE_Fortran_SUBMODULE_EXT ".mod")
-
-      set(CMAKE_Fortran_PREPROCESS_SOURCE
-        "<CMAKE_Fortran_COMPILER> -cpp <DEFINES> <INCLUDES> <FLAGS> -E <SOURCE> > <PREPROCESSED_SOURCE>")
-
-      set(CMAKE_Fortran_FORMAT_FIXED_FLAG "-ffixed-form")
-      set(CMAKE_Fortran_FORMAT_FREE_FLAG "-ffree-form")
-
-      set(CMAKE_Fortran_MODDIR_FLAG "-J")
-
-      set(CMAKE_Fortran_COMPILE_OPTIONS_PREPROCESS_ON "-cpp")
-      set(CMAKE_Fortran_COMPILE_OPTIONS_PREPROCESS_OFF "-nocpp")
-      set(CMAKE_Fortran_POSTPROCESS_FLAG "-ffixed-line-length-72")
-
-      set(CMAKE_Fortran_LINKER_WRAPPER_FLAG "-Wl,")
-      set(CMAKE_Fortran_LINKER_WRAPPER_FLAG_SEP ",")
-
-      set(CMAKE_Fortran_VERBOSE_FLAG "-v")
-
-      set(CMAKE_Fortran_LINK_MODE DRIVER)
-    endif ()
-
-    # Optimization flags are only passed after CMake 3.27.4
-    # https://gitlab.kitware.com/cmake/cmake/-/commit/1140087adea98bd8d8974e4c18979f4949b52c34
-    if (CMAKE_VERSION VERSION_LESS "3.27.4")
-      string(APPEND CMAKE_Fortran_FLAGS_DEBUG_INIT " -O0 -g")
-      string(APPEND CMAKE_Fortran_FLAGS_RELWITHDEBINFO_INIT " -O2 -g")
-      string(APPEND CMAKE_Fortran_FLAGS_RELEASE_INIT " -O3")
-    endif ()
-
-    # Only CMake 3.28+ pass --target= to Flang. But for cross-compiling, including
-    # to nvptx amd amdgpu targets, passing the target triple is essential.
-    # https://gitlab.kitware.com/cmake/cmake/-/commit/e9af7b968756e72553296ecdcde6f36606a0babf
-    if (CMAKE_VERSION VERSION_LESS "3.28")
-      set(CMAKE_Fortran_COMPILE_OPTIONS_TARGET "--target=")
-    endif ()
-  endif ()
-
-  include(CheckFortranSourceCompiles)
-  include(CheckLanguage)
-
-  set(RUNTIMES_FLANG_MODULES_ENABLED_default OFF)
-  check_language(Fortran)
-  if (CMAKE_Fortran_COMPILER)
-    enable_language(Fortran)
-
-    if (CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang" AND "flang-rt" IN_LIST LLVM_ENABLE_RUNTIMES)
-      # In a bootstrapping build (or any runtimes-build that includes flang-rt),
-      # the intrinsic modules are not built yet. Targets can depend on
-      # flang-rt-mod to ensure that flang-rt's modules are built first.
-      set(FORTRAN_MODULE_DEPS flang-rt-mod)
-      set(RUNTIMES_FLANG_MODULES_ENABLED_default ON)
-    else ()
-      # Check whether building modules works, avoid causing the entire build to
-      # fail because of Fortran. The primary situation we want to support here
-      # is Flang, or its intrinsic modules were built separately in a
-      # non-bootstrapping build.
-      check_fortran_builtins_available()
-      if (HAVE_FORTRAN_INTRINSIC_MODS)
-        set(RUNTIMES_FLANG_MODULES_ENABLED_default ON)
-        message(STATUS "${LLVM_SUBPROJECT_TITLE}: Non-bootstrapping Fortran modules build (${CMAKE_Fortran_COMPILER_ID} located at ${CMAKE_Fortran_COMPILER})")
-      else ()
-        message(STATUS "Not compiling Flang modules: Not passing smoke check")
-      endif ()
-    endif ()
-  endif ()
-
-  option(RUNTIMES_FLANG_MODULES_ENABLED "Build Fortran modules" "${RUNTIMES_FLANG_MODULES_ENABLED_default}")
-else ()
-  set(RUNTIMES_FLANG_MODULES_ENABLED NO)
-endif ()
 
 
 # Determine whether we are in the runtimes/runtimes-bins directory of a
@@ -283,6 +94,17 @@ if (LLVM_LIBRARY_DIR AND LLVM_TOOLS_BINARY_DIR AND PACKAGE_VERSION)
   set(LLVM_TREE_AVAILABLE ON)
 endif()
 
+if(LLVM_TREE_AVAILABLE)
+  # Setting these variables will allow the sub-build to put their outputs into
+  # the library and bin directories of the top-level build.
+  set(LLVM_LIBRARY_OUTPUT_INTDIR ${LLVM_LIBRARY_DIR})
+  set(LLVM_RUNTIME_OUTPUT_INTDIR ${LLVM_TOOLS_BINARY_DIR})
+else()
+  # Use own build directory for artifact output.
+  set(LLVM_LIBRARY_OUTPUT_INTDIR "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX}")
+  set(LLVM_RUNTIME_OUTPUT_INTDIR "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin")
+endif()
+
 # CMake omits default compiler include paths, but in runtimes build, we use
 # -nostdinc and -nostdinc++ and control include paths manually so this behavior
 # is undesirable. Filtering CMAKE_{LANG}_IMPLICIT_INCLUDE_DIRECTORIES to remove
@@ -414,109 +236,6 @@ if(CMAKE_C_COMPILER_ID MATCHES "Clang")
   endif()
 endif()
 
-# Determine output and install paths based on LLVM_TARGET_TRIPLE
-if(LLVM_TREE_AVAILABLE)
-  # In a bootstrap build emit the libraries into a default search path in the
-  # build directory of the just-built compiler. This allows using the
-  # just-built compiler without specifying paths to runtime libraries.
-  # LLVM_LIBRARY_OUTPUT_INTDIR/LLVM_RUNTIME_OUTPUT_INTDIR is used by
-  # AddLLVM.cmake as artifact output locations.
-  set(LLVM_LIBRARY_OUTPUT_INTDIR ${LLVM_LIBRARY_DIR})
-  set(LLVM_RUNTIME_OUTPUT_INTDIR ${LLVM_TOOLS_BINARY_DIR})
-
-  # Despite Clang in the name, get_clang_resource_dir does not depend on Clang
-  # being added to the build. Flang uses the same resource dir as Clang.
-  include(GetClangResourceDir)
-  get_clang_resource_dir(RUNTIMES_OUTPUT_RESOURCE_DIR PREFIX "${LLVM_LIBRARY_OUTPUT_INTDIR}/..")
-  get_clang_resource_dir(RUNTIMES_INSTALL_RESOURCE_PATH_DEFAULT)
-else()
-  # In a standalone runtimes build, do not write into LLVM_BINARY_DIR. It may be
-  # read-only and/or shared by multiple runtimes with different build
-  # configurations (e.g. Debug/Release). Use the runtime's own lib dir like any
-  # non-toolchain library. Use own build directory for artifact output.
-  set(LLVM_LIBRARY_OUTPUT_INTDIR "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX}")
-  set(LLVM_RUNTIME_OUTPUT_INTDIR "${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin")
-
-  # For the install prefix, still use the resource dir assuming that Flang will
-  # be installed there using the same prefix. This is to not have a difference
-  # between bootstrap and standalone runtimes builds.
-  set(RUNTIMES_OUTPUT_RESOURCE_DIR "${CMAKE_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX}/clang/${LLVM_VERSION_MAJOR}")
-  set(RUNTIMES_INSTALL_RESOURCE_PATH_DEFAULT "lib${LLVM_LIBDIR_SUFFIX}/clang/${LLVM_VERSION_MAJOR}")
-endif()
-
-# Determine build and install paths.
-# The build path is absolute, but the install dir is relative, CMake's install
-# command has to apply CMAKE_INSTALL_PREFIX itself.
-# FIXME: For shared libraries, the toolchain resource lib dir is not a good
-#        destination because it is not a ld.so default search path.
-#        The machine where the executable is eventually executed may not be the
-#        machine where the Flang compiler and its resource dir is installed, so
-#        setting RPath by the driver is not an solution. It should belong into
-#        /usr/lib/<triple>/lib<name>.so, like e.g. libgcc_s.so.
-#        But the linker as invoked by the Flang driver also requires
-#        libflang_rt.so to be found when linking and the resource lib dir is
-#        the only reliable location.
-get_toolchain_library_subdir(toolchain_lib_subdir)
-extend_path(RUNTIMES_OUTPUT_RESOURCE_LIB_DIR "${RUNTIMES_OUTPUT_RESOURCE_DIR}" "${toolchain_lib_subdir}")
-
-set(RUNTIMES_INSTALL_RESOURCE_PATH "${RUNTIMES_INSTALL_RESOURCE_PATH_DEFAULT}" CACHE PATH "Path to install headers, runtime libraries, and Fortran modules to (default: Clang resource dir)")
-extend_path(RUNTIMES_INSTALL_RESOURCE_LIB_PATH "${RUNTIMES_INSTALL_RESOURCE_PATH}" "${toolchain_lib_subdir}")
-
-cmake_path(NORMAL_PATH RUNTIMES_OUTPUT_RESOURCE_DIR)
-cmake_path(NORMAL_PATH RUNTIMES_INSTALL_RESOURCE_PATH)
-cmake_path(NORMAL_PATH RUNTIMES_OUTPUT_RESOURCE_LIB_DIR)
-cmake_path(NORMAL_PATH RUNTIMES_INSTALL_RESOURCE_LIB_PATH)
-
-
-if (RUNTIMES_FLANG_MODULES_ENABLED)
-  if (CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-    # Flang expects its builtin modules in Clang's resource directory
-    get_toolchain_module_subdir(toolchain_mod_subdir)
-    extend_path(RUNTIMES_OUTPUT_RESOURCE_MOD_DIR "${RUNTIMES_OUTPUT_RESOURCE_DIR}" "${toolchain_mod_subdir}")
-    extend_path(RUNTIMES_INSTALL_RESOURCE_MOD_PATH "${RUNTIMES_INSTALL_RESOURCE_PATH}" "${toolchain_mod_subdir}")
-  else ()
-    # For non-Flang compilers, avoid the risk of Flang accidentally picking them up.
-    extend_path(RUNTIMES_OUTPUT_RESOURCE_MOD_DIR "${RUNTIMES_OUTPUT_RESOURCE_DIR}" "finclude-${CMAKE_Fortran_COMPILER_ID}")
-    extend_path(RUNTIMES_INSTALL_RESOURCE_MOD_PATH "${RUNTIMES_INSTALL_RESOURCE_PATH}" "finclude-${CMAKE_Fortran_COMPILER_ID}")
-  endif ()
-  cmake_path(NORMAL_PATH RUNTIMES_OUTPUT_RESOURCE_MOD_DIR)
-  cmake_path(NORMAL_PATH RUNTIMES_INSTALL_RESOURCE_MOD_PATH)
-
-  # No way to find out which mod files are built by a target, so install the
-  # entire output directory
-  # https://stackoverflow.com/questions/52712416/cmake-fortran-module-directory-to-be-used-with-add-library
-  set(destination "${RUNTIMES_INSTALL_RESOURCE_MOD_PATH}/..")
-  cmake_path(NORMAL_PATH destination)
-    install(DIRECTORY "${RUNTIMES_OUTPUT_RESOURCE_MOD_DIR}"
-      DESTINATION "${destination}"
-    )
-endif ()
-
-
-if (RUNTIMES_FLANG_MODULES_ENABLED)
-  set(RUNTIMES_DEVICE_ARCHITECTURES "all" CACHE STRING
-      "List of offload device architectures to be used to compile the Fortran offload code (e.g. 'gfx1103;sm_90')"
-    )
-
-  if (RUNTIMES_DEVICE_ARCHITECTURES STREQUAL "all")
-    # TODO: support auto detection on the build system.
-    set(all_amdgpu_architectures
-      "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906"
-      "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030"
-      "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036"
-      "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151"
-      "gfx1152;gfx1153")
-    set(all_nvptx_architectures
-      "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62"
-      "sm_70;sm_72;sm_75;sm_80;sm_86;sm_89;sm_90")
-    set(all_gpu_architectures
-      "${all_amdgpu_architectures};${all_nvptx_architectures}")
-      set(RUNTIMES_DEVICE_ARCHITECTURES ${all_gpu_architectures})
-  endif()
-  list(REMOVE_DUPLICATES RUNTIMES_DEVICE_ARCHITECTURES)
-endif ()
-
-
 option(LLVM_INCLUDE_TESTS "Generate build targets for the runtimes unit tests." ON)
 option(LLVM_INCLUDE_DOCS "Generate build targets for the runtimes documentation." ON)
 option(LLVM_ENABLE_SPHINX "Use Sphinx to generate the runtimes documentation." OFF)

From 9e2b8b0254ccd26dc6f3c7abf053646429d9f15d Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 9 Dec 2025 14:36:32 -0800
Subject: [PATCH 64/85] [lldb] Remove CommandReturnObject::AppendRawError
 (#171517)

Remove `CommandReturnObject::AppendRawError` and replace its two uses
with `AppendError`, which correctly prefixes the message with `error:`.
The comment for the method is outdated and the prefixing is clearly
desired in both situations.
---
 lldb/include/lldb/Interpreter/CommandReturnObject.h      | 2 --
 lldb/source/Commands/CommandObjectMultiword.cpp          | 2 +-
 lldb/source/Interpreter/CommandInterpreter.cpp           | 2 +-
 lldb/source/Interpreter/CommandReturnObject.cpp          | 9 ---------
 .../functionalities/abbreviation/TestAbbreviations.py    | 2 +-
 .../ambigous_commands/TestAmbiguousCommands.py           | 2 +-
 .../functionalities/wrong_commands/TestWrongCommands.py  | 4 +++-
 .../Commands/command-wrong-subcommand-error-msg.test     | 4 ++--
 8 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/lldb/include/lldb/Interpreter/CommandReturnObject.h b/lldb/include/lldb/Interpreter/CommandReturnObject.h
index 0742f1b836f5e..f6e60840256a4 100644
--- a/lldb/include/lldb/Interpreter/CommandReturnObject.h
+++ b/lldb/include/lldb/Interpreter/CommandReturnObject.h
@@ -129,8 +129,6 @@ class CommandReturnObject {
 
   void AppendError(llvm::StringRef in_string);
 
-  void AppendRawError(llvm::StringRef in_string);
-
   void AppendErrorWithFormat(const char *format, ...)
       __attribute__((format(printf, 2, 3)));
 
diff --git a/lldb/source/Commands/CommandObjectMultiword.cpp b/lldb/source/Commands/CommandObjectMultiword.cpp
index a369557cca845..e08b33cdce940 100644
--- a/lldb/source/Commands/CommandObjectMultiword.cpp
+++ b/lldb/source/Commands/CommandObjectMultiword.cpp
@@ -205,7 +205,7 @@ void CommandObjectMultiword::Execute(const char *args_string,
             .str());
   }
   error_msg.append("\n");
-  result.AppendRawError(error_msg.c_str());
+  result.AppendError(error_msg);
 }
 
 std::string CommandObjectMultiword::GetSubcommandsHintText() {
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index ffcc9ceeb2a93..cb6acfc9c29a9 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -3708,7 +3708,7 @@ CommandInterpreter::ResolveCommandImpl(std::string &command_line,
           for (uint32_t i = 0; i < num_matches; ++i) {
             error_msg.Printf("\t%s\n", matches.GetStringAtIndex(i));
           }
-          result.AppendRawError(error_msg.GetString());
+          result.AppendError(error_msg.GetString());
         }
       } else {
         // We didn't have only one match, otherwise we wouldn't get here.
diff --git a/lldb/source/Interpreter/CommandReturnObject.cpp b/lldb/source/Interpreter/CommandReturnObject.cpp
index ef5bfae1bd1bd..85b058e97a679 100644
--- a/lldb/source/Interpreter/CommandReturnObject.cpp
+++ b/lldb/source/Interpreter/CommandReturnObject.cpp
@@ -173,15 +173,6 @@ StructuredData::ObjectSP CommandReturnObject::GetErrorData() {
   return Serialize(m_diagnostics);
 }
 
-// Similar to AppendError, but do not prepend 'Status: ' to message, and don't
-// append "\n" to the end of it.
-
-void CommandReturnObject::AppendRawError(llvm::StringRef in_string) {
-  SetStatus(eReturnStatusFailed);
-  assert(!in_string.empty() && "Expected a non-empty error message");
-  GetErrorStream() << in_string;
-}
-
 void CommandReturnObject::SetStatus(ReturnStatus status) { m_status = status; }
 
 ReturnStatus CommandReturnObject::GetStatus() const { return m_status; }
diff --git a/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py b/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
index cc767edaaa619..5dd4f6bee56a3 100644
--- a/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
+++ b/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
@@ -41,7 +41,7 @@ def test_command_abbreviations_and_aliases(self):
         # "pl" could be "platform" or "plugin".
         command_interpreter.ResolveCommand("pl", result)
         self.assertFalse(result.Succeeded())
-        self.assertTrue(result.GetError().startswith("Ambiguous command"))
+        self.assertTrue(result.GetError().startswith("error: Ambiguous command"))
 
         # Make sure an unabbreviated command is not mangled.
         command_interpreter.ResolveCommand(
diff --git a/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
index 14c66fefea7ef..31b67d72ce469 100644
--- a/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
+++ b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
@@ -24,7 +24,7 @@ def test_ambiguous_command_with_alias(self):
         self.assertFalse(result.Succeeded())
         self.assertEqual(
             result.GetError(),
-            "Ambiguous command 'co'. Possible matches:\n\tcommand\n\tcontinue\n\tcorefile\n",
+            "error: Ambiguous command 'co'. Possible matches:\n\tcommand\n\tcontinue\n\tcorefile\n",
         )
 
         command_interpreter.HandleCommand("command unalias continue", result)
diff --git a/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py b/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py
index 6d2ce2bb3e709..25f95f3061eb4 100644
--- a/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py
+++ b/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py
@@ -17,7 +17,9 @@ def test_ambiguous_command(self):
 
         command_interpreter.HandleCommand("g", result)
         self.assertFalse(result.Succeeded())
-        self.assertRegex(result.GetError(), "Ambiguous command 'g'. Possible matches:")
+        self.assertRegex(
+            result.GetError(), "error: Ambiguous command 'g'. Possible matches:"
+        )
         self.assertRegex(result.GetError(), "gui")
         self.assertRegex(result.GetError(), "gdb-remote")
         self.assertEqual(1, result.GetError().count("gdb-remote"))
diff --git a/lldb/test/Shell/Commands/command-wrong-subcommand-error-msg.test b/lldb/test/Shell/Commands/command-wrong-subcommand-error-msg.test
index 5365f81c19587..968a4d6fb9682 100644
--- a/lldb/test/Shell/Commands/command-wrong-subcommand-error-msg.test
+++ b/lldb/test/Shell/Commands/command-wrong-subcommand-error-msg.test
@@ -4,5 +4,5 @@
 # RUN: not %lldb -b -o 'breakpoint foo' %t.out -o exit 2>&1 | FileCheck %s --check-prefix BP-MSG
 # RUN: not %lldb -b -o 'watchpoint set foo' %t.out -o exit 2>&1 | FileCheck %s --check-prefix WP-MSG
 # CHECK: at main.c:2:21
-# BP-MSG: "foo" is not a valid subcommand of "breakpoint". Valid subcommands are: add, clear, command, delete, disable, and others. Use "help breakpoint" to find out more.
-# WP-MSG: "foo" is not a valid subcommand of "watchpoint set". Valid subcommands are: expression, variable. Use "help watchpoint set" to find out more.
\ No newline at end of file
+# BP-MSG: error: "foo" is not a valid subcommand of "breakpoint". Valid subcommands are: add, clear, command, delete, disable, and others. Use "help breakpoint" to find out more.
+# WP-MSG: error: "foo" is not a valid subcommand of "watchpoint set". Valid subcommands are: expression, variable. Use "help watchpoint set" to find out more.

From 794218bc53a42bd87048317506e8794deb0dc8be Mon Sep 17 00:00:00 2001
From: Prajwal Nadig <pnadig@apple.com>
Date: Tue, 9 Dec 2025 23:41:46 +0100
Subject: [PATCH 65/85] [ExtractAPI] Format typedef params correctly (#171516)

Typically, pointer types are formatted in a way where the identifier
comes right after the type definition without a space separating them,
e.g. `int *foo`, where the type is `int *` and the identifier is `foo`.
However, if a type alias to a pointer type is used, the emitted
declaration fragments are incorrect due to the missing space between the
type and identifier, like in the below example:

```
typedef int *T;
// The declaration fragment contains `Tbar` instead of `T bar`
void foo(T bar);
```

This patch checks if pointer types are aliased, and inserts the space
correctly if so.

rdar://132022003
---
 clang/lib/ExtractAPI/DeclarationFragments.cpp |  5 +-
 clang/test/ExtractAPI/typedef.c               | 81 ++++++++++++++++++-
 2 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index e5eda46df8056..7f2778713eade 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -633,7 +633,10 @@ DeclarationFragmentsBuilder::getFragmentsForParam(const ParmVarDecl *Param) {
                 DeclarationFragments::FragmentKind::InternalParam);
   } else {
     Fragments.append(std::move(TypeFragments));
-    if (!T->isAnyPointerType() && !T->isBlockPointerType())
+    // If the type is a type alias, append the space
+    // even if the underlying type is a pointer type.
+    if (T->isTypedefNameType() ||
+        (!T->isAnyPointerType() && !T->isBlockPointerType()))
       Fragments.appendSpace();
     Fragments
         .append(Param->getName(),
diff --git a/clang/test/ExtractAPI/typedef.c b/clang/test/ExtractAPI/typedef.c
index a4c3619bfd210..6099cd62f8776 100644
--- a/clang/test/ExtractAPI/typedef.c
+++ b/clang/test/ExtractAPI/typedef.c
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing -fblocks \
 // RUN:   -triple arm64-apple-macosx -x objective-c-header %s -o %t/output.symbols.json -verify
 
 // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix MYINT
@@ -90,4 +90,83 @@ void foo(BarPtr value);
 void baz(BarPtr *value);
 // CHECK-NOT: struct Bar *
 
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix BLOCKPTR
+typedef int (^CustomType)(const unsigned int *, unsigned long);
+void bar(CustomType block);
+
+// BLOCKPTR-LABEL: "!testLabel": "c:@F@bar",
+// BLOCKPTR:           "declarationFragments": [
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "typeIdentifier",
+// BLOCKPTR-NEXT:          "preciseIdentifier": "c:v",
+// BLOCKPTR-NEXT:          "spelling": "void"
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "text",
+// BLOCKPTR-NEXT:          "spelling": " "
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "identifier",
+// BLOCKPTR-NEXT:          "spelling": "bar"
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "text",
+// BLOCKPTR-NEXT:          "spelling": "("
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "typeIdentifier",
+// BLOCKPTR-NEXT:          "preciseIdentifier": "c:typedef.c@T@CustomType",
+// BLOCKPTR-NEXT:          "spelling": "CustomType"
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "text",
+// BLOCKPTR-NEXT:          "spelling": " "
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "internalParam",
+// BLOCKPTR-NEXT:          "spelling": "block"
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "text",
+// BLOCKPTR-NEXT:          "spelling": ");"
+// BLOCKPTR-NEXT:        }
+// BLOCKPTR-NEXT:      ],
+// BLOCKPTR-NEXT:      "functionSignature": {
+// BLOCKPTR-NEXT:        "parameters": [
+// BLOCKPTR-NEXT:          {
+// BLOCKPTR-NEXT:            "declarationFragments": [
+// BLOCKPTR-NEXT:              {
+// BLOCKPTR-NEXT:                "kind": "typeIdentifier",
+// BLOCKPTR-NEXT:                "preciseIdentifier": "c:typedef.c@T@CustomType",
+// BLOCKPTR-NEXT:                "spelling": "CustomType"
+// BLOCKPTR-NEXT:              },
+// BLOCKPTR-NEXT:              {
+// BLOCKPTR-NEXT:                "kind": "text",
+// BLOCKPTR-NEXT:                "spelling": " "
+// BLOCKPTR-NEXT:              },
+// BLOCKPTR-NEXT:              {
+// BLOCKPTR-NEXT:                "kind": "internalParam",
+// BLOCKPTR-NEXT:                "spelling": "block"
+// BLOCKPTR-NEXT:              }
+// BLOCKPTR-NEXT:            ],
+// BLOCKPTR-NEXT:            "name": "block"
+// BLOCKPTR-NEXT:          }
+// BLOCKPTR-NEXT:        ],
+// BLOCKPTR-NEXT:        "returns": [
+// BLOCKPTR-NEXT:          {
+// BLOCKPTR-NEXT:            "kind": "typeIdentifier",
+// BLOCKPTR-NEXT:            "preciseIdentifier": "c:v",
+// BLOCKPTR-NEXT:            "spelling": "void"
+// BLOCKPTR-NEXT:          }
+// BLOCKPTR-NEXT:        ]
+// BLOCKPTR-NEXT:      },
+// BLOCKPTR:           "identifier": {
+// BLOCKPTR-NEXT:        "interfaceLanguage": "objective-c",
+// BLOCKPTR-NEXT:        "precise": "c:@F@bar"
+// BLOCKPTR-NEXT:      },
+// BLOCKPTR:           "kind": {
+// BLOCKPTR-NEXT:        "displayName": "Function",
+// BLOCKPTR-NEXT:        "identifier": "objective-c.func"
+// BLOCKPTR-NEXT:      },
+
 // expected-no-diagnostics

From fb29a6e4075894e799dd2fb29399c478a4df4462 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 9 Dec 2025 14:42:49 -0800
Subject: [PATCH 66/85] [RISCV] Fix formatting in RISCVInstrInfoXSf.td. NFC
 (#171500)

---
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index f7b4914135711..c07ed8596f009 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -935,7 +935,7 @@ let Predicates = [HasVendorXSfcease] in {
     let rd = 0b00000;
     let rs1 = 0b00000;
     let rs2 = 0b00101;
-}
+  }
 }
 
 let Predicates = [HasVendorXSfvfbfexp16e] in {

From 84b9e444454a06ddf01f568164e3d5c8d956707d Mon Sep 17 00:00:00 2001
From: adams381 <adams@nvidia.com>
Date: Tue, 9 Dec 2025 17:09:25 -0600
Subject: [PATCH 67/85] [CIR] Add Function Argument Demotion support (#170915)

This PR migrates the Function Argument Demotion feature from the
incubator repository to upstream. The feature handles K&R-style function
parameters that are promoted (e.g., short->int, float->double) and
demotes them back to their declared types.

## Changes
- Add emitArgumentDemotion helper function for type demotion
- Create emitFunctionProlog function to handle function prologue setup
(addresses existing TODO to move parameter handling logic)
- Move parameter handling logic into emitFunctionProlog
- Add test case kr-func-promote.c to verify the feature

Tested: All CIR tests pass (320/321, 99.69%). The one unsupported test
is an expected failure.
---
 clang/lib/CIR/CodeGen/CIRGenFunction.cpp | 92 ++++++++++++++++++------
 clang/lib/CIR/CodeGen/CIRGenFunction.h   |  5 ++
 clang/test/CIR/CodeGen/kr-func-promote.c | 42 +++++++++++
 3 files changed, 119 insertions(+), 20 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/kr-func-promote.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 492cbb69fabb3..6b2e60a551198 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -16,6 +16,7 @@
 #include "CIRGenCall.h"
 #include "CIRGenValue.h"
 #include "mlir/IR/Location.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/GlobalDecl.h"
 #include "clang/CIR/MissingFeatures.h"
@@ -422,28 +423,35 @@ cir::TryOp CIRGenFunction::LexicalScope::getClosestTryParent() {
   return nullptr;
 }
 
-void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
-                                   cir::FuncOp fn, cir::FuncType funcType,
-                                   FunctionArgList args, SourceLocation loc,
-                                   SourceLocation startLoc) {
-  assert(!curFn &&
-         "CIRGenFunction can only be used for one function at a time");
+/// An argument came in as a promoted argument; demote it back to its
+/// declared type.
+static mlir::Value emitArgumentDemotion(CIRGenFunction &cgf, const VarDecl *var,
+                                        mlir::Value value) {
+  mlir::Type ty = cgf.convertType(var->getType());
 
-  curFn = fn;
+  // This can happen with promotions that actually don't change the
+  // underlying type, like the enum promotions.
+  if (value.getType() == ty)
+    return value;
 
-  const Decl *d = gd.getDecl();
+  assert((mlir::isa<cir::IntType>(ty) || cir::isAnyFloatingPointType(ty)) &&
+         "unexpected promotion type");
 
-  didCallStackSave = false;
-  curCodeDecl = d;
-  const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
-  curFuncDecl = d->getNonClosureContext();
+  if (mlir::isa<cir::IntType>(ty))
+    return cgf.getBuilder().CIRBaseBuilderTy::createIntCast(value, ty);
 
-  prologueCleanupDepth = ehStack.stable_begin();
+  return cgf.getBuilder().createFloatingCast(value, ty);
+}
 
-  mlir::Block *entryBB = &fn.getBlocks().front();
-  builder.setInsertionPointToStart(entryBB);
+void CIRGenFunction::emitFunctionProlog(const FunctionArgList &args,
+                                        mlir::Block *entryBB,
+                                        const FunctionDecl *fd,
+                                        SourceLocation bodyBeginLoc) {
+  // Naked functions don't have prologues.
+  if (fd && fd->hasAttr<NakedAttr>()) {
+    cgm.errorNYI(bodyBeginLoc, "naked function decl");
+  }
 
-  // TODO(cir): this should live in `emitFunctionProlog
   // Declare all the function arguments in the symbol table.
   for (const auto nameValue : llvm::zip(args, entryBB->getArguments())) {
     const VarDecl *paramVar = std::get<0>(nameValue);
@@ -466,20 +474,64 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
                       cast<ParmVarDecl>(paramVar)->isKNRPromoted();
     assert(!cir::MissingFeatures::constructABIArgDirectExtend());
     if (isPromoted)
-      cgm.errorNYI(fd->getSourceRange(), "Function argument demotion");
+      paramVal = emitArgumentDemotion(*this, paramVar, paramVal);
 
     // Location of the store to the param storage tracked as beginning of
     // the function body.
-    mlir::Location fnBodyBegin = getLoc(fd->getBody()->getBeginLoc());
+    mlir::Location fnBodyBegin = getLoc(bodyBeginLoc);
     builder.CIRBaseBuilderTy::createStore(fnBodyBegin, paramVal, addrVal);
   }
   assert(builder.getInsertionBlock() && "Should be valid");
+}
+
+void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
+                                   cir::FuncOp fn, cir::FuncType funcType,
+                                   FunctionArgList args, SourceLocation loc,
+                                   SourceLocation startLoc) {
+  assert(!curFn &&
+         "CIRGenFunction can only be used for one function at a time");
+
+  curFn = fn;
+
+  const Decl *d = gd.getDecl();
+
+  didCallStackSave = false;
+  curCodeDecl = d;
+  const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
+  curFuncDecl = d->getNonClosureContext();
+
+  prologueCleanupDepth = ehStack.stable_begin();
+
+  mlir::Block *entryBB = &fn.getBlocks().front();
+  builder.setInsertionPointToStart(entryBB);
+
+  // Determine the function body begin location for the prolog.
+  // If fd is null or has no body, use startLoc as fallback.
+  SourceLocation bodyBeginLoc = startLoc;
+  if (fd) {
+    if (Stmt *body = fd->getBody())
+      bodyBeginLoc = body->getBeginLoc();
+    else
+      bodyBeginLoc = fd->getLocation();
+  }
+
+  emitFunctionProlog(args, entryBB, fd, bodyBeginLoc);
 
   // When the current function is not void, create an address to store the
   // result value.
-  if (!returnType->isVoidType())
-    emitAndUpdateRetAlloca(returnType, getLoc(fd->getBody()->getEndLoc()),
+  if (!returnType->isVoidType()) {
+    // Determine the function body end location.
+    // If fd is null or has no body, use loc as fallback.
+    SourceLocation bodyEndLoc = loc;
+    if (fd) {
+      if (Stmt *body = fd->getBody())
+        bodyEndLoc = body->getEndLoc();
+      else
+        bodyEndLoc = fd->getLocation();
+    }
+    emitAndUpdateRetAlloca(returnType, getLoc(bodyEndLoc),
                            getContext().getTypeAlignInChars(returnType));
+  }
 
   if (isa_and_nonnull<CXXMethodDecl>(d) &&
       cast<CXXMethodDecl>(d)->isInstance()) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 0df812bcfb94e..15322ee72a1b0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -954,6 +954,11 @@ class CIRGenFunction : public CIRGenTypeCache {
   clang::QualType buildFunctionArgList(clang::GlobalDecl gd,
                                        FunctionArgList &args);
 
+  /// Emit the function prologue: declare function arguments in the symbol
+  /// table.
+  void emitFunctionProlog(const FunctionArgList &args, mlir::Block *entryBB,
+                          const FunctionDecl *fd, SourceLocation bodyBeginLoc);
+
   /// Emit code for the start of a function.
   /// \param loc       The location to be associated with the function.
   /// \param startLoc  The location of the function body.
diff --git a/clang/test/CIR/CodeGen/kr-func-promote.c b/clang/test/CIR/CodeGen/kr-func-promote.c
new file mode 100644
index 0000000000000..5fed068a30398
--- /dev/null
+++ b/clang/test/CIR/CodeGen/kr-func-promote.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c89 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c89 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c89 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+// CIR: cir.func {{.*}}@foo(%arg0: !s32i
+// CIR:   %0 = cir.alloca !s16i, !cir.ptr<!s16i>, ["x", init]
+// CIR:   %1 = cir.cast integral %arg0 : !s32i -> !s16i
+// CIR:   cir.store %1, %0 : !s16i, !cir.ptr<!s16i>
+// expected-warning@+1 {{a function definition without a prototype is deprecated}}
+void foo(x) short x; {}
+
+// LLVM: define{{.*}} void @foo(i32 %0)
+// LLVM:   %[[X_PTR:.*]] = alloca i16, i64 1, align 2
+// LLVM:   %[[X:.*]] = trunc i32 %0 to i16
+// LLVM:   store i16 %[[X]], ptr %[[X_PTR]], align 2
+
+// OGCG: define{{.*}} void @foo(i32 noundef %0)
+// OGCG: entry:
+// OGCG:   %[[X_PTR:.*]] = alloca i16, align 2
+// OGCG:   %[[X:.*]] = trunc i32 %0 to i16
+// OGCG:   store i16 %[[X]], ptr %[[X_PTR]], align 2
+
+// CIR: cir.func{{.*}}no_proto dso_local @bar(%arg0: !cir.double
+// CIR:   %0 = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f", init]
+// CIR:   %1 = cir.cast floating %arg0 : !cir.double -> !cir.float
+// CIR:   cir.store %1, %0 : !cir.float, !cir.ptr<!cir.float>
+// expected-warning@+1 {{a function definition without a prototype is deprecated}}
+void bar(f) float f; {}
+
+// LLVM: define{{.*}} void @bar(double %0)
+// LLVM:   %[[F_PTR:.*]] = alloca float, i64 1, align 4
+// LLVM:   %[[F:.*]] = fptrunc double %0 to float
+// LLVM:   store float %[[F]], ptr %[[F_PTR]], align 4
+
+// OGCG: define{{.*}} void @bar(double noundef %0)
+// OGCG: entry:
+// OGCG:   %[[F_PTR:.*]] = alloca float, align 4
+// OGCG:   %[[F:.*]] = fptrunc double %0 to float
+// OGCG:   store float %[[F]], ptr %[[F_PTR]], align 4

From 0a2e56df64c936bacc746aeb94878d66ee00dec3 Mon Sep 17 00:00:00 2001
From: adams381 <adams@nvidia.com>
Date: Tue, 9 Dec 2025 17:11:01 -0600
Subject: [PATCH 68/85] [CIR] Add support for thread-local storage (TLS)
 (#168662)

This commit adds full support for thread-local storage variables in
ClangIR, including code generation, lowering to LLVM IR, and
comprehensive testing.

Changes include:
- Added CIR_TLSModel enum with 4 TLS models (GeneralDynamic,
LocalDynamic, InitialExec, LocalExec) to CIROps.td
- Extended GlobalOp with optional tls_model attribute
- Extended GetGlobalOp with thread_local unit attribute
- Added verification to ensure thread_local GetGlobalOp references
globals with tls_model set
- Implemented GetDefaultCIRTLSModel() and setTLSMode() in CIRGenModule
- Updated getAddrOfGlobalVar() to handle TLS access
- Removed MissingFeatures assertions for TLS operations
- Added lowering of GetGlobalOp with TLS to llvm.threadlocal.address
intrinsic
- Added lowering of GlobalOp with tls_model to LLVM thread_local globals
- Added comprehensive test with CIR, LLVM, and OGCG checks

Known limitations (matching incubator):
- Static local TLS variables not yet implemented
- TLS_Dynamic with wrapper functions not yet implemented

Fixes #153270
---
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      | 12 +++---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 17 +++++++-
 clang/lib/CIR/CodeGen/CIRGenDecl.cpp          |  3 +-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |  5 +--
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 39 ++++++++++++++++---
 clang/lib/CIR/CodeGen/CIRGenModule.h          |  7 ++++
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  5 ++-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 12 +++---
 clang/test/CIR/CodeGen/tls.c                  | 29 ++++++++++++++
 clang/test/CIR/IR/invalid-tls.cir             | 13 +++++++
 10 files changed, 121 insertions(+), 21 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/tls.c
 create mode 100644 clang/test/CIR/IR/invalid-tls.cir

diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index aa47c4bce189b..7ee3785ef74f0 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -308,14 +308,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return cir::GlobalViewAttr::get(type, symbol, indices);
   }
 
-  mlir::Value createGetGlobal(mlir::Location loc, cir::GlobalOp global) {
+  mlir::Value createGetGlobal(mlir::Location loc, cir::GlobalOp global,
+                              bool threadLocal = false) {
     assert(!cir::MissingFeatures::addressSpace());
-    return cir::GetGlobalOp::create(
-        *this, loc, getPointerTo(global.getSymType()), global.getSymName());
+    return cir::GetGlobalOp::create(*this, loc,
+                                    getPointerTo(global.getSymType()),
+                                    global.getSymNameAttr(), threadLocal);
   }
 
-  mlir::Value createGetGlobal(cir::GlobalOp global) {
-    return createGetGlobal(global.getLoc(), global);
+  mlir::Value createGetGlobal(cir::GlobalOp global, bool threadLocal = false) {
+    return createGetGlobal(global.getLoc(), global, threadLocal);
   }
 
   /// Create a copy with inferred length.
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 9bd24cf0bcf27..04648bee848f1 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2078,6 +2078,13 @@ def CIR_GlobalLinkageKind : CIR_I32EnumAttr<
 // properties of a global variable will be added over time as more of ClangIR
 // is upstreamed.
 
+def CIR_TLSModel : CIR_I32EnumAttr<"TLS_Model", "TLS model", [
+  I32EnumAttrCase<"GeneralDynamic", 0, "tls_dyn">,
+  I32EnumAttrCase<"LocalDynamic", 1, "tls_local_dyn">,
+  I32EnumAttrCase<"InitialExec", 2, "tls_init_exec">,
+  I32EnumAttrCase<"LocalExec", 3, "tls_local_exec">
+]>;
+
 def CIR_GlobalOp : CIR_Op<"global", [
   DeclareOpInterfaceMethods<RegionBranchOpInterface>,
   DeclareOpInterfaceMethods<CIRGlobalValueInterface>,
@@ -2106,6 +2113,7 @@ def CIR_GlobalOp : CIR_Op<"global", [
                        OptionalAttr<StrAttr>:$sym_visibility,
                        TypeAttr:$sym_type,
                        CIR_GlobalLinkageKind:$linkage,
+                       OptionalAttr<CIR_TLSModel>:$tls_model,
                        OptionalAttr<AnyAttr>:$initial_value,
                        UnitAttr:$comdat,
                        UnitAttr:$constant,
@@ -2121,6 +2129,7 @@ def CIR_GlobalOp : CIR_Op<"global", [
     (`constant` $constant^)?
     $linkage
     (`comdat` $comdat^)?
+    ($tls_model^)?
     (`dso_local` $dso_local^)?
     $sym_name
     custom<GlobalOpTypeAndInitialValue>($sym_type, $initial_value,
@@ -2184,16 +2193,22 @@ def CIR_GetGlobalOp : CIR_Op<"get_global", [
     undefined. The resulting type must always be a `!cir.ptr<...>` type with the
     same address space as the global variable.
 
+    Addresses of thread local globals can only be retrieved if this operation
+    is marked `thread_local`, which indicates the address isn't constant.
+
     Example:
     ```mlir
     %x = cir.get_global @gv : !cir.ptr<i32>
+    ...
+    %y = cir.get_global thread_local @tls_gv : !cir.ptr<i32>
     ```
   }];
 
-  let arguments = (ins FlatSymbolRefAttr:$name);
+  let arguments = (ins FlatSymbolRefAttr:$name, UnitAttr:$tls);
   let results = (outs Res<CIR_PointerType, "", []>:$addr);
 
   let assemblyFormat = [{
+    (`thread_local` $tls^)?
     $name `:` qualified(type($addr)) attr-dict
   }];
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 948245ceab2cd..12b153af36c3e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -454,7 +454,8 @@ CIRGenModule::getOrCreateStaticVarDecl(const VarDecl &d,
   if (supportsCOMDAT() && gv.isWeakForLinker())
     gv.setComdat(true);
 
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+  if (d.getTLSKind())
+    errorNYI(d.getSourceRange(), "getOrCreateStaticVarDecl: TLS");
 
   setGVProperties(gv, &d);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 5d509e37f4621..cac046c41e30d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -282,7 +282,6 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
   QualType t = e->getType();
 
   // If it's thread_local, emit a call to its wrapper function instead.
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
   if (vd->getTLSKind() == VarDecl::TLS_Dynamic)
     cgf.cgm.errorNYI(e->getSourceRange(),
                      "emitGlobalVarDeclLValue: thread_local variable");
@@ -318,7 +317,6 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                        bool isVolatile, QualType ty,
                                        LValueBaseInfo baseInfo, bool isInit,
                                        bool isNontemporal) {
-  assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
     // Boolean vectors use `iN` as storage type.
@@ -569,7 +567,8 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue,
 mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile,
                                              QualType ty, SourceLocation loc,
                                              LValueBaseInfo baseInfo) {
-  assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
+  // Traditional LLVM codegen handles thread local separately, CIR handles
+  // as part of getAddrOfGlobalVar (GetGlobalOp).
   mlir::Type eltTy = addr.getElementType();
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 41a5d9db83e2b..eaa9e946e243d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -682,8 +682,11 @@ CIRGenModule::getOrCreateCIRGlobal(StringRef mangledName, mlir::Type ty,
 
     setLinkageForGV(gv, d);
 
-    if (d->getTLSKind())
-      errorNYI(d->getSourceRange(), "thread local global variable");
+    if (d->getTLSKind()) {
+      if (d->getTLSKind() == VarDecl::TLS_Dynamic)
+        errorNYI(d->getSourceRange(), "TLS dynamic");
+      setTLSMode(gv, *d);
+    }
 
     setGVProperties(gv, d);
 
@@ -738,12 +741,11 @@ mlir::Value CIRGenModule::getAddrOfGlobalVar(const VarDecl *d, mlir::Type ty,
   if (!ty)
     ty = getTypes().convertTypeForMem(astTy);
 
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
-
+  bool tlsAccess = d->getTLSKind() != VarDecl::TLS_None;
   cir::GlobalOp g = getOrCreateCIRGlobal(d, ty, isForDefinition);
   mlir::Type ptrTy = builder.getPointerTo(g.getSymType());
   return cir::GetGlobalOp::create(builder, getLoc(d->getSourceRange()), ptrTy,
-                                  g.getSymName());
+                                  g.getSymNameAttr(), tlsAccess);
 }
 
 cir::GlobalViewAttr CIRGenModule::getAddrOfGlobalVarAttr(const VarDecl *d) {
@@ -1953,6 +1955,33 @@ void CIRGenModule::setGVPropertiesAux(mlir::Operation *op,
   assert(!cir::MissingFeatures::opGlobalPartition());
 }
 
+cir::TLS_Model CIRGenModule::getDefaultCIRTLSModel() const {
+  switch (getCodeGenOpts().getDefaultTLSModel()) {
+  case CodeGenOptions::GeneralDynamicTLSModel:
+    return cir::TLS_Model::GeneralDynamic;
+  case CodeGenOptions::LocalDynamicTLSModel:
+    return cir::TLS_Model::LocalDynamic;
+  case CodeGenOptions::InitialExecTLSModel:
+    return cir::TLS_Model::InitialExec;
+  case CodeGenOptions::LocalExecTLSModel:
+    return cir::TLS_Model::LocalExec;
+  }
+  llvm_unreachable("Invalid TLS model!");
+}
+
+void CIRGenModule::setTLSMode(mlir::Operation *op, const VarDecl &d) {
+  assert(d.getTLSKind() && "setting TLS mode on non-TLS var!");
+
+  cir::TLS_Model tlm = getDefaultCIRTLSModel();
+
+  // Override the TLS model if it is explicitly specified.
+  if (d.getAttr<TLSModelAttr>())
+    errorNYI(d.getSourceRange(), "TLS model attribute");
+
+  auto global = cast<cir::GlobalOp>(op);
+  global.setTlsModel(tlm);
+}
+
 void CIRGenModule::setFunctionAttributes(GlobalDecl globalDecl,
                                          cir::FuncOp func,
                                          bool isIncompleteFunction,
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 9c0961579718d..de263f4868507 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -447,6 +447,13 @@ class CIRGenModule : public CIRGenTypeCache {
   void setGVProperties(mlir::Operation *op, const NamedDecl *d) const;
   void setGVPropertiesAux(mlir::Operation *op, const NamedDecl *d) const;
 
+  /// Set TLS mode for the given operation based on the given variable
+  /// declaration.
+  void setTLSMode(mlir::Operation *op, const VarDecl &d);
+
+  /// Get TLS mode from CodeGenOptions.
+  cir::TLS_Model getDefaultCIRTLSModel() const;
+
   /// Set function attributes for a function declaration.
   void setFunctionAttributes(GlobalDecl gd, cir::FuncOp f,
                              bool isIncompleteFunction, bool isThunk);
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 38a2cecbb8617..0f546cb254db4 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1737,7 +1737,10 @@ cir::GetGlobalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   if (auto g = dyn_cast<GlobalOp>(op)) {
     symTy = g.getSymType();
     assert(!cir::MissingFeatures::addressSpace());
-    assert(!cir::MissingFeatures::opGlobalThreadLocal());
+    // Verify that for thread local global access, the global needs to
+    // be marked with tls bits.
+    if (getTls() && !g.getTlsModel())
+      return emitOpError("access to global not marked thread local");
   } else if (auto f = dyn_cast<FuncOp>(op)) {
     symTy = f.getFunctionType();
   } else {
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 88ca8033b48ea..0ad3360e1357e 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2058,7 +2058,11 @@ mlir::LogicalResult CIRToLLVMGetGlobalOpLowering::matchAndRewrite(
   mlir::Operation *newop = mlir::LLVM::AddressOfOp::create(
       rewriter, op.getLoc(), type, op.getName());
 
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+  if (op.getTls()) {
+    // Handle access to TLS via intrinsic.
+    newop = mlir::LLVM::ThreadlocalAddressOp::create(rewriter, op.getLoc(),
+                                                     type, newop->getResult(0));
+  }
 
   rewriter.replaceOp(op, newop);
   return mlir::success();
@@ -2079,8 +2083,7 @@ void CIRToLLVMGlobalOpLowering::setupRegionInitializedLLVMGlobalOp(
   assert(!cir::MissingFeatures::addressSpace());
   const unsigned addrSpace = 0;
   const bool isDsoLocal = op.getDsoLocal();
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
-  const bool isThreadLocal = false;
+  const bool isThreadLocal = (bool)op.getTlsModelAttr();
   const uint64_t alignment = op.getAlignment().value_or(0);
   const mlir::LLVM::Linkage linkage = convertLinkage(op.getLinkage());
   const StringRef symbol = op.getSymName();
@@ -2140,8 +2143,7 @@ mlir::LogicalResult CIRToLLVMGlobalOpLowering::matchAndRewrite(
   assert(!cir::MissingFeatures::addressSpace());
   const unsigned addrSpace = 0;
   const bool isDsoLocal = op.getDsoLocal();
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
-  const bool isThreadLocal = false;
+  const bool isThreadLocal = (bool)op.getTlsModelAttr();
   const uint64_t alignment = op.getAlignment().value_or(0);
   const mlir::LLVM::Linkage linkage = convertLinkage(op.getLinkage());
   const StringRef symbol = op.getSymName();
diff --git a/clang/test/CIR/CodeGen/tls.c b/clang/test/CIR/CodeGen/tls.c
new file mode 100644
index 0000000000000..582a716f0fa73
--- /dev/null
+++ b/clang/test/CIR/CodeGen/tls.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ogcg.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ogcg.ll %s
+
+extern __thread int b;
+// CIR: cir.global "private" external tls_dyn @b : !s32i
+
+__thread int a;
+// CIR: cir.global external tls_dyn @a = #cir.int<0> : !s32i
+
+int c(void) { return *&b; }
+// CIR: cir.func no_inline dso_local @c() -> !s32i
+// CIR:   %[[TLS_ADDR:.*]] = cir.get_global thread_local @b : !cir.ptr<!s32i>
+
+// LLVM: @b = external thread_local global i32
+// LLVM: @a = thread_local global i32 0
+
+// LLVM-LABEL: @c
+// LLVM: = call ptr @llvm.threadlocal.address.p0(ptr @b)
+
+// OGCG: @b = external thread_local{{.*}} global i32
+// OGCG: @a = thread_local{{.*}} global i32 0
+
+// OGCG-LABEL: define{{.*}} @c
+// OGCG: call{{.*}} ptr @llvm.threadlocal.address.p0(ptr{{.*}} @b)
+
diff --git a/clang/test/CIR/IR/invalid-tls.cir b/clang/test/CIR/IR/invalid-tls.cir
new file mode 100644
index 0000000000000..36df7fdb1e619
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-tls.cir
@@ -0,0 +1,13 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.global "private" external @non_tls : !s32i
+  cir.func @error() {
+    // expected-error@+1 {{access to global not marked thread local}}
+    %0 = cir.get_global thread_local @non_tls : !cir.ptr<!s32i>
+    cir.return
+  }
+}
+

From 27651133e213a6a1eb4d0e47837625cee3613111 Mon Sep 17 00:00:00 2001
From: anjenner <161845516+anjenner@users.noreply.github.com>
Date: Tue, 9 Dec 2025 23:13:33 +0000
Subject: [PATCH 69/85] AMDGPU: Drop and upgrade
 llvm.amdgcn.atomic.csub/cond.sub to atomicrmw (#105553)

These both perform conditional subtraction, returning the minuend and
zero respectively, if the difference is negative.
---
 llvm/docs/AMDGPUUsage.rst                     |   5 -
 llvm/docs/ReleaseNotes.md                     |   4 +
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   4 -
 llvm/lib/IR/AutoUpgrade.cpp                   |  11 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |   4 -
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 -
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   2 -
 llvm/lib/Target/AMDGPU/BUFInstructions.td     |  54 ++--
 llvm/lib/Target/AMDGPU/DSInstructions.td      |   9 -
 llvm/lib/Target/AMDGPU/FLATInstructions.td    |  11 -
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  15 +-
 .../UniformityAnalysis/AMDGPU/atomics.ll      |  57 ----
 llvm/test/Bitcode/amdgcn-atomic.ll            | 147 ++++++++++
 .../llvm.amdgcn.global.atomic.csub.ll         | 270 ------------------
 llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll  | 236 ++++++++-------
 .../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll     | 219 --------------
 16 files changed, 317 insertions(+), 733 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 3e7a5dfc504ae..7ecf1c1124894 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1527,11 +1527,6 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
                                                    The iglp_opt strategy implementations are subject to change.
 
-  llvm.amdgcn.atomic.cond.sub.u32                  Provides direct access to flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32
-                                                   and ds_cond_sub_u32 based on address space on gfx12 targets. This
-                                                   performs a subtraction only if the memory value is greater than or
-                                                   equal to the data value.
-
   llvm.amdgcn.s.barrier.signal.isfirst             Provides access to the s_barrier_signal_first instruction;
                                                    additionally ensures that the result value is valid even when the
                                                    intrinsic is used from a wave that is not running in a workgroup.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 8ec46c661974b..1b85145efbf4a 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -114,6 +114,10 @@ Changes to the AArch64 Backend
 Changes to the AMDGPU Backend
 -----------------------------
 
+* Removed `llvm.amdgcn.atomic.cond.sub.u32` and
+  `llvm.amdgcn.atomic.csub.u32` intrinsics. Users should use the
+  `atomicrmw` instruction with `usub_cond` and `usub_sat` instead.
+
 Changes to the ARM Backend
 --------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 03488f8389aa2..64d3dd6c3b701 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2888,8 +2888,6 @@ class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic <
   [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
   [SDNPMemOperand]>;
 
-def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>;
-
 // uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
 //                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
 // <node_ptr> is i32 or i64.
@@ -3137,8 +3135,6 @@ def int_amdgcn_flat_atomic_fmax_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
-def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>;
-
 class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
   Intrinsic<
     [llvm_any_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index e67f1ecd96bb1..2202b08e3cf0d 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1274,9 +1274,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       }
 
       if (Name.consume_front("atomic.")) {
-        if (Name.starts_with("inc") || Name.starts_with("dec")) {
-          // These were replaced with atomicrmw uinc_wrap and udec_wrap, so
-          // there's no new declaration.
+        if (Name.starts_with("inc") || Name.starts_with("dec") ||
+            Name.starts_with("cond.sub") || Name.starts_with("csub")) {
+          // These were replaced with atomicrmw uinc_wrap, udec_wrap, usub_cond
+          // and usub_sat so there's no new declaration.
           NewFn = nullptr;
           return true;
         }
@@ -4606,7 +4607,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
           .StartsWith("global.atomic.fmin", AtomicRMWInst::FMin)
           .StartsWith("flat.atomic.fmin", AtomicRMWInst::FMin)
           .StartsWith("global.atomic.fmax", AtomicRMWInst::FMax)
-          .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax);
+          .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax)
+          .StartsWith("atomic.cond.sub", AtomicRMWInst::USubCond)
+          .StartsWith("atomic.csub", AtomicRMWInst::USubSat);
 
   unsigned NumOperands = CI->getNumOperands();
   if (NumOperands < 3) // Malformed bitcode.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index dd86cb5d3d5a6..2a99dacba52a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -636,15 +636,11 @@ multiclass local_addr_space_atomic_op {
     }
 }
 
-defm int_amdgcn_global_atomic_csub : noret_op;
 defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
 defm int_amdgcn_flat_atomic_fmin_num : noret_op;
 defm int_amdgcn_flat_atomic_fmax_num : noret_op;
 defm int_amdgcn_global_atomic_fmin_num : noret_op;
 defm int_amdgcn_global_atomic_fmax_num : noret_op;
-defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
 
 multiclass noret_binary_atomic_op<SDNode atomic_op> {
   let HasNoUse = true in
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a7955ee2dac40..ce4cc799543f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5309,12 +5309,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
-    case Intrinsic::amdgcn_global_atomic_csub:
     case Intrinsic::amdgcn_global_atomic_fmin_num:
     case Intrinsic::amdgcn_global_atomic_fmax_num:
     case Intrinsic::amdgcn_flat_atomic_fmin_num:
     case Intrinsic::amdgcn_flat_atomic_fmax_num:
-    case Intrinsic::amdgcn_atomic_cond_sub_u32:
     case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
     case Intrinsic::amdgcn_global_load_tr_b64:
     case Intrinsic::amdgcn_global_load_tr_b128:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index fe452f008c95c..58a9b5511f2d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
 def : SourceOfDivergence<int_r600_read_tidig_x>;
 def : SourceOfDivergence<int_r600_read_tidig_y>;
 def : SourceOfDivergence<int_r600_read_tidig_z>;
-def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index b97b7385dc1ff..bb0e9380e956d 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -783,37 +783,20 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
 
 multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
                                      RegisterOperand vdataClass,
-                                     ValueType vdataType,
-                                     SDPatternOperator atomic> {
+                                     ValueType vdataType> {
   let FPAtomic = vdataType.isFP in {
-    def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
-               vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <0, NAME # "_RTN">;
-
-    def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
-                vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <1, NAME # "_RTN">;
-
+    def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
+                      MUBUFAddr64Table <0, NAME # "_RTN">;
+    def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>,
+                      MUBUFAddr64Table <1, NAME # "_RTN">;
     def _OFFEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass, 0>;
     def _IDXEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass, 0>;
     def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>;
 
-    def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
-               vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
-
-    def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
-                vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
-
+    def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>,
+                      MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
+    def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>,
+                      MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
     def _VBUFFER_OFFEN_RTN  : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn,  vdataClass, 1>;
     def _VBUFFER_IDXEN_RTN  : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn,  vdataClass, 1>;
     def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>;
@@ -822,10 +805,9 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
 
 multiclass MUBUF_Pseudo_Atomics <string opName,
                                  RegisterOperand vdataClass,
-                                 ValueType vdataType,
-                                 SDPatternOperator atomic = null_frag> :
+                                 ValueType vdataType> :
   MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
-  MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+  MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType>;
 
 
 //===----------------------------------------------------------------------===//
@@ -1096,7 +1078,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
 
 let OtherPredicates = [HasGFX10_BEncoding] in {
   defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
-    "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub
+    "buffer_atomic_csub", VGPROp_32, i32
   >;
 }
 
@@ -1117,22 +1099,22 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
 let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
 
 defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag
+  "buffer_atomic_fcmpswap", AVLdSt_64, v2f32
 >;
 }
 
 let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
 defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmin", AVLdSt_32, f32, null_frag
+  "buffer_atomic_fmin", AVLdSt_32, f32
 >;
 defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmax", AVLdSt_32, f32, null_frag
+  "buffer_atomic_fmax", AVLdSt_32, f32
 >;
 }
 
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
 defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag
+  "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64
 >;
 }
 
@@ -1201,12 +1183,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
 
 let SubtargetPredicate = HasAtomicFaddRtnInsts in
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
-  "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag
+  "buffer_atomic_add_f32", AVLdSt_32, f32
 >;
 
 let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
-  "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag
+  "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16
 >;
 
 let SubtargetPredicate = isGFX12Plus in {
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 4b3bd0c09e076..3a53cef96473c 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -886,15 +886,6 @@ defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_3
 def DS_BPERMUTE_FI_B32    : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
                                              int_amdgcn_ds_bpermute_fi_b32>;
 
-multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
-                                  ValueType vt, string frag> {
-  def : DSAtomicRetPat<inst, vt,
-                        !cast<PatFrag>(frag#"_local_addrspace")>;
-  def : DSAtomicRetPat<noRetInst, vt,
-                        !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>;
-}
-
-defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
 } // let SubtargetPredicate = isGFX12Plus
 
 let SubtargetPredicate = isGFX1250Plus in {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 0e60c73aa5db7..9e38af91c7ccf 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1562,10 +1562,6 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType
   }
 }
 
-multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
-                                           ValueType vt> :
-  FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>;
-
 multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
                           ValueType data_vt = vt, bit isIntr = 0> :
   FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1590,10 +1586,6 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
   }
 }
 
-multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
-                                         ValueType vt> :
-  FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>;
-
 multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
                              ValueType data_vt = vt, bit isIntr = 0> :
   FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -2189,9 +2181,6 @@ let SubtargetPredicate = HasAtomicCondSubClampFlatInsts in {
 defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
 defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
 
-defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-
 let OtherPredicates = [HasD16LoadStore] in {
 defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
 defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a871d978dfbc8..0f91b319b16d4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1522,15 +1522,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     return true;
   }
-  case Intrinsic::amdgcn_global_atomic_csub: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.ptrVal = CI.getOperand(0);
-    Info.align.reset();
-    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-                  MachineMemOperand::MOVolatile;
-    return true;
-  }
   case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
   case Intrinsic::amdgcn_image_bvh_intersect_ray:
   case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
@@ -1551,8 +1542,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_global_atomic_fmax_num:
   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
   case Intrinsic::amdgcn_flat_atomic_fmin_num:
-  case Intrinsic::amdgcn_flat_atomic_fmax_num:
-  case Intrinsic::amdgcn_atomic_cond_sub_u32: {
+  case Intrinsic::amdgcn_flat_atomic_fmax_num: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
@@ -1727,7 +1717,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
                                             Type *&AccessTy) const {
   Value *Ptr = nullptr;
   switch (II->getIntrinsicID()) {
-  case Intrinsic::amdgcn_atomic_cond_sub_u32:
   case Intrinsic::amdgcn_cluster_load_b128:
   case Intrinsic::amdgcn_cluster_load_b64:
   case Intrinsic::amdgcn_cluster_load_b32:
@@ -1750,7 +1739,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
   case Intrinsic::amdgcn_flat_load_monitor_b128:
   case Intrinsic::amdgcn_flat_load_monitor_b32:
   case Intrinsic::amdgcn_flat_load_monitor_b64:
-  case Intrinsic::amdgcn_global_atomic_csub:
   case Intrinsic::amdgcn_global_atomic_fmax_num:
   case Intrinsic::amdgcn_global_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
@@ -18390,7 +18378,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
   case AMDGPUISD::BUFFER_ATOMIC_INC:
   case AMDGPUISD::BUFFER_ATOMIC_DEC:
   case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
-  case AMDGPUISD::BUFFER_ATOMIC_CSUB:
   case AMDGPUISD::BUFFER_ATOMIC_FADD:
   case AMDGPUISD::BUFFER_ATOMIC_FMIN:
   case AMDGPUISD::BUFFER_ATOMIC_FMAX:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
index 15355ea139205..d9e51c39c2042 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
@@ -15,62 +15,5 @@ define amdgpu_kernel void @test2(ptr %ptr, i32 %cmp, i32 %new) {
   ret void
 }
 
-; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
-define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) #0 {
-  %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
-  store i32 %ret, ptr addrspace(1) %ptr, align 4
-  ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
-define amdgpu_kernel void @test_ds_atomic_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) #0 {
-entry:
-  %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
-  store i32 %val, ptr addrspace(3) %use
-  ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
-define amdgpu_kernel void @test_flat_atomic_cond_sub_u32(ptr %addr, i32 %in, ptr %use) #0 {
-entry:
-  %gep = getelementptr i32, ptr %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
-  store i32 %val, ptr %use
-  ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
-define amdgpu_kernel void @test_global_atomic_cond_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) #0 {
-entry:
-  %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
-  store i32 %val, ptr addrspace(1) %use
-  ret void
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-define float @test_raw_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
-  %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-define float @test_struct_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
-  %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) #1
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #1
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #1
-
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index 3e28cd050fc88..e9194ea1e14fe 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -420,5 +420,152 @@ define double @upgrade_amdgcn_global_atomic_fmax_f64_p1_f64(ptr addrspace(1) %pt
 
 attributes #0 = { argmemonly nounwind willreturn }
 
+define void @atomic_usub_cond(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+  ; CHECK: atomicrmw usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+    %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result3 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result4 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+    %result5 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+  ret void
+}
+
+define void @atomic_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+    %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+    %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+  ret void
+}
+
+; Test some invalid ordering handling
+define void @ordering_usub_cond_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+  ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4
+    %result3 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result4 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result5 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result6 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result7 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result8 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false)
+
+  ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result9 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true)
+
+  ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result10 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result11 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 -1, i64 0, i1 true)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result12 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 true)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result13 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 1, i64 0, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") monotonic, align 8
+    %result14 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 2, i64 0, i1 true)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result15 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result16 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 4, i1 true)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result17 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 5, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result18 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 6, i1 true)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result19 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 7, i1 false)
+
+  ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result20 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 8, i1 true)
+
+  ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result21 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 true)
+  ret void
+}
+
+define void @immarg_violations_usub_sat(ptr %ptr0, i32 %val32, i1 %val1, i64 %val64) {
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+    %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+    %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 %val64, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+    %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 %val64, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+    %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 0, i1 %val1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+
+declare i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+
 ; CHECK: !0 = !{i32 5, i32 6}
 ; CHECK: !1 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index bff4771319454..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,270 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
-  %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret void
-}
-
-define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset_nortn:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
-  %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    global_store_dword v[0:1], v0, off
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
-  %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  store i32 %ret, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_endpgm
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
-  %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret void
-}
-
-attributes #0 = { nounwind willreturn }
-attributes #1 = { argmemonly nounwind }
-
-!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 4af2d58b01518..d281492c647f1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -2,190 +2,222 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
 
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32)
-
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], -16
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr inbounds i32, ptr %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+  %gep = getelementptr i32, ptr %addr, i32 -4
+  %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], -16
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr inbounds i32, ptr %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+  %gep = getelementptr i32, ptr %addr, i32 -4
+  %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr %use) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr %use) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-SDAG-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 16
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-GISEL-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr inbounds i32, ptr %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+  %gep = getelementptr i32, ptr %addr, i32 4
+  %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   store i32 %val, ptr %use
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], -16
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, -16
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+  %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], -16
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, -16
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+  %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 16
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+  %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   store i32 %val, ptr addrspace(1) %use
   ret void
 }
 
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -193,9 +225,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX12-SDAG-NEXT:    ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -203,15 +237,17 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX12-GISEL-NEXT:    ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+  %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -219,9 +255,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr,
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX12-SDAG-NEXT:    ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -229,38 +267,44 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr,
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX12-GISEL-NEXT:    ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+  %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
-; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
+; GFX12-SDAG-LABEL: ds_usub_cond_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-SDAG-NEXT:    ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    ds_store_b32 v1, v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX12-GISEL-NEXT:    ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    ds_store_b32 v1, v0
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+  %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   store i32 %val, ptr addrspace(3) %use
   ret void
 }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
deleted file mode 100644
index 243cd59c6a821..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ /dev/null
@@ -1,219 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-
-define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return_forced:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], null
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
-  ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
-  ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return_forced:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
-  ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
-  ret void
-}
-
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #0
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind "target-features"="+atomic-csub-no-rtn-insts" }
-

From 7062cd63d0c258ac499c82dc233259aecac29b06 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <farzonlotfi@microsoft.com>
Date: Tue, 9 Dec 2025 18:30:25 -0500
Subject: [PATCH 70/85] [Matrix] Add a row\col major toggle in the clang driver
 (#167628)

fixes #167621

- define the new options in `Options.td` limit the naming to row-major
or column-major.
- In `ToolChains/Clang.cpp` limit the opt usage to only when
`-fenable-matrix` is used.

---------

Co-authored-by: Florian Hahn <flo@fhahn.com>
---
 clang/docs/LanguageExtensions.rst             |  6 +++
 clang/docs/MatrixTypes.rst                    |  4 ++
 clang/docs/ReleaseNotes.rst                   |  1 +
 .../clang/Basic/DiagnosticDriverKinds.td      |  3 ++
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 ++
 clang/include/clang/Basic/LangOptions.def     |  1 +
 clang/include/clang/Basic/LangOptions.h       |  7 +++
 clang/include/clang/Options/Options.td        |  6 +++
 clang/lib/Driver/ToolChains/Clang.cpp         | 12 +++++
 clang/lib/Frontend/CompilerInvocation.cpp     | 30 ++++++++++++
 clang/lib/Sema/SemaChecking.cpp               | 19 +++++++
 clang/test/Driver/fmatrix-memory-layout.c     | 49 +++++++++++++++++++
 .../Sema/matrix-col-major-builtin-disable.c   | 13 +++++
 .../Frontend/CompilerInvocationTest.cpp       | 36 ++++++++++++++
 14 files changed, 190 insertions(+)
 create mode 100644 clang/test/Driver/fmatrix-memory-layout.c
 create mode 100644 clang/test/Sema/matrix-col-major-builtin-disable.c

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 87d38e7d99e50..c4b86b203d383 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1073,6 +1073,12 @@ The matrix type extension supports explicit casts. Implicit type conversion betw
     i = static_cast<matrix_5_5<int>>(d);
   }
 
+The matrix type extension supports column and row major memory layouts, but not
+all builtins are supported with row-major layout. The layout defaults to column
+major and can be specified using `-fmatrix-memory-layout`. To enable column 
+major layout, use `-fmatrix-memory-layout=column-major`, and for row major
+layout use `-fmatrix-memory-layout=row-major`
+
 Half-Precision Floating Point
 =============================
 
diff --git a/clang/docs/MatrixTypes.rst b/clang/docs/MatrixTypes.rst
index b3a2c8cf53670..8701baa2d0f1c 100644
--- a/clang/docs/MatrixTypes.rst
+++ b/clang/docs/MatrixTypes.rst
@@ -287,6 +287,10 @@ part of the draft specification.
 The elements of a  value of a matrix type are laid out in column-major order
 without padding.
 
+To change memory layout to row major use the `-fmatrix-memory-layout` flag.
+This flag supports two flag argument values either `column-major` or
+`row-major` used like so `-fmatrix-memory-layout=column-major`.` 
+
 We propose to provide a Clang option to override this behavior and allow
 contraction of those operations (e.g. *-ffp-contract=matrix*).
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 1cd465e25947a..690d2a389ef24 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -331,6 +331,7 @@ New Compiler Flags
 - New option ``-fsanitize-debug-trap-reasons=`` added to control emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
 - New options for enabling allocation token instrumentation: ``-fsanitize=alloc-token``, ``-falloc-token-max=``, ``-fsanitize-alloc-token-fast-abi``, ``-fsanitize-alloc-token-extended``.
 - The ``-resource-dir`` option is now displayed in the list of options shown by ``--help``.
+- New option ``-fmatrix-memory-layout`` added to control the memory layout of Clang matrix types. (e.g. ``-fmatrix-memory-layout=column-major`` or ``-fmatrix-memory-layout=row-major``).
 
 Lanai Support
 ^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index aeffe96e806bd..3e6f7a2a430ff 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -792,6 +792,9 @@ def err_cc1_round_trip_mismatch : Error<
 def err_cc1_unbounded_vscale_min : Error<
   "minimum vscale must be an unsigned integer greater than 0">;
 
+def err_conflicting_matrix_layout_flags: Error<
+  "-fmatrix-memory-layout=%0 conflicts with -mllvm -matrix-default-layout=%1">;
+
 def err_drv_using_omit_rtti_component_without_no_rtti : Error<
   "-fexperimental-omit-vtable-rtti call only be used with -fno-rtti">;
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index fec278c21a89e..28803829f387d 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13041,6 +13041,9 @@ def err_builtin_trivially_relocate_invalid_arg_type: Error <
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
+def err_builtin_matrix_major_order_disabled: Error<
+  "matrix %select{row|column}0 major %select{load|store}1 is not supported with "
+  "-fmatrix-memory-layout=%select{column|row}0-major. Pass -fmatrix-memory-layout=%select{row|column}0-major to enable it">;
 def err_matrix_index_not_integer: Error<
   "matrix %select{row|column}0 index is not an integer">;
 def err_matrix_index_outside_range: Error<
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 40fc66ea12e34..e515c0cee79eb 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -434,6 +434,7 @@ ENUM_LANGOPT(RegisterStaticDestructors, RegisterStaticDestructorsKind, 2,
 LANGOPT(RegCall4, 1, 0, NotCompatible, "Set __regcall4 as a default calling convention to respect __regcall ABI v.4")
 
 LANGOPT(MatrixTypes, 1, 0, NotCompatible, "Enable or disable the builtin matrix type")
+ENUM_LANGOPT(DefaultMatrixMemoryLayout, MatrixMemoryLayout, 1, MatrixMemoryLayout::MatrixColMajor, NotCompatible, "Defines the default memory Layout for matrices")
 VALUE_LANGOPT(MaxMatrixDimension, 32, (1 << 20) - 1, NotCompatible, "maximum allowed matrix dimension")
 
 LANGOPT(CXXAssumptions, 1, 1, NotCompatible, "Enable or disable codegen and compile-time checks for C++23's [[assume]] attribute")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 61ee0275283fc..859ed57996983 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -251,6 +251,13 @@ class LangOptionsBase {
     FEM_UnsetOnCommandLine = 3
   };
 
+  enum class MatrixMemoryLayout : unsigned {
+    // Use column-major layout for matrices
+    MatrixColMajor = 0,
+    // Use row-major layout for matrices
+    MatrixRowMajor = 1,
+  };
+
   enum ExcessPrecisionKind { FPP_Standard, FPP_Fast, FPP_None };
 
   enum class LaxVectorConversionKind {
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index cac122d296624..e55146f0c7823 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -4692,6 +4692,12 @@ def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
     HelpText<"Enable matrix data type and related builtin functions">,
     MarshallingInfoFlag<LangOpts<"MatrixTypes">, hlsl.KeyPath>;
 
+def fmatrix_memory_layout_EQ : Joined<["-"], "fmatrix-memory-layout=">,
+    Visibility<[ClangOption, CC1Option]>,
+    HelpText<"Sets the matrix memory layout (row-major or column-major)">,
+    Values<"row-major,column-major">,
+    Group<f_Group>;
+
 defm raw_string_literals : BoolFOption<"raw-string-literals",
     LangOpts<"RawStringLiterals">, Default<std#".hasRawStringLiterals()">,
     PosFlag<SetTrue, [], [], "Enable">,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 7187d1a158e01..542b70b3e9d4c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5703,6 +5703,18 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fenable-matrix");
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back("-enable-matrix");
+    // Only handle default layout if matrix is enabled
+    if (const Arg *A = Args.getLastArg(options::OPT_fmatrix_memory_layout_EQ)) {
+      StringRef Val = A->getValue();
+      if (Val == "row-major" || Val == "column-major") {
+        CmdArgs.push_back(Args.MakeArgString("-fmatrix-memory-layout=" + Val));
+        CmdArgs.push_back("-mllvm");
+        CmdArgs.push_back(Args.MakeArgString("-matrix-default-layout=" + Val));
+
+      } else {
+        D.Diag(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
+      }
+    }
   }
 
   CodeGenOptions::FramePointerKind FPKeepKind =
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index b42c263abd2c7..477406f2526c0 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3951,6 +3951,15 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
     StringRef S = llvm::getAllocTokenModeAsString(*Opts.AllocTokenMode);
     GenerateArg(Consumer, OPT_falloc_token_mode_EQ, S);
   }
+  // Generate args for matrix types.
+  if (Opts.MatrixTypes) {
+    if (Opts.getDefaultMatrixMemoryLayout() ==
+        LangOptions::MatrixMemoryLayout::MatrixColMajor)
+      GenerateArg(Consumer, OPT_fmatrix_memory_layout_EQ, "column-major");
+    if (Opts.getDefaultMatrixMemoryLayout() ==
+        LangOptions::MatrixMemoryLayout::MatrixRowMajor)
+      GenerateArg(Consumer, OPT_fmatrix_memory_layout_EQ, "row-major");
+  }
 }
 
 bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
@@ -4545,6 +4554,27 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
       Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S;
   }
 
+  // Enable options for matrix types.
+  if (Opts.MatrixTypes) {
+    if (const Arg *A = Args.getLastArg(OPT_fmatrix_memory_layout_EQ)) {
+      StringRef ClangValue = A->getValue();
+      if (ClangValue == "row-major")
+        Opts.setDefaultMatrixMemoryLayout(
+            LangOptions::MatrixMemoryLayout::MatrixRowMajor);
+      else
+        Opts.setDefaultMatrixMemoryLayout(
+            LangOptions::MatrixMemoryLayout::MatrixColMajor);
+
+      for (Arg *A : Args.filtered(options::OPT_mllvm)) {
+        StringRef OptValue = A->getValue();
+        if (OptValue.consume_front("-matrix-default-layout=") &&
+            ClangValue != OptValue)
+          Diags.Report(diag::err_conflicting_matrix_layout_flags)
+              << ClangValue << OptValue;
+      }
+    }
+  }
+
   // Validate options for HLSL
   if (Opts.HLSL) {
     // TODO: Revisit restricting SPIR-V to logical once we've figured out how to
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 02c838bc4a862..67482f1d56da1 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -16511,6 +16511,13 @@ ExprResult Sema::BuiltinMatrixColumnMajorLoad(CallExpr *TheCall,
     return ExprError();
   }
 
+  if (getLangOpts().getDefaultMatrixMemoryLayout() !=
+      LangOptions::MatrixMemoryLayout::MatrixColMajor) {
+    Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_major_order_disabled)
+        << /*column*/ 1 << /*load*/ 0;
+    return ExprError();
+  }
+
   if (checkArgCount(TheCall, 4))
     return ExprError();
 
@@ -16623,6 +16630,18 @@ ExprResult Sema::BuiltinMatrixColumnMajorLoad(CallExpr *TheCall,
 
 ExprResult Sema::BuiltinMatrixColumnMajorStore(CallExpr *TheCall,
                                                ExprResult CallResult) {
+  if (!getLangOpts().MatrixTypes) {
+    Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_disabled);
+    return ExprError();
+  }
+
+  if (getLangOpts().getDefaultMatrixMemoryLayout() !=
+      LangOptions::MatrixMemoryLayout::MatrixColMajor) {
+    Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_major_order_disabled)
+        << /*column*/ 1 << /*store*/ 1;
+    return ExprError();
+  }
+
   if (checkArgCount(TheCall, 3))
     return ExprError();
 
diff --git a/clang/test/Driver/fmatrix-memory-layout.c b/clang/test/Driver/fmatrix-memory-layout.c
new file mode 100644
index 0000000000000..f05cd8f26c004
--- /dev/null
+++ b/clang/test/Driver/fmatrix-memory-layout.c
@@ -0,0 +1,49 @@
+// RUN: %clang --target=x86_64-linux-gnu -fenable-matrix -fmatrix-memory-layout=column-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-COL-MAJOR
+// CHECK-COL-MAJOR:  -fenable-matrix
+// CHECK-COL-MAJOR:  -mllvm
+// CHECK-COL-MAJOR:  -enable-matrix
+// CHECK-COL-MAJOR:  -fmatrix-memory-layout=column-major
+// CHECK-COL-MAJOR:  -mllvm
+// CHECK-COL-MAJOR:  -matrix-default-layout=column-major
+
+// RUN: %clang --target=x86_64-linux-gnu -fenable-matrix -fmatrix-memory-layout=row-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ROW-MAJOR
+// CHECK-ROW-MAJOR:  -fenable-matrix
+// CHECK-ROW-MAJOR:  -mllvm
+// CHECK-ROW-MAJOR:  -enable-matrix
+// CHECK-ROW-MAJOR:  -fmatrix-memory-layout=row-major
+// CHECK-ROW-MAJOR:  -mllvm
+// CHECK-ROW-MAJOR:  -matrix-default-layout=row-major
+
+// RUN: not %clang --target=x86_64-linux-gnu -fenable-matrix -fmatrix-memory-layout=error-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR-MAJOR
+// CHECK-ERROR-MAJOR: error: invalid value 'error-major' in '-fmatrix-memory-layout=error-major'
+
+// RUN: %clang --target=x86_64-linux-gnu -fmatrix-memory-layout=column-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-COL-MAJOR-DISABLED
+// CHECK-COL-MAJOR-DISABLED: clang: warning: argument unused during compilation: '-fmatrix-memory-layout=column-major'
+// CHECK-COL-MAJOR-DISABLED-NOT:  -fenable-matrix
+// CHECK-COL-MAJOR-DISABLED-NOT:  -mllvm
+// CHECK-COL-MAJOR-DISABLED-NOT:  -enable-matrix
+// CHECK-COL-MAJOR-DISABLED-NOT:  -fmatrix-memory-layout=column-major
+// CHECK-COL-MAJOR-DISABLED-NOT:  -mllvm
+// CHECK-COL-MAJOR-DISABLED-NOT:  -matrix-default-layout=column-major
+
+// RUN: %clang --target=x86_64-linux-gnu -fmatrix-memory-layout=row-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ROW-MAJOR-DISABLED
+// CHECK-ROW-MAJOR-DISABLED: clang: warning: argument unused during compilation: '-fmatrix-memory-layout=row-major'
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -fenable-matrix
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -mllvm
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -enable-matrix
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -fmatrix-memory-layout=row-major
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -mllvm
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -matrix-default-layout=row-major
+
+// RUN: %clang --target=x86_64-linux-gnu -fenable-matrix  %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MATRIX-ENABLED
+// CHECK-MATRIX-ENABLED:  -fenable-matrix
+// CHECK-MATRIX-ENABLED:  -mllvm
+// CHECK-MATRIX-ENABLED:  -enable-matrix
+// CHECK-MATRIX-ENABLED-NOT:  -fmatrix-memory-layout=row-major
+// CHECK-MATRIX-ENABLED-NOT:  -fmatrix-memory-layout=column-major
+// CHECK-MATRIX-ENABLED-NOT:  -mllvm
+// CHECK-MATRIX-ENABLED-NOT:  -matrix-default-layout=row-major
+// CHECK-MATRIX-ENABLED-NOT:  -matrix-default-layout=column-major
+
+// RUN: not %clang --target=x86_64-linux-gnu -fenable-matrix -fmatrix-memory-layout=column-major -mllvm -matrix-default-layout=row-major %s -fsyntax-only 2>&1 | FileCheck %s --check-prefix=CHECK-MISMATCH-MAJOR
+// CHECK-MISMATCH-MAJOR: error: -fmatrix-memory-layout=column-major conflicts with -mllvm -matrix-default-layout=row-major
diff --git a/clang/test/Sema/matrix-col-major-builtin-disable.c b/clang/test/Sema/matrix-col-major-builtin-disable.c
new file mode 100644
index 0000000000000..b86d4ebe54f93
--- /dev/null
+++ b/clang/test/Sema/matrix-col-major-builtin-disable.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 %s -fenable-matrix -fmatrix-memory-layout=row-major -pedantic -verify -triple=x86_64-apple-darwin9
+
+typedef float sx5x10_t __attribute__((matrix_type(5, 10)));
+
+void column_major_load(float *p1) {
+  sx5x10_t a1 = __builtin_matrix_column_major_load(p1, 5, 11, 5);
+  // expected-error@-1 {{matrix column major load is not supported with -fmatrix-memory-layout=row-major. Pass -fmatrix-memory-layout=column-major to enable it}}
+}
+
+void column_major_store(sx5x10_t *m1, float *p1) {
+  __builtin_matrix_column_major_store(*m1, p1, 1);
+  // expected-error@-1 {{matrix column major store is not supported with -fmatrix-memory-layout=row-major. Pass -fmatrix-memory-layout=column-major to enable it}}
+}
diff --git a/clang/unittests/Frontend/CompilerInvocationTest.cpp b/clang/unittests/Frontend/CompilerInvocationTest.cpp
index 1332422688fe6..64d0f4c38f11c 100644
--- a/clang/unittests/Frontend/CompilerInvocationTest.cpp
+++ b/clang/unittests/Frontend/CompilerInvocationTest.cpp
@@ -738,6 +738,18 @@ TEST_F(CommandLineTest, ConditionalParsingIfHLSLFlagPresent) {
   CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
 
   ASSERT_EQ(Invocation.getLangOpts().MaxMatrixDimension, 4u);
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixColMajor);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
+TEST_F(CommandLineTest, ConditionalParsingHLSLRowMajor) {
+  const char *Args[] = {"-xhlsl", "-fmatrix-memory-layout=row-major"};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixRowMajor);
 
   Invocation.generateCC1CommandLine(GeneratedArgs, *this);
 }
@@ -748,6 +760,30 @@ TEST_F(CommandLineTest, ConditionalParsingIfHLSLFlagNotPresent) {
   CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
 
   ASSERT_EQ(Invocation.getLangOpts().MaxMatrixDimension, 1048575u);
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixColMajor);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
+TEST_F(CommandLineTest, ConditionalParsingClangRowMajor) {
+  const char *Args[] = {"-fenable-matrix", "-fmatrix-memory-layout=row-major"};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixRowMajor);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
+TEST_F(CommandLineTest, ConditionalParsingIgnoreRowMajorIfMatrixNotEnabled) {
+  const char *Args[] = {"-fmatrix-memory-layout=row-major"};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixColMajor);
 
   Invocation.generateCC1CommandLine(GeneratedArgs, *this);
 }

From a9bcedf49f813977a6ba5e770cf0041bfc2fa6b1 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Tue, 9 Dec 2025 23:35:23 +0000
Subject: [PATCH 71/85] [NFC][SPIRV] Fix breakage introduced by #170798
 (#171513)

Adding support for i128 missed a few quirks of legalisation, which were
masked previously by early erroring out on bitwidth > 64. i128 uses
should be legal, we decide whether or not the resulting module is viable
(i.e. if the required extensions are present) in the ModuleAnalysis
pass.
---
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp | 17 +++++++++--------
 llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp  |  4 ++--
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 2078bfee2e767..30703ee40be06 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -115,18 +115,19 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
                            v4s1, v4s8, v4s16, v4s32, v4s64};
 
   auto allScalarsAndVectors = {
-      s1,   s8,   s16,   s32,   s64,   v2s1,  v2s8,  v2s16,  v2s32,  v2s64,
-      v3s1, v3s8, v3s16, v3s32, v3s64, v4s1,  v4s8,  v4s16,  v4s32,  v4s64,
-      v8s1, v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
+      s1,    s8,    s16,   s32,   s64,    s128,   v2s1,  v2s8,
+      v2s16, v2s32, v2s64, v3s1,  v3s8,   v3s16,  v3s32, v3s64,
+      v4s1,  v4s8,  v4s16, v4s32, v4s64,  v8s1,   v8s8,  v8s16,
+      v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
 
-  auto allIntScalarsAndVectors = {s8,    s16,   s32,   s64,    v2s8,   v2s16,
-                                  v2s32, v2s64, v3s8,  v3s16,  v3s32,  v3s64,
-                                  v4s8,  v4s16, v4s32, v4s64,  v8s8,   v8s16,
-                                  v8s32, v8s64, v16s8, v16s16, v16s32, v16s64};
+  auto allIntScalarsAndVectors = {
+      s8,    s16,   s32,   s64,   s128,   v2s8,   v2s16, v2s32, v2s64,
+      v3s8,  v3s16, v3s32, v3s64, v4s8,   v4s16,  v4s32, v4s64, v8s8,
+      v8s16, v8s32, v8s64, v16s8, v16s16, v16s32, v16s64};
 
   auto allBoolScalarsAndVectors = {s1, v2s1, v3s1, v4s1, v8s1, v16s1};
 
-  auto allIntScalars = {s8, s16, s32, s64};
+  auto allIntScalars = {s8, s16, s32, s64, s128};
 
   auto allFloatScalarsAndF16Vector2AndVector4s = {s16, s32, s64, v2s16, v4s16};
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index acc726717743d..9ddbeee92ffb6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -388,11 +388,11 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
 
 // To support current approach and limitations wrt. bit width here we widen a
 // scalar register with a bit width greater than 1 to valid sizes and cap it to
-// 64 width.
+// 128 width.
 static unsigned widenBitWidthToNextPow2(unsigned BitWidth) {
   if (BitWidth == 1)
     return 1; // No need to widen 1-bit values
-  return std::min(std::max(1u << Log2_32_Ceil(BitWidth), 8u), 64u);
+  return std::min(std::max(1u << Log2_32_Ceil(BitWidth), 8u), 128u);
 }
 
 static void widenScalarType(Register Reg, MachineRegisterInfo &MRI) {

From 782f50792f99740f6cb371f66511f9778104f095 Mon Sep 17 00:00:00 2001
From: Derek Schuff <dschuff@chromium.org>
Date: Tue, 9 Dec 2025 15:41:06 -0800
Subject: [PATCH 72/85] [lldb][Wasm] Handle imports when parsing Wasm name
 sections (#170960)

LLDB can use the wasm name section to populate its symbol table and get
names for functions. However the index space used in the name section is
the "function index space" which includes imported as well as locally
defined functions.
---
 .../ObjectFile/wasm/ObjectFileWasm.cpp        | 84 ++++++++++++++++---
 .../Plugins/ObjectFile/wasm/ObjectFileWasm.h  |  1 +
 .../test/Shell/Symtab/Inputs/simple.wasm.yaml | 68 +++++++++------
 lldb/test/Shell/Symtab/symtab-wasm.test       | 10 ++-
 4 files changed, 125 insertions(+), 38 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
index 0bedb5e753b77..4ab4cac68c5f8 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
@@ -307,6 +307,41 @@ struct WasmFunction {
   uint32_t size = 0;
 };
 
+static llvm::Expected<uint32_t> ParseImports(DataExtractor &import_data) {
+  // Currently this function just returns the number of imported functions.
+  // If we want to do anything with global names in the future, we'll also
+  // need to know those.
+  llvm::DataExtractor data = import_data.GetAsLLVM();
+  llvm::DataExtractor::Cursor c(0);
+
+  llvm::Expected<uint32_t> count = GetULEB32(data, c);
+  if (!count)
+    return count.takeError();
+
+  uint32_t function_imports = 0;
+  for (uint32_t i = 0; c && i < *count; ++i) {
+    // We don't need module and field names, so we can just get them as raw
+    // strings and discard.
+    if (!GetWasmString(data, c))
+      return llvm::createStringError("failed to parse module name");
+    if (!GetWasmString(data, c))
+      return llvm::createStringError("failed to parse field name");
+
+    uint8_t kind = data.getU8(c);
+    if (kind == llvm::wasm::WASM_EXTERNAL_FUNCTION)
+      function_imports++;
+
+    // For function imports, this is a type index. For others it's different.
+    // We don't need it, just need to parse it to advance the cursor.
+    data.getULEB128(c);
+  }
+
+  if (!c)
+    return c.takeError();
+
+  return function_imports;
+}
+
 static llvm::Expected<std::vector<WasmFunction>>
 ParseFunctions(DataExtractor &data) {
   lldb::offset_t offset = 0;
@@ -410,7 +445,8 @@ static llvm::Expected<std::vector<WasmSegment>> ParseData(DataExtractor &data) {
 static llvm::Expected<std::vector<Symbol>>
 ParseNames(SectionSP code_section_sp, DataExtractor &name_data,
            const std::vector<WasmFunction> &functions,
-           std::vector<WasmSegment> &segments) {
+           std::vector<WasmSegment> &segments,
+           uint32_t num_imported_functions) {
 
   llvm::DataExtractor data = name_data.GetAsLLVM();
   llvm::DataExtractor::Cursor c(0);
@@ -434,15 +470,29 @@ ParseNames(SectionSP code_section_sp, DataExtractor &name_data,
         llvm::Expected<std::string> name = GetWasmString(data, c);
         if (!name)
           return name.takeError();
-        if (*idx >= functions.size())
+        if (*idx >= num_imported_functions + functions.size())
           continue;
-        symbols.emplace_back(
-            symbols.size(), *name, lldb::eSymbolTypeCode,
-            /*external=*/false, /*is_debug=*/false, /*is_trampoline=*/false,
-            /*is_artificial=*/false, code_section_sp,
-            functions[i].section_offset, functions[i].size,
-            /*size_is_valid=*/true, /*contains_linker_annotations=*/false,
-            /*flags=*/0);
+
+        if (*idx < num_imported_functions) {
+          symbols.emplace_back(symbols.size(), *name, lldb::eSymbolTypeCode,
+                               /*external=*/true, /*is_debug=*/false,
+                               /*is_trampoline=*/false,
+                               /*is_artificial=*/false,
+                               /*section_sp=*/lldb::SectionSP(),
+                               /*value=*/0, /*size=*/0,
+                               /*size_is_valid=*/false,
+                               /*contains_linker_annotations=*/false,
+                               /*flags=*/0);
+        } else {
+          const WasmFunction &func = functions[*idx - num_imported_functions];
+          symbols.emplace_back(symbols.size(), *name, lldb::eSymbolTypeCode,
+                               /*external=*/false, /*is_debug=*/false,
+                               /*is_trampoline=*/false, /*is_artificial=*/false,
+                               code_section_sp, func.section_offset, func.size,
+                               /*size_is_valid=*/true,
+                               /*contains_linker_annotations=*/false,
+                               /*flags=*/0);
+        }
       }
     } break;
     case llvm::wasm::WASM_NAMES_DATA_SEGMENT: {
@@ -590,6 +640,20 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) {
     }
   }
 
+  // Parse the import section. The number of functions is needed because the
+  // function index space used in the name section includes imports.
+  if (std::optional<section_info> info =
+          GetSectionInfo(llvm::wasm::WASM_SEC_IMPORT)) {
+    DataExtractor import_data = ReadImageData(info->offset, info->size);
+    llvm::Expected<uint32_t> num_imports = ParseImports(import_data);
+    if (!num_imports) {
+      LLDB_LOG_ERROR(log, num_imports.takeError(),
+                     "Failed to parse Wasm import section: {0}");
+    } else {
+      m_num_imported_functions = *num_imports;
+    }
+  }
+
   // Parse the data section.
   std::optional<section_info> data_info =
       GetSectionInfo(llvm::wasm::WASM_SEC_DATA);
@@ -609,7 +673,7 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) {
     DataExtractor names_data = ReadImageData(info->offset, info->size);
     llvm::Expected<std::vector<Symbol>> symbols = ParseNames(
         m_sections_up->FindSectionByType(lldb::eSectionTypeCode, false),
-        names_data, functions, segments);
+        names_data, functions, segments, m_num_imported_functions);
     if (!symbols) {
       LLDB_LOG_ERROR(log, symbols.takeError(),
                      "Failed to parse Wasm names: {0}");
diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h
index 86ecbf26803cf..17fe23c1131d2 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h
@@ -146,6 +146,7 @@ class ObjectFileWasm : public ObjectFile {
   /// \}
 
   std::vector<section_info> m_sect_infos;
+  uint32_t m_num_imported_functions = 0;
   std::vector<Symbol> m_symbols;
   ArchSpec m_arch;
   UUID m_uuid;
diff --git a/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml b/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
index 088d6163d6b0b..8f9742c4edff8 100644
--- a/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
+++ b/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
@@ -10,6 +10,7 @@
 #   int j = 2;
 #   return add(i, j);
 # }
+# Additional imports have been manually added and indexes adjusted after compiling
 --- !WASM
 FileHeader:
   Version:         0x1
@@ -29,6 +30,21 @@ Sections:
         ParamTypes:      []
         ReturnTypes:
           - I32
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           another_import
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
   - Type:            FUNCTION
     FunctionTypes:   [ 0, 1, 2, 1 ]
   - Type:            TABLE
@@ -44,73 +60,73 @@ Sections:
       - Minimum:         0x2
   - Type:            GLOBAL
     Globals:
-      - Index:           0
+      - Index:           1
         Type:            I32
         Mutable:         true
         InitExpr:
           Opcode:          I32_CONST
           Value:           66576
-      - Index:           1
+      - Index:           2
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1036
-      - Index:           2
+      - Index:           3
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1024
-      - Index:           3
+      - Index:           4
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1040
-      - Index:           4
+      - Index:           5
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1040
-      - Index:           5
+      - Index:           6
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           66576
-      - Index:           6
+      - Index:           7
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1024
-      - Index:           7
+      - Index:           8
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           66576
-      - Index:           8
+      - Index:           9
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           131072
-      - Index:           9
+      - Index:           10
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           0
-      - Index:           10
+      - Index:           11
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1
-      - Index:           11
+      - Index:           12
         Type:            I32
         Mutable:         false
         InitExpr:
@@ -123,16 +139,16 @@ Sections:
         Index:           0
       - Name:            __wasm_call_ctors
         Kind:            FUNCTION
-        Index:           0
+        Index:           2
       - Name:            add
         Kind:            FUNCTION
-        Index:           1
+        Index:           3
       - Name:            __original_main
         Kind:            FUNCTION
-        Index:           2
+        Index:           4
       - Name:            main
         Kind:            FUNCTION
-        Index:           3
+        Index:           5
       - Name:            str
         Kind:            GLOBAL
         Index:           1
@@ -174,20 +190,20 @@ Sections:
         Index:           11
   - Type:            CODE
     Functions:
-      - Index:           0
+      - Index:           2
         Locals:          []
         Body:            0B
-      - Index:           1
+      - Index:           3
         Locals:
           - Type:            I32
             Count:           1
         Body:            23808080800041106B21022002200036020C20022001360208200228020C20022802086A0F0B
-      - Index:           2
+      - Index:           4
         Locals:
           - Type:            I32
             Count:           2
         Body:            23808080800041106B210020002480808080002000410036020C2000410136020820004102360204200028020820002802041081808080002101200041106A24808080800020010F0B
-      - Index:           3
+      - Index:           5
         Locals:          []
         Body:            1082808080000F0B
   - Type:            DATA
@@ -208,15 +224,19 @@ Sections:
     Name:            name
     FunctionNames:
       - Index:           0
-        Name:            __wasm_call_ctors
+        Name:            foo
       - Index:           1
-        Name:            add
+        Name:            another_import
       - Index:           2
-        Name:            __original_main
+        Name:            __wasm_call_ctors
       - Index:           3
+        Name:            add
+      - Index:           4
+        Name:            __original_main
+      - Index:           5
         Name:            main
     GlobalNames:
-      - Index:           0
+      - Index:           1
         Name:            __stack_pointer
     DataSegmentNames:
       - Index:           0
diff --git a/lldb/test/Shell/Symtab/symtab-wasm.test b/lldb/test/Shell/Symtab/symtab-wasm.test
index 524691b897322..e8628dbd04fb9 100644
--- a/lldb/test/Shell/Symtab/symtab-wasm.test
+++ b/lldb/test/Shell/Symtab/symtab-wasm.test
@@ -1,16 +1,18 @@
 # RUN: yaml2obj %S/Inputs/simple.wasm.yaml -o %t.wasm
 
 # RUN: %lldb %t.wasm -o 'image dump symtab' | FileCheck %s --check-prefix SYMTAB
+SYMTAB: Code 0x0000000000000000 0x0000000000000000 0x00000000 foo
+SYMTAB: Code 0x0000000000000000 0x0000000000000000 0x00000000 another_import
 SYMTAB: Code 0x0000000000000002 0x0000000000000002 0x00000000 __wasm_call_ctors
 SYMTAB: Code 0x0000000000000005 0x0000000000000029 0x00000000 add
 SYMTAB: Code 0x000000000000002f 0x000000000000004c 0x00000000 __original_main
 SYMTAB: Code 0x000000000000007c 0x0000000000000009 0x00000000 main
 
 # RUN: %lldb %t.wasm -o 'image dump sections' | FileCheck %s --check-prefix SECTIONS
-SECTIONS: 0x0000000000000001 code                   [0x0000000000000000-0x0000000000000085)  ---  0x000001a1 0x00000085 0x00000000 symtab-wasm.test.tmp.wasm.code
-SECTIONS: 0x0000000000000040 wasm-name                                                       ---  0x00000251 0x00000059 0x00000000 symtab-wasm.test.tmp.wasm.name
-SECTIONS: 0x0000000000000100 data                   [0x0000000000000400-0x0000000000000409)  ---  0x00000233 0x00000009 0x00000000 symtab-wasm.test.tmp.wasm..rodata
-SECTIONS: 0x0000000000000200 data                   [0x000000000000040c-0x0000000000000410)  ---  0x00000242 0x00000004 0x00000000 symtab-wasm.test.tmp.wasm..data
+SECTIONS: 0x0000000000000001 code                   [0x0000000000000000-0x0000000000000085)  ---  0x000001d2 0x00000085 0x00000000 symtab-wasm.test.tmp.wasm.code
+SECTIONS: 0x0000000000000040 wasm-name                                                       ---  0x00000282 0x0000006e 0x00000000 symtab-wasm.test.tmp.wasm.name
+SECTIONS: 0x0000000000000100 data                   [0x0000000000000400-0x0000000000000409)  ---  0x00000264 0x00000009 0x00000000 symtab-wasm.test.tmp.wasm..rodata
+SECTIONS: 0x0000000000000200 data                   [0x000000000000040c-0x0000000000000410)  ---  0x00000273 0x00000004 0x00000000 symtab-wasm.test.tmp.wasm..data
 
 # RUN: %lldb %t.wasm -o 'x/s 0x0000000000000400' | FileCheck %s --check-prefix STR
 STR: "data str"

From dda715df2d455e8fe0fcb8c2acef961933de1b7c Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Tue, 9 Dec 2025 15:46:43 -0800
Subject: [PATCH 73/85] [BOLT][DWARF] Improve reporting on missing DWOs
 (#171506)

List all required missing DWO files and report a summary with
recommendations on how to proceed.
---
 bolt/lib/Core/BinaryContext.cpp | 25 ++++++++++++++++++++-----
 bolt/test/dwarf5-missing-dwo.c  | 18 ++++++++++++++++++
 2 files changed, 38 insertions(+), 5 deletions(-)
 create mode 100644 bolt/test/dwarf5-missing-dwo.c

diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 51bc867a839cf..b06eee2f415a7 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1888,6 +1888,9 @@ void BinaryContext::preprocessDebugInfo() {
 
   preprocessDWODebugInfo();
 
+  // Check if required DWO files are missing.
+  uint64_t NumMissingDWOs = 0;
+
   // Populate MCContext with DWARF files from all units.
   StringRef GlobalPrefix = AsmInfo->getPrivateGlobalPrefix();
   for (const std::unique_ptr<DWARFUnit> &CU : DwCtx->compile_units()) {
@@ -1909,19 +1912,23 @@ void BinaryContext::preprocessDebugInfo() {
       std::optional<MD5::MD5Result> Checksum;
       if (LineTable->Prologue.ContentTypes.HasMD5)
         Checksum = LineTable->Prologue.FileNames[0].Checksum;
-      std::optional<const char *> Name =
+      const char *Name =
           dwarf::toString(CU->getUnitDIE().find(dwarf::DW_AT_name), nullptr);
       if (std::optional<uint64_t> DWOID = CU->getDWOId()) {
         auto Iter = DWOCUs.find(*DWOID);
         if (Iter == DWOCUs.end()) {
-          this->errs() << "BOLT-ERROR: DWO CU was not found for " << Name
-                       << '\n';
-          exit(1);
+          const char *DWOName =
+              dwarf::toString(CU->getUnitDIE().find(dwarf::DW_AT_dwo_name),
+                              "<missing DW_AT_dwo_name>");
+          this->errs() << "BOLT-ERROR: unable to load " << DWOName
+                       << " for DWO_id 0x" << Twine::utohexstr(*DWOID) << '\n';
+          NumMissingDWOs++;
+          continue;
         }
         Name = dwarf::toString(
             Iter->second->getUnitDIE().find(dwarf::DW_AT_name), nullptr);
       }
-      BinaryLineTable.setRootFile(CU->getCompilationDir(), *Name, Checksum,
+      BinaryLineTable.setRootFile(CU->getCompilationDir(), Name, Checksum,
                                   std::nullopt);
     }
 
@@ -1956,6 +1963,14 @@ void BinaryContext::preprocessDebugInfo() {
                             DwarfVersion));
     }
   }
+
+  if (NumMissingDWOs) {
+    this->errs() << "BOLT-ERROR: " << NumMissingDWOs
+                 << " required DWO file(s) not found. Unable to update debug"
+                    " info. Use --comp-dir-override to locate the file(s) or"
+                    " --update-debug-sections=0 to remove debug info\n";
+    exit(1);
+  }
 }
 
 bool BinaryContext::shouldEmit(const BinaryFunction &Function) const {
diff --git a/bolt/test/dwarf5-missing-dwo.c b/bolt/test/dwarf5-missing-dwo.c
new file mode 100644
index 0000000000000..25de1a53aa5ac
--- /dev/null
+++ b/bolt/test/dwarf5-missing-dwo.c
@@ -0,0 +1,18 @@
+// Check that llvm-bolt correctly reports a missing DWO file while updating
+// debug info.
+//
+// RUN: %clang %cflags -g -dwarf5 -gsplit-dwarf=single -c %s -o %t.o
+// RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+// RUN: rm %t.o
+// RUN: not llvm-bolt %t.exe -o %t.bolt --update-debug-sections \
+// RUN:   2>&1 | FileCheck %s -DDWO=%t.o
+//
+// Check that llvm-bolt succeeds on the same binary with disabled debug info
+// update.
+//
+// RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections=0
+
+// CHECK:      BOLT-ERROR: unable to load [[DWO]]
+// CHECK-NEXT: BOLT-ERROR: 1 required DWO file(s) not found
+
+int main() { return 0; }

From 29760ce5d9b9eb54f85992c67408074e55f7ea6d Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 9 Dec 2025 16:02:40 -0800
Subject: [PATCH 74/85] [lldb] Fix capitalization in ambiguous command error
 (#171519)

We follow LLVM's style guide for diagnostics, which instructs to start
the first sentence with a lowercase letter, and finish the last sentence
without a period, if it would end in one otherwise.
---
 lldb/source/Interpreter/CommandInterpreter.cpp              | 6 ++----
 .../API/functionalities/abbreviation/TestAbbreviations.py   | 2 +-
 .../ambigous_commands/TestAmbiguousCommands.py              | 2 +-
 .../API/functionalities/wrong_commands/TestWrongCommands.py | 2 +-
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index cb6acfc9c29a9..afc1753e21c46 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -3702,12 +3702,10 @@ CommandInterpreter::ResolveCommandImpl(std::string &command_line,
           done = static_cast<bool>(cmd_obj);
         } else {
           StreamString error_msg;
-          error_msg.Printf("Ambiguous command '%s'. Possible matches:\n",
+          error_msg.Printf("ambiguous command '%s'. Possible matches:\n",
                            next_word.c_str());
-
-          for (uint32_t i = 0; i < num_matches; ++i) {
+          for (uint32_t i = 0; i < num_matches; ++i)
             error_msg.Printf("\t%s\n", matches.GetStringAtIndex(i));
-          }
           result.AppendError(error_msg.GetString());
         }
       } else {
diff --git a/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py b/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
index 5dd4f6bee56a3..e5bbb9ee2eb9d 100644
--- a/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
+++ b/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
@@ -41,7 +41,7 @@ def test_command_abbreviations_and_aliases(self):
         # "pl" could be "platform" or "plugin".
         command_interpreter.ResolveCommand("pl", result)
         self.assertFalse(result.Succeeded())
-        self.assertTrue(result.GetError().startswith("error: Ambiguous command"))
+        self.assertTrue(result.GetError().startswith("error: ambiguous command"))
 
         # Make sure an unabbreviated command is not mangled.
         command_interpreter.ResolveCommand(
diff --git a/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
index 31b67d72ce469..f114820d16347 100644
--- a/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
+++ b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
@@ -24,7 +24,7 @@ def test_ambiguous_command_with_alias(self):
         self.assertFalse(result.Succeeded())
         self.assertEqual(
             result.GetError(),
-            "error: Ambiguous command 'co'. Possible matches:\n\tcommand\n\tcontinue\n\tcorefile\n",
+            "error: ambiguous command 'co'. Possible matches:\n\tcommand\n\tcontinue\n\tcorefile\n",
         )
 
         command_interpreter.HandleCommand("command unalias continue", result)
diff --git a/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py b/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py
index 25f95f3061eb4..941a909aa93aa 100644
--- a/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py
+++ b/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py
@@ -18,7 +18,7 @@ def test_ambiguous_command(self):
         command_interpreter.HandleCommand("g", result)
         self.assertFalse(result.Succeeded())
         self.assertRegex(
-            result.GetError(), "error: Ambiguous command 'g'. Possible matches:"
+            result.GetError(), "error: ambiguous command 'g'. Possible matches:"
         )
         self.assertRegex(result.GetError(), "gui")
         self.assertRegex(result.GetError(), "gdb-remote")

From 21147e7c95c03f554d4a7fb9b55b8e459357eb49 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Tue, 9 Dec 2025 16:03:53 -0800
Subject: [PATCH 75/85] [mlir][acc] Add loop tiling utilities for OpenACC
 (#171490)

Add utilities in OpenACCUtilsTiling.h/.cpp to support tiling
transformations on acc.loop operations:

- uncollapseLoops: Expand collapsed loops with multiple IVs into nested
loop structures when tile count exceeds collapse count
- tileACCLoops: Transform loop nests into tile and element loops based
on provided tile sizes, with automatic resolution of unknown tile sizes
(tile(*) represented as -1)

These utilities prepare for the ACCLoopTiling pass which handles the
OpenACC loop tile directive.

---------

Co-authored-by: Vijay Kandiah <vkandiah@nvidia.com>
---
 .../mlir/Dialect/OpenACC/OpenACCUtilsTiling.h |  83 +++++
 mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt |   3 +
 .../OpenACC/Utils/OpenACCUtilsTiling.cpp      | 311 ++++++++++++++++
 mlir/unittests/Dialect/OpenACC/CMakeLists.txt |   1 +
 .../OpenACC/OpenACCUtilsTilingTest.cpp        | 348 ++++++++++++++++++
 5 files changed, 746 insertions(+)
 create mode 100644 mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsTiling.h
 create mode 100644 mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp
 create mode 100644 mlir/unittests/Dialect/OpenACC/OpenACCUtilsTilingTest.cpp

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsTiling.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsTiling.h
new file mode 100644
index 0000000000000..6fcb706aa3488
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsTiling.h
@@ -0,0 +1,83 @@
+//===- OpenACCUtilsTiling.h - OpenACC Loop Tiling Utilities -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utility functions for tiling OpenACC loops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENACC_OPENACCUTILSTILING_H_
+#define MLIR_DIALECT_OPENACC_OPENACCUTILSTILING_H_
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/PatternMatch.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+namespace acc {
+
+/// Uncollapse tile loops with multiple IVs and collapseCount < tileCount.
+/// This is used to prepare loops for tiling when the collapse count is less
+/// than the tile count.
+///
+/// \param origLoop The original loop operation to uncollapse.
+/// \param tileCount The number of tile dimensions.
+/// \param collapseCount The collapse count from the original loop.
+/// \param rewriter The rewriter to use for modifications.
+/// \return A vector of uncollapsed loop operations.
+llvm::SmallVector<mlir::acc::LoopOp>
+uncollapseLoops(mlir::acc::LoopOp origLoop, unsigned tileCount,
+                unsigned collapseCount, mlir::RewriterBase &rewriter);
+
+/// Tile ACC loops according to the given tile sizes.
+///
+/// Tiling a 2-level nested loop will create two 'tile' loops containing two
+/// 'element' loops. The transformation looks like:
+///
+/// Before Tiling:
+/// \code
+/// #pragma acc loop tile(tile_size1, tile_size2)
+///  for (i = lb1; i < ub1; i += step1) { // original loop
+///    for (j = lb2; j < ub2; j += step2) {
+///      a[i,j] = i + j;
+///    }
+///  }
+/// \endcode
+///
+/// After Tiling:
+/// \code
+///  for (i = lb1; i < ub1; i += (step1 * tile_size1)) { // tile loop 1
+///    for (j = lb2; j < ub2; j += (step2 * tile_size2)) { // tile loop 2
+///      for (ii = i; ii < min(ub1, (step1 * tile_size1) + i); ii += step1) {
+///      // element loop 1
+///        for (jj = j; jj < min(ub2, (step2 * tile_size2) + j); jj += step2)
+///        { // element loop 2
+///          a[ii,jj] = i + j;
+///        }
+///      }
+///    }
+///  }
+/// \endcode
+///
+/// Unknown tile sizes (represented as -1 in acc dialect for `tile(*)`) are
+/// resolved to the provided default tile size.
+///
+/// \param tileLoops The loops to tile (outermost first).
+/// \param tileSizes The tile sizes for each dimension. Values of -1 are
+///        treated as unknown and resolved to defaultTileSize.
+/// \param defaultTileSize The default tile size to use for unknown (*) tiles.
+/// \param rewriter The rewriter to use for modifications.
+/// \return The outermost loop after tiling.
+mlir::acc::LoopOp tileACCLoops(llvm::SmallVector<mlir::acc::LoopOp> &tileLoops,
+                               const llvm::SmallVector<mlir::Value> &tileSizes,
+                               int32_t defaultTileSize,
+                               mlir::RewriterBase &rewriter);
+
+} // namespace acc
+} // namespace mlir
+
+#endif // MLIR_DIALECT_OPENACC_OPENACCUTILSTILING_H_
diff --git a/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
index 68e124625921f..c3de4f7e3e282 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_dialect_library(MLIROpenACCUtils
+  OpenACCUtilsTiling.cpp
   OpenACCUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -14,7 +15,9 @@ add_mlir_dialect_library(MLIROpenACCUtils
   MLIROpenACCTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
+  MLIRArithDialect
   MLIROpenACCDialect
   MLIRIR
   MLIRSupport
+  MLIRTransformUtils
 )
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp
new file mode 100644
index 0000000000000..bf82d247028b9
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp
@@ -0,0 +1,311 @@
+//===- OpenACCUtilsTiling.cpp - OpenACC Loop Tiling Utilities -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utility functions for tiling OpenACC loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACCUtilsTiling.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+// Resolve unknown tile sizes (represented as -1 for tile(*)) to the default.
+static mlir::Value resolveUnknownTileSize(mlir::Value tileSize,
+                                          int32_t defaultTileSize,
+                                          mlir::RewriterBase &rewriter,
+                                          mlir::Location loc) {
+  auto constVal = mlir::getConstantIntValue(tileSize);
+  if (constVal && *constVal < 0)
+    return mlir::arith::ConstantOp::create(
+        rewriter, loc, rewriter.getI32Type(),
+        rewriter.getI32IntegerAttr(defaultTileSize));
+  return tileSize;
+}
+
+// Remove vector/worker attributes from loop
+static void removeWorkerVectorFromLoop(mlir::acc::LoopOp loop) {
+  if (loop.hasVector() || loop.getVectorValue()) {
+    loop.removeVectorAttr();
+    loop.removeVectorOperandsDeviceTypeAttr();
+  } else if (loop.hasWorker() || loop.getWorkerValue()) {
+    loop.removeWorkerAttr();
+    loop.removeWorkerNumOperandsDeviceTypeAttr();
+  }
+}
+
+// Create a new ACC loop with new steps, lb, ub from original loop
+static mlir::acc::LoopOp
+createACCLoopFromOriginal(mlir::acc::LoopOp origLoop,
+                          mlir::RewriterBase &rewriter, mlir::ValueRange lb,
+                          mlir::ValueRange ub, mlir::ValueRange step,
+                          mlir::DenseBoolArrayAttr inclusiveUBAttr,
+                          mlir::acc::CombinedConstructsTypeAttr combinedAttr,
+                          mlir::Location loc, bool preserveCollapse) {
+  mlir::ArrayAttr collapseAttr = mlir::ArrayAttr{};
+  mlir::ArrayAttr collapseDeviceTypeAttr = mlir::ArrayAttr{};
+  if (preserveCollapse) {
+    collapseAttr = origLoop.getCollapseAttr();
+    collapseDeviceTypeAttr = origLoop.getCollapseDeviceTypeAttr();
+  }
+  auto newLoop = mlir::acc::LoopOp::create(
+      rewriter, loc, origLoop->getResultTypes(), lb, ub, step, inclusiveUBAttr,
+      collapseAttr, collapseDeviceTypeAttr, origLoop.getGangOperands(),
+      origLoop.getGangOperandsArgTypeAttr(),
+      origLoop.getGangOperandsSegmentsAttr(),
+      origLoop.getGangOperandsDeviceTypeAttr(), origLoop.getWorkerNumOperands(),
+      origLoop.getWorkerNumOperandsDeviceTypeAttr(),
+      origLoop.getVectorOperands(), origLoop.getVectorOperandsDeviceTypeAttr(),
+      origLoop.getSeqAttr(), origLoop.getIndependentAttr(),
+      origLoop.getAuto_Attr(), origLoop.getGangAttr(), origLoop.getWorkerAttr(),
+      origLoop.getVectorAttr(), mlir::ValueRange{}, mlir::DenseI32ArrayAttr{},
+      mlir::ArrayAttr{}, origLoop.getCacheOperands(),
+      origLoop.getPrivateOperands(), origLoop.getFirstprivateOperands(),
+      origLoop.getReductionOperands(), combinedAttr);
+  return newLoop;
+}
+
+// Create inner loop inside input loop
+static mlir::acc::LoopOp
+createInnerLoop(mlir::acc::LoopOp inputLoop, mlir::RewriterBase &rewriter,
+                mlir::ValueRange lb, mlir::ValueRange ub, mlir::ValueRange step,
+                mlir::DenseBoolArrayAttr inclusiveUBAttr, mlir::Location loc) {
+  mlir::acc::LoopOp elementLoop = createACCLoopFromOriginal(
+      inputLoop, rewriter, lb, ub, step, inclusiveUBAttr,
+      mlir::acc::CombinedConstructsTypeAttr{}, loc, /*preserveCollapse*/ false);
+
+  // Remove gang/worker attributes from inner loops
+  rewriter.startOpModification(elementLoop);
+  if (inputLoop.hasGang() ||
+      inputLoop.getGangValue(mlir::acc::GangArgType::Num) ||
+      inputLoop.getGangValue(mlir::acc::GangArgType::Dim) ||
+      inputLoop.getGangValue(mlir::acc::GangArgType::Static)) {
+    elementLoop.removeGangAttr();
+    elementLoop.removeGangOperandsArgTypeAttr();
+    elementLoop.removeGangOperandsSegmentsAttr();
+    elementLoop.removeGangOperandsDeviceTypeAttr();
+  }
+  if (inputLoop.hasVector() || inputLoop.getVectorValue()) {
+    elementLoop.removeWorkerAttr();
+    elementLoop.removeWorkerNumOperandsDeviceTypeAttr();
+  }
+  rewriter.finalizeOpModification(elementLoop);
+
+  // Create empty block in elementLoop and add IV argument
+  mlir::Block *blk = rewriter.createBlock(&elementLoop.getRegion(),
+                                          elementLoop.getRegion().begin());
+  rewriter.setInsertionPointToEnd(blk);
+  mlir::acc::YieldOp::create(rewriter, loc);
+  elementLoop.getBody().addArgument(
+      inputLoop.getBody().getArgument(0).getType(), loc);
+
+  return elementLoop;
+}
+
+// Move ops from source to target Loop and replace uses of IVs
+static void moveOpsAndReplaceIVs(mlir::acc::LoopOp sourceLoop,
+                                 mlir::acc::LoopOp targetLoop,
+                                 llvm::ArrayRef<mlir::Value> newIVs,
+                                 llvm::ArrayRef<mlir::Value> origIVs,
+                                 size_t nOps, mlir::RewriterBase &rewriter) {
+  // Move ops from source to target loop [begin, begin + nOps - 1)
+  mlir::Block::iterator begin = sourceLoop.getBody().begin();
+  targetLoop.getBody().getOperations().splice(
+      targetLoop.getBody().getOperations().begin(),
+      sourceLoop.getBody().getOperations(), begin, std::next(begin, nOps - 1));
+
+  // Replace uses of origIV with newIV
+  for (auto [i, newIV] : llvm::enumerate(newIVs))
+    mlir::replaceAllUsesInRegionWith(origIVs[i], newIV, targetLoop.getRegion());
+}
+
+mlir::acc::LoopOp
+mlir::acc::tileACCLoops(llvm::SmallVector<mlir::acc::LoopOp> &tileLoops,
+                        const llvm::SmallVector<mlir::Value> &tileSizes,
+                        int32_t defaultTileSize, mlir::RewriterBase &rewriter) {
+  // Tile collapsed and/or nested loops
+  mlir::acc::LoopOp outerLoop = tileLoops[0];
+  const mlir::Location loc = outerLoop.getLoc();
+
+  // Resolve unknown tile sizes (tile(*) represented as -1)
+  llvm::SmallVector<mlir::Value> resolvedTileSizes;
+  rewriter.setInsertionPoint(outerLoop);
+  for (mlir::Value tileSize : tileSizes)
+    resolvedTileSizes.push_back(
+        resolveUnknownTileSize(tileSize, defaultTileSize, rewriter, loc));
+
+  mlir::acc::LoopOp innerLoop = tileLoops[tileLoops.size() - 1];
+  llvm::SmallVector<mlir::Value, 3> origIVs;
+  llvm::SmallVector<mlir::Value, 3> origSteps;
+  llvm::SmallVector<mlir::Value, 3> origUBs;
+  llvm::SmallVector<mlir::Value, 3> newSteps;
+  llvm::SmallVector<mlir::Value, 3> newUBs;
+  llvm::SmallVector<mlir::Value, 3> newIVs;
+  size_t nOps = innerLoop.getBody().getOperations().size();
+
+  // Extract original inclusiveUBs
+  llvm::SmallVector<bool> inclusiveUBs;
+  for (auto tileLoop : tileLoops) {
+    for (auto [j, step] : llvm::enumerate(tileLoop.getStep())) {
+      // inclusiveUBs are present on the IR from Fortran frontend for DO loops
+      // but might not be present from other frontends (python)
+      // So check if it exists
+      if (tileLoop.getInclusiveUpperboundAttr())
+        inclusiveUBs.push_back(
+            tileLoop.getInclusiveUpperboundAttr().asArrayRef()[j]);
+      else
+        inclusiveUBs.push_back(false);
+    }
+  }
+
+  // Extract original ivs, UBs, steps, and calculate new steps
+  rewriter.setInsertionPoint(outerLoop);
+  for (auto [i, tileLoop] : llvm::enumerate(tileLoops)) {
+    for (auto arg : tileLoop.getBody().getArguments())
+      origIVs.push_back(arg);
+    for (auto ub : tileLoop.getUpperbound())
+      origUBs.push_back(ub);
+
+    llvm::SmallVector<mlir::Value, 3> currentLoopSteps;
+    for (auto [j, step] : llvm::enumerate(tileLoop.getStep())) {
+      origSteps.push_back(step);
+      if (i + j >= resolvedTileSizes.size()) {
+        currentLoopSteps.push_back(step);
+      } else {
+        mlir::Value tileSize = resolvedTileSizes[i + j];
+        auto newLoopStep =
+            mlir::arith::MulIOp::create(rewriter, loc, step, tileSize);
+        currentLoopSteps.push_back(newLoopStep);
+        newSteps.push_back(newLoopStep);
+      }
+    }
+
+    rewriter.startOpModification(tileLoop);
+    tileLoop.getStepMutable().clear();
+    tileLoop.getStepMutable().append(currentLoopSteps);
+    rewriter.finalizeOpModification(tileLoop);
+  }
+
+  // Calculate new upper bounds for element loops
+  for (size_t i = 0; i < newSteps.size(); i++) {
+    rewriter.setInsertionPoint(innerLoop.getBody().getTerminator());
+    // UpperBound: min(origUB, origIV+(originalStep*tile_size))
+    auto stepped =
+        mlir::arith::AddIOp::create(rewriter, loc, origIVs[i], newSteps[i]);
+    mlir::Value newUB = stepped;
+    if (inclusiveUBs[i]) {
+      // Handle InclusiveUB
+      // UpperBound: min(origUB, origIV+(originalStep*tile_size - 1))
+      auto c1 = mlir::arith::ConstantOp::create(
+          rewriter, loc, newSteps[i].getType(),
+          rewriter.getIntegerAttr(newSteps[i].getType(), 1));
+      newUB = mlir::arith::SubIOp::create(rewriter, loc, stepped, c1);
+    }
+    newUBs.push_back(
+        mlir::arith::MinSIOp::create(rewriter, loc, origUBs[i], newUB));
+  }
+
+  // Create and insert nested elementLoopOps before terminator of outer loopOp
+  mlir::acc::LoopOp currentLoop = innerLoop;
+  for (size_t i = 0; i < resolvedTileSizes.size(); i++) {
+    rewriter.setInsertionPoint(currentLoop.getBody().getTerminator());
+    mlir::DenseBoolArrayAttr inclusiveUBAttr = mlir::DenseBoolArrayAttr{};
+    if (inclusiveUBs[i])
+      inclusiveUBAttr = rewriter.getDenseBoolArrayAttr({true});
+
+    mlir::acc::LoopOp elementLoop =
+        createInnerLoop(innerLoop, rewriter, mlir::ValueRange{origIVs[i]},
+                        mlir::ValueRange{newUBs[i]},
+                        mlir::ValueRange{origSteps[i]}, inclusiveUBAttr, loc);
+
+    // Remove vector/worker attributes from inner element loops except
+    // outermost element loop
+    if (i > 0) {
+      rewriter.startOpModification(elementLoop);
+      removeWorkerVectorFromLoop(elementLoop);
+      rewriter.finalizeOpModification(elementLoop);
+    }
+    newIVs.push_back(elementLoop.getBody().getArgument(0));
+    currentLoop = elementLoop;
+  }
+
+  // Remove vector/worker attributes from outer tile loops
+  for (auto tileLoop : tileLoops) {
+    rewriter.startOpModification(tileLoop);
+    removeWorkerVectorFromLoop(tileLoop);
+    rewriter.finalizeOpModification(tileLoop);
+  }
+
+  // Move ops from inner tile loop to inner element loop and replace IV uses
+  moveOpsAndReplaceIVs(innerLoop, currentLoop, newIVs, origIVs, nOps, rewriter);
+
+  return outerLoop;
+}
+
+llvm::SmallVector<mlir::acc::LoopOp>
+mlir::acc::uncollapseLoops(mlir::acc::LoopOp origLoop, unsigned tileCount,
+                           unsigned collapseCount,
+                           mlir::RewriterBase &rewriter) {
+  llvm::SmallVector<mlir::acc::LoopOp, 3> newLoops;
+  llvm::SmallVector<mlir::Value, 3> newIVs;
+  mlir::Location loc = origLoop.getLoc();
+  llvm::SmallVector<bool> newInclusiveUBs;
+  llvm::SmallVector<mlir::Value, 3> lbs, ubs, steps;
+  for (unsigned i = 0; i < collapseCount; i++) {
+    // inclusiveUpperbound attribute might not be set, default to false
+    bool inclusiveUB = false;
+    if (origLoop.getInclusiveUpperboundAttr())
+      inclusiveUB = origLoop.getInclusiveUpperboundAttr().asArrayRef()[i];
+    newInclusiveUBs.push_back(inclusiveUB);
+    lbs.push_back(origLoop.getLowerbound()[i]);
+    ubs.push_back(origLoop.getUpperbound()[i]);
+    steps.push_back(origLoop.getStep()[i]);
+  }
+  mlir::acc::LoopOp outerLoop = createACCLoopFromOriginal(
+      origLoop, rewriter, lbs, ubs, steps,
+      rewriter.getDenseBoolArrayAttr(newInclusiveUBs),
+      origLoop.getCombinedAttr(), loc, /*preserveCollapse*/ true);
+  mlir::Block *blk = rewriter.createBlock(&outerLoop.getRegion(),
+                                          outerLoop.getRegion().begin());
+  rewriter.setInsertionPointToEnd(blk);
+  mlir::acc::YieldOp::create(rewriter, loc);
+  for (unsigned i = 0; i < collapseCount; i++) {
+    outerLoop.getBody().addArgument(origLoop.getBody().getArgument(i).getType(),
+                                    loc);
+    newIVs.push_back(outerLoop.getBody().getArgument(i));
+  }
+  newLoops.push_back(outerLoop);
+
+  mlir::acc::LoopOp currentLoopOp = outerLoop;
+  for (unsigned i = collapseCount; i < tileCount; i++) {
+    rewriter.setInsertionPoint(currentLoopOp.getBody().getTerminator());
+    bool inclusiveUB = false;
+    if (origLoop.getInclusiveUpperboundAttr())
+      inclusiveUB = origLoop.getInclusiveUpperboundAttr().asArrayRef()[i];
+    mlir::DenseBoolArrayAttr inclusiveUBAttr =
+        rewriter.getDenseBoolArrayAttr({inclusiveUB});
+    mlir::acc::LoopOp innerLoop = createInnerLoop(
+        origLoop, rewriter, mlir::ValueRange{origLoop.getLowerbound()[i]},
+        mlir::ValueRange{origLoop.getUpperbound()[i]},
+        mlir::ValueRange{origLoop.getStep()[i]}, inclusiveUBAttr, loc);
+    newIVs.push_back(innerLoop.getBody().getArgument(0));
+    newLoops.push_back(innerLoop);
+    currentLoopOp = innerLoop;
+  }
+  // Move ops from origLoop to innermost loop and replace uses of IVs
+  size_t nOps = origLoop.getBody().getOperations().size();
+  llvm::SmallVector<mlir::Value, 3> origIVs;
+  for (auto arg : origLoop.getBody().getArguments())
+    origIVs.push_back(arg);
+  moveOpsAndReplaceIVs(origLoop, currentLoopOp, newIVs, origIVs, nOps,
+                       rewriter);
+
+  return newLoops;
+}
diff --git a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
index c8c2bb96b0539..060c8b8d2679d 100644
--- a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
@@ -2,6 +2,7 @@ add_mlir_unittest(MLIROpenACCTests
   OpenACCOpsTest.cpp
   OpenACCOpsInterfacesTest.cpp
   OpenACCUtilsTest.cpp
+  OpenACCUtilsTilingTest.cpp
 )
 mlir_target_link_libraries(MLIROpenACCTests
   PRIVATE
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTilingTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTilingTest.cpp
new file mode 100644
index 0000000000000..95bc1eab7d3fe
--- /dev/null
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTilingTest.cpp
@@ -0,0 +1,348 @@
+//===- OpenACCUtilsTilingTest.cpp - Unit tests for loop tiling utilities --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACCUtilsTiling.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+//===----------------------------------------------------------------------===//
+// Test Fixture
+//===----------------------------------------------------------------------===//
+
+class OpenACCUtilsTilingTest : public ::testing::Test {
+protected:
+  OpenACCUtilsTilingTest() : b(&context), loc(UnknownLoc::get(&context)) {
+    context.loadDialect<acc::OpenACCDialect, arith::ArithDialect,
+                        memref::MemRefDialect, func::FuncDialect>();
+  }
+
+  // Create a simple LoopOp with specified bounds using the simple builder
+  acc::LoopOp createLoopOp(OpBuilder &builder, ValueRange lbs, ValueRange ubs,
+                           ValueRange steps) {
+    auto loopOp = acc::LoopOp::create(builder, loc, lbs, ubs, steps,
+                                      acc::LoopParMode::loop_independent);
+
+    // Add body block with IV arguments and yield
+    Region &region = loopOp.getRegion();
+    Block *block = builder.createBlock(&region, region.begin());
+    for (Value lb : lbs)
+      block->addArgument(lb.getType(), loc);
+    builder.setInsertionPointToEnd(block);
+    acc::YieldOp::create(builder, loc);
+
+    return loopOp;
+  }
+
+  // Helper to count nested acc.loop ops within a loop
+  unsigned countNestedLoops(acc::LoopOp loop) {
+    unsigned count = 0;
+    loop.getBody().walk([&](acc::LoopOp) { ++count; });
+    return count;
+  }
+
+  // Helper to collect all nested acc.loop ops in order
+  SmallVector<acc::LoopOp> collectNestedLoops(acc::LoopOp loop) {
+    SmallVector<acc::LoopOp> loops;
+    loop.getBody().walk(
+        [&](acc::LoopOp nestedLoop) { loops.push_back(nestedLoop); });
+    return loops;
+  }
+
+  MLIRContext context;
+  OpBuilder b;
+  Location loc;
+};
+
+//===----------------------------------------------------------------------===//
+// tileACCLoops Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTilingTest, tileACCLoopsSingleLoop) {
+  // Create a module to hold the function
+  OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+  Block *moduleBlock = module->getBody();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleBlock);
+
+  // Create a function
+  auto funcType = b.getFunctionType({}, {});
+  OwningOpRef<func::FuncOp> funcOp =
+      func::FuncOp::create(b, loc, "test_func", funcType);
+  Block *funcBlock = funcOp->addEntryBlock();
+
+  b.setInsertionPointToStart(funcBlock);
+
+  // Create loop bounds
+  Value lb =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(100));
+  Value step =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+  Value tileSize =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(4));
+
+  // Create the loop
+  acc::LoopOp loopOp = createLoopOp(b, {lb}, {ub}, {step});
+
+  // Tile the loop using IRRewriter
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPoint(loopOp);
+
+  SmallVector<acc::LoopOp> loopsToTile = {loopOp};
+  SmallVector<Value> tileSizes = {tileSize};
+
+  acc::LoopOp tiledLoop =
+      tileACCLoops(loopsToTile, tileSizes, /*defaultTileSize=*/128, rewriter);
+
+  // Verify the tiled loop was created
+  EXPECT_TRUE(tiledLoop != nullptr);
+  EXPECT_FALSE(tiledLoop.getBody().empty());
+
+  // After tiling a single loop with tile(4), we should have:
+  // - 1 tile loop (the outer loop)
+  // - 1 element loop nested inside
+  // Total: 1 nested loop inside the tile loop
+  EXPECT_EQ(countNestedLoops(tiledLoop), 1u);
+
+  // The tile loop (outer) should have 1 IV
+  EXPECT_EQ(tiledLoop.getBody().getNumArguments(), 1u);
+
+  // Collect nested loops and verify
+  auto nestedLoops = collectNestedLoops(tiledLoop);
+  EXPECT_EQ(nestedLoops.size(), 1u);
+  // The element loop should have 1 IV
+  if (!nestedLoops.empty())
+    EXPECT_EQ(nestedLoops[0].getBody().getNumArguments(), 1u);
+}
+
+TEST_F(OpenACCUtilsTilingTest, tileACCLoopsNestedLoops) {
+  // Create a module to hold the function
+  OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+  Block *moduleBlock = module->getBody();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleBlock);
+
+  // Create a function
+  auto funcType = b.getFunctionType({}, {});
+  OwningOpRef<func::FuncOp> funcOp =
+      func::FuncOp::create(b, loc, "test_func", funcType);
+  Block *funcBlock = funcOp->addEntryBlock();
+
+  b.setInsertionPointToStart(funcBlock);
+
+  // Create loop bounds for outer loop
+  Value lb1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(100));
+  Value step1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+
+  // Create loop bounds for inner loop
+  Value lb2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(50));
+  Value step2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+
+  // Tile sizes
+  Value tileSize1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(4));
+  Value tileSize2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(8));
+
+  // Create outer loop
+  acc::LoopOp outerLoop = createLoopOp(b, {lb1}, {ub1}, {step1});
+
+  // Create inner loop inside outer loop
+  b.setInsertionPoint(outerLoop.getBody().getTerminator());
+  acc::LoopOp innerLoop = createLoopOp(b, {lb2}, {ub2}, {step2});
+
+  // Tile the loops
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPoint(outerLoop);
+
+  SmallVector<acc::LoopOp> loopsToTile = {outerLoop, innerLoop};
+  SmallVector<Value> tileSizes = {tileSize1, tileSize2};
+
+  acc::LoopOp tiledLoop =
+      tileACCLoops(loopsToTile, tileSizes, /*defaultTileSize=*/128, rewriter);
+
+  // Verify the tiled loop nest was created
+  EXPECT_TRUE(tiledLoop != nullptr);
+  EXPECT_FALSE(tiledLoop.getBody().empty());
+
+  // After tiling a 2-level nested loop with tile(4,8), we should have:
+  // tile_loop_1 -> tile_loop_2 -> element_loop_1 -> element_loop_2
+  // Total: 3 nested loops inside the outermost tile loop
+  unsigned nestedCount = countNestedLoops(tiledLoop);
+  EXPECT_EQ(nestedCount, 3u);
+
+  // The outermost tile loop should have 1 IV
+  EXPECT_EQ(tiledLoop.getBody().getNumArguments(), 1u);
+
+  // Collect all nested loops and verify each has 1 IV
+  auto nestedLoops = collectNestedLoops(tiledLoop);
+  EXPECT_EQ(nestedLoops.size(), 3u);
+  for (auto loop : nestedLoops)
+    EXPECT_EQ(loop.getBody().getNumArguments(), 1u);
+}
+
+//===----------------------------------------------------------------------===//
+// uncollapseLoops Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTilingTest, uncollapseLoopsBasic) {
+  // Create a module to hold the function
+  OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+  Block *moduleBlock = module->getBody();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleBlock);
+
+  // Create a function
+  auto funcType = b.getFunctionType({}, {});
+  OwningOpRef<func::FuncOp> funcOp =
+      func::FuncOp::create(b, loc, "test_func", funcType);
+  Block *funcBlock = funcOp->addEntryBlock();
+
+  b.setInsertionPointToStart(funcBlock);
+
+  // Create loop bounds for a collapsed 2-level loop
+  Value lb1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(10));
+  Value step1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+  Value lb2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(20));
+  Value step2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+
+  // Create a collapsed loop with 2 IVs
+  acc::LoopOp collapsedLoop =
+      createLoopOp(b, {lb1, lb2}, {ub1, ub2}, {step1, step2});
+
+  // Set the collapse attribute
+  collapsedLoop.setCollapseForDeviceTypes(&context, {acc::DeviceType::None},
+                                          llvm::APInt(64, 1));
+
+  // Uncollapse the loop: tileCount=2, collapseCount=1
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPoint(collapsedLoop);
+
+  SmallVector<acc::LoopOp> uncollapsedLoops = uncollapseLoops(
+      collapsedLoop, /*tileCount=*/2, /*collapseCount=*/1, rewriter);
+
+  // Should produce 2 loops (one outer with collapse=1, one inner)
+  EXPECT_EQ(uncollapsedLoops.size(), 2u);
+
+  if (uncollapsedLoops.size() >= 2) {
+    // Verify the outer loop has 1 IV (collapseCount=1)
+    acc::LoopOp outerLoop = uncollapsedLoops[0];
+    EXPECT_EQ(outerLoop.getBody().getNumArguments(), 1u);
+    EXPECT_EQ(outerLoop.getLowerbound().size(), 1u);
+    EXPECT_EQ(outerLoop.getUpperbound().size(), 1u);
+    EXPECT_EQ(outerLoop.getStep().size(), 1u);
+
+    // Verify the inner loop has 1 IV
+    acc::LoopOp innerLoop = uncollapsedLoops[1];
+    EXPECT_EQ(innerLoop.getBody().getNumArguments(), 1u);
+    EXPECT_EQ(innerLoop.getLowerbound().size(), 1u);
+    EXPECT_EQ(innerLoop.getUpperbound().size(), 1u);
+    EXPECT_EQ(innerLoop.getStep().size(), 1u);
+
+    // Verify nesting: inner loop should be inside outer loop
+    unsigned nestedCount = countNestedLoops(outerLoop);
+    EXPECT_EQ(nestedCount, 1u);
+  }
+}
+
+TEST_F(OpenACCUtilsTilingTest, uncollapseLoopsThreeLevels) {
+  // Test uncollapsing with 3 levels: collapse(2) with tile(3)
+  OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+  Block *moduleBlock = module->getBody();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleBlock);
+
+  auto funcType = b.getFunctionType({}, {});
+  OwningOpRef<func::FuncOp> funcOp =
+      func::FuncOp::create(b, loc, "test_func", funcType);
+  Block *funcBlock = funcOp->addEntryBlock();
+
+  b.setInsertionPointToStart(funcBlock);
+
+  // Create 3 sets of bounds
+  Value lb1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(10));
+  Value step1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+  Value lb2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(20));
+  Value step2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+  Value lb3 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub3 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(30));
+  Value step3 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+
+  // Create a collapsed loop with 3 IVs
+  acc::LoopOp collapsedLoop =
+      createLoopOp(b, {lb1, lb2, lb3}, {ub1, ub2, ub3}, {step1, step2, step3});
+
+  // Set collapse(2)
+  collapsedLoop.setCollapseForDeviceTypes(&context, {acc::DeviceType::None},
+                                          llvm::APInt(64, 2));
+
+  // Uncollapse: tileCount=3, collapseCount=2
+  // This should create: outer loop with 2 IVs, then 1 inner loop
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPoint(collapsedLoop);
+
+  SmallVector<acc::LoopOp> uncollapsedLoops = uncollapseLoops(
+      collapsedLoop, /*tileCount=*/3, /*collapseCount=*/2, rewriter);
+
+  // Should produce 2 loops
+  EXPECT_EQ(uncollapsedLoops.size(), 2u);
+
+  if (uncollapsedLoops.size() >= 2) {
+    // Outer loop should have 2 IVs (from collapse=2)
+    acc::LoopOp outerLoop = uncollapsedLoops[0];
+    EXPECT_EQ(outerLoop.getBody().getNumArguments(), 2u);
+    EXPECT_EQ(outerLoop.getLowerbound().size(), 2u);
+
+    // Inner loop should have 1 IV (the 3rd dimension)
+    acc::LoopOp innerLoop = uncollapsedLoops[1];
+    EXPECT_EQ(innerLoop.getBody().getNumArguments(), 1u);
+    EXPECT_EQ(innerLoop.getLowerbound().size(), 1u);
+  }
+}

From 687986eb50c10f5dab4b11aeb65bd1c03a102986 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 16:15:58 -0800
Subject: [PATCH 76/85] [Docs] Add documentation for LLVM_ENABLE_CURL (#170928)

---
 llvm/docs/CMake.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 7e95545425f2d..de8be0ad2dded 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -478,6 +478,12 @@ its enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_ENABLE_BINDINGS**:BOOL
   If disabled, do not try to build the OCaml bindings.
 
+**LLVM_ENABLE_CURL**:
+  Used to decide if LLVM tools, should support downloading information
+  (particularly debug info from ``llvm-debuginfod``) over HTTP. Allowed
+  values are ``OFF`` (default), ``ON``, and ``FORCE_ON`` (error if libcurl
+  is not found).
+
 **LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING**:STRING
   Enhances Debugify's ability to detect line number errors by storing extra
   information inside Instructions, removing false positives from Debugify's

From 53cd4ab41373d07776dc46b75c6759d980f0fbef Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 9 Dec 2025 16:22:17 -0800
Subject: [PATCH 77/85] Revert "[AArch64] Run optimizeTerminators earlier too."
 (#171505)

Reverts llvm/llvm-project#170907

Causes crashes, see
https://github.com/llvm/llvm-project/pull/170907#issuecomment-3634271414
---
 llvm/lib/CodeGen/ShrinkWrap.cpp               |  2 -
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 47 ------------
 llvm/lib/Target/AArch64/AArch64InstrInfo.h    |  2 -
 .../AArch64RedundantCondBranchPass.cpp        | 46 +++++++++++-
 .../AArch64RedundantCopyElimination.cpp       |  6 +-
 .../CodeGen/AArch64/arm64-shrink-wrapping.ll  | 74 +++++++++++++------
 .../block-placement-optimize-branches.ll      | 32 +++++---
 .../AArch64/lr-reserved-for-ra-live-in.ll     |  2 +
 llvm/test/CodeGen/AArch64/pr164181.ll         | 66 +++++++++--------
 llvm/test/CodeGen/AArch64/pr166870.ll         | 21 +++---
 ...ch64_generated_funcs.ll.generated.expected | 23 ++++--
 ...64_generated_funcs.ll.nogenerated.expected | 22 ++++--
 12 files changed, 199 insertions(+), 144 deletions(-)

diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 6a6d58c705487..83581052560cb 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -618,8 +618,6 @@ bool ShrinkWrapImpl::postShrinkWrapping(bool HasCandidate, MachineFunction &MF,
 
   DenseSet<const MachineBasicBlock *> DirtyBBs;
   for (MachineBasicBlock &MBB : MF) {
-    if (!MDT->isReachableFromEntry(&MBB))
-      continue;
     if (MBB.isEHPad()) {
       DirtyBBs.insert(&MBB);
       continue;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f82180fc57b99..ac4c1acedf05c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -708,53 +708,6 @@ unsigned AArch64InstrInfo::insertBranch(
   return 2;
 }
 
-bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
-                               const TargetInstrInfo &TII) {
-  for (MachineInstr &MI : MBB->terminators()) {
-    unsigned Opc = MI.getOpcode();
-    switch (Opc) {
-    case AArch64::CBZW:
-    case AArch64::CBZX:
-    case AArch64::TBZW:
-    case AArch64::TBZX:
-      // CBZ/TBZ with WZR/XZR -> unconditional B
-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-          MI.getOperand(0).getReg() == AArch64::XZR) {
-        DEBUG_WITH_TYPE("optimizeTerminators",
-                        dbgs() << "Removing always taken branch: " << MI);
-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
-        for (auto *S : Succs)
-          if (S != Target)
-            MBB->removeSuccessor(S);
-        DebugLoc DL = MI.getDebugLoc();
-        while (MBB->rbegin() != &MI)
-          MBB->rbegin()->eraseFromParent();
-        MI.eraseFromParent();
-        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
-        return true;
-      }
-      break;
-    case AArch64::CBNZW:
-    case AArch64::CBNZX:
-    case AArch64::TBNZW:
-    case AArch64::TBNZX:
-      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-          MI.getOperand(0).getReg() == AArch64::XZR) {
-        DEBUG_WITH_TYPE("optimizeTerminators",
-                        dbgs() << "Removing never taken branch: " << MI);
-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-        MI.getParent()->removeSuccessor(Target);
-        MI.eraseFromParent();
-        return true;
-      }
-      break;
-    }
-  }
-  return false;
-}
-
 // Find the original register that VReg is copied from.
 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
   while (Register::isVirtualRegister(VReg)) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index d237721450644..2de2e0d73901f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -705,8 +705,6 @@ int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset,
                               unsigned *OutUnscaledOp = nullptr,
                               int64_t *EmittableOffset = nullptr);
 
-bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII);
-
 static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
 
 static inline bool isCondBranchOpcode(int Opc) {
diff --git a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
index 1a5a9f0a6018b..1b990796ec9da 100644
--- a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "AArch64InstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -46,6 +45,51 @@ INITIALIZE_PASS(AArch64RedundantCondBranch, "aarch64-redundantcondbranch",
                 "AArch64 Redundant Conditional Branch Elimination pass", false,
                 false)
 
+static bool optimizeTerminators(MachineBasicBlock *MBB,
+                                const TargetInstrInfo &TII) {
+  for (MachineInstr &MI : make_early_inc_range(MBB->terminators())) {
+    unsigned Opc = MI.getOpcode();
+    switch (Opc) {
+    case AArch64::CBZW:
+    case AArch64::CBZX:
+    case AArch64::TBZW:
+    case AArch64::TBZX:
+      // CBZ/TBZ with WZR/XZR -> unconditional B
+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+          MI.getOperand(0).getReg() == AArch64::XZR) {
+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
+        for (auto *S : Succs)
+          if (S != Target)
+            MBB->removeSuccessor(S);
+        DebugLoc DL = MI.getDebugLoc();
+        while (MBB->rbegin() != &MI)
+          MBB->rbegin()->eraseFromParent();
+        MI.eraseFromParent();
+        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
+        return true;
+      }
+      break;
+    case AArch64::CBNZW:
+    case AArch64::CBNZX:
+    case AArch64::TBNZW:
+    case AArch64::TBNZX:
+      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+          MI.getOperand(0).getReg() == AArch64::XZR) {
+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+        MI.getParent()->removeSuccessor(Target);
+        MI.eraseFromParent();
+        return true;
+      }
+      break;
+    }
+  }
+  return false;
+}
+
 bool AArch64RedundantCondBranch::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index 9dc721eb7315d..84015e5061768 100644
--- a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -50,7 +50,6 @@
 //        to use WZR/XZR directly in some cases.
 //===----------------------------------------------------------------------===//
 #include "AArch64.h"
-#include "AArch64InstrInfo.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
@@ -476,7 +475,6 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
     return false;
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   // Resize the clobbered and used register unit trackers.  We do this once per
   // function.
@@ -486,10 +484,8 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
   OptBBUsedRegs.init(*TRI);
 
   bool Changed = false;
-  for (MachineBasicBlock &MBB : MF) {
-    Changed |= optimizeTerminators(&MBB, TII);
+  for (MachineBasicBlock &MBB : MF)
     Changed |= optimizeBlock(&MBB);
-  }
   return Changed;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index b837a361bd287..724c8b3fc9170 100644
--- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -735,15 +735,21 @@ define void @infiniteloop() {
 ; ENABLE-NEXT:    .cfi_offset w29, -16
 ; ENABLE-NEXT:    .cfi_offset w19, -24
 ; ENABLE-NEXT:    .cfi_offset w20, -32
+; ENABLE-NEXT:  ; %bb.1: ; %if.then
 ; ENABLE-NEXT:    sub x19, sp, #16
 ; ENABLE-NEXT:    mov sp, x19
 ; ENABLE-NEXT:    mov w20, wzr
-; ENABLE-NEXT:  LBB10_1: ; %for.body
+; ENABLE-NEXT:  LBB10_2: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    bl _something
 ; ENABLE-NEXT:    add w20, w0, w20
 ; ENABLE-NEXT:    str w20, [x19]
-; ENABLE-NEXT:    b LBB10_1
+; ENABLE-NEXT:    b LBB10_2
+; ENABLE-NEXT:  ; %bb.3: ; %if.end
+; ENABLE-NEXT:    sub sp, x29, #16
+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ENABLE-NEXT:    ret
 ;
 ; DISABLE-LABEL: infiniteloop:
 ; DISABLE:       ; %bb.0: ; %entry
@@ -755,15 +761,21 @@ define void @infiniteloop() {
 ; DISABLE-NEXT:    .cfi_offset w29, -16
 ; DISABLE-NEXT:    .cfi_offset w19, -24
 ; DISABLE-NEXT:    .cfi_offset w20, -32
+; DISABLE-NEXT:  ; %bb.1: ; %if.then
 ; DISABLE-NEXT:    sub x19, sp, #16
 ; DISABLE-NEXT:    mov sp, x19
 ; DISABLE-NEXT:    mov w20, wzr
-; DISABLE-NEXT:  LBB10_1: ; %for.body
+; DISABLE-NEXT:  LBB10_2: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    bl _something
 ; DISABLE-NEXT:    add w20, w0, w20
 ; DISABLE-NEXT:    str w20, [x19]
-; DISABLE-NEXT:    b LBB10_1
+; DISABLE-NEXT:    b LBB10_2
+; DISABLE-NEXT:  ; %bb.3: ; %if.end
+; DISABLE-NEXT:    sub sp, x29, #16
+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; DISABLE-NEXT:    ret
 entry:
   br i1 undef, label %if.then, label %if.end
 
@@ -794,10 +806,11 @@ define void @infiniteloop2() {
 ; ENABLE-NEXT:    .cfi_offset w29, -16
 ; ENABLE-NEXT:    .cfi_offset w19, -24
 ; ENABLE-NEXT:    .cfi_offset w20, -32
+; ENABLE-NEXT:  ; %bb.1: ; %if.then
 ; ENABLE-NEXT:    sub x8, sp, #16
 ; ENABLE-NEXT:    mov sp, x8
 ; ENABLE-NEXT:    mov w9, wzr
-; ENABLE-NEXT:  LBB11_1: ; %for.body
+; ENABLE-NEXT:  LBB11_2: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    ; InlineAsm Start
 ; ENABLE-NEXT:    mov x10, #0 ; =0x0
@@ -808,7 +821,12 @@ define void @infiniteloop2() {
 ; ENABLE-NEXT:    ; InlineAsm Start
 ; ENABLE-NEXT:    nop
 ; ENABLE-NEXT:    ; InlineAsm End
-; ENABLE-NEXT:    b LBB11_1
+; ENABLE-NEXT:    b LBB11_2
+; ENABLE-NEXT:  ; %bb.3: ; %if.end
+; ENABLE-NEXT:    sub sp, x29, #16
+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ENABLE-NEXT:    ret
 ;
 ; DISABLE-LABEL: infiniteloop2:
 ; DISABLE:       ; %bb.0: ; %entry
@@ -820,10 +838,11 @@ define void @infiniteloop2() {
 ; DISABLE-NEXT:    .cfi_offset w29, -16
 ; DISABLE-NEXT:    .cfi_offset w19, -24
 ; DISABLE-NEXT:    .cfi_offset w20, -32
+; DISABLE-NEXT:  ; %bb.1: ; %if.then
 ; DISABLE-NEXT:    sub x8, sp, #16
 ; DISABLE-NEXT:    mov sp, x8
 ; DISABLE-NEXT:    mov w9, wzr
-; DISABLE-NEXT:  LBB11_1: ; %for.body
+; DISABLE-NEXT:  LBB11_2: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    ; InlineAsm Start
 ; DISABLE-NEXT:    mov x10, #0 ; =0x0
@@ -834,7 +853,12 @@ define void @infiniteloop2() {
 ; DISABLE-NEXT:    ; InlineAsm Start
 ; DISABLE-NEXT:    nop
 ; DISABLE-NEXT:    ; InlineAsm End
-; DISABLE-NEXT:    b LBB11_1
+; DISABLE-NEXT:    b LBB11_2
+; DISABLE-NEXT:  ; %bb.3: ; %if.end
+; DISABLE-NEXT:    sub sp, x29, #16
+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; DISABLE-NEXT:    ret
 entry:
   br i1 undef, label %if.then, label %if.end
 
@@ -865,43 +889,49 @@ if.end:
 define void @infiniteloop3() {
 ; ENABLE-LABEL: infiniteloop3:
 ; ENABLE:       ; %bb.0: ; %entry
+; ENABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
 ; ENABLE-NEXT:    mov x8, xzr
 ; ENABLE-NEXT:    mov x9, xzr
 ; ENABLE-NEXT:    mov x11, xzr
-; ENABLE-NEXT:    b LBB12_2
-; ENABLE-NEXT:  LBB12_1: ; %loop2b
-; ENABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
+; ENABLE-NEXT:    b LBB12_3
+; ENABLE-NEXT:  LBB12_2: ; %loop2b
+; ENABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
 ; ENABLE-NEXT:    str x10, [x11]
 ; ENABLE-NEXT:    mov x11, x10
-; ENABLE-NEXT:  LBB12_2: ; %loop1
+; ENABLE-NEXT:  LBB12_3: ; %loop1
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    mov x10, x9
 ; ENABLE-NEXT:    ldr x9, [x8]
-; ENABLE-NEXT:    cbnz x8, LBB12_1
-; ENABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
+; ENABLE-NEXT:    cbnz x8, LBB12_2
+; ENABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
 ; ENABLE-NEXT:    mov x8, x10
 ; ENABLE-NEXT:    mov x11, x10
-; ENABLE-NEXT:    b LBB12_2
+; ENABLE-NEXT:    b LBB12_3
+; ENABLE-NEXT:  ; %bb.5: ; %end
+; ENABLE-NEXT:    ret
 ;
 ; DISABLE-LABEL: infiniteloop3:
 ; DISABLE:       ; %bb.0: ; %entry
+; DISABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
 ; DISABLE-NEXT:    mov x8, xzr
 ; DISABLE-NEXT:    mov x9, xzr
 ; DISABLE-NEXT:    mov x11, xzr
-; DISABLE-NEXT:    b LBB12_2
-; DISABLE-NEXT:  LBB12_1: ; %loop2b
-; DISABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
+; DISABLE-NEXT:    b LBB12_3
+; DISABLE-NEXT:  LBB12_2: ; %loop2b
+; DISABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
 ; DISABLE-NEXT:    str x10, [x11]
 ; DISABLE-NEXT:    mov x11, x10
-; DISABLE-NEXT:  LBB12_2: ; %loop1
+; DISABLE-NEXT:  LBB12_3: ; %loop1
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    mov x10, x9
 ; DISABLE-NEXT:    ldr x9, [x8]
-; DISABLE-NEXT:    cbnz x8, LBB12_1
-; DISABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
+; DISABLE-NEXT:    cbnz x8, LBB12_2
+; DISABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
 ; DISABLE-NEXT:    mov x8, x10
 ; DISABLE-NEXT:    mov x11, x10
-; DISABLE-NEXT:    b LBB12_2
+; DISABLE-NEXT:    b LBB12_3
+; DISABLE-NEXT:  ; %bb.5: ; %end
+; DISABLE-NEXT:    ret
 entry:
   br i1 undef, label %loop2a, label %body
 
diff --git a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
index 6e6fb6f367867..7c3a567d1b336 100644
--- a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
+++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
@@ -8,14 +8,20 @@
 define i8 @foo_optsize(i32 %v4) optsize {
 ; CHECK-LABEL: foo_optsize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbnz w0, .LBB0_2
-; CHECK-NEXT:  // %bb.1: // %b2
-; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2: // %b1
-; CHECK-NEXT:    cmp w0, #1
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    cbnz w0, .LBB0_4
+; CHECK-NEXT:  // %bb.3: // %b2
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_4: // %b1
+; CHECK-NEXT:    cmp w0, #1
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.5: // %b3
+; CHECK-NEXT:    b .LBB0_1
 entry:
   %v2 = icmp eq i32 0, 0
   br i1 %v2, label %b1, label %b4
@@ -41,14 +47,20 @@ b4:
 define i8 @foo_optspeed(i32 %v4) {
 ; CHECK-LABEL: foo_optspeed:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbnz w0, .LBB1_2
-; CHECK-NEXT:  // %bb.1: // %b2
-; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    b .LBB1_2
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_2: // %b1
-; CHECK-NEXT:    cmp w0, #1
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    cbnz w0, .LBB1_4
+; CHECK-NEXT:  // %bb.3: // %b2
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_4: // %b1
+; CHECK-NEXT:    cmp w0, #1
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  // %bb.5: // %b3
+; CHECK-NEXT:    b .LBB1_1
 entry:
   %v2 = icmp eq i32 0, 0
   br i1 %v2, label %b1, label %b4
diff --git a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
index 708ba621c26d8..29427146e8a43 100644
--- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
+++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
@@ -21,8 +21,10 @@ define i32 @check_lr_liveness(ptr %arg) #1 {
   ; CHECK-NEXT:   B %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.bb:
+  ; CHECK-NEXT:   successors: %bb.3(0x2aaaaaab), %bb.2(0x55555555)
   ; CHECK-NEXT:   liveins: $w0, $lr
   ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBNZW $wzr, %bb.3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb1:
diff --git a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
index 72b2a77e51c06..abb090ccbcaed 100644
--- a/llvm/test/CodeGen/AArch64/pr164181.ll
+++ b/llvm/test/CodeGen/AArch64/pr164181.ll
@@ -29,11 +29,11 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Spill
 ; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Spill
 ; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Spill
-; CHECK-NEXT:    tbz w5, #0, .LBB0_40
+; CHECK-NEXT:    tbz w5, #0, .LBB0_43
 ; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
 ; CHECK-NEXT:    ldr x4, [sp, #312]
 ; CHECK-NEXT:    ldr x14, [sp, #280]
-; CHECK-NEXT:    tbz w0, #0, .LBB0_39
+; CHECK-NEXT:    tbz w0, #0, .LBB0_42
 ; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
 ; CHECK-NEXT:    ldrb w8, [sp, #368]
 ; CHECK-NEXT:    ldrb w12, [sp, #256]
@@ -92,7 +92,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
 ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
 ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
 ; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Reload
 ; CHECK-NEXT:    mov x12, x24
 ; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Spill
@@ -117,7 +117,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
 ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
 ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
 ; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Spill
 ; CHECK-NEXT:    cmn x24, #30
 ; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
@@ -142,7 +142,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
 ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
 ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
 ; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
 ; CHECK-NEXT:    mov w14, #1152 // =0x480
 ; CHECK-NEXT:    mov w24, #1 // =0x1
@@ -176,7 +176,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    // => This Loop Header: Depth=4
 ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
 ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
 ; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Reload
 ; CHECK-NEXT:    and w8, w8, w8, asr #31
 ; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Spill
@@ -281,23 +281,31 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    mov x24, xzr
 ; CHECK-NEXT:    mul w12, w12, w22
 ; CHECK-NEXT:    mov x22, x5
-; CHECK-NEXT:    tbz w0, #0, .LBB0_33
-; CHECK-NEXT:  .LBB0_28: // %if.then222.us
+; CHECK-NEXT:    tbz w0, #0, .LBB0_36
+; CHECK-NEXT:  .LBB0_28: // %for.body194.us
 ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
 ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
 ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
 ; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:  // %bb.29: // %if.then222.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    adrp x27, :got:var_32
 ; CHECK-NEXT:    ldur w8, [x19, #-12]
 ; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
 ; CHECK-NEXT:    strh w8, [x27]
 ; CHECK-NEXT:    sxtb w8, w25
-; CHECK-NEXT:    strb w3, [x16]
 ; CHECK-NEXT:    bic w25, w8, w8, asr #31
+; CHECK-NEXT:    b .LBB0_31
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  // %bb.30:
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:  .LBB0_31: // %if.end239.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    strb w3, [x16]
 ; CHECK-NEXT:    tst w13, #0xff
-; CHECK-NEXT:    b.eq .LBB0_30
-; CHECK-NEXT:  // %bb.29: // %if.then254.us
+; CHECK-NEXT:    b.eq .LBB0_33
+; CHECK-NEXT:  // %bb.32: // %if.then254.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
 ; CHECK-NEXT:    adrp x27, :got:var_35
@@ -306,7 +314,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    csel x8, xzr, x7, eq
 ; CHECK-NEXT:    str x8, [x27]
 ; CHECK-NEXT:    strh w1, [x17]
-; CHECK-NEXT:  .LBB0_30: // %if.end282.us
+; CHECK-NEXT:  .LBB0_33: // %if.end282.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    orr x27, x24, x4
 ; CHECK-NEXT:    adrp x8, :got:var_39
@@ -317,14 +325,14 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    str x8, [x18]
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    cbnz x2, .LBB0_27
-; CHECK-NEXT:  // %bb.31: // %if.then327.us
+; CHECK-NEXT:  // %bb.34: // %if.then327.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    cbz w8, .LBB0_25
-; CHECK-NEXT:  // %bb.32: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    mov w4, wzr
 ; CHECK-NEXT:    b .LBB0_26
 ; CHECK-NEXT:    .p2align 5, , 16
-; CHECK-NEXT:  .LBB0_33: // %for.cond376.preheader.us
+; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
 ; CHECK-NEXT:    mov w3, #1152 // =0x480
 ; CHECK-NEXT:    mov x22, xzr
@@ -335,24 +343,24 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    madd x14, x14, x3, x11
 ; CHECK-NEXT:    mov w28, w30
 ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
-; CHECK-NEXT:    b .LBB0_36
+; CHECK-NEXT:    b .LBB0_39
 ; CHECK-NEXT:    .p2align 5, , 16
-; CHECK-NEXT:  .LBB0_34: // %if.then466.us
-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+; CHECK-NEXT:  .LBB0_37: // %if.then466.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
 ; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Reload
 ; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Reload
 ; CHECK-NEXT:    sxtb w4, w4
 ; CHECK-NEXT:    bic w4, w4, w4, asr #31
 ; CHECK-NEXT:    str x3, [x28]
 ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
-; CHECK-NEXT:  .LBB0_35: // %for.inc505.us
-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
 ; CHECK-NEXT:    add x22, x22, #1
 ; CHECK-NEXT:    add x27, x27, #1
 ; CHECK-NEXT:    mov w28, wzr
 ; CHECK-NEXT:    cmp x27, #0
 ; CHECK-NEXT:    b.hs .LBB0_9
-; CHECK-NEXT:  .LBB0_36: // %for.body380.us
+; CHECK-NEXT:  .LBB0_39: // %for.body380.us
 ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
 ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
 ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
@@ -364,18 +372,18 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    strh w28, [x11]
 ; CHECK-NEXT:    csel w28, w21, w3, ne
 ; CHECK-NEXT:    str w28, [x20]
-; CHECK-NEXT:    cbz x15, .LBB0_35
-; CHECK-NEXT:  // %bb.37: // %if.then436.us
-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+; CHECK-NEXT:    cbz x15, .LBB0_38
+; CHECK-NEXT:  // %bb.40: // %if.then436.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
 ; CHECK-NEXT:    ldrh w28, [x14]
-; CHECK-NEXT:    cbnz w28, .LBB0_34
-; CHECK-NEXT:  // %bb.38: // in Loop: Header=BB0_36 Depth=5
+; CHECK-NEXT:    cbnz w28, .LBB0_37
+; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
 ; CHECK-NEXT:    mov w4, wzr
-; CHECK-NEXT:    b .LBB0_35
-; CHECK-NEXT:  .LBB0_39: // %for.body41
+; CHECK-NEXT:    b .LBB0_38
+; CHECK-NEXT:  .LBB0_42: // %for.body41
 ; CHECK-NEXT:    strb wzr, [x4]
 ; CHECK-NEXT:    strb wzr, [x14]
-; CHECK-NEXT:  .LBB0_40: // %for.cond563.preheader
+; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
 ; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
index 6f54b0465fbfd..48967d30485ab 100644
--- a/llvm/test/CodeGen/AArch64/pr166870.ll
+++ b/llvm/test/CodeGen/AArch64/pr166870.ll
@@ -26,11 +26,12 @@ define i32 @widget(i32 %arg, i32 %arg1, i1 %arg2, ptr %arg3, i1 %arg4) #0 nounwi
 ; CHECK-NEXT:    mov x21, x1
 ; CHECK-NEXT:    bl baz
 ; CHECK-NEXT:    mov w0, #0 // =0x0
+; CHECK-NEXT:  // %bb.5: // %bb6
 ; CHECK-NEXT:    mov w10, #1 // =0x1
+; CHECK-NEXT:    cbnz w10, .LBB0_11
+; CHECK-NEXT:  // %bb.6: // %bb7
 ; CHECK-NEXT:    cbnz w10, .LBB0_10
-; CHECK-NEXT:  // %bb.5: // %bb7
-; CHECK-NEXT:    cbnz w10, .LBB0_9
-; CHECK-NEXT:  // %bb.6: // %bb8
+; CHECK-NEXT:  // %bb.7: // %bb8
 ; CHECK-NEXT:    mov x8, x21
 ; CHECK-NEXT:    mov x9, x20
 ; CHECK-NEXT:    mov w20, #0 // =0x0
@@ -38,17 +39,17 @@ define i32 @widget(i32 %arg, i32 %arg1, i1 %arg2, ptr %arg3, i1 %arg4) #0 nounwi
 ; CHECK-NEXT:    mov x21, x9
 ; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    mov x22, x8
-; CHECK-NEXT:  .LBB0_7: // %bb10
+; CHECK-NEXT:  .LBB0_8: // %bb10
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    strb w20, [x19]
-; CHECK-NEXT:    cbnz x21, .LBB0_7
-; CHECK-NEXT:  // %bb.8: // %bb12
-; CHECK-NEXT:    // in Loop: Header=BB0_7 Depth=1
+; CHECK-NEXT:    cbnz x21, .LBB0_8
+; CHECK-NEXT:  // %bb.9: // %bb12
+; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    bl snork
-; CHECK-NEXT:    cbnz x22, .LBB0_7
-; CHECK-NEXT:  .LBB0_9:
-; CHECK-NEXT:    mov w0, #0 // =0x0
+; CHECK-NEXT:    cbnz x22, .LBB0_8
 ; CHECK-NEXT:  .LBB0_10:
+; CHECK-NEXT:    mov w0, #0 // =0x0
+; CHECK-NEXT:  .LBB0_11:
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
index 215b11a746759..f4815dc331056 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
@@ -71,21 +71,27 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_remember_state
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    stur xzr, [x29, #-8]
-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-; CHECK-NEXT:    ldur w8, [x29, #-8]
-; CHECK-NEXT:    cbz w8, .LBB0_2
+; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    str w8, [sp, #16]
-; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:    ldur w8, [x29, #-8]
+; CHECK-NEXT:    cbz w8, .LBB0_4
 ; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #2 // =0x2
-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:    str w8, [sp, #16]
+; CHECK-NEXT:    b .LBB0_5
 ; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:    ldur w8, [x29, #-8]
+; CHECK-NEXT:    cbnz w8, .LBB0_2
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
 ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
@@ -128,6 +134,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ;
 ; CHECK-LABEL: OUTLINED_FUNCTION_0:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    stp w9, w8, [x29, #-12]
 ; CHECK-NEXT:    mov w9, #3 // =0x3
 ; CHECK-NEXT:    mov w8, #4 // =0x4
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
index bf7cf2b54983b..bc3500522672d 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
@@ -12,21 +12,27 @@ define dso_local i32 @check_boundaries() #0 {
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_remember_state
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    stur xzr, [x29, #-8]
-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-; CHECK-NEXT:    ldur w8, [x29, #-8]
-; CHECK-NEXT:    cbz w8, .LBB0_2
+; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    str w8, [sp, #16]
-; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:    ldur w8, [x29, #-8]
+; CHECK-NEXT:    cbz w8, .LBB0_4
 ; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #2 // =0x2
-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:    str w8, [sp, #16]
+; CHECK-NEXT:    b .LBB0_5
 ; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:    ldur w8, [x29, #-8]
+; CHECK-NEXT:    cbnz w8, .LBB0_2
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
 ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload

From eab9394a45276f6eba9e0530b2964325efb725e5 Mon Sep 17 00:00:00 2001
From: Naveen Seth Hanig <naveen.hanig@outlook.com>
Date: Wed, 10 Dec 2025 01:32:39 +0100
Subject: [PATCH 78/85] [clang][DependencyScanning] Fix spelling mistake for
 DiagnosticsEngineWithDiagOpts (#171535)

---
 .../clang/DependencyScanning/DependencyScannerImpl.h   | 10 +++++-----
 clang/lib/DependencyScanning/DependencyScannerImpl.cpp |  4 ++--
 .../DependencyScanning/DependencyScanningWorker.cpp    |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/DependencyScanning/DependencyScannerImpl.h b/clang/include/clang/DependencyScanning/DependencyScannerImpl.h
index 352a0ad44fb7f..f79f21acf52f9 100644
--- a/clang/include/clang/DependencyScanning/DependencyScannerImpl.h
+++ b/clang/include/clang/DependencyScanning/DependencyScannerImpl.h
@@ -63,15 +63,15 @@ class DependencyScanningAction {
 std::unique_ptr<DiagnosticOptions>
 createDiagOptions(ArrayRef<std::string> CommandLine);
 
-struct DignosticsEngineWithDiagOpts {
+struct DiagnosticsEngineWithDiagOpts {
   // We need to bound the lifetime of the DiagOpts used to create the
   // DiganosticsEngine with the DiagnosticsEngine itself.
   std::unique_ptr<DiagnosticOptions> DiagOpts;
   IntrusiveRefCntPtr<DiagnosticsEngine> DiagEngine;
 
-  DignosticsEngineWithDiagOpts(ArrayRef<std::string> CommandLine,
-                               IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
-                               DiagnosticConsumer &DC);
+  DiagnosticsEngineWithDiagOpts(ArrayRef<std::string> CommandLine,
+                                IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
+                                DiagnosticConsumer &DC);
 };
 
 struct TextDiagnosticsPrinterWithOutput {
@@ -151,7 +151,7 @@ class CompilerInstanceWithContext {
   // DiagConsumer may points to DiagPrinterWithOS->DiagPrinter, or a custom
   // DiagnosticConsumer passed in from initialize.
   DiagnosticConsumer *DiagConsumer = nullptr;
-  std::unique_ptr<DignosticsEngineWithDiagOpts> DiagEngineWithCmdAndOpts;
+  std::unique_ptr<DiagnosticsEngineWithDiagOpts> DiagEngineWithCmdAndOpts;
 
   // Context - compiler invocation
   // Compilation's command's arguments may be owned by Alloc when expanded from
diff --git a/clang/lib/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/DependencyScanning/DependencyScannerImpl.cpp
index acd05cc50daa8..beec192e3031d 100644
--- a/clang/lib/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/DependencyScanning/DependencyScannerImpl.cpp
@@ -367,7 +367,7 @@ dependencies::createDiagOptions(ArrayRef<std::string> CommandLine) {
   return DiagOpts;
 }
 
-DignosticsEngineWithDiagOpts::DignosticsEngineWithDiagOpts(
+DiagnosticsEngineWithDiagOpts::DiagnosticsEngineWithDiagOpts(
     ArrayRef<std::string> CommandLine,
     IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS, DiagnosticConsumer &DC) {
   std::vector<const char *> CCommandLine(CommandLine.size(), nullptr);
@@ -725,7 +725,7 @@ bool CompilerInstanceWithContext::initialize(DiagnosticConsumer *DC) {
   std::tie(OverlayFS, CommandLine) = initVFSForByNameScanning(
       Worker.DepFS, CommandLine, CWD, "ScanningByName");
 
-  DiagEngineWithCmdAndOpts = std::make_unique<DignosticsEngineWithDiagOpts>(
+  DiagEngineWithCmdAndOpts = std::make_unique<DiagnosticsEngineWithDiagOpts>(
       CommandLine, OverlayFS, *DiagConsumer);
 
   std::tie(Driver, Compilation) = buildCompilation(
diff --git a/clang/lib/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/DependencyScanning/DependencyScanningWorker.cpp
index 7b03abd8e3138..5982dcf160083 100644
--- a/clang/lib/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/DependencyScanning/DependencyScanningWorker.cpp
@@ -100,7 +100,7 @@ bool DependencyScanningWorker::scanDependencies(
     FS = std::move(OverlayFS);
   }
 
-  DignosticsEngineWithDiagOpts DiagEngineWithCmdAndOpts(CommandLine, FS, DC);
+  DiagnosticsEngineWithDiagOpts DiagEngineWithCmdAndOpts(CommandLine, FS, DC);
   DependencyScanningAction Action(Service, WorkingDirectory, Consumer,
                                   Controller, DepFS);
 

From c155124bb5189cf0c7fd7db14e07cf3932a11796 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Tue, 9 Dec 2025 16:54:31 -0800
Subject: [PATCH 79/85] [mlir][acc] Fix build error for tiling API return value
 (#171546)

The build error looks like:
error: could not convert 'newLoops' from 'SmallVector<[...],3>' to
'SmallVector<[...],6>'
  310 |   return newLoops;

The fix is to remove the explicit size in the local declaration for the
SmallVector being returned.
---
 mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp
index bf82d247028b9..0b344ba2f8316 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp
@@ -253,7 +253,7 @@ llvm::SmallVector<mlir::acc::LoopOp>
 mlir::acc::uncollapseLoops(mlir::acc::LoopOp origLoop, unsigned tileCount,
                            unsigned collapseCount,
                            mlir::RewriterBase &rewriter) {
-  llvm::SmallVector<mlir::acc::LoopOp, 3> newLoops;
+  llvm::SmallVector<mlir::acc::LoopOp> newLoops;
   llvm::SmallVector<mlir::Value, 3> newIVs;
   mlir::Location loc = origLoop.getLoc();
   llvm::SmallVector<bool> newInclusiveUBs;

From 2d0c14cfb5ca87f15269832269f9e240bad2601d Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 17:07:06 -0800
Subject: [PATCH 80/85] [bazel] Port 21147e7c95c03f554d4a7fb9b55b8e459357eb49
 (#171545)

Adds a couple additional deps to OpenACCUtils.
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 9967aa8e08fd9..e6f577d6665c2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10393,6 +10393,8 @@ cc_library(
     ),
     includes = ["include"],
     deps = [
+        ":ArithDialect",
+        ":DialectUtils",
         ":FunctionInterfaces",
         ":IR",
         ":OpenACCDialect",
@@ -10400,6 +10402,7 @@ cc_library(
         ":OpenACCPassIncGen",
         ":OpenACCTypeInterfacesIncGen",
         ":Support",
+        ":TransformUtils",
         ":ViewLikeInterface",
         "//llvm:Support",
         "//llvm:ir_headers",

From b3a5ad14bff5888b6095dbfe31b6ff0bf1200d30 Mon Sep 17 00:00:00 2001
From: Mike Urbach <mikeurbach@gmail.com>
Date: Tue, 9 Dec 2025 18:22:51 -0700
Subject: [PATCH 81/85] [MLIR][IRDL] Add C API for IRDL Variadicity attributes.
 (#171076)

This add the basic APIs to create VariadicityAttr and
VariadicityArrayAttr attributes from the C API. This is necessary for
C API users that want to create IRDL dialect declarations.
---
 mlir/include/mlir-c/Dialect/IRDL.h | 14 ++++++++++++
 mlir/lib/CAPI/Dialect/IRDL.cpp     | 27 ++++++++++++++++++++++++
 mlir/test/CAPI/irdl.c              | 34 ++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/mlir/include/mlir-c/Dialect/IRDL.h b/mlir/include/mlir-c/Dialect/IRDL.h
index c4d6ffd989af9..d87ab864fb33f 100644
--- a/mlir/include/mlir-c/Dialect/IRDL.h
+++ b/mlir/include/mlir-c/Dialect/IRDL.h
@@ -22,6 +22,20 @@ MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(IRDL, irdl);
 /// the module's associated context.
 MLIR_CAPI_EXPORTED MlirLogicalResult mlirLoadIRDLDialects(MlirModule module);
 
+//===----------------------------------------------------------------------===//
+// VariadicityAttr
+//===----------------------------------------------------------------------===//
+
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirIRDLVariadicityAttrGet(MlirContext ctx, MlirStringRef value);
+
+//===----------------------------------------------------------------------===//
+// VariadicityArrayAttr
+//===----------------------------------------------------------------------===//
+
+MLIR_CAPI_EXPORTED MlirAttribute mlirIRDLVariadicityArrayAttrGet(
+    MlirContext ctx, intptr_t nValues, MlirAttribute const *values);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir/lib/CAPI/Dialect/IRDL.cpp b/mlir/lib/CAPI/Dialect/IRDL.cpp
index cb9dc8ceb6795..43f420bf6db05 100644
--- a/mlir/lib/CAPI/Dialect/IRDL.cpp
+++ b/mlir/lib/CAPI/Dialect/IRDL.cpp
@@ -16,3 +16,30 @@ MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(IRDL, irdl, mlir::irdl::IRDLDialect)
 MlirLogicalResult mlirLoadIRDLDialects(MlirModule module) {
   return wrap(mlir::irdl::loadDialects(unwrap(module)));
 }
+
+//===----------------------------------------------------------------------===//
+// VariadicityAttr
+//===----------------------------------------------------------------------===//
+
+MlirAttribute mlirIRDLVariadicityAttrGet(MlirContext ctx, MlirStringRef value) {
+  return wrap(mlir::irdl::VariadicityAttr::get(
+      unwrap(ctx), mlir::irdl::symbolizeVariadicity(unwrap(value)).value()));
+}
+
+//===----------------------------------------------------------------------===//
+// VariadicityArrayAttr
+//===----------------------------------------------------------------------===//
+
+MlirAttribute mlirIRDLVariadicityArrayAttrGet(MlirContext ctx, intptr_t nValues,
+                                              MlirAttribute const *values) {
+  llvm::SmallVector<mlir::Attribute> attrs;
+  llvm::ArrayRef<mlir::Attribute> unwrappedAttrs =
+      unwrapList(nValues, values, attrs);
+
+  llvm::SmallVector<mlir::irdl::VariadicityAttr> variadicities;
+  for (auto attr : unwrappedAttrs)
+    variadicities.push_back(llvm::cast<mlir::irdl::VariadicityAttr>(attr));
+
+  return wrap(
+      mlir::irdl::VariadicityArrayAttr::get(unwrap(ctx), variadicities));
+}
diff --git a/mlir/test/CAPI/irdl.c b/mlir/test/CAPI/irdl.c
index ad52ece6a41ce..20cf35f2501ff 100644
--- a/mlir/test/CAPI/irdl.c
+++ b/mlir/test/CAPI/irdl.c
@@ -12,6 +12,7 @@
 
 #include "mlir-c/Dialect/IRDL.h"
 #include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
 
 const char irdlDialect[] = "\
   irdl.dialect @foo {\
@@ -37,10 +38,39 @@ const char newDialectUsage[] = "\
     \"bar.op\"(%res) : (i32) -> ()\
   }";
 
+void testVariadicityAttributes(MlirContext ctx) {
+  MlirAttribute variadicitySingle =
+      mlirIRDLVariadicityAttrGet(ctx, mlirStringRefCreateFromCString("single"));
+
+  // CHECK: #irdl<variadicity single>
+  mlirAttributeDump(variadicitySingle);
+
+  MlirAttribute variadicityOptional = mlirIRDLVariadicityAttrGet(
+      ctx, mlirStringRefCreateFromCString("optional"));
+
+  // CHECK: #irdl<variadicity optional>
+  mlirAttributeDump(variadicityOptional);
+
+  MlirAttribute variadicityVariadic = mlirIRDLVariadicityAttrGet(
+      ctx, mlirStringRefCreateFromCString("variadic"));
+
+  // CHECK: #irdl<variadicity variadic>
+  mlirAttributeDump(variadicityVariadic);
+
+  MlirAttribute variadicities[] = {variadicitySingle, variadicityOptional,
+                                   variadicityVariadic};
+  MlirAttribute variadicityArray =
+      mlirIRDLVariadicityArrayAttrGet(ctx, 3, variadicities);
+
+  // CHECK: #irdl<variadicity_array[ single, optional,  variadic]>
+  mlirAttributeDump(variadicityArray);
+}
+
 int main(void) {
   MlirContext ctx = mlirContextCreate();
   mlirDialectHandleLoadDialect(mlirGetDialectHandle__irdl__(), ctx);
 
+  // Test loading an IRDL dialect and using it.
   MlirModule dialectDecl =
       mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(irdlDialect));
 
@@ -53,6 +83,10 @@ int main(void) {
   mlirOperationDump(mlirModuleGetOperation(usingModule));
 
   mlirModuleDestroy(usingModule);
+
+  // Test variadicity attributes.
+  testVariadicityAttributes(ctx);
+
   mlirContextDestroy(ctx);
   return 0;
 }

From f309fabdfc3b6e70968de7e9deef40b3995911b5 Mon Sep 17 00:00:00 2001
From: Jianjian Guan <jacquesguan@me.com>
Date: Wed, 10 Dec 2025 10:06:56 +0800
Subject: [PATCH 82/85] [RISCV][GISel] Fix legalizer of G_INSERT_SUBVECTOR
 (#171091)

Fix early exit condition and use the right type as mask type of
vslideup.
---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |   4 +-
 .../rvv/legalize-insert-subvector.mir         | 322 +++++++++++++++---
 2 files changed, 276 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 2cc594a33eb0d..a37b34a3758a1 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -1240,7 +1240,7 @@ bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
   LLT BigTy = MRI.getType(BigVec);
   LLT LitTy = MRI.getType(LitVec);
 
-  if (Idx == 0 ||
+  if (Idx == 0 &&
       MRI.getVRegDef(BigVec)->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
     return true;
 
@@ -1314,7 +1314,7 @@ bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
   auto Insert = MIB.buildInsertSubvector(InterLitTy, MIB.buildUndef(InterLitTy),
                                          LitVec, 0);
 
-  auto [Mask, _] = buildDefaultVLOps(BigTy, MIB, MRI);
+  auto [Mask, _] = buildDefaultVLOps(InterLitTy, MIB, MRI);
   auto VL = MIB.buildVScale(XLenTy, LitTy.getElementCount().getKnownMinValue());
 
   // If we're inserting into the lowest elements, use a tail undisturbed
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
index e8b5325bf8ef1..abf7e5efa9952 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
@@ -10,12 +10,65 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1_undef_nonzero
-    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 2 x s1>), 2
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s1>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv2i1_nxv4i1_undef_nonzero
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s64)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s64)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_SUBVECTOR [[DEF2]], [[SELECT1]](<vscale x 2 x s8>), 0
+    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C4]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C5]](s64)
+    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C5]](s64)
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 1
+    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s64)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[VSLIDEUP_VL]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i1_nxv4i1_undef_nonzero
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_SUBVECTOR [[DEF2]], [[SELECT1]](<vscale x 2 x s8>), 0
+    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C4]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C5]](s32)
+    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C5]](s32)
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 1
+    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[VSLIDEUP_VL]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
     %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s1>), %1, 2
@@ -264,14 +317,63 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 1 x s1>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s1>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv2i1_nxv4i1_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C2]](s64)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C3]](s64)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](<vscale x 1 x s8>), 0
+    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C4]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C5]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VMV_V_V_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s64)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[VMV_V_V_VL]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i1_nxv4i1_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](<vscale x 1 x s8>), 0
+    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C4]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C5]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VMV_V_V_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[VMV_V_V_VL]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s1>) = COPY $v8
     %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s1>), %1, 0
@@ -285,14 +387,63 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv4i1_nxv8i1_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 2 x s1>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s1>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv4i1_nxv8i1_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s64)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s64)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](<vscale x 2 x s8>), 0
+    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C4]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C5]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VMV_V_V_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C6]](s64)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[VMV_V_V_VL]](<vscale x 8 x s8>), [[SPLAT_VECTOR4]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv4i1_nxv8i1_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](<vscale x 2 x s8>), 0
+    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C4]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C5]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VMV_V_V_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[VMV_V_V_VL]](<vscale x 8 x s8>), [[SPLAT_VECTOR4]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s1>) = COPY $v8
     %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s1>), %1, 0
@@ -306,14 +457,43 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv32i1_nxv64i1_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 16 x s1>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 64 x s1>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv32i1_nxv64i1_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[COPY]](<vscale x 64 x s1>)
+    ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[DEF]](<vscale x 16 x s1>)
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[BITCAST1]](<vscale x 2 x s8>), 0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VMV_V_V_VL [[BITCAST]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: [[BITCAST2:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VMV_V_V_VL]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST2]](<vscale x 64 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv32i1_nxv64i1_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[COPY]](<vscale x 64 x s1>)
+    ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[DEF]](<vscale x 16 x s1>)
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[BITCAST1]](<vscale x 2 x s8>), 0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VMV_V_V_VL [[BITCAST]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: [[BITCAST2:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VMV_V_V_VL]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST2]](<vscale x 64 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 64 x s1>) = COPY $v8
     %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 64 x s1>), %1, 0
@@ -329,14 +509,37 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv1i8_nxv2i8_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 1 x s8>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 2 x s8>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv1i8_nxv2i8_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s8>), 0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VMV_V_V_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: $v8 = COPY [[VMV_V_V_VL]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv1i8_nxv2i8_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s8>), 0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VMV_V_V_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: $v8 = COPY [[VMV_V_V_VL]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s8>) = COPY $v8
     %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
     %2:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR %0(<vscale x 2 x s8>), %1, 0
@@ -350,14 +553,37 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv2i16_nxv4i16_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 1 x s16>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s16>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv2i16_nxv4i16_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s16>), 0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VMV_V_V_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: $v8 = COPY [[VMV_V_V_VL]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i16_nxv4i16_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s16>), 0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VMV_V_V_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: $v8 = COPY [[VMV_V_V_VL]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s16>) = COPY $v8
     %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
     %2:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s16>), %1, 0
@@ -524,7 +750,7 @@ body:             |
     ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
     ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s16>), 0
     ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C]](s64)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C]](s64)
     ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
     ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
     ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64)
@@ -542,7 +768,7 @@ body:             |
     ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
     ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s16>), 0
     ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C]](s32)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C]](s32)
     ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
     ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32)

From 17b13e9e3f43d7e1d7036e8b03451148e8250b45 Mon Sep 17 00:00:00 2001
From: KAWASHIMA Takahiro <t-kawashima@fujitsu.com>
Date: Wed, 10 Dec 2025 11:42:32 +0900
Subject: [PATCH 83/85] [flang][docs] Correct the title in Unsigned.md
 (#171553)

Probablly the original author copied `Extensions.md` and forgot to
change the title.
---
 flang/docs/Unsigned.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/docs/Unsigned.md b/flang/docs/Unsigned.md
index 5c90e2aa185bc..5815822cd53ad 100644
--- a/flang/docs/Unsigned.md
+++ b/flang/docs/Unsigned.md
@@ -6,7 +6,7 @@
   
 -->
 
-# Fortran Extensions supported by Flang
+# Flang support for UNSIGNED type
 
 ```{contents}
 ---

From 09eb25f1fa47b3776e291fb96bd8664e027de27d Mon Sep 17 00:00:00 2001
From: KAWASHIMA Takahiro <t-kawashima@fujitsu.com>
Date: Wed, 10 Dec 2025 11:43:39 +0900
Subject: [PATCH 84/85] [flang][docs] Replace Flang to Classic Flang in old doc
 (#171558)

This document was written when Flang was known as F18 and Classic Flang
was known as Flang. The term "Flang" in this document refers to Classic
Flang, except in the first paragraph.

Also, a trivial HTML error is fixed.
---
 flang/docs/OptionComparison.md | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/flang/docs/OptionComparison.md b/flang/docs/OptionComparison.md
index 934750a07e9bb..c416742c8217b 100644
--- a/flang/docs/OptionComparison.md
+++ b/flang/docs/OptionComparison.md
@@ -8,7 +8,7 @@
 
 # Compiler options comparison
 
-This document catalogs the options processed by Flang's peers/competitors.  Much of the document is taken up by a set of tables that list the options categorized into different topics.  Some of the table headings link to more information about the contents of the tables.  For example, the table on **Standards conformance** options links to <a href=#standards">notes on Standards conformance</a>.
+This document catalogs the options processed by Flang's peers/competitors.  Much of the document is taken up by a set of tables that list the options categorized into different topics.  Some of the table headings link to more information about the contents of the tables.  For example, the table on **Standards conformance** options links to <a href="#standards">notes on Standards conformance</a>.
 
 **There's also important information in the ___[Appendix section](#appendix)___ near the end of the document on how this data was gathered and what ___is___ and ___is not___ included in this document.**  
 
@@ -28,7 +28,7 @@ Note that compilers may support language features without having an option for t
    <td><strong>IBM</strong> </td>
    <td><strong>Intel</strong> </td>
    <td><strong>PGI</strong> </td>
-   <td><strong>Flang</strong> </td>
+   <td><strong>Classic Flang</strong> </td>
   </tr>
   <tr>
    <td>Overall conformance </td>
@@ -112,7 +112,7 @@ fall-intrinsics
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -241,7 +241,7 @@ fd-lines-as-comments
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -410,7 +410,7 @@ fd-lines-as-comments
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -471,7 +471,7 @@ fd-lines-as-comments
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -604,7 +604,7 @@ Mr8intrinsics
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -739,7 +739,7 @@ fdefault-integer-8
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -810,7 +810,7 @@ fdefault-integer-8
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -952,7 +952,7 @@ Msave
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -1027,7 +1027,7 @@ Msave
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -1098,7 +1098,7 @@ Mcuda
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -1220,7 +1220,7 @@ IBM Fortran's options allow the source line length to be specified with the opti
 *   **GNU:** For both "ffixed-line-length-_n_" and "ffree-line-length-_n_" options, characters are ignored after the specified length.  The default for fixed is 72.  The default for free is 132.  For free, you can specify 'none' as the length, which means that all characters in the line are meaningful.
 *   **IBM:** For **fixed**, the default is 72.  For **free**, there's no default, but the maximum length for either form is 132.
 *   **Intel:** The default is 72 for **fixed** and 132 for **free**.
-*   **PGI, Flang:** 
+*   **PGI, Classic Flang:** 
     * in free form, it is an error if the line is longer than 1000 characters
     * in fixed form by default, characters after column 72 are ignored
     * in fixed form with -Mextend, characters after column 132 are ignored
@@ -1233,7 +1233,7 @@ IBM Fortran's options allow the source line length to be specified with the opti
 *   **GNU:** The "-fbackslash" option the interpretation of backslashes in string literals from a single backslash character to "C-style" escape characters. The following combinations are expanded \a, \b, \f, \n, \r, \t, \v, \\, and \0 to the ASCII characters alert, backspace, form feed, newline, carriage return, horizontal tab, vertical tab, backslash, and NUL, respectively. Additionally, \xnn, \unnnn and \Unnnnnnnn (where each n is a hexadecimal digit) are translated into the Unicode characters corresponding to the specified code points. All other combinations of a character preceded by \ are unexpanded.
 *   **Intel:** The option "-assume bscc" tells the compiler to treat the backslash character (\) as a C-style control (escape) character syntax in character literals. "nobscc" specifies that the backslash character is treated as a normal character in character literals.  This is the default.
 
-**"$" in symbol names:** Allowing "$" in names is controlled by an option in GNU and is the default behavior in IBM and Intel.  Presumably, these compilers issue warnings when standard conformance options are enabled.  Dollar signs in names don't seem to be allowed in Cray, PGI, or Flang.
+**"$" in symbol names:** Allowing "$" in names is controlled by an option in GNU and is the default behavior in IBM and Intel.  Presumably, these compilers issue warnings when standard conformance options are enabled.  Dollar signs in names don't seem to be allowed in Cray, PGI, or Classic Flang.
 
 **<a name="do"></a>DO loop handling**
 
@@ -1328,7 +1328,7 @@ Here's the list of compilers surveyed, hot linked to the source of data on it.
 *   [NAG Fortran Release 6.2](https://www.nag.co.uk/nagware/np/r62_doc/manual/compiler_2_4.html)
 *   [Oracle Fortran version 819-0492-10](https://docs.oracle.com/cd/E19059-01/stud.10/819-0492/3_options.html)
 *   PGI -- [Compiler Reference version 19.1](https://www.pgroup.com/resources/docs/19.1/x86/pgi-ref-guide/index.htm#cmdln-options-ref), [Fortran Reference Guide version 17](https://www.pgroup.com/doc/pgi17fortref.pdf)
-*   [Flang](https://github.com/flang-compiler/flang/wiki/Using-Flang) -- information from GitHub
+*   [Classic Flang](https://github.com/flang-compiler/flang/wiki/Using-Flang) -- information from GitHub
 
 This document has been kept relatively small by providing links to much of the information about options rather than duplicating that information.  For IBM, Intel, and some PGI options, there are direct links.  But direct links were not possible for Cray, GNU and some PGI options.
 

From eb611eaf5f033407711aea12a21eccb1ba8ff223 Mon Sep 17 00:00:00 2001
From: Ron Lieberman <ron.lieberman@amd.com>
Date: Tue, 9 Dec 2025 21:18:16 -0600
Subject: [PATCH 85/85] cleanup revert_patch.txt

---
 revert_patches.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/revert_patches.txt b/revert_patches.txt
index 57ede019ece2b..133652271d4e4 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -5,9 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485)
 breaks build of ROCmValidationSuite
 [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662)
 ---
-complicated build, deferring
-[llvm][mlir][OpenMP] Support translation for linear clause in omp.wsloop
----
 needs more work to land
 [Flang] Move builtin .mod generation into runtimes (Reapply #137828)
 ---