diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 9a4fc6030d5ec..5edde254eb73d 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -267,7 +267,7 @@ def plural(num_tests):
             report.extend(
                 [
                     "",
-                    "All tests passed but another part of the build **failed**. "
+                    "All executed tests passed, but another part of the build **failed**. "
                     "Information about the build failure could not be automatically "
                     "obtained.",
                     "",
@@ -278,7 +278,7 @@ def plural(num_tests):
             report.extend(
                 [
                     "",
-                    "All tests passed but another part of the build **failed**. Click on "
+                    "All executed tests passed, but another part of the build **failed**. Click on "
                     "a failure below to see the details.",
                     "",
                 ]
diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py
index b9e992e0f798b..06279d672f3c3 100644
--- a/.ci/generate_test_report_lib_test.py
+++ b/.ci/generate_test_report_lib_test.py
@@ -343,7 +343,7 @@ def test_no_failures_build_failed(self):
 
               * 1 test passed
 
-              All tests passed but another part of the build **failed**. Information about the build failure could not be automatically obtained.
+              All executed tests passed, but another part of the build **failed**. Information about the build failure could not be automatically obtained.
 
               Download the build's log file to see the details.
               
@@ -390,7 +390,7 @@ def test_no_failures_build_failed_ninja_log(self):
 
                     * 1 test passed
 
-                    All tests passed but another part of the build **failed**. Click on a failure below to see the details.
+                    All executed tests passed, but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
                     <summary>test/4.stamp</summary>
@@ -476,7 +476,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
 
                     * 1 test passed
 
-                    All tests passed but another part of the build **failed**. Click on a failure below to see the details.
+                    All executed tests passed, but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
                     <summary>touch test/2.stamp</summary>
@@ -978,7 +978,7 @@ def test_generate_report_end_to_end(self):
 
                     * 1 test passed
 
-                    All tests passed but another part of the build **failed**. Click on a failure below to see the details.
+                    All executed tests passed, but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
                     <summary>test/4.stamp</summary>
diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml
index 807377564fa13..6bb01d502050e 100644
--- a/.github/workflows/libc-overlay-tests.yml
+++ b/.github/workflows/libc-overlay-tests.yml
@@ -16,7 +16,7 @@ jobs:
       # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations.
       fail-fast: false
       matrix:
-        os: [ubuntu-24.04, ubuntu-24.04-arm, windows-2022, windows-2025, macos-14]
+        os: [ubuntu-24.04, ubuntu-24.04-arm, windows-2022, windows-2025, macos-15]
         include:
           # TODO: add linux gcc when it is fixed
           - os: ubuntu-24.04
@@ -35,7 +35,7 @@ jobs:
             compiler:
               c_compiler: clang-cl
               cpp_compiler: clang-cl
-          - os: macos-14
+          - os: macos-15
             compiler:
               c_compiler: clang
               cpp_compiler: clang++
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 51bc867a839cf..b06eee2f415a7 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1888,6 +1888,9 @@ void BinaryContext::preprocessDebugInfo() {
 
   preprocessDWODebugInfo();
 
+  // Check if required DWO files are missing.
+  uint64_t NumMissingDWOs = 0;
+
   // Populate MCContext with DWARF files from all units.
   StringRef GlobalPrefix = AsmInfo->getPrivateGlobalPrefix();
   for (const std::unique_ptr<DWARFUnit> &CU : DwCtx->compile_units()) {
@@ -1909,19 +1912,23 @@ void BinaryContext::preprocessDebugInfo() {
       std::optional<MD5::MD5Result> Checksum;
       if (LineTable->Prologue.ContentTypes.HasMD5)
         Checksum = LineTable->Prologue.FileNames[0].Checksum;
-      std::optional<const char *> Name =
+      const char *Name =
           dwarf::toString(CU->getUnitDIE().find(dwarf::DW_AT_name), nullptr);
       if (std::optional<uint64_t> DWOID = CU->getDWOId()) {
         auto Iter = DWOCUs.find(*DWOID);
         if (Iter == DWOCUs.end()) {
-          this->errs() << "BOLT-ERROR: DWO CU was not found for " << Name
-                       << '\n';
-          exit(1);
+          const char *DWOName =
+              dwarf::toString(CU->getUnitDIE().find(dwarf::DW_AT_dwo_name),
+                              "<missing DW_AT_dwo_name>");
+          this->errs() << "BOLT-ERROR: unable to load " << DWOName
+                       << " for DWO_id 0x" << Twine::utohexstr(*DWOID) << '\n';
+          NumMissingDWOs++;
+          continue;
         }
         Name = dwarf::toString(
             Iter->second->getUnitDIE().find(dwarf::DW_AT_name), nullptr);
       }
-      BinaryLineTable.setRootFile(CU->getCompilationDir(), *Name, Checksum,
+      BinaryLineTable.setRootFile(CU->getCompilationDir(), Name, Checksum,
                                   std::nullopt);
     }
 
@@ -1956,6 +1963,14 @@ void BinaryContext::preprocessDebugInfo() {
                             DwarfVersion));
     }
   }
+
+  if (NumMissingDWOs) {
+    this->errs() << "BOLT-ERROR: " << NumMissingDWOs
+                 << " required DWO file(s) not found. Unable to update debug"
+                    " info. Use --comp-dir-override to locate the file(s) or"
+                    " --update-debug-sections=0 to remove debug info\n";
+    exit(1);
+  }
 }
 
 bool BinaryContext::shouldEmit(const BinaryFunction &Function) const {
diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 01b350b2f11fe..d38a7fadb0767 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -547,7 +547,7 @@ class SrcSafetyAnalysis {
 
     // Being trusted is a strictly stronger property than being
     // safe-to-dereference.
-    assert(!Next.TrustedRegs.test(Next.SafeToDerefRegs) &&
+    assert(Next.TrustedRegs.subsetOf(Next.SafeToDerefRegs) &&
            "SafeToDerefRegs should contain all TrustedRegs");
 
     return Next;
diff --git a/bolt/test/dwarf5-missing-dwo.c b/bolt/test/dwarf5-missing-dwo.c
new file mode 100644
index 0000000000000..25de1a53aa5ac
--- /dev/null
+++ b/bolt/test/dwarf5-missing-dwo.c
@@ -0,0 +1,18 @@
+// Check that llvm-bolt correctly reports a missing DWO file while updating
+// debug info.
+//
+// RUN: %clang %cflags -g -dwarf5 -gsplit-dwarf=single -c %s -o %t.o
+// RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
+// RUN: rm %t.o
+// RUN: not llvm-bolt %t.exe -o %t.bolt --update-debug-sections \
+// RUN:   2>&1 | FileCheck %s -DDWO=%t.o
+//
+// Check that llvm-bolt succeeds on the same binary with disabled debug info
+// update.
+//
+// RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections=0
+
+// CHECK:      BOLT-ERROR: unable to load [[DWO]]
+// CHECK-NEXT: BOLT-ERROR: 1 required DWO file(s) not found
+
+int main() { return 0; }
diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
index 1bfeeaed3715a..0f5e9a38da2ba 100644
--- a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -11,12 +11,15 @@
 // RUN: -fno-asynchronous-unwind-tables \
 // RUN: %s -o %t.exe -Wl,-q
 // RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
-//
-// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
-// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
-// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
-// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports asynchronous
-// CHECK-SAME: unwind tables. For C compilers, see -fasynchronous-unwind-tables.
+
+// Number of functions with .cfi-negate-ra-state in the binary is
+// platform-dependent.
+// CHECK: BOLT-INFO: PointerAuthCFIAnalyzer ran on {{[0-9]+}} functions.
+// CHECK-SAME: Ignored {{[0-9]}} functions ({{[0-9.]+}}%) because of CFI
+// CHECK-SAME: inconsistencies
+// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports
+// CHECK-SAME: asynchronous unwind tables. For C compilers, see
+// CHECK-SAME: -fasynchronous-unwind-tables.
 
 #include <cstdio>
 #include <stdexcept>
diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt
index 5989e5fe60cf3..7a375d7cd0524 100644
--- a/clang-tools-extra/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/CMakeLists.txt
@@ -16,7 +16,6 @@ add_clang_library(clangDoc STATIC
   Representation.cpp
   Serialize.cpp
   YAMLGenerator.cpp
-  HTMLMustacheGenerator.cpp
   JSONGenerator.cpp
 
   DEPENDS
diff --git a/clang-tools-extra/clang-doc/Generators.cpp b/clang-tools-extra/clang-doc/Generators.cpp
index ba46609d0e7fc..d6c1cc948ce30 100644
--- a/clang-tools-extra/clang-doc/Generators.cpp
+++ b/clang-tools-extra/clang-doc/Generators.cpp
@@ -243,8 +243,6 @@ void Generator::addInfoToIndex(Index &Idx, const doc::Info *Info) {
 [[maybe_unused]] static int YAMLGeneratorAnchorDest = YAMLGeneratorAnchorSource;
 [[maybe_unused]] static int MDGeneratorAnchorDest = MDGeneratorAnchorSource;
 [[maybe_unused]] static int HTMLGeneratorAnchorDest = HTMLGeneratorAnchorSource;
-[[maybe_unused]] static int MHTMLGeneratorAnchorDest =
-    MHTMLGeneratorAnchorSource;
 [[maybe_unused]] static int JSONGeneratorAnchorDest = JSONGeneratorAnchorSource;
 } // namespace doc
 } // namespace clang
diff --git a/clang-tools-extra/clang-doc/Generators.h b/clang-tools-extra/clang-doc/Generators.h
index 847722646b029..a50f1ac25eda9 100644
--- a/clang-tools-extra/clang-doc/Generators.h
+++ b/clang-tools-extra/clang-doc/Generators.h
@@ -137,7 +137,6 @@ struct MustacheGenerator : public Generator {
 extern volatile int YAMLGeneratorAnchorSource;
 extern volatile int MDGeneratorAnchorSource;
 extern volatile int HTMLGeneratorAnchorSource;
-extern volatile int MHTMLGeneratorAnchorSource;
 extern volatile int JSONGeneratorAnchorSource;
 
 } // namespace doc
diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
index 7c8c16b8e8aca..6f58c3d00fa28 100644
--- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
@@ -5,1145 +5,169 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the implementation of the HTMLGenerator class,
+/// which is a Clang-Doc generator for HTML using Mustache templates.
+///
+//===----------------------------------------------------------------------===//
 
 #include "Generators.h"
 #include "Representation.h"
 #include "support/File.h"
-#include "clang/Basic/Version.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/JSON.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <optional>
-#include <string>
 
 using namespace llvm;
+using namespace llvm::json;
+using namespace llvm::mustache;
 
 namespace clang {
 namespace doc {
 
-namespace {
-
-class HTMLTag {
-public:
-  // Any other tag can be added if required
-  enum TagType {
-    TAG_A,
-    TAG_DIV,
-    TAG_FOOTER,
-    TAG_H1,
-    TAG_H2,
-    TAG_H3,
-    TAG_HEADER,
-    TAG_LI,
-    TAG_LINK,
-    TAG_MAIN,
-    TAG_META,
-    TAG_OL,
-    TAG_P,
-    TAG_SCRIPT,
-    TAG_SPAN,
-    TAG_TITLE,
-    TAG_UL,
-    TAG_TABLE,
-    TAG_THEAD,
-    TAG_TBODY,
-    TAG_TR,
-    TAG_TD,
-    TAG_TH
-  };
-
-  HTMLTag() = default;
-  constexpr HTMLTag(TagType Value) : Value(Value) {}
-
-  operator TagType() const { return Value; }
-  operator bool() = delete;
-
-  bool isSelfClosing() const;
-  StringRef toString() const;
-
-private:
-  TagType Value;
-};
-
-enum NodeType {
-  NODE_TEXT,
-  NODE_TAG,
-};
-
-struct HTMLNode {
-  HTMLNode(NodeType Type) : Type(Type) {}
-  virtual ~HTMLNode() = default;
-
-  virtual void render(llvm::raw_ostream &OS, int IndentationLevel) = 0;
-  NodeType Type; // Type of node
-};
-
-struct TextNode : public HTMLNode {
-  TextNode(const Twine &Text)
-      : HTMLNode(NodeType::NODE_TEXT), Text(Text.str()) {}
-
-  std::string Text; // Content of node
-  void render(llvm::raw_ostream &OS, int IndentationLevel) override;
-};
-
-struct TagNode : public HTMLNode {
-  TagNode(HTMLTag Tag) : HTMLNode(NodeType::NODE_TAG), Tag(Tag) {}
-  TagNode(HTMLTag Tag, const Twine &Text) : TagNode(Tag) {
-    Children.emplace_back(std::make_unique<TextNode>(Text.str()));
-  }
-
-  HTMLTag Tag; // Name of HTML Tag (p, div, h1)
-  std::vector<std::unique_ptr<HTMLNode>> Children; // List of child nodes
-  std::vector<std::pair<std::string, std::string>>
-      Attributes; // List of key-value attributes for tag
-
-  void render(llvm::raw_ostream &OS, int IndentationLevel) override;
-};
-
-struct HTMLFile {
-  std::vector<std::unique_ptr<HTMLNode>> Children; // List of child nodes
-  void render(llvm::raw_ostream &OS) {
-    OS << "<!DOCTYPE html>\n";
-    for (const auto &C : Children) {
-      C->render(OS, 0);
-      OS << "\n";
-    }
-  }
-};
-
-} // namespace
-
-bool HTMLTag::isSelfClosing() const {
-  switch (Value) {
-  case HTMLTag::TAG_META:
-  case HTMLTag::TAG_LINK:
-    return true;
-  case HTMLTag::TAG_A:
-  case HTMLTag::TAG_DIV:
-  case HTMLTag::TAG_FOOTER:
-  case HTMLTag::TAG_H1:
-  case HTMLTag::TAG_H2:
-  case HTMLTag::TAG_H3:
-  case HTMLTag::TAG_HEADER:
-  case HTMLTag::TAG_LI:
-  case HTMLTag::TAG_MAIN:
-  case HTMLTag::TAG_OL:
-  case HTMLTag::TAG_P:
-  case HTMLTag::TAG_SCRIPT:
-  case HTMLTag::TAG_SPAN:
-  case HTMLTag::TAG_TITLE:
-  case HTMLTag::TAG_UL:
-  case HTMLTag::TAG_TABLE:
-  case HTMLTag::TAG_THEAD:
-  case HTMLTag::TAG_TBODY:
-  case HTMLTag::TAG_TR:
-  case HTMLTag::TAG_TD:
-  case HTMLTag::TAG_TH:
-    return false;
-  }
-  llvm_unreachable("Unhandled HTMLTag::TagType");
-}
-
-StringRef HTMLTag::toString() const {
-  switch (Value) {
-  case HTMLTag::TAG_A:
-    return "a";
-  case HTMLTag::TAG_DIV:
-    return "div";
-  case HTMLTag::TAG_FOOTER:
-    return "footer";
-  case HTMLTag::TAG_H1:
-    return "h1";
-  case HTMLTag::TAG_H2:
-    return "h2";
-  case HTMLTag::TAG_H3:
-    return "h3";
-  case HTMLTag::TAG_HEADER:
-    return "header";
-  case HTMLTag::TAG_LI:
-    return "li";
-  case HTMLTag::TAG_LINK:
-    return "link";
-  case HTMLTag::TAG_MAIN:
-    return "main";
-  case HTMLTag::TAG_META:
-    return "meta";
-  case HTMLTag::TAG_OL:
-    return "ol";
-  case HTMLTag::TAG_P:
-    return "p";
-  case HTMLTag::TAG_SCRIPT:
-    return "script";
-  case HTMLTag::TAG_SPAN:
-    return "span";
-  case HTMLTag::TAG_TITLE:
-    return "title";
-  case HTMLTag::TAG_UL:
-    return "ul";
-  case HTMLTag::TAG_TABLE:
-    return "table";
-  case HTMLTag::TAG_THEAD:
-    return "thead";
-  case HTMLTag::TAG_TBODY:
-    return "tbody";
-  case HTMLTag::TAG_TR:
-    return "tr";
-  case HTMLTag::TAG_TD:
-    return "td";
-  case HTMLTag::TAG_TH:
-    return "th";
-  }
-  llvm_unreachable("Unhandled HTMLTag::TagType");
-}
-
-void TextNode::render(llvm::raw_ostream &OS, int IndentationLevel) {
-  OS.indent(IndentationLevel * 2);
-  printHTMLEscaped(Text, OS);
-}
-
-void TagNode::render(llvm::raw_ostream &OS, int IndentationLevel) {
-  // Children nodes are rendered in the same line if all of them are text nodes
-  bool InlineChildren = true;
-  for (const auto &C : Children)
-    if (C->Type == NodeType::NODE_TAG) {
-      InlineChildren = false;
-      break;
-    }
-  OS.indent(IndentationLevel * 2);
-  OS << "<" << Tag.toString();
-  for (const auto &A : Attributes)
-    OS << " " << A.first << "=\"" << A.second << "\"";
-  if (Tag.isSelfClosing()) {
-    OS << "/>";
-    return;
-  }
-  OS << ">";
-  if (!InlineChildren)
-    OS << "\n";
-  bool NewLineRendered = true;
-  for (const auto &C : Children) {
-    int ChildrenIndentation =
-        InlineChildren || !NewLineRendered ? 0 : IndentationLevel + 1;
-    C->render(OS, ChildrenIndentation);
-    if (!InlineChildren && (C == Children.back() ||
-                            (C->Type != NodeType::NODE_TEXT ||
-                             (&C + 1)->get()->Type != NodeType::NODE_TEXT))) {
-      OS << "\n";
-      NewLineRendered = true;
-    } else
-      NewLineRendered = false;
-  }
-  if (!InlineChildren)
-    OS.indent(IndentationLevel * 2);
-  OS << "</" << Tag.toString() << ">";
-}
-
-template <typename Derived, typename Base,
-          typename = std::enable_if<std::is_base_of<Derived, Base>::value>>
-static void appendVector(std::vector<Derived> &&New,
-                         std::vector<Base> &Original) {
-  std::move(New.begin(), New.end(), std::back_inserter(Original));
-}
-
-// HTML generation
-
-static std::vector<std::unique_ptr<TagNode>>
-genStylesheetsHTML(StringRef InfoPath, const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  for (const auto &FilePath : CDCtx.UserStylesheets) {
-    auto LinkNode = std::make_unique<TagNode>(HTMLTag::TAG_LINK);
-    LinkNode->Attributes.emplace_back("rel", "stylesheet");
-    SmallString<128> StylesheetPath = computeRelativePath("", InfoPath);
-    llvm::sys::path::append(StylesheetPath,
-                            llvm::sys::path::filename(FilePath));
-    // Paths in HTML must be in posix-style
-    llvm::sys::path::native(StylesheetPath, llvm::sys::path::Style::posix);
-    LinkNode->Attributes.emplace_back("href", std::string(StylesheetPath));
-    Out.emplace_back(std::move(LinkNode));
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genJsScriptsHTML(StringRef InfoPath, const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-
-  // index_json.js is part of every generated HTML file
-  SmallString<128> IndexJSONPath = computeRelativePath("", InfoPath);
-  auto IndexJSONNode = std::make_unique<TagNode>(HTMLTag::TAG_SCRIPT);
-  llvm::sys::path::append(IndexJSONPath, "index_json.js");
-  llvm::sys::path::native(IndexJSONPath, llvm::sys::path::Style::posix);
-  IndexJSONNode->Attributes.emplace_back("src", std::string(IndexJSONPath));
-  Out.emplace_back(std::move(IndexJSONNode));
-
-  for (const auto &FilePath : CDCtx.JsScripts) {
-    SmallString<128> ScriptPath = computeRelativePath("", InfoPath);
-    auto ScriptNode = std::make_unique<TagNode>(HTMLTag::TAG_SCRIPT);
-    llvm::sys::path::append(ScriptPath, llvm::sys::path::filename(FilePath));
-    // Paths in HTML must be in posix-style
-    llvm::sys::path::native(ScriptPath, llvm::sys::path::Style::posix);
-    ScriptNode->Attributes.emplace_back("src", std::string(ScriptPath));
-    Out.emplace_back(std::move(ScriptNode));
-  }
-  return Out;
-}
-
-static std::unique_ptr<TagNode> genLink(const Twine &Text, const Twine &Link) {
-  auto LinkNode = std::make_unique<TagNode>(HTMLTag::TAG_A, Text);
-  LinkNode->Attributes.emplace_back("href", Link.str());
-  return LinkNode;
-}
-
-static std::unique_ptr<HTMLNode>
-genReference(const Reference &Type, StringRef CurrentDirectory,
-             std::optional<StringRef> JumpToSection = std::nullopt) {
-  if (Type.Path.empty()) {
-    if (!JumpToSection)
-      return std::make_unique<TextNode>(Type.Name);
-    return genLink(Type.Name, "#" + *JumpToSection);
-  }
-  llvm::SmallString<64> Path = Type.getRelativeFilePath(CurrentDirectory);
-  llvm::sys::path::append(Path, Type.getFileBaseName() + ".html");
-
-  // Paths in HTML must be in posix-style
-  llvm::sys::path::native(Path, llvm::sys::path::Style::posix);
-  if (JumpToSection)
-    Path += ("#" + *JumpToSection).str();
-  return genLink(Type.Name, Path);
-}
-
-static std::vector<std::unique_ptr<HTMLNode>>
-genReferenceList(const llvm::SmallVectorImpl<Reference> &Refs,
-                 const StringRef &CurrentDirectory) {
-  std::vector<std::unique_ptr<HTMLNode>> Out;
-  for (const auto &R : Refs) {
-    if (&R != Refs.begin())
-      Out.emplace_back(std::make_unique<TextNode>(", "));
-    Out.emplace_back(genReference(R, CurrentDirectory));
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const EnumInfo &I, const ClangDocContext &CDCtx);
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const FunctionInfo &I, const ClangDocContext &CDCtx,
-        StringRef ParentInfoDir);
-static std::unique_ptr<TagNode> genHTML(const std::vector<CommentInfo> &C);
-
-static std::vector<std::unique_ptr<TagNode>>
-genEnumsBlock(const std::vector<EnumInfo> &Enums,
-              const ClangDocContext &CDCtx) {
-  if (Enums.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, "Enums"));
-  Out.back()->Attributes.emplace_back("id", "Enums");
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_DIV));
-  auto &DivBody = Out.back();
-  for (const auto &E : Enums) {
-    std::vector<std::unique_ptr<TagNode>> Nodes = genHTML(E, CDCtx);
-    appendVector(std::move(Nodes), DivBody->Children);
-  }
-  return Out;
-}
-
-static std::unique_ptr<TagNode>
-genEnumMembersBlock(const llvm::SmallVector<EnumValueInfo, 4> &Members) {
-  if (Members.empty())
-    return nullptr;
-
-  auto List = std::make_unique<TagNode>(HTMLTag::TAG_TBODY);
-
-  for (const auto &M : Members) {
-    auto TRNode = std::make_unique<TagNode>(HTMLTag::TAG_TR);
-    TRNode->Children.emplace_back(
-        std::make_unique<TagNode>(HTMLTag::TAG_TD, M.Name));
-    // Use user supplied value if it exists, otherwise use the value
-    if (!M.ValueExpr.empty()) {
-      TRNode->Children.emplace_back(
-          std::make_unique<TagNode>(HTMLTag::TAG_TD, M.ValueExpr));
-    } else {
-      TRNode->Children.emplace_back(
-          std::make_unique<TagNode>(HTMLTag::TAG_TD, M.Value));
-    }
-    if (!M.Description.empty()) {
-      auto TD = std::make_unique<TagNode>(HTMLTag::TAG_TD);
-      TD->Children.emplace_back(genHTML(M.Description));
-      TRNode->Children.emplace_back(std::move(TD));
-    }
-    List->Children.emplace_back(std::move(TRNode));
-  }
-  return List;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genFunctionsBlock(const std::vector<FunctionInfo> &Functions,
-                  const ClangDocContext &CDCtx, StringRef ParentInfoDir) {
-  if (Functions.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, "Functions"));
-  Out.back()->Attributes.emplace_back("id", "Functions");
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_DIV));
-  auto &DivBody = Out.back();
-  for (const auto &F : Functions) {
-    std::vector<std::unique_ptr<TagNode>> Nodes =
-        genHTML(F, CDCtx, ParentInfoDir);
-    appendVector(std::move(Nodes), DivBody->Children);
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genRecordMembersBlock(const llvm::SmallVector<MemberTypeInfo, 4> &Members,
-                      StringRef ParentInfoDir) {
-  if (Members.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, "Members"));
-  Out.back()->Attributes.emplace_back("id", "Members");
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_UL));
-  auto &ULBody = Out.back();
-  for (const auto &M : Members) {
-    StringRef Access = getAccessSpelling(M.Access);
-    auto LIBody = std::make_unique<TagNode>(HTMLTag::TAG_LI);
-    auto MemberDecl = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-    if (!Access.empty())
-      MemberDecl->Children.emplace_back(
-          std::make_unique<TextNode>(Access + " "));
-    if (M.IsStatic)
-      MemberDecl->Children.emplace_back(std::make_unique<TextNode>("static "));
-    MemberDecl->Children.emplace_back(genReference(M.Type, ParentInfoDir));
-    MemberDecl->Children.emplace_back(std::make_unique<TextNode>(" " + M.Name));
-    if (!M.Description.empty())
-      LIBody->Children.emplace_back(genHTML(M.Description));
-    LIBody->Children.emplace_back(std::move(MemberDecl));
-    ULBody->Children.emplace_back(std::move(LIBody));
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genReferencesBlock(const std::vector<Reference> &References,
-                   llvm::StringRef Title, StringRef ParentPath) {
-  if (References.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, Title));
-  Out.back()->Attributes.emplace_back("id", std::string(Title));
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_UL));
-  auto &ULBody = Out.back();
-  for (const auto &R : References) {
-    auto LiNode = std::make_unique<TagNode>(HTMLTag::TAG_LI);
-    LiNode->Children.emplace_back(genReference(R, ParentPath));
-    ULBody->Children.emplace_back(std::move(LiNode));
-  }
-  return Out;
-}
-static std::unique_ptr<TagNode> writeSourceFileRef(const ClangDocContext &CDCtx,
-                                                   const Location &L) {
-
-  if (!L.IsFileInRootDir && !CDCtx.RepositoryUrl)
-    return std::make_unique<TagNode>(
-        HTMLTag::TAG_P, "Defined at line " + std::to_string(L.StartLineNumber) +
-                            " of file " + L.Filename);
-
-  SmallString<128> FileURL(CDCtx.RepositoryUrl.value_or(""));
-  llvm::sys::path::append(
-      FileURL, llvm::sys::path::Style::posix,
-      // If we're on Windows, the file name will be in the wrong format, and
-      // append won't convert the full path being appended to the correct
-      // format, so we need to do that here.
-      llvm::sys::path::convert_to_slash(
-          L.Filename,
-          // The style here is the current style of the path, not the one we're
-          // targeting. If the string is already in the posix style, it will do
-          // nothing.
-          llvm::sys::path::Style::windows));
-  auto Node = std::make_unique<TagNode>(HTMLTag::TAG_P);
-  Node->Children.emplace_back(std::make_unique<TextNode>("Defined at line "));
-  auto LocNumberNode = std::make_unique<TagNode>(
-      HTMLTag::TAG_A, std::to_string(L.StartLineNumber));
-  // The links to a specific line in the source code use the github /
-  // googlesource notation so it won't work for all hosting pages.
-  LocNumberNode->Attributes.emplace_back(
-      "href",
-      formatv("{0}#{1}{2}", FileURL, CDCtx.RepositoryLinePrefix.value_or(""),
-              L.StartLineNumber));
-  Node->Children.emplace_back(std::move(LocNumberNode));
-  Node->Children.emplace_back(std::make_unique<TextNode>(" of file "));
-  auto LocFileNode = std::make_unique<TagNode>(
-      HTMLTag::TAG_A, llvm::sys::path::filename(FileURL));
-  LocFileNode->Attributes.emplace_back("href", std::string(FileURL));
-  Node->Children.emplace_back(std::move(LocFileNode));
-  return Node;
-}
-
-static void maybeWriteSourceFileRef(std::vector<std::unique_ptr<TagNode>> &Out,
-                                    const ClangDocContext &CDCtx,
-                                    const std::optional<Location> &DefLoc) {
-  if (DefLoc)
-    Out.emplace_back(writeSourceFileRef(CDCtx, *DefLoc));
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const Index &Index, StringRef InfoPath, bool IsOutermostList);
-
-// Generates a list of child nodes for the HTML head tag
-// It contains a meta node, link nodes to import CSS files, and script nodes to
-// import JS files
-static std::vector<std::unique_ptr<TagNode>>
-genFileHeadNodes(StringRef Title, StringRef InfoPath,
-                 const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  auto MetaNode = std::make_unique<TagNode>(HTMLTag::TAG_META);
-  MetaNode->Attributes.emplace_back("charset", "utf-8");
-  Out.emplace_back(std::move(MetaNode));
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_TITLE, Title));
-  std::vector<std::unique_ptr<TagNode>> StylesheetsNodes =
-      genStylesheetsHTML(InfoPath, CDCtx);
-  appendVector(std::move(StylesheetsNodes), Out);
-  std::vector<std::unique_ptr<TagNode>> JsNodes =
-      genJsScriptsHTML(InfoPath, CDCtx);
-  appendVector(std::move(JsNodes), Out);
-  return Out;
-}
-
-// Generates a header HTML node that can be used for any file
-// It contains the project name
-static std::unique_ptr<TagNode> genFileHeaderNode(StringRef ProjectName) {
-  auto HeaderNode = std::make_unique<TagNode>(HTMLTag::TAG_HEADER, ProjectName);
-  HeaderNode->Attributes.emplace_back("id", "project-title");
-  return HeaderNode;
-}
-
-// Generates a main HTML node that has all the main content of an info file
-// It contains both indexes and the info's documented information
-// This function should only be used for the info files (not for the file that
-// only has the general index)
-static std::unique_ptr<TagNode> genInfoFileMainNode(
-    StringRef InfoPath,
-    std::vector<std::unique_ptr<TagNode>> &MainContentInnerNodes,
-    const Index &InfoIndex) {
-  auto MainNode = std::make_unique<TagNode>(HTMLTag::TAG_MAIN);
-
-  auto LeftSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  LeftSidebarNode->Attributes.emplace_back("id", "sidebar-left");
-  LeftSidebarNode->Attributes.emplace_back("path", std::string(InfoPath));
-  LeftSidebarNode->Attributes.emplace_back(
-      "class", "col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left");
-
-  auto MainContentNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  MainContentNode->Attributes.emplace_back("id", "main-content");
-  MainContentNode->Attributes.emplace_back(
-      "class", "col-xs-12 col-sm-9 col-md-8 main-content");
-  appendVector(std::move(MainContentInnerNodes), MainContentNode->Children);
-
-  auto RightSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  RightSidebarNode->Attributes.emplace_back("id", "sidebar-right");
-  RightSidebarNode->Attributes.emplace_back(
-      "class", "col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right");
-  std::vector<std::unique_ptr<TagNode>> InfoIndexHTML =
-      genHTML(InfoIndex, InfoPath, true);
-  appendVector(std::move(InfoIndexHTML), RightSidebarNode->Children);
-
-  MainNode->Children.emplace_back(std::move(LeftSidebarNode));
-  MainNode->Children.emplace_back(std::move(MainContentNode));
-  MainNode->Children.emplace_back(std::move(RightSidebarNode));
-
-  return MainNode;
-}
-
-// Generates a footer HTML node that can be used for any file
-// It contains clang-doc's version
-static std::unique_ptr<TagNode> genFileFooterNode() {
-  auto FooterNode = std::make_unique<TagNode>(HTMLTag::TAG_FOOTER);
-  auto SpanNode = std::make_unique<TagNode>(
-      HTMLTag::TAG_SPAN, clang::getClangToolFullVersion("clang-doc"));
-  SpanNode->Attributes.emplace_back("class", "no-break");
-  FooterNode->Children.emplace_back(std::move(SpanNode));
-  return FooterNode;
-}
-
-// Generates a complete HTMLFile for an Info
-static HTMLFile
-genInfoFile(StringRef Title, StringRef InfoPath,
-            std::vector<std::unique_ptr<TagNode>> &MainContentNodes,
-            const Index &InfoIndex, const ClangDocContext &CDCtx) {
-  HTMLFile F;
-
-  std::vector<std::unique_ptr<TagNode>> HeadNodes =
-      genFileHeadNodes(Title, InfoPath, CDCtx);
-  std::unique_ptr<TagNode> HeaderNode = genFileHeaderNode(CDCtx.ProjectName);
-  std::unique_ptr<TagNode> MainNode =
-      genInfoFileMainNode(InfoPath, MainContentNodes, InfoIndex);
-  std::unique_ptr<TagNode> FooterNode = genFileFooterNode();
-
-  appendVector(std::move(HeadNodes), F.Children);
-  F.Children.emplace_back(std::move(HeaderNode));
-  F.Children.emplace_back(std::move(MainNode));
-  F.Children.emplace_back(std::move(FooterNode));
-
-  return F;
-}
-
-template <typename T,
-          typename = std::enable_if<std::is_base_of<T, Info>::value>>
-static Index genInfoIndexItem(const std::vector<T> &Infos, StringRef Title) {
-  Index Idx(Title, Title);
-  for (const auto &C : Infos)
-    Idx.Children.emplace_back(C.extractName(),
-                              llvm::toHex(llvm::toStringRef(C.USR)));
-  return Idx;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const Index &Index, StringRef InfoPath, bool IsOutermostList) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  if (!Index.Name.empty()) {
-    Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_SPAN));
-    auto &SpanBody = Out.back();
-    if (!Index.JumpToSection)
-      SpanBody->Children.emplace_back(genReference(Index, InfoPath));
-    else
-      SpanBody->Children.emplace_back(
-          genReference(Index, InfoPath, Index.JumpToSection->str()));
-  }
-  if (Index.Children.empty())
-    return Out;
-  // Only the outermost list should use ol, the others should use ul
-  HTMLTag ListHTMLTag = IsOutermostList ? HTMLTag::TAG_OL : HTMLTag::TAG_UL;
-  Out.emplace_back(std::make_unique<TagNode>(ListHTMLTag));
-  const auto &UlBody = Out.back();
-  for (const auto &C : Index.Children) {
-    auto LiBody = std::make_unique<TagNode>(HTMLTag::TAG_LI);
-    std::vector<std::unique_ptr<TagNode>> Nodes = genHTML(C, InfoPath, false);
-    appendVector(std::move(Nodes), LiBody->Children);
-    UlBody->Children.emplace_back(std::move(LiBody));
-  }
-  return Out;
-}
-
-static std::unique_ptr<HTMLNode> genHTML(const CommentInfo &I) {
-  switch (I.Kind) {
-  case CommentKind::CK_FullComment: {
-    auto FullComment = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-    for (const auto &Child : I.Children) {
-      std::unique_ptr<HTMLNode> Node = genHTML(*Child);
-      if (Node)
-        FullComment->Children.emplace_back(std::move(Node));
-    }
-    return std::move(FullComment);
-  }
-
-  case CommentKind::CK_ParagraphComment: {
-    auto ParagraphComment = std::make_unique<TagNode>(HTMLTag::TAG_P);
-    for (const auto &Child : I.Children) {
-      std::unique_ptr<HTMLNode> Node = genHTML(*Child);
-      if (Node)
-        ParagraphComment->Children.emplace_back(std::move(Node));
-    }
-    if (ParagraphComment->Children.empty())
-      return nullptr;
-    return std::move(ParagraphComment);
-  }
-
-  case CommentKind::CK_BlockCommandComment: {
-    auto BlockComment = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-    BlockComment->Children.emplace_back(
-        std::make_unique<TagNode>(HTMLTag::TAG_DIV, I.Name));
-    for (const auto &Child : I.Children) {
-      std::unique_ptr<HTMLNode> Node = genHTML(*Child);
-      if (Node)
-        BlockComment->Children.emplace_back(std::move(Node));
-    }
-    if (BlockComment->Children.empty())
-      return nullptr;
-    return std::move(BlockComment);
-  }
-
-  case CommentKind::CK_TextComment: {
-    if (I.Text.empty())
-      return nullptr;
-    return std::make_unique<TextNode>(I.Text);
-  }
-
-  // For now, return nullptr for unsupported comment kinds
-  case CommentKind::CK_InlineCommandComment:
-  case CommentKind::CK_HTMLStartTagComment:
-  case CommentKind::CK_HTMLEndTagComment:
-  case CommentKind::CK_ParamCommandComment:
-  case CommentKind::CK_TParamCommandComment:
-  case CommentKind::CK_VerbatimBlockComment:
-  case CommentKind::CK_VerbatimBlockLineComment:
-  case CommentKind::CK_VerbatimLineComment:
-  case CommentKind::CK_Unknown:
-    return nullptr;
-  }
-  llvm_unreachable("Unhandled CommentKind");
-}
-
-static std::unique_ptr<TagNode> genHTML(const std::vector<CommentInfo> &C) {
-  auto CommentBlock = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  for (const auto &Child : C) {
-    if (std::unique_ptr<HTMLNode> Node = genHTML(Child))
-      CommentBlock->Children.emplace_back(std::move(Node));
-  }
-  return CommentBlock;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const EnumInfo &I, const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  std::string EnumType = I.Scoped ? "enum class " : "enum ";
-  // Determine if enum members have comments attached
-  bool HasComments = llvm::any_of(
-      I.Members, [](const EnumValueInfo &M) { return !M.Description.empty(); });
-  std::unique_ptr<TagNode> Table =
-      std::make_unique<TagNode>(HTMLTag::TAG_TABLE);
-  std::unique_ptr<TagNode> THead =
-      std::make_unique<TagNode>(HTMLTag::TAG_THEAD);
-  std::unique_ptr<TagNode> TRow = std::make_unique<TagNode>(HTMLTag::TAG_TR);
-  std::unique_ptr<TagNode> TD =
-      std::make_unique<TagNode>(HTMLTag::TAG_TH, EnumType + I.Name);
-  // Span 3 columns if enum has comments
-  TD->Attributes.emplace_back("colspan", HasComments ? "3" : "2");
-
-  Table->Attributes.emplace_back("id", llvm::toHex(llvm::toStringRef(I.USR)));
-  TRow->Children.emplace_back(std::move(TD));
-  THead->Children.emplace_back(std::move(TRow));
-  Table->Children.emplace_back(std::move(THead));
-
-  if (std::unique_ptr<TagNode> Node = genEnumMembersBlock(I.Members))
-    Table->Children.emplace_back(std::move(Node));
-
-  Out.emplace_back(std::move(Table));
-
-  maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc);
-
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const FunctionInfo &I, const ClangDocContext &CDCtx,
-        StringRef ParentInfoDir) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H3, I.Name));
-  // USR is used as id for functions instead of name to disambiguate function
-  // overloads.
-  Out.back()->Attributes.emplace_back("id",
-                                      llvm::toHex(llvm::toStringRef(I.USR)));
-
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_P));
-  auto &FunctionHeader = Out.back();
-
-  std::string Access = getAccessSpelling(I.Access).str();
-  if (Access != "")
-    FunctionHeader->Children.emplace_back(
-        std::make_unique<TextNode>(Access + " "));
-  if (I.IsStatic)
-    FunctionHeader->Children.emplace_back(
-        std::make_unique<TextNode>("static "));
-  if (I.ReturnType.Type.Name != "") {
-    FunctionHeader->Children.emplace_back(
-        genReference(I.ReturnType.Type, ParentInfoDir));
-    FunctionHeader->Children.emplace_back(std::make_unique<TextNode>(" "));
-  }
-  FunctionHeader->Children.emplace_back(
-      std::make_unique<TextNode>(I.Name + "("));
-
-  for (const auto &P : I.Params) {
-    if (&P != I.Params.begin())
-      FunctionHeader->Children.emplace_back(std::make_unique<TextNode>(", "));
-    FunctionHeader->Children.emplace_back(genReference(P.Type, ParentInfoDir));
-    FunctionHeader->Children.emplace_back(
-        std::make_unique<TextNode>(" " + P.Name));
-  }
-  FunctionHeader->Children.emplace_back(std::make_unique<TextNode>(")"));
-
-  maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc);
-
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const NamespaceInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx,
-        std::string &InfoTitle) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  if (I.Name.str() == "")
-    InfoTitle = "Global Namespace";
-  else
-    InfoTitle = ("namespace " + I.Name).str();
+static std::unique_ptr<MustacheTemplateFile> NamespaceTemplate = nullptr;
 
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H1, InfoTitle));
+static std::unique_ptr<MustacheTemplateFile> RecordTemplate = nullptr;
 
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  llvm::SmallString<64> BasePath = I.getRelativeFilePath("");
-
-  std::vector<std::unique_ptr<TagNode>> ChildNamespaces =
-      genReferencesBlock(I.Children.Namespaces, "Namespaces", BasePath);
-  appendVector(std::move(ChildNamespaces), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildRecords =
-      genReferencesBlock(I.Children.Records, "Records", BasePath);
-  appendVector(std::move(ChildRecords), Out);
-
-  std::vector<std::unique_ptr<TagNode>> ChildFunctions =
-      genFunctionsBlock(I.Children.Functions, CDCtx, BasePath);
-  appendVector(std::move(ChildFunctions), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildEnums =
-      genEnumsBlock(I.Children.Enums, CDCtx);
-  appendVector(std::move(ChildEnums), Out);
-
-  if (!I.Children.Namespaces.empty())
-    InfoIndex.Children.emplace_back("Namespaces", "Namespaces");
-  if (!I.Children.Records.empty())
-    InfoIndex.Children.emplace_back("Records", "Records");
-  if (!I.Children.Functions.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Functions, "Functions"));
-  if (!I.Children.Enums.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Enums, "Enums"));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const RecordInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx,
-        std::string &InfoTitle) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  InfoTitle = (getTagType(I.TagType) + " " + I.Name).str();
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H1, InfoTitle));
-
-  maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc);
-
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  std::vector<std::unique_ptr<HTMLNode>> Parents =
-      genReferenceList(I.Parents, I.Path);
-  std::vector<std::unique_ptr<HTMLNode>> VParents =
-      genReferenceList(I.VirtualParents, I.Path);
-  if (!Parents.empty() || !VParents.empty()) {
-    Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_P));
-    auto &PBody = Out.back();
-    PBody->Children.emplace_back(std::make_unique<TextNode>("Inherits from "));
-    if (Parents.empty())
-      appendVector(std::move(VParents), PBody->Children);
-    else if (VParents.empty())
-      appendVector(std::move(Parents), PBody->Children);
-    else {
-      appendVector(std::move(Parents), PBody->Children);
-      PBody->Children.emplace_back(std::make_unique<TextNode>(", "));
-      appendVector(std::move(VParents), PBody->Children);
-    }
-  }
-
-  std::vector<std::unique_ptr<TagNode>> Members =
-      genRecordMembersBlock(I.Members, I.Path);
-  appendVector(std::move(Members), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildRecords =
-      genReferencesBlock(I.Children.Records, "Records", I.Path);
-  appendVector(std::move(ChildRecords), Out);
-
-  std::vector<std::unique_ptr<TagNode>> ChildFunctions =
-      genFunctionsBlock(I.Children.Functions, CDCtx, I.Path);
-  appendVector(std::move(ChildFunctions), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildEnums =
-      genEnumsBlock(I.Children.Enums, CDCtx);
-  appendVector(std::move(ChildEnums), Out);
-
-  if (!I.Members.empty())
-    InfoIndex.Children.emplace_back("Members", "Members");
-  if (!I.Children.Records.empty())
-    InfoIndex.Children.emplace_back("Records", "Records");
-  if (!I.Children.Functions.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Functions, "Functions"));
-  if (!I.Children.Enums.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Enums, "Enums"));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const TypedefInfo &I, const ClangDocContext &CDCtx,
-        std::string &InfoTitle) {
-  // TODO support typedefs in HTML.
-  return {};
-}
-
-/// Generator for HTML documentation.
-class HTMLGenerator : public Generator {
+class HTMLGenerator : public MustacheGenerator {
 public:
   static const char *Format;
-
+  Error createResources(ClangDocContext &CDCtx) override;
+  Error generateDocForInfo(Info *I, raw_ostream &OS,
+                           const ClangDocContext &CDCtx) override;
+  Error setupTemplateFiles(const ClangDocContext &CDCtx) override;
+  Error generateDocForJSON(json::Value &JSON, raw_fd_ostream &OS,
+                           const ClangDocContext &CDCtx, StringRef ObjTypeStr,
+                           StringRef RelativeRootPath) override;
+  // Populates templates with CSS stylesheets, JS scripts paths.
+  Error setupTemplateResources(const ClangDocContext &CDCtx, json::Value &V,
+                               SmallString<128> RelativeRootPath);
   llvm::Error generateDocumentation(
       StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
       const ClangDocContext &CDCtx, std::string DirName) override;
-  llvm::Error createResources(ClangDocContext &CDCtx) override;
-  llvm::Error generateDocForInfo(Info *I, llvm::raw_ostream &OS,
-                                 const ClangDocContext &CDCtx) override;
 };
 
-const char *HTMLGenerator::Format = "html";
-
-llvm::Error HTMLGenerator::generateDocumentation(
-    StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
-    const ClangDocContext &CDCtx, std::string DirName) {
-  // Track which directories we already tried to create.
-  llvm::StringSet<> CreatedDirs;
+Error HTMLGenerator::setupTemplateFiles(const ClangDocContext &CDCtx) {
+  // Template files need to use the native path when they're opened,
+  // but have to be used in POSIX style when used in HTML.
+  auto ConvertToNative = [](std::string &&Path) -> std::string {
+    SmallString<128> PathBuf(Path);
+    llvm::sys::path::native(PathBuf);
+    return PathBuf.str().str();
+  };
 
-  // Collect all output by file name and create the nexessary directories.
-  llvm::StringMap<std::vector<doc::Info *>> FileToInfos;
-  for (const auto &Group : Infos) {
-    doc::Info *Info = Group.getValue().get();
+  std::string NamespaceFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("namespace-template"));
+  std::string ClassFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("class-template"));
+  std::string CommentFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("comment-template"));
+  std::string FunctionFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("function-template"));
+  std::string EnumFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("enum-template"));
+  std::vector<std::pair<StringRef, StringRef>> Partials = {
+      {"Comments", CommentFilePath},
+      {"FunctionPartial", FunctionFilePath},
+      {"EnumPartial", EnumFilePath}};
+
+  if (Error Err = setupTemplate(NamespaceTemplate, NamespaceFilePath, Partials))
+    return Err;
 
-    llvm::SmallString<128> Path;
-    llvm::sys::path::native(RootDir, Path);
-    llvm::sys::path::append(Path, Info->getRelativeFilePath(""));
-    if (!CreatedDirs.contains(Path)) {
-      if (std::error_code Err = llvm::sys::fs::create_directories(Path);
-          Err != std::error_code()) {
-        return llvm::createStringError(Err, "Failed to create directory '%s'.",
-                                       Path.c_str());
-      }
-      CreatedDirs.insert(Path);
-    }
+  if (Error Err = setupTemplate(RecordTemplate, ClassFilePath, Partials))
+    return Err;
 
-    llvm::sys::path::append(Path, Info->getFileBaseName() + ".html");
-    FileToInfos[Path].push_back(Info);
-  }
+  return Error::success();
+}
 
-  for (const auto &Group : FileToInfos) {
-    std::error_code FileErr;
-    llvm::raw_fd_ostream InfoOS(Group.getKey(), FileErr,
-                                llvm::sys::fs::OF_Text);
-    if (FileErr) {
-      return llvm::createStringError(FileErr, "Error opening file '%s'",
-                                     Group.getKey().str().c_str());
-    }
+Error HTMLGenerator::setupTemplateResources(const ClangDocContext &CDCtx,
+                                            json::Value &V,
+                                            SmallString<128> RelativeRootPath) {
+  V.getAsObject()->insert({"ProjectName", CDCtx.ProjectName});
+  json::Value StylesheetArr = Array();
+  sys::path::native(RelativeRootPath, sys::path::Style::posix);
 
-    // TODO: https://github.com/llvm/llvm-project/issues/59073
-    // If there are multiple Infos for this file name (for example, template
-    // specializations), this will generate multiple complete web pages (with
-    // <DOCTYPE> and <title>, etc.) concatenated together. This generator needs
-    // some refactoring to be able to output the headers separately from the
-    // contents.
-    for (const auto &Info : Group.getValue()) {
-      if (llvm::Error Err = generateDocForInfo(Info, InfoOS, CDCtx)) {
-        return Err;
-      }
-    }
+  auto *SSA = StylesheetArr.getAsArray();
+  SSA->reserve(CDCtx.UserStylesheets.size());
+  for (const auto &FilePath : CDCtx.UserStylesheets) {
+    SmallString<128> StylesheetPath = RelativeRootPath;
+    sys::path::append(StylesheetPath, sys::path::Style::posix,
+                      sys::path::filename(FilePath));
+    SSA->emplace_back(StylesheetPath);
+  }
+  V.getAsObject()->insert({"Stylesheets", StylesheetArr});
+
+  json::Value ScriptArr = Array();
+  auto *SCA = ScriptArr.getAsArray();
+  SCA->reserve(CDCtx.JsScripts.size());
+  for (auto Script : CDCtx.JsScripts) {
+    SmallString<128> JsPath = RelativeRootPath;
+    sys::path::append(JsPath, sys::path::Style::posix,
+                      sys::path::filename(Script));
+    SCA->emplace_back(JsPath);
+  }
+  V.getAsObject()->insert({"Scripts", ScriptArr});
+  return Error::success();
+}
+
+Error HTMLGenerator::generateDocForJSON(json::Value &JSON, raw_fd_ostream &OS,
+                                        const ClangDocContext &CDCtx,
+                                        StringRef ObjTypeStr,
+                                        StringRef RelativeRootPath) {
+  if (ObjTypeStr == "namespace") {
+    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
+      return Err;
+    assert(NamespaceTemplate && "NamespaceTemplate is nullptr.");
+    NamespaceTemplate->render(JSON, OS);
+  } else if (ObjTypeStr == "record") {
+    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
+      return Err;
+    assert(RecordTemplate && "RecordTemplate is nullptr.");
+    RecordTemplate->render(JSON, OS);
   }
-
-  return llvm::Error::success();
+  return Error::success();
 }
 
-llvm::Error HTMLGenerator::generateDocForInfo(Info *I, llvm::raw_ostream &OS,
-                                              const ClangDocContext &CDCtx) {
-  std::string InfoTitle;
-  std::vector<std::unique_ptr<TagNode>> MainContentNodes;
-  Index InfoIndex;
+Error HTMLGenerator::generateDocForInfo(Info *I, raw_ostream &OS,
+                                        const ClangDocContext &CDCtx) {
   switch (I->IT) {
-  case InfoType::IT_namespace:
-    MainContentNodes = genHTML(*static_cast<clang::doc::NamespaceInfo *>(I),
-                               InfoIndex, CDCtx, InfoTitle);
-    break;
-  case InfoType::IT_record:
-    MainContentNodes = genHTML(*static_cast<clang::doc::RecordInfo *>(I),
-                               InfoIndex, CDCtx, InfoTitle);
-    break;
   case InfoType::IT_enum:
-    MainContentNodes = genHTML(*static_cast<clang::doc::EnumInfo *>(I), CDCtx);
-    break;
   case InfoType::IT_function:
-    MainContentNodes =
-        genHTML(*static_cast<clang::doc::FunctionInfo *>(I), CDCtx, "");
-    break;
   case InfoType::IT_typedef:
-    MainContentNodes =
-        genHTML(*static_cast<clang::doc::TypedefInfo *>(I), CDCtx, InfoTitle);
-    break;
-  case InfoType::IT_concept:
-  case InfoType::IT_variable:
-  case InfoType::IT_friend:
-    break;
-  case InfoType::IT_default:
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "unexpected info type");
-  }
-
-  HTMLFile F = genInfoFile(InfoTitle, I->getRelativeFilePath(""),
-                           MainContentNodes, InfoIndex, CDCtx);
-  F.render(OS);
-
-  return llvm::Error::success();
-}
-
-static std::string getRefType(InfoType IT) {
-  switch (IT) {
-  case InfoType::IT_default:
-    return "default";
   case InfoType::IT_namespace:
-    return "namespace";
   case InfoType::IT_record:
-    return "record";
-  case InfoType::IT_function:
-    return "function";
-  case InfoType::IT_enum:
-    return "enum";
-  case InfoType::IT_typedef:
-    return "typedef";
   case InfoType::IT_concept:
-    return "concept";
   case InfoType::IT_variable:
-    return "variable";
   case InfoType::IT_friend:
-    return "friend";
-  }
-  llvm_unreachable("Unknown InfoType");
-}
-
-static llvm::Error serializeIndex(ClangDocContext &CDCtx) {
-  std::error_code OK;
-  std::error_code FileErr;
-  llvm::SmallString<128> FilePath;
-  llvm::sys::path::native(CDCtx.OutDirectory, FilePath);
-  llvm::sys::path::append(FilePath, "index_json.js");
-  llvm::raw_fd_ostream OS(FilePath, FileErr, llvm::sys::fs::OF_Text);
-  if (FileErr != OK) {
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "error creating index file: " +
-                                       FileErr.message());
-  }
-  llvm::SmallString<128> RootPath(CDCtx.OutDirectory);
-  if (llvm::sys::path::is_relative(RootPath)) {
-    llvm::sys::fs::make_absolute(RootPath);
+    break;
+  case InfoType::IT_default:
+    return createStringError(inconvertibleErrorCode(), "unexpected InfoType");
   }
-  // Replace the escaped characters with a forward slash. It shouldn't matter
-  // when rendering the webpage in a web browser. This helps to prevent the
-  // JavaScript from escaping characters incorrectly, and introducing  bad paths
-  // in the URLs.
-  std::string RootPathEscaped = RootPath.str().str();
-  llvm::replace(RootPathEscaped, '\\', '/');
-  OS << "var RootPath = \"" << RootPathEscaped << "\";\n";
-
-  llvm::SmallString<128> Base(CDCtx.Base);
-  std::string BaseEscaped = Base.str().str();
-  llvm::replace(BaseEscaped, '\\', '/');
-  OS << "var Base = \"" << BaseEscaped << "\";\n";
-
-  CDCtx.Idx.sort();
-  llvm::json::OStream J(OS, 2);
-  std::function<void(Index)> IndexToJSON = [&](const Index &I) {
-    J.object([&] {
-      J.attribute("USR", toHex(llvm::toStringRef(I.USR)));
-      J.attribute("Name", I.Name);
-      J.attribute("RefType", getRefType(I.RefType));
-      J.attribute("Path", I.getRelativeFilePath(""));
-      J.attributeArray("Children", [&] {
-        for (const Index &C : I.Children)
-          IndexToJSON(C);
-      });
-    });
-  };
-  OS << "async function LoadIndex() {\nreturn";
-  IndexToJSON(CDCtx.Idx);
-  OS << ";\n}";
-  return llvm::Error::success();
+  return Error::success();
 }
 
-// Generates a main HTML node that has the main content of the file that shows
-// only the general index
-// It contains the general index with links to all the generated files
-static std::unique_ptr<TagNode> genIndexFileMainNode() {
-  auto MainNode = std::make_unique<TagNode>(HTMLTag::TAG_MAIN);
-
-  auto LeftSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  LeftSidebarNode->Attributes.emplace_back("id", "sidebar-left");
-  LeftSidebarNode->Attributes.emplace_back("path", "");
-  LeftSidebarNode->Attributes.emplace_back(
-      "class", "col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left");
-  LeftSidebarNode->Attributes.emplace_back("style", "flex: 0 100%;");
-
-  MainNode->Children.emplace_back(std::move(LeftSidebarNode));
-
-  return MainNode;
+Error HTMLGenerator::createResources(ClangDocContext &CDCtx) {
+  std::string ResourcePath(CDCtx.OutDirectory + "/html");
+  for (const auto &FilePath : CDCtx.UserStylesheets)
+    if (Error Err = copyFile(FilePath, ResourcePath))
+      return Err;
+  for (const auto &FilePath : CDCtx.JsScripts)
+    if (Error Err = copyFile(FilePath, ResourcePath))
+      return Err;
+  return Error::success();
 }
 
-static llvm::Error genIndex(const ClangDocContext &CDCtx) {
-  std::error_code FileErr, OK;
-  llvm::SmallString<128> IndexPath;
-  llvm::sys::path::native(CDCtx.OutDirectory, IndexPath);
-  llvm::sys::path::append(IndexPath, "index.html");
-  llvm::raw_fd_ostream IndexOS(IndexPath, FileErr, llvm::sys::fs::OF_Text);
-  if (FileErr != OK) {
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "error creating main index: " +
-                                       FileErr.message());
-  }
-
-  HTMLFile F;
-
-  std::vector<std::unique_ptr<TagNode>> HeadNodes =
-      genFileHeadNodes("Index", "", CDCtx);
-  std::unique_ptr<TagNode> HeaderNode = genFileHeaderNode(CDCtx.ProjectName);
-  std::unique_ptr<TagNode> MainNode = genIndexFileMainNode();
-  std::unique_ptr<TagNode> FooterNode = genFileFooterNode();
-
-  appendVector(std::move(HeadNodes), F.Children);
-  F.Children.emplace_back(std::move(HeaderNode));
-  F.Children.emplace_back(std::move(MainNode));
-  F.Children.emplace_back(std::move(FooterNode));
-
-  F.render(IndexOS);
-
-  return llvm::Error::success();
+Error HTMLGenerator::generateDocumentation(
+    StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
+    const ClangDocContext &CDCtx, std::string DirName) {
+  return MustacheGenerator::generateDocumentation(RootDir, std::move(Infos),
+                                                  CDCtx, "html");
 }
 
-llvm::Error HTMLGenerator::createResources(ClangDocContext &CDCtx) {
-  auto Err = serializeIndex(CDCtx);
-  if (Err)
-    return Err;
-  Err = genIndex(CDCtx);
-  if (Err)
-    return Err;
-
-  for (const auto &FilePath : CDCtx.UserStylesheets) {
-    Err = copyFile(FilePath, CDCtx.OutDirectory);
-    if (Err)
-      return Err;
-  }
-  for (const auto &FilePath : CDCtx.JsScripts) {
-    Err = copyFile(FilePath, CDCtx.OutDirectory);
-    if (Err)
-      return Err;
-  }
-  return llvm::Error::success();
-}
+const char *HTMLGenerator::Format = "html";
 
-static GeneratorRegistry::Add<HTMLGenerator> HTML(HTMLGenerator::Format,
-                                                  "Generator for HTML output.");
+static GeneratorRegistry::Add<HTMLGenerator>
+    HTML(HTMLGenerator::Format, "Generator for mustache HTML output.");
 
 // This anchor is used to force the linker to link in the generated object
 // file and thus register the generator.
diff --git a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
deleted file mode 100644
index d33b77feb84be..0000000000000
--- a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-///===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains the implementation of the MustacheHTMLGenerator class,
-/// which is Clang-Doc generator for HTML using Mustache templates.
-///
-//===----------------------------------------------------------------------===//
-
-#include "Generators.h"
-#include "Representation.h"
-#include "support/File.h"
-#include "clang/Basic/Diagnostic.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/Path.h"
-
-using namespace llvm;
-using namespace llvm::json;
-using namespace llvm::mustache;
-
-namespace clang {
-namespace doc {
-
-static std::unique_ptr<MustacheTemplateFile> NamespaceTemplate = nullptr;
-
-static std::unique_ptr<MustacheTemplateFile> RecordTemplate = nullptr;
-
-class MustacheHTMLGenerator : public MustacheGenerator {
-public:
-  static const char *Format;
-  Error createResources(ClangDocContext &CDCtx) override;
-  Error generateDocForInfo(Info *I, raw_ostream &OS,
-                           const ClangDocContext &CDCtx) override;
-  Error setupTemplateFiles(const ClangDocContext &CDCtx) override;
-  Error generateDocForJSON(json::Value &JSON, raw_fd_ostream &OS,
-                           const ClangDocContext &CDCtx, StringRef ObjTypeStr,
-                           StringRef RelativeRootPath) override;
-  // Populates templates with CSS stylesheets, JS scripts paths.
-  Error setupTemplateResources(const ClangDocContext &CDCtx, json::Value &V,
-                               SmallString<128> RelativeRootPath);
-  llvm::Error generateDocumentation(
-      StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
-      const ClangDocContext &CDCtx, std::string DirName) override;
-};
-
-Error MustacheHTMLGenerator::setupTemplateFiles(const ClangDocContext &CDCtx) {
-  // Template files need to use the native path when they're opened,
-  // but have to be used in POSIX style when used in HTML.
-  auto ConvertToNative = [](std::string &&Path) -> std::string {
-    SmallString<128> PathBuf(Path);
-    llvm::sys::path::native(PathBuf);
-    return PathBuf.str().str();
-  };
-
-  std::string NamespaceFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("namespace-template"));
-  std::string ClassFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("class-template"));
-  std::string CommentFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("comment-template"));
-  std::string FunctionFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("function-template"));
-  std::string EnumFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("enum-template"));
-  std::vector<std::pair<StringRef, StringRef>> Partials = {
-      {"Comments", CommentFilePath},
-      {"FunctionPartial", FunctionFilePath},
-      {"EnumPartial", EnumFilePath}};
-
-  if (Error Err = setupTemplate(NamespaceTemplate, NamespaceFilePath, Partials))
-    return Err;
-
-  if (Error Err = setupTemplate(RecordTemplate, ClassFilePath, Partials))
-    return Err;
-
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::setupTemplateResources(
-    const ClangDocContext &CDCtx, json::Value &V,
-    SmallString<128> RelativeRootPath) {
-  V.getAsObject()->insert({"ProjectName", CDCtx.ProjectName});
-  json::Value StylesheetArr = Array();
-  sys::path::native(RelativeRootPath, sys::path::Style::posix);
-
-  auto *SSA = StylesheetArr.getAsArray();
-  SSA->reserve(CDCtx.UserStylesheets.size());
-  for (const auto &FilePath : CDCtx.UserStylesheets) {
-    SmallString<128> StylesheetPath = RelativeRootPath;
-    sys::path::append(StylesheetPath, sys::path::Style::posix,
-                      sys::path::filename(FilePath));
-    SSA->emplace_back(StylesheetPath);
-  }
-  V.getAsObject()->insert({"Stylesheets", StylesheetArr});
-
-  json::Value ScriptArr = Array();
-  auto *SCA = ScriptArr.getAsArray();
-  SCA->reserve(CDCtx.JsScripts.size());
-  for (auto Script : CDCtx.JsScripts) {
-    SmallString<128> JsPath = RelativeRootPath;
-    sys::path::append(JsPath, sys::path::Style::posix,
-                      sys::path::filename(Script));
-    SCA->emplace_back(JsPath);
-  }
-  V.getAsObject()->insert({"Scripts", ScriptArr});
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::generateDocForJSON(json::Value &JSON,
-                                                raw_fd_ostream &OS,
-                                                const ClangDocContext &CDCtx,
-                                                StringRef ObjTypeStr,
-                                                StringRef RelativeRootPath) {
-  if (ObjTypeStr == "namespace") {
-    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
-      return Err;
-    assert(NamespaceTemplate && "NamespaceTemplate is nullptr.");
-    NamespaceTemplate->render(JSON, OS);
-  } else if (ObjTypeStr == "record") {
-    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
-      return Err;
-    assert(RecordTemplate && "RecordTemplate is nullptr.");
-    RecordTemplate->render(JSON, OS);
-  }
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::generateDocForInfo(Info *I, raw_ostream &OS,
-                                                const ClangDocContext &CDCtx) {
-  switch (I->IT) {
-  case InfoType::IT_enum:
-  case InfoType::IT_function:
-  case InfoType::IT_typedef:
-  case InfoType::IT_namespace:
-  case InfoType::IT_record:
-  case InfoType::IT_concept:
-  case InfoType::IT_variable:
-  case InfoType::IT_friend:
-    break;
-  case InfoType::IT_default:
-    return createStringError(inconvertibleErrorCode(), "unexpected InfoType");
-  }
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::createResources(ClangDocContext &CDCtx) {
-  std::string ResourcePath(CDCtx.OutDirectory + "/html");
-  for (const auto &FilePath : CDCtx.UserStylesheets)
-    if (Error Err = copyFile(FilePath, ResourcePath))
-      return Err;
-  for (const auto &FilePath : CDCtx.JsScripts)
-    if (Error Err = copyFile(FilePath, ResourcePath))
-      return Err;
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::generateDocumentation(
-    StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
-    const ClangDocContext &CDCtx, std::string DirName) {
-  return MustacheGenerator::generateDocumentation(RootDir, std::move(Infos),
-                                                  CDCtx, "html");
-}
-
-const char *MustacheHTMLGenerator::Format = "mustache";
-
-static GeneratorRegistry::Add<MustacheHTMLGenerator>
-    MHTML(MustacheHTMLGenerator::Format, "Generator for mustache HTML output.");
-
-// This anchor is used to force the linker to link in the generated object
-// file and thus register the generator.
-volatile int MHTMLGeneratorAnchorSource = 0;
-
-} // namespace doc
-} // namespace clang
diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 97c599a3f605c..77aa8794561e4 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -84,8 +84,23 @@ serializeLocation(const Location &Loc,
   return LocationObj;
 }
 
+/// Insert comments into a key in the Description object.
+///
+/// \param Comment Either an Object or Array, depending on the comment type
+/// \param Key     The type (Brief, Code, etc.) of comment to be inserted
 static void insertComment(Object &Description, json::Value &Comment,
                           StringRef Key) {
+  // The comment has a Children array for the actual text, with meta attributes
+  // alongside it in the Object.
+  if (auto *Obj = Comment.getAsObject()) {
+    if (auto *Children = Obj->getArray("Children"); Children->empty())
+      return;
+  }
+  // The comment is just an array of text comments.
+  else if (auto *Array = Comment.getAsArray(); Array->empty()) {
+    return;
+  }
+
   auto DescriptionIt = Description.find(Key);
 
   if (DescriptionIt == Description.end()) {
@@ -98,10 +113,28 @@ static void insertComment(Object &Description, json::Value &Comment,
   }
 }
 
+/// Takes the nested "Children" array from a comment Object.
+///
+/// \return a json::Array of comments, possible json::Value::Kind::Null
 static json::Value extractTextComments(Object *ParagraphComment) {
   if (!ParagraphComment)
-    return json::Object();
-  return *ParagraphComment->get("Children");
+    return json::Value(nullptr);
+  json::Value *Children = ParagraphComment->get("Children");
+  if (!Children)
+    return json::Value(nullptr);
+  auto ChildrenArray = *Children->getAsArray();
+  auto ChildrenIt = ChildrenArray.begin();
+  while (ChildrenIt != ChildrenArray.end()) {
+    auto *ChildObj = ChildrenIt->getAsObject();
+    assert(ChildObj && "Invalid JSON object in Comment");
+    auto TextComment = ChildObj->getString("TextComment");
+    if (!TextComment || TextComment->empty()) {
+      ChildrenIt = ChildrenArray.erase(ChildrenIt);
+      continue;
+    }
+    ++ChildrenIt;
+  }
+  return ChildrenArray;
 }
 
 static json::Value extractVerbatimComments(json::Array VerbatimLines) {
@@ -131,7 +164,8 @@ static Object serializeComment(const CommentInfo &I, Object &Description) {
 
   switch (I.Kind) {
   case CommentKind::CK_TextComment: {
-    Obj.insert({commentKindToString(I.Kind), I.Text});
+    if (!I.Text.empty())
+      Obj.insert({commentKindToString(I.Kind), I.Text});
     return Obj;
   }
 
@@ -265,6 +299,9 @@ serializeCommonAttributes(const Info &I, json::Object &Obj,
       if (auto *ParagraphComment = Comment.getAsObject();
           ParagraphComment->get("ParagraphComment")) {
         auto TextCommentsArray = extractTextComments(ParagraphComment);
+        if (TextCommentsArray.kind() == json::Value::Null ||
+            TextCommentsArray.getAsArray()->empty())
+          continue;
         insertComment(Description, TextCommentsArray, "ParagraphComments");
       }
     }
diff --git a/clang-tools-extra/clang-doc/support/Utils.cpp b/clang-tools-extra/clang-doc/support/Utils.cpp
index 897a7ad0adb79..f410bfcf956d4 100644
--- a/clang-tools-extra/clang-doc/support/Utils.cpp
+++ b/clang-tools-extra/clang-doc/support/Utils.cpp
@@ -28,8 +28,7 @@ SmallString<128> appendPathPosix(StringRef Base, StringRef Path) {
   return Default;
 }
 
-void getMustacheHtmlFiles(StringRef AssetsPath,
-                          clang::doc::ClangDocContext &CDCtx) {
+void getHtmlFiles(StringRef AssetsPath, clang::doc::ClangDocContext &CDCtx) {
   assert(!AssetsPath.empty());
   assert(sys::fs::is_directory(AssetsPath));
 
diff --git a/clang-tools-extra/clang-doc/support/Utils.h b/clang-tools-extra/clang-doc/support/Utils.h
index 8161c37503f81..f4ed9ec42dce4 100644
--- a/clang-tools-extra/clang-doc/support/Utils.h
+++ b/clang-tools-extra/clang-doc/support/Utils.h
@@ -20,7 +20,7 @@ llvm::SmallString<128> appendPathNative(llvm::StringRef Base,
 llvm::SmallString<128> appendPathPosix(llvm::StringRef Base,
                                        llvm::StringRef Path);
 
-void getMustacheHtmlFiles(llvm::StringRef AssetsPath,
-                          clang::doc::ClangDocContext &CDCtx);
+void getHtmlFiles(llvm::StringRef AssetsPath,
+                  clang::doc::ClangDocContext &CDCtx);
 
 #endif
diff --git a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
index cb98c200442a6..ee4c449718871 100644
--- a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
+++ b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
@@ -111,21 +111,20 @@ Turn on time profiler. Generates clang-doc-tracing.json)"),
                                       llvm::cl::init(false),
                                       llvm::cl::cat(ClangDocCategory));
 
-enum OutputFormatTy { md, yaml, html, mustache, json };
-
-static llvm::cl::opt<OutputFormatTy> FormatEnum(
-    "format", llvm::cl::desc("Format for outputted docs."),
-    llvm::cl::values(clEnumValN(OutputFormatTy::yaml, "yaml",
-                                "Documentation in YAML format."),
-                     clEnumValN(OutputFormatTy::md, "md",
-                                "Documentation in MD format."),
-                     clEnumValN(OutputFormatTy::html, "html",
-                                "Documentation in HTML format."),
-                     clEnumValN(OutputFormatTy::mustache, "mustache",
-                                "Documentation in mustache HTML format"),
-                     clEnumValN(OutputFormatTy::json, "json",
-                                "Documentation in JSON format")),
-    llvm::cl::init(OutputFormatTy::yaml), llvm::cl::cat(ClangDocCategory));
+enum OutputFormatTy { md, yaml, html, json };
+
+static llvm::cl::opt<OutputFormatTy>
+    FormatEnum("format", llvm::cl::desc("Format for outputted docs."),
+               llvm::cl::values(clEnumValN(OutputFormatTy::yaml, "yaml",
+                                           "Documentation in YAML format."),
+                                clEnumValN(OutputFormatTy::md, "md",
+                                           "Documentation in MD format."),
+                                clEnumValN(OutputFormatTy::html, "html",
+                                           "Documentation in HTML format."),
+                                clEnumValN(OutputFormatTy::json, "json",
+                                           "Documentation in JSON format")),
+               llvm::cl::init(OutputFormatTy::yaml),
+               llvm::cl::cat(ClangDocCategory));
 
 static llvm::ExitOnError ExitOnErr;
 
@@ -137,8 +136,6 @@ static std::string getFormatString() {
     return "md";
   case OutputFormatTy::html:
     return "html";
-  case OutputFormatTy::mustache:
-    return "mustache";
   case OutputFormatTy::json:
     return "json";
   }
@@ -175,61 +172,12 @@ static llvm::Error getAssetFiles(clang::doc::ClangDocContext &CDCtx) {
   return llvm::Error::success();
 }
 
-static llvm::Error getDefaultAssetFiles(const char *Argv0,
-                                        clang::doc::ClangDocContext &CDCtx) {
-  void *MainAddr = (void *)(intptr_t)getExecutablePath;
-  std::string ClangDocPath = getExecutablePath(Argv0, MainAddr);
-  llvm::SmallString<128> NativeClangDocPath;
-  llvm::sys::path::native(ClangDocPath, NativeClangDocPath);
-
-  llvm::SmallString<128> AssetsPath;
-  AssetsPath = llvm::sys::path::parent_path(NativeClangDocPath);
-  llvm::sys::path::append(AssetsPath, "..", "share", "clang-doc");
-  llvm::SmallString<128> DefaultStylesheet =
-      appendPathNative(AssetsPath, "clang-doc-default-stylesheet.css");
-  llvm::SmallString<128> IndexJS = appendPathNative(AssetsPath, "index.js");
-
-  if (!llvm::sys::fs::is_regular_file(IndexJS))
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "default index.js file missing at " +
-                                       IndexJS + "\n");
-
-  if (!llvm::sys::fs::is_regular_file(DefaultStylesheet))
-    return llvm::createStringError(
-        llvm::inconvertibleErrorCode(),
-        "default clang-doc-default-stylesheet.css file missing at " +
-            DefaultStylesheet + "\n");
-
-  CDCtx.UserStylesheets.insert(CDCtx.UserStylesheets.begin(),
-                               std::string(DefaultStylesheet));
-  CDCtx.JsScripts.emplace_back(IndexJS.str());
-
-  return llvm::Error::success();
-}
-
-static llvm::Error getHtmlAssetFiles(const char *Argv0,
-                                     clang::doc::ClangDocContext &CDCtx) {
-  if (!UserAssetPath.empty() &&
-      !llvm::sys::fs::is_directory(std::string(UserAssetPath))) {
-    unsigned ID = CDCtx.Diags.getCustomDiagID(
-        DiagnosticsEngine::Warning,
-        "Asset path supply is not a directory: %0 falling back to default");
-    CDCtx.Diags.Report(ID) << UserAssetPath;
-  }
-  if (llvm::sys::fs::is_directory(std::string(UserAssetPath)))
-    return getAssetFiles(CDCtx);
-  return getDefaultAssetFiles(Argv0, CDCtx);
-}
-
-static llvm::Error getMustacheHtmlFiles(const char *Argv0,
-                                        clang::doc::ClangDocContext &CDCtx) {
+static llvm::Error getHtmlFiles(const char *Argv0,
+                                clang::doc::ClangDocContext &CDCtx) {
   bool IsDir = llvm::sys::fs::is_directory(UserAssetPath);
-  if (!UserAssetPath.empty() && !IsDir) {
-    unsigned ID = CDCtx.Diags.getCustomDiagID(
-        DiagnosticsEngine::Note,
-        "Asset path supply is not a directory: %0 falling back to default");
-    CDCtx.Diags.Report(ID) << UserAssetPath;
-  }
+  if (!UserAssetPath.empty() && !IsDir)
+    llvm::outs() << "Asset path supply is not a directory: " << UserAssetPath
+                 << " falling back to default\n";
   if (IsDir) {
     if (auto Err = getAssetFiles(CDCtx))
       return Err;
@@ -243,7 +191,7 @@ static llvm::Error getMustacheHtmlFiles(const char *Argv0,
   AssetsPath = llvm::sys::path::parent_path(NativeClangDocPath);
   llvm::sys::path::append(AssetsPath, "..", "share", "clang-doc");
 
-  getMustacheHtmlFiles(AssetsPath, CDCtx);
+  getHtmlFiles(AssetsPath, CDCtx);
 
   return llvm::Error::success();
 }
@@ -337,11 +285,8 @@ Example usage for a project using a compile commands database:
         SourceRoot, RepositoryUrl, RepositoryCodeLinePrefix, BaseDirectory,
         {UserStylesheets.begin(), UserStylesheets.end()}, Diags, FTimeTrace);
 
-    if (Format == "html") {
-      ExitOnErr(getHtmlAssetFiles(argv[0], CDCtx));
-    } else if (Format == "mustache") {
-      ExitOnErr(getMustacheHtmlFiles(argv[0], CDCtx));
-    }
+    if (Format == "html")
+      ExitOnErr(getHtmlFiles(argv[0], CDCtx));
 
     llvm::timeTraceProfilerBegin("Executor Launch", "total runtime");
     // Mapping phase
diff --git a/clang-tools-extra/test/clang-doc/DR-131697.cpp b/clang-tools-extra/test/clang-doc/DR-131697.cpp
index 06168e6642f62..9025bbf910813 100644
--- a/clang-tools-extra/test/clang-doc/DR-131697.cpp
+++ b/clang-tools-extra/test/clang-doc/DR-131697.cpp
@@ -1,7 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: split-file %s %t
 // RUN: clang-doc -format=html %t/compile_commands.json %t/main.cpp
-// RUN: clang-doc -format=mustache %t/compile_commands.json %t/main.cpp
 
 //--- main.cpp
 
diff --git a/clang-tools-extra/test/clang-doc/assets.cpp b/clang-tools-extra/test/clang-doc/assets.cpp
index 9acb64a10b4fe..853dfe53d09f0 100644
--- a/clang-tools-extra/test/clang-doc/assets.cpp
+++ b/clang-tools-extra/test/clang-doc/assets.cpp
@@ -1,24 +1,8 @@
 // RUN: rm -rf %t && mkdir %t
 // RUN: clang-doc --format=html --output=%t --asset=%S/Inputs/test-assets --executor=standalone %s --base base_dir
-// RUN: clang-doc --format=mustache --output=%t --asset=%S/Inputs/test-assets --executor=standalone %s --base base_dir
-// RUN: FileCheck %s -input-file=%t/index.html -check-prefix=INDEX
-// RUN: FileCheck %s -input-file=%t/test.css -check-prefix=CSS
-// RUN: FileCheck %s -input-file=%t/test.js -check-prefix=JS
-
 // RUN: FileCheck %s -input-file=%t/html/test.css -check-prefix=CSS
 // RUN: FileCheck %s -input-file=%t/html/test.js -check-prefix=JS
 
-// INDEX: <!DOCTYPE html>
-// INDEX-NEXT: <meta charset="utf-8"/>
-// INDEX-NEXT: <title>Index</title>
-// INDEX-NEXT: <link rel="stylesheet" href="test.css"/>
-// INDEX-NEXT: <script src="index_json.js"></script>
-// INDEX-NEXT: <script src="test.js"></script>
-// INDEX-NEXT: <header id="project-title"></header>
-// INDEX-NEXT: <main>
-// INDEX-NEXT:   <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left" style="flex: 0 100%;"></div>
-// INDEX-NEXT: </main>
-
 // CSS: body {
 // CSS-NEXT:     padding: 0;
 // CSS-NEXT: }
diff --git a/clang-tools-extra/test/clang-doc/basic-project.mustache.test b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
index 9f7de6e689313..b985a39265de7 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.mustache.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t/docs %t/build
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 
-// RUN: clang-doc --format=mustache --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
+// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
 // RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV5Shape.html -check-prefix=HTML-SHAPE
 // RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV10Calculator.html -check-prefix=HTML-CALC
 // RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV9Rectangle.html -check-prefix=HTML-RECTANGLE
@@ -65,9 +65,6 @@ HTML-SHAPE:                         <div>
 HTML-SHAPE:                             <p> Abstract base class for shapes.</p>
 HTML-SHAPE:                         </div>
 HTML-SHAPE:                         <div>
-HTML-SHAPE:                             <p></p>
-HTML-SHAPE:                         </div>
-HTML-SHAPE:                         <div>
 HTML-SHAPE:                             <p> Provides a common interface for different types of shapes.</p>
 HTML-SHAPE:                         </div>
 HTML-SHAPE:                     </div>
@@ -83,12 +80,6 @@ HTML-SHAPE:                             <div>
 HTML-SHAPE:                                 <div>
 HTML-SHAPE:                                     <p> Calculates the area of the shape.</p>
 HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
 HTML-SHAPE:                                 <h3>Returns</h3>
 HTML-SHAPE:                                 <p> double The area of the shape.</p>
 HTML-SHAPE:                             </div>
@@ -101,12 +92,6 @@ HTML-SHAPE:                             <div>
 HTML-SHAPE:                                 <div>
 HTML-SHAPE:                                     <p> Calculates the perimeter of the shape.</p>
 HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
 HTML-SHAPE:                                 <h3>Returns</h3>
 HTML-SHAPE:                                 <p> double The perimeter of the shape.</p>
 HTML-SHAPE:                             </div>
@@ -119,9 +104,6 @@ HTML-SHAPE:                             <div>
 HTML-SHAPE:                                 <div>
 HTML-SHAPE:                                     <p> Virtual destructor.</p>
 HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
 HTML-SHAPE:                             </div>
 HTML-SHAPE:                         </div>
 HTML-SHAPE:                     </div>
@@ -210,9 +192,6 @@ HTML-CALC:                         <div>
 HTML-CALC:                             <p> A simple calculator class.</p>
 HTML-CALC:                         </div>
 HTML-CALC:                         <div>
-HTML-CALC:                             <p></p>
-HTML-CALC:                         </div>
-HTML-CALC:                         <div>
 HTML-CALC:                             <p> Provides basic arithmetic operations.</p>
 HTML-CALC:                         </div>
 HTML-CALC:                     </div>
@@ -239,12 +218,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Adds two integers.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -264,12 +237,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Subtracts the second integer from the first.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -289,12 +256,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Multiplies two integers.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -314,12 +275,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Divides the first integer by the second.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -329,7 +284,6 @@ HTML-CALC:                                     <b>b</b>   Second integer.
 HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Returns</h3>
 HTML-CALC:                                 <p> double The result of a / b.</p>
-HTML-CALC:                                 <p></p>
 HTML-CALC:                                 <h3>Throws</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>std::invalid_argument</b> if b is zero.
@@ -344,12 +298,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Performs the mod operation on integers.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -431,9 +379,6 @@ HTML-RECTANGLE:                         <div>
 HTML-RECTANGLE:                             <p> Rectangle class derived from Shape.</p>
 HTML-RECTANGLE:                         </div>
 HTML-RECTANGLE:                         <div>
-HTML-RECTANGLE:                             <p></p>
-HTML-RECTANGLE:                         </div>
-HTML-RECTANGLE:                         <div>
 HTML-RECTANGLE:                             <p> Represents a rectangle with a given width and height.</p>
 HTML-RECTANGLE:                         </div>
 HTML-RECTANGLE:                     </div>
@@ -449,12 +394,6 @@ HTML-RECTANGLE:                             <div>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <p> Constructs a new Rectangle object.</p>
 HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
 HTML-RECTANGLE:                                 <h3>Parameters</h3>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <b>width</b>   Width of the rectangle.
@@ -472,12 +411,6 @@ HTML-RECTANGLE:                             <div>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <p> Calculates the area of the rectangle.</p>
 HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
 HTML-RECTANGLE:                                 <h3>Returns</h3>
 HTML-RECTANGLE:                                 <p> double The area of the rectangle.</p>
 HTML-RECTANGLE:                             </div>
@@ -490,12 +423,6 @@ HTML-RECTANGLE:                             <div>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <p> Calculates the perimeter of the rectangle.</p>
 HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
 HTML-RECTANGLE:                                 <h3>Returns</h3>
 HTML-RECTANGLE:                                 <p> double The perimeter of the rectangle.</p>
 HTML-RECTANGLE:                             </div>
@@ -570,9 +497,6 @@ HTML-CIRCLE:                         <div>
 HTML-CIRCLE:                             <p> Circle class derived from Shape.</p>
 HTML-CIRCLE:                         </div>
 HTML-CIRCLE:                         <div>
-HTML-CIRCLE:                             <p></p>
-HTML-CIRCLE:                         </div>
-HTML-CIRCLE:                         <div>
 HTML-CIRCLE:                             <p> Represents a circle with a given radius.</p>
 HTML-CIRCLE:                         </div>
 HTML-CIRCLE:                     </div>
@@ -588,12 +512,6 @@ HTML-CIRCLE:                             <div>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <p> Constructs a new Circle object.</p>
 HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
 HTML-CIRCLE:                                 <h3>Parameters</h3>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <b>radius</b>   Radius of the circle.
@@ -608,12 +526,6 @@ HTML-CIRCLE:                             <div>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <p> Calculates the area of the circle.</p>
 HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
 HTML-CIRCLE:                                 <h3>Returns</h3>
 HTML-CIRCLE:                                 <p> double The area of the circle.</p>
 HTML-CIRCLE:                             </div>
@@ -626,15 +538,6 @@ HTML-CIRCLE:                             <div>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <p> Calculates the perimeter of the circle.</p>
 HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
 HTML-CIRCLE:                                 <h3>Returns</h3>
 HTML-CIRCLE:                                 <p> double The perimeter of the circle.</p>
 HTML-CIRCLE:                                 <h3>Code</h3>
diff --git a/clang-tools-extra/test/clang-doc/basic-project.test b/clang-tools-extra/test/clang-doc/basic-project.test
index 9c1ed29973d79..9220dc6974508 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.test
@@ -1,31 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t/docs %t/build
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 
-// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
-// RUN: FileCheck %s -input-file=%t/docs/index_json.js -check-prefix=JSON-INDEX
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Shape.html -check-prefix=HTML-SHAPE
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Calculator.html -check-prefix=HTML-CALC
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefix=HTML-RECTANGLE
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefix=HTML-CIRCLE
-
-// RUN: clang-doc --format=html --output=%t/docs-with-prefix --executor=all-TUs %t/build/compile_commands.json --repository=https://repository.com --repository-line-prefix=L
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Shape.html -check-prefixes=HTML-SHAPE,SHAPE-LINE-PREFIX
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Calculator.html -check-prefixes=HTML-CALC,CALC-LINE-PREFIX
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Rectangle.html -check-prefixes=HTML-RECTANGLE,RECTANGLE-LINE-PREFIX
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Circle.html -check-prefixes=HTML-CIRCLE,CIRCLE-LINE-PREFIX
-
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Shape.html -check-prefixes=HTML-SHAPE,SHAPE-NO-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Calculator.html -check-prefixes=HTML-CALC,CALC-NO-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefixes=HTML-RECTANGLE,RECTANGLE-NO-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefixes=HTML-CIRCLE,CIRCLE-NO-REPOSITORY
-
-// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json --repository=https://repository.com
-// RUN: FileCheck %s -input-file=%t/docs/index_json.js -check-prefixes=JSON-INDEX
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Shape.html -check-prefixes=HTML-SHAPE,SHAPE-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Calculator.html -check-prefixes=HTML-CALC,CALC-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefixes=HTML-RECTANGLE,RECTANGLE-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefixes=HTML-CIRCLE,CIRCLE-REPOSITORY
-
 // RUN: clang-doc --format=md --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
 // RUN: FileCheck %s -input-file=%t/docs/all_files.md -check-prefixes=MD-ALL-FILES
 // RUN: FileCheck %s -input-file=%t/docs/index.md -check-prefixes=MD-INDEX
@@ -81,248 +56,6 @@
 // JSON-INDEX-NEXT: };
 // JSON-INDEX-NEXT: }
 
-//      HTML-SHAPE: <h1>class Shape</h1>
-//   SHAPE-NO-REPOSITORY: <p>Defined at line 8 of file .{{.}}include{{.}}Shape.h</p>
-//      SHAPE-REPOSITORY: <p>
-// SHAPE-REPOSITORY-NEXT: Defined at line
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h#8">8</a>
-//     SHAPE-LINE-PREFIX: <a href="https://repository.com/./include/Shape.h#L8">8</a>
-// SHAPE-REPOSITORY-NEXT: of file
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h">Shape.h</a>
-// SHAPE-REPOSITORY-NEXT: </p>
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Abstract base class for shapes.</p>
-//      HTML-SHAPE: <p> Provides a common interface for different types of shapes.</p>
-//      HTML-SHAPE: <h2 id="Functions">Functions</h2>
-//      HTML-SHAPE: <h3 id="{{([0-9A-F]{40})}}">area</h3>
-//      HTML-SHAPE: <p>public double area()</p>
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Calculates the area of the shape.</p>
-//      HTML-SHAPE: <h3 id="{{([0-9A-F]{40})}}">perimeter</h3>
-//      HTML-SHAPE: <p>public double perimeter()</p>
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Calculates the perimeter of the shape.</p>
-//      HTML-SHAPE: <div>return</div>
-//      HTML-SHAPE: <p> double The perimeter of the shape.</p>
-//      HTML-SHAPE: <h3 id="{{([0-9A-F]{40})}}">~Shape</h3>
-//      HTML-SHAPE: <p>public void ~Shape()</p>
-
-//   SHAPE-NO-REPOSITORY: Defined at line 13 of file .{{.}}include{{.}}Shape.h
-//      SHAPE-REPOSITORY: Defined at line 
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h#13">13</a>
-//     SHAPE-LINE-PREFIX: <a href="https://repository.com/./include/Shape.h#L13">13</a>
-// SHAPE-REPOSITORY-NEXT: of file 
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h">Shape.h</a>
-
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Virtual destructor.</p>
-
-//      HTML-CALC: <h1>class Calculator</h1>
-// CALC-NO-REPOSITORY: <p>Defined at line 8 of file .{{.}}include{{.}}Calculator.h</p>
-//      CALC-REPOSITORY: <p>
-// CALC-REPOSITORY-NEXT: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h#8">8</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./include/Calculator.h#L8">8</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h">Calculator.h</a>
-// CALC-REPOSITORY-NEXT: </p>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> A simple calculator class.</p>
-//      HTML-CALC: <p> Provides basic arithmetic operations.</p>
-
-//      HTML-CALC: <h2 id="Members">Members</h2>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Holds a public value.</p>
-//      HTML-CALC: <div>public int public_val</div>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> A static value.</p>
-//      HTML-CALC: <div>public static const int static_val</div>
-
-//      HTML-CALC: <h2 id="Functions">Functions</h2>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">add</h3>
-//      HTML-CALC: <p>public int add(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 3 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#3">3</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L3">3</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Adds two integers.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> int The sum of a and b.</p>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">subtract</h3>
-//      HTML-CALC: <p>public int subtract(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 7 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#7">7</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L7">7</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Subtracts the second integer from the first.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> int The result of a - b.</p>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">multiply</h3>
-//      HTML-CALC: <p>public int multiply(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 11 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#11">11</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L11">11</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Multiplies two integers.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> int The product of a and b.</p>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">divide</h3>
-//      HTML-CALC: <p>public double divide(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 15 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#15">15</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L15">15</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Divides the first integer by the second.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> double The result of a / b.</p>
-//      HTML-CALC: <div>throw</div>
-//      HTML-CALC: <p>if b is zero.</p>
-
-//      HTML-CALC: <p>public static int mod(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 54 of file .{{.}}include{{.}}Calculator.h
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h#54">54</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./include/Calculator.h#L54">54</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h">Calculator.h</a>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Performs the mod operation on integers.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> The result of a % b.</p>
-
-//      HTML-RECTANGLE: <h1>class Rectangle</h1>
-// RECTANGLE-NO-REPOSITORY: <p>Defined at line 10 of file .{{.}}include{{.}}Rectangle.h</p>
-//      RECTANGLE-REPOSITORY: <p>
-// RECTANGLE-REPOSITORY-NEXT: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Rectangle.h#10">10</a>
-//     RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./include/Rectangle.h#L10">10</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Rectangle.h">Rectangle.h</a>
-// RECTANGLE-REPOSITORY-NEXT: </p>
-
-//      HTML-RECTANGLE: <p> Represents a rectangle with a given width and height.</p>
-//      HTML-RECTANGLE: <p>
-//      HTML-RECTANGLE:   Inherits from
-//      HTML-RECTANGLE:   <a href="Shape.html">Shape</a>
-//      HTML-RECTANGLE: </p>
-//      HTML-RECTANGLE: <h2 id="Members">Members</h2>
-//      HTML-RECTANGLE: <p> Width of the rectangle.</p>
-//      HTML-RECTANGLE: <div>private double width_</div>
-//      HTML-RECTANGLE: <p> Height of the rectangle.</p>
-//      HTML-RECTANGLE: <div>private double height_</div>
-//      HTML-RECTANGLE: <h2 id="Functions">Functions</h2>
-//      HTML-RECTANGLE: <h3 id="{{([0-9A-F]{40})}}">Rectangle</h3>
-//      HTML-RECTANGLE: <p>public void Rectangle(double width, double height)</p>
-//   RECTANGLE-NO-REPOSITORY: Defined at line 3 of file .{{.}}src{{.}}Rectangle.cpp
-//      RECTANGLE-REPOSITORY: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp#3">3</a>
-//     RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./src/Rectangle.cpp#L3">3</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp">Rectangle.cpp</a>
-
-//      HTML-RECTANGLE: <div>brief</div>
-//      HTML-RECTANGLE: <p> Constructs a new Rectangle object.</p>
-//      HTML-RECTANGLE: <h3 id="{{([0-9A-F]{40})}}">area</h3>
-//      HTML-RECTANGLE: <p>public double area()</p>
-//   RECTANGLE-NO-REPOSITORY: Defined at line 6 of file .{{.}}src{{.}}Rectangle.cpp
-//      RECTANGLE-REPOSITORY: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp#6">6</a>
-//     RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./src/Rectangle.cpp#L6">6</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp">Rectangle.cpp</a>
-
-//      HTML-RECTANGLE: <div>brief</div>
-//      HTML-RECTANGLE: <p> Calculates the area of the rectangle.</p>
-//      HTML-RECTANGLE: <div>return</div>
-//      HTML-RECTANGLE: <p> double The area of the rectangle.</p>
-//      HTML-RECTANGLE: <h3 id="{{([0-9A-F]{40})}}">perimeter</h3>
-//      HTML-RECTANGLE: <p>public double perimeter()</p>
-//   RECTANGLE-NO-REPOSITORY: Defined at line 10 of file .{{.}}src{{.}}Rectangle.cpp
-//      RECTANGLE-REPOSITORY: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp#10">10</a>
-// RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./src/Rectangle.cpp#L10">10</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp">Rectangle.cpp</a>
-//      HTML-RECTANGLE: <div>brief</div>
-//      HTML-RECTANGLE: <p> Calculates the perimeter of the rectangle.</p>
-//      HTML-RECTANGLE: <div>return</div>
-//      HTML-RECTANGLE: <p> double The perimeter of the rectangle.</p>
-
-//      HTML-CIRCLE: <h1>class Circle</h1>
-// CIRCLE-NO-REPOSITORY: <p>Defined at line 10 of file .{{.}}include{{.}}Circle.h</p>
-//      CIRCLE-REPOSITORY: <p>
-// CIRCLE-REPOSITORY-NEXT: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Circle.h#10">10</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./include/Circle.h#L10">10</a>
-// CIRCLE-REPOSITORY-NEXT: of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Circle.h">Circle.h</a>
-// CIRCLE-REPOSITORY-NEXT: </p>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Circle class derived from Shape.</p>
-//      HTML-CIRCLE: <p> Represents a circle with a given radius.</p>
-//      HTML-CIRCLE: <p>
-//      HTML-CIRCLE:   Inherits from
-//      HTML-CIRCLE:   <a href="Shape.html">Shape</a>
-//      HTML-CIRCLE: </p>
-//      HTML-CIRCLE: <h2 id="Members">Members</h2>
-//      HTML-CIRCLE: <p> Radius of the circle.</p>
-//      HTML-CIRCLE: <div>private double radius_</div>
-//      HTML-CIRCLE: <h2 id="Functions">Functions</h2>
-//      HTML-CIRCLE: <h3 id="{{([0-9A-F]{40})}}">Circle</h3>
-//      HTML-CIRCLE: <p>public void Circle(double radius)</p>
-//   CIRCLE-NO-REPOSITORY: Defined at line 3 of file .{{.}}src{{.}}Circle.cpp
-//      CIRCLE-REPOSITORY: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp#3">3</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./src/Circle.cpp#L3">3</a>
-// CIRCLE-REPOSITORY-NEXT:  of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp">Circle.cpp</a>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Constructs a new Circle object.</p>
-//      HTML-CIRCLE: <h3 id="{{([0-9A-F]{40})}}">area</h3>
-//      HTML-CIRCLE: <p>public double area()</p>
-//   CIRCLE-NO-REPOSITORY: Defined at line 5 of file .{{.}}src{{.}}Circle.cpp
-//      CIRCLE-REPOSITORY: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp#5">5</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./src/Circle.cpp#L5">5</a>
-// CIRCLE-REPOSITORY-NEXT:  of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp">Circle.cpp</a>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Calculates the area of the circle.</p>
-//      HTML-CIRCLE: <div>return</div>
-//      HTML-CIRCLE: <p> double The area of the circle.</p>
-//      HTML-CIRCLE: <h3 id="{{([0-9A-F]{40})}}">perimeter</h3>
-//      HTML-CIRCLE: <p>public double perimeter()</p>
-//   CIRCLE-NO-REPOSITORY: Defined at line  9 of file .{{.}}src{{.}}Circle.cpp
-//      CIRCLE-REPOSITORY: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp#9">9</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./src/Circle.cpp#L9">9</a>
-// CIRCLE-REPOSITORY-NEXT:  of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp">Circle.cpp</a>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Calculates the perimeter of the circle.</p>
-//      HTML-CIRCLE: <div>return</div>
-//      HTML-CIRCLE: <p> double The perimeter of the circle.</p>
-
 // MD-CALC: # class Calculator
 // MD-CALC: *Defined at .{{[\/]}}include{{[\/]}}Calculator.h#8*
 // MD-CALC: **brief** A simple calculator class.
diff --git a/clang-tools-extra/test/clang-doc/comments-in-macros.cpp b/clang-tools-extra/test/clang-doc/comments-in-macros.cpp
index 0c70fadb7f9ac..bc0ec46b72a05 100644
--- a/clang-tools-extra/test/clang-doc/comments-in-macros.cpp
+++ b/clang-tools-extra/test/clang-doc/comments-in-macros.cpp
@@ -6,8 +6,8 @@
 // RUN: FileCheck %s < %t/GlobalNamespace/MyClass.md --check-prefix=MD-MYCLASS
 
 // RUN: clang-doc --format=html --doxygen --output=%t --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.html --check-prefix=HTML-MYCLASS-LINE
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.html --check-prefix=HTML-MYCLASS
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7MyClass.html --check-prefix=HTML-MYCLASS-LINE
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7MyClass.html --check-prefix=HTML-MYCLASS
 
 #define DECLARE_METHODS                                           \
     /**   							  
@@ -21,15 +21,18 @@
 // MD-MYCLASS: *public int Add(int a, int b)*
 // MD-MYCLASS: **brief** Declare a method to calculate the sum of two numbers
 
-// HTML-MYCLASS: <p>public int Add(int a, int b)</p>
-// HTML-MYCLASS: <div>brief</div>
-// HTML-MYCLASS: <p> Declare a method to calculate the sum of two numbers</p>
+
+// HTML-MYCLASS: <pre><code class="language-cpp code-clang-doc">int Add (int a, int b)</code></pre>
+// HTML-MYCLASS: <div>
+// HTML-MYCLASS:     <div>
+// HTML-MYCLASS:         <p> Declare a method to calculate the sum of two numbers</p>
+// HTML-MYCLASS:     </div>
 
 
 class MyClass {
 public:
-// MD-MYCLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp#[[@LINE+2]]*
-// HTML-MYCLASS-LINE: <p>Defined at line [[@LINE+1]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp</p>
+// MD-MYCLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp#[[@LINE-2]]*
+// HTML-MYCLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp</p>
     DECLARE_METHODS
 };
 
diff --git a/clang-tools-extra/test/clang-doc/conversion_function.cpp b/clang-tools-extra/test/clang-doc/conversion_function.cpp
index 0200a578219ee..63df5d6f50d39 100644
--- a/clang-tools-extra/test/clang-doc/conversion_function.cpp
+++ b/clang-tools-extra/test/clang-doc/conversion_function.cpp
@@ -4,7 +4,7 @@
 // RUN: find %t/ -regex ".*/[0-9A-F]*.yaml" -exec cat {} ";" | FileCheck %s --check-prefix=CHECK-YAML
 
 // RUN: clang-doc --format=html --output=%t --executor=standalone %s 
-// RUN: FileCheck %s < %t/GlobalNamespace/MyStruct.html --check-prefix=CHECK-HTML
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV8MyStruct.html --check-prefix=CHECK-HTML
 
 template <typename T>
 struct MyStruct {
@@ -14,5 +14,6 @@ struct MyStruct {
 // Output correct conversion names.
 // CHECK-YAML:         Name:            'operator T'
 
-// CHECK-HTML: <h3 id="{{[0-9A-F]*}}">operator T</h3>
-// CHECK-HTML: <p>public T operator T()</p>
+// CHECK-HTML: <div id="{{([0-9A-F]{40})}}">
+// CHECK-HTML:     <pre><code class="language-cpp code-clang-doc">T operator T ()</code></pre>
+// CHECK-HTML: </div>
diff --git a/clang-tools-extra/test/clang-doc/enum.cpp b/clang-tools-extra/test/clang-doc/enum.cpp
index 3ba834e0b2e70..bb0d51fc3b36c 100644
--- a/clang-tools-extra/test/clang-doc/enum.cpp
+++ b/clang-tools-extra/test/clang-doc/enum.cpp
@@ -1,19 +1,12 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=html --doxygen --output=%t --executor=standalone %s
 // RUN: clang-doc --format=md --doxygen --output=%t --executor=standalone %s
-// RUN: clang-doc --format=mustache --doxygen --output=%t --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/index.html --check-prefix=HTML-INDEX-LINE
-// RUN: FileCheck %s < %t/GlobalNamespace/index.html --check-prefix=HTML-INDEX
-// RUN: FileCheck %s < %t/GlobalNamespace/Animals.html --check-prefix=HTML-ANIMAL-LINE
-// RUN: FileCheck %s < %t/GlobalNamespace/Animals.html --check-prefix=HTML-ANIMAL
-// RUN: FileCheck %s < %t/Vehicles/index.html --check-prefix=HTML-VEHICLES-LINE
-// RUN: FileCheck %s < %t/Vehicles/index.html --check-prefix=HTML-VEHICLES
-// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=MUSTACHE-INDEX-LINE
-// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=MUSTACHE-INDEX
-// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=MUSTACHE-ANIMAL-LINE
-// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=MUSTACHE-ANIMAL
-// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=MUSTACHE-VEHICLES-LINE
-// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=MUSTACHE-VEHICLES
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=HTML-INDEX-LINE
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=HTML-INDEX
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=HTML-ANIMAL-LINE
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=HTML-ANIMAL
+// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=HTML-VEHICLES-LINE
+// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=HTML-VEHICLES
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md --check-prefix=MD-INDEX-LINE
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md --check-prefix=MD-INDEX
 // RUN: FileCheck %s < %t/GlobalNamespace/Animals.md --check-prefix=MD-ANIMAL-LINE
@@ -28,8 +21,7 @@
  */
 enum Color {
   // MD-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-  // HTML-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
+  // HTML-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
   Red,   ///< Comment 1
   Green, ///< Comment 2
   Blue   ///< Comment 3
@@ -43,48 +35,36 @@ enum Color {
 // MD-INDEX: | Blue |
 // MD-INDEX: **brief** For specifying RGB colors
 
-// HTML-INDEX: <th colspan="3">enum Color</th>
-// HTML-INDEX: <td>Red</td>
-// HTML-INDEX: <td>0</td>
-// HTML-INDEX: <p> Comment 1</p>
-// HTML-INDEX: <td>Green</td>
-// HTML-INDEX: <td>1</td>
-// HTML-INDEX: <p> Comment 2</p>
-// HTML-INDEX: <td>Blue</td>
-// HTML-INDEX: <td>2</td>
-// HTML-INDEX: <p> Comment 3</p>
-
-// MUSTACHE-INDEX:     <div>
-// MUSTACHE-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Color</code></pre>
-// MUSTACHE-INDEX:     </div>
-// MUSTACHE-INDEX:     <table class="table-wrapper">
-// MUSTACHE-INDEX:         <tbody>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <th>Name</th>
-// MUSTACHE-INDEX:                 <th>Value</th>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Red</td>
-// MUSTACHE-INDEX:                 <td>0</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Green</td>
-// MUSTACHE-INDEX:                 <td>1</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Blue</td>
-// MUSTACHE-INDEX:                 <td>2</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:         </tbody>
-// MUSTACHE-INDEX:     </table>
+// HTML-INDEX:     <div>
+// HTML-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Color</code></pre>
+// HTML-INDEX:     </div>
+// HTML-INDEX:     <table class="table-wrapper">
+// HTML-INDEX:         <tbody>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <th>Name</th>
+// HTML-INDEX:                 <th>Value</th>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Red</td>
+// HTML-INDEX:                 <td>0</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Green</td>
+// HTML-INDEX:                 <td>1</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Blue</td>
+// HTML-INDEX:                 <td>2</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:         </tbody>
+// HTML-INDEX:     </table>
 
 /**
  * @brief Shape Types
  */
 enum class Shapes {
   // MD-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-  // HTML-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
+  // HTML-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
 
   /// Comment 1
   Circle,
@@ -100,86 +80,60 @@ enum class Shapes {
 // MD-INDEX: | Triangle |
 // MD-INDEX: **brief** Shape Types
 
-// HTML-INDEX: <th colspan="3">enum class Shapes</th>
-// HTML-INDEX: <td>Circle</td>
-// HTML-INDEX: <td>0</td>
-// HTML-INDEX: <p> Comment 1</p>
-// HTML-INDEX: <td>Rectangle</td>
-// HTML-INDEX: <td>1</td>
-// HTML-INDEX: <p> Comment 2</p>
-// HTML-INDEX: <td>Triangle</td>
-// HTML-INDEX: <td>2</td>
-// HTML-INDEX: <p> Comment 3</p>
-
 // COM: FIXME: Serialize "enum class" in template
-// MUSTACHE-INDEX:     <div>
-// MUSTACHE-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Shapes</code></pre>
-// MUSTACHE-INDEX:     </div>
-// MUSTACHE-INDEX:     <table class="table-wrapper">
-// MUSTACHE-INDEX:         <tbody>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <th>Name</th>
-// MUSTACHE-INDEX:                 <th>Value</th>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Circle</td>
-// MUSTACHE-INDEX:                 <td>0</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Rectangle</td>
-// MUSTACHE-INDEX:                 <td>1</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Triangle</td>
-// MUSTACHE-INDEX:                 <td>2</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:         </tbody>
-// MUSTACHE-INDEX:     </table>
+// HTML-INDEX:     <div>
+// HTML-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Shapes</code></pre>
+// HTML-INDEX:     </div>
+// HTML-INDEX:     <table class="table-wrapper">
+// HTML-INDEX:         <tbody>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <th>Name</th>
+// HTML-INDEX:                 <th>Value</th>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Circle</td>
+// HTML-INDEX:                 <td>0</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Rectangle</td>
+// HTML-INDEX:                 <td>1</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Triangle</td>
+// HTML-INDEX:                 <td>2</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:         </tbody>
+// HTML-INDEX:     </table>
 
 // COM: FIXME: Add enums declared inside of classes to class template
 class Animals {
   // MD-ANIMAL-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
   // HTML-ANIMAL-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-ANIMAL-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
 public:
   /**
    * @brief specify what animal the class is
    */
   enum AnimalType {
     // MD-ANIMAL-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-    // HTML-ANIMAL-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-    // MUSTACHE-ANIMAL-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
+    // HTML-ANIMAL-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
     Dog,   ///< Man's best friend
     Cat,   ///< Man's other best friend
     Iguana ///< A lizard
   };
 };
 
-// HTML-ANIMAL: <h1>class Animals</h1>
-// HTML-ANIMAL: <h2 id="Enums">Enums</h2>
-// HTML-ANIMAL: <th colspan="3">enum AnimalType</th>
-// HTML-ANIMAL: <td>Dog</td>
-// HTML-ANIMAL: <td>0</td>
-// HTML-ANIMAL: <p> Man&apos;s best friend</p>
-// HTML-ANIMAL: <td>Cat</td>
-// HTML-ANIMAL: <td>1</td>
-// HTML-ANIMAL: <p> Man&apos;s other best friend</p>
-// HTML-ANIMAL: <td>Iguana</td>
-// HTML-ANIMAL: <td>2</td>
-// HTML-ANIMAL: <p> A lizard</p>
-
-// MUSTACHE-ANIMAL-NOT: <h1>class Animals</h1>
-// MUSTACHE-ANIMAL-NOT: <h2 id="Enums">Enums</h2>
-// MUSTACHE-ANIMAL-NOT: <th colspan="3">enum AnimalType</th>
-// MUSTACHE-ANIMAL-NOT: <td>Dog</td>
-// MUSTACHE-ANIMAL-NOT: <td>0</td>
-// MUSTACHE-ANIMAL-NOT: <p> Man&apos;s best friend</p>
-// MUSTACHE-ANIMAL-NOT: <td>Cat</td>
-// MUSTACHE-ANIMAL-NOT: <td>1</td>
-// MUSTACHE-ANIMAL-NOT: <p> Man&apos;s other best friend</p>
-// MUSTACHE-ANIMAL-NOT: <td>Iguana</td>
-// MUSTACHE-ANIMAL-NOT: <td>2</td>
-// MUSTACHE-ANIMAL-NOT: <p> A lizard</p>
+// HTML-ANIMAL-NOT: <h1>class Animals</h1>
+// HTML-ANIMAL-NOT: <h2 id="Enums">Enums</h2>
+// HTML-ANIMAL-NOT: <th colspan="3">enum AnimalType</th>
+// HTML-ANIMAL-NOT: <td>Dog</td>
+// HTML-ANIMAL-NOT: <td>0</td>
+// HTML-ANIMAL-NOT: <p> Man&apos;s best friend</p>
+// HTML-ANIMAL-NOT: <td>Cat</td>
+// HTML-ANIMAL-NOT: <td>1</td>
+// HTML-ANIMAL-NOT: <p> Man&apos;s other best friend</p>
+// HTML-ANIMAL-NOT: <td>Iguana</td>
+// HTML-ANIMAL-NOT: <td>2</td>
+// HTML-ANIMAL-NOT: <p> A lizard</p>
 
 // MD-ANIMAL: # class Animals
 // MD-ANIMAL: ## Enums
@@ -196,8 +150,7 @@ namespace Vehicles {
  */
 enum Car {
   // MD-VEHICLES-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-  // HTML-VEHICLES-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-VEHICLES-LINE: Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp
+  // HTML-VEHICLES-LINE: Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp
 
   Sedan,    ///< Comment 1
   SUV,      ///< Comment 2
@@ -216,48 +169,33 @@ enum Car {
 // MD-VEHICLES: | Hatchback |
 // MD-VEHICLES: **brief** specify type of car
 
-// HTML-VEHICLES: <h1>namespace Vehicles</h1>
-// HTML-VEHICLES: <th colspan="3">enum Car</th>
-// HTML-VEHICLES: <td>Sedan</td>
-// HTML-VEHICLES: <td>0</td>
-// HTML-VEHICLES: <p> Comment 1</p>
-// HTML-VEHICLES: <td>SUV</td>
-// HTML-VEHICLES: <td>1</td>
-// HTML-VEHICLES: <p> Comment 2</p>
-// HTML-VEHICLES: <td>Pickup</td>
-// HTML-VEHICLES: <td>2</td>
-// HTML-VEHICLES: <p> Comment 3</p>
-// HTML-VEHICLES: <td>Hatchback</td>
-// HTML-VEHICLES: <td>3</td>
-// HTML-VEHICLES: <p> Comment 4</p>
-
-// MUSTACHE-VEHICLES:     <div>
-// MUSTACHE-VEHICLES:         <pre><code class="language-cpp code-clang-doc">enum Car</code></pre>
-// MUSTACHE-VEHICLES:      </div>
-// MUSTACHE-VEHICLES:      <table class="table-wrapper">
-// MUSTACHE-VEHICLES:          <tbody>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <th>Name</th>
-// MUSTACHE-VEHICLES:                  <th>Value</th>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>Sedan</td>
-// MUSTACHE-VEHICLES:                  <td>0</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>SUV</td>
-// MUSTACHE-VEHICLES:                  <td>1</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>Pickup</td>
-// MUSTACHE-VEHICLES:                  <td>2</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>Hatchback</td>
-// MUSTACHE-VEHICLES:                  <td>3</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:          </tbody>
-// MUSTACHE-VEHICLES:      </table>
+// HTML-VEHICLES:     <div>
+// HTML-VEHICLES:         <pre><code class="language-cpp code-clang-doc">enum Car</code></pre>
+// HTML-VEHICLES:      </div>
+// HTML-VEHICLES:      <table class="table-wrapper">
+// HTML-VEHICLES:          <tbody>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <th>Name</th>
+// HTML-VEHICLES:                  <th>Value</th>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>Sedan</td>
+// HTML-VEHICLES:                  <td>0</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>SUV</td>
+// HTML-VEHICLES:                  <td>1</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>Pickup</td>
+// HTML-VEHICLES:                  <td>2</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>Hatchback</td>
+// HTML-VEHICLES:                  <td>3</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:          </tbody>
+// HTML-VEHICLES:      </table>
 
 enum ColorUserSpecified {
   RedUserSpecified = 'A',
@@ -271,34 +209,26 @@ enum ColorUserSpecified {
 // MD-INDEX: | GreenUserSpecified |
 // MD-INDEX: | BlueUserSpecified |
 
-// HTML-INDEX: <th colspan="2">enum ColorUserSpecified</th>
-// HTML-INDEX: <td>RedUserSpecified</td>
-// HTML-INDEX: <td>&apos;A&apos;</td>
-// HTML-INDEX: <td>GreenUserSpecified</td>
-// HTML-INDEX: <td>2</td>
-// HTML-INDEX: <td>BlueUserSpecified</td>
-// HTML-INDEX: <td>&apos;C&apos;</td>
-
-// MUSTACHE-INDEX:     <div>
-// MUSTACHE-INDEX:         <pre><code class="language-cpp code-clang-doc">enum ColorUserSpecified</code></pre>
-// MUSTACHE-INDEX:     </div>
-// MUSTACHE-INDEX:     <table class="table-wrapper">
-// MUSTACHE-INDEX:         <tbody>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <th>Name</th>
-// MUSTACHE-INDEX:                 <th>Value</th>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>RedUserSpecified</td>
-// MUSTACHE-INDEX:                 <td>&#39;A&#39;</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>GreenUserSpecified</td>
-// MUSTACHE-INDEX:                 <td>2</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>BlueUserSpecified</td>
-// MUSTACHE-INDEX:                 <td>&#39;C&#39;</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:         </tbody>
-// MUSTACHE-INDEX:     </table>
+// HTML-INDEX:     <div>
+// HTML-INDEX:         <pre><code class="language-cpp code-clang-doc">enum ColorUserSpecified</code></pre>
+// HTML-INDEX:     </div>
+// HTML-INDEX:     <table class="table-wrapper">
+// HTML-INDEX:         <tbody>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <th>Name</th>
+// HTML-INDEX:                 <th>Value</th>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>RedUserSpecified</td>
+// HTML-INDEX:                 <td>&#39;A&#39;</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>GreenUserSpecified</td>
+// HTML-INDEX:                 <td>2</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>BlueUserSpecified</td>
+// HTML-INDEX:                 <td>&#39;C&#39;</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:         </tbody>
+// HTML-INDEX:     </table>
diff --git a/clang-tools-extra/test/clang-doc/json/class.cpp b/clang-tools-extra/test/clang-doc/json/class.cpp
index 91160585bef1a..8bf9402adf054 100644
--- a/clang-tools-extra/test/clang-doc/json/class.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class.cpp
@@ -47,9 +47,6 @@ struct MyClass {
 // CHECK-NEXT:          },
 // CHECK-NEXT:          {
 // CHECK-NEXT:            "TextComment": " It has some nice methods and fields."
-// CHECK-NEXT:          },
-// CHECK-NEXT:          {
-// CHECK-NEXT:            "TextComment": ""
 // CHECK-NEXT:          }
 // CHECK:         "DocumentationFileName": "_ZTV7MyClass",
 // CHECK:         "Enums": [
diff --git a/clang-tools-extra/test/clang-doc/long-name.cpp b/clang-tools-extra/test/clang-doc/long-name.cpp
index 77e50b1553ad5..e4a5e29f973d5 100644
--- a/clang-tools-extra/test/clang-doc/long-name.cpp
+++ b/clang-tools-extra/test/clang-doc/long-name.cpp
@@ -1,7 +1,7 @@
 // FIXME: This test seems to break on windows, so disable it for now.
 // UNSUPPORTED: system-windows
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --output=%t --format=mustache --executor=standalone %s
+// RUN: clang-doc --output=%t --format=html --executor=standalone %s
 // RUN: ls %t/json/GlobalNamespace | FileCheck %s -check-prefix=CHECK-JSON
 // RUN: ls %t/html/GlobalNamespace | FileCheck %s -check-prefix=CHECK-HTML
 
diff --git a/clang-tools-extra/test/clang-doc/mustache-index.cpp b/clang-tools-extra/test/clang-doc/mustache-index.cpp
index 709cc82bf85bb..0aa6e21c37cac 100644
--- a/clang-tools-extra/test/clang-doc/mustache-index.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-index.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
+// RUN: clang-doc --format=html --output=%t --executor=standalone %s 
 // RUN: FileCheck %s < %t/html/GlobalNamespace/index.html
 
 enum Color {
diff --git a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
index dfc81df134596..add8a221feb40 100644
--- a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
+// RUN: clang-doc --format=html --output=%t --executor=standalone %s 
 // RUN: FileCheck %s < %t/html/MyNamespace/index.html
 // RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=CHECK-GLOBAL
 
diff --git a/clang-tools-extra/test/clang-doc/namespace.cpp b/clang-tools-extra/test/clang-doc/namespace.cpp
index adf7ab7d946ab..029f9974e775e 100644
--- a/clang-tools-extra/test/clang-doc/namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/namespace.cpp
@@ -1,24 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=html --output=%t --executor=standalone %s
 // RUN: clang-doc --format=md --output=%t --executor=standalone %s
-// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s
-// RUN: FileCheck %s < %t/index_json.js -check-prefix=JSON-INDEX
-// RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.html -check-prefix=HTML-ANON-CLASS-LINE
-// RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.html -check-prefix=HTML-ANON-CLASS
-// RUN: FileCheck %s < %t/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX-LINE
-// RUN: FileCheck %s < %t/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX
-// RUN: FileCheck %s < %t/AnotherNamespace/ClassInAnotherNamespace.html -check-prefix=HTML-ANOTHER-CLASS-LINE
-// RUN: FileCheck %s < %t/AnotherNamespace/ClassInAnotherNamespace.html -check-prefix=HTML-ANOTHER-CLASS
-// RUN: FileCheck %s < %t/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX-LINE
-// RUN: FileCheck %s < %t/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/ClassInNestedNamespace.html -check-prefix=HTML-NESTED-CLASS-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/ClassInNestedNamespace.html -check-prefix=HTML-NESTED-CLASS
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX
-// RUN: FileCheck %s < %t/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX
-// RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.html -check-prefix=HTML-PRIMARY-CLASS-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.html -check-prefix=HTML-PRIMARY-CLASS
 // RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.md -check-prefix=MD-ANON-CLASS-LINE
 // RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.md -check-prefix=MD-ANON-CLASS
 // RUN: FileCheck %s < %t/@nonymous_namespace/index.md -check-prefix=MD-ANON-INDEX-LINE
@@ -35,26 +17,26 @@
 // RUN: FileCheck %s < %t/PrimaryNamespace/index.md -check-prefix=MD-PRIMARY-INDEX
 // RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.md -check-prefix=MD-PRIMARY-CLASS-LINE
 // RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.md -check-prefix=MD-PRIMARY-CLASS
-// RUN: FileCheck %s < %t/GlobalNamespace/index.html -check-prefix=HTML-GLOBAL-INDEX
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html -check-prefix=HTML-GLOBAL-INDEX
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md -check-prefix=MD-GLOBAL-INDEX
 // RUN: FileCheck %s < %t/all_files.md -check-prefix=MD-ALL-FILES
 // RUN: FileCheck %s < %t/index.md -check-prefix=MD-INDEX
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=MUSTACHE-ANON-CLASS-LINE
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=MUSTACHE-ANON-CLASS
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=MUSTACHE-ANON-INDEX-LINE
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=MUSTACHE-ANON-INDEX
-// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=MUSTACHE-ANOTHER-CLASS-LINE
-// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=MUSTACHE-ANOTHER-CLASS
-// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=MUSTACHE-ANOTHER-INDEX-LINE
-// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=MUSTACHE-ANOTHER-INDEX
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=MUSTACHE-NESTED-CLASS-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=MUSTACHE-NESTED-CLASS
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=MUSTACHE-NESTED-INDEX-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=MUSTACHE-NESTED-INDEX
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=MUSTACHE-PRIMARY-INDEX-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=MUSTACHE-PRIMARY-INDEX
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=MUSTACHE-PRIMARY-CLASS-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=MUSTACHE-PRIMARY-CLASS
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=HTML-ANON-CLASS-LINE
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=HTML-ANON-CLASS
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX-LINE
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX
+// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=HTML-ANOTHER-CLASS-LINE
+// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=HTML-ANOTHER-CLASS
+// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX-LINE
+// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=HTML-NESTED-CLASS-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=HTML-NESTED-CLASS
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=HTML-PRIMARY-CLASS-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=HTML-PRIMARY-CLASS
 
 // COM: FIXME: Add global functions to the namespace template
 // COM: FIXME: Add namespaces to the namespace template
@@ -63,17 +45,14 @@
 namespace {
 void anonFunction() {}
 // MD-ANON-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-ANON-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANON-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-ANON-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 class AnonClass {};
 // MD-ANON-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANON-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANON-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-ANON-CLASS: # class AnonClass
-// HTML-ANON-CLASS: <h1>class AnonClass</h1>
-// MUSTACHE-ANON-CLASS: <h1 class="hero__title-large">class AnonClass</h1>
+// HTML-ANON-CLASS: <h1 class="hero__title-large">class AnonClass</h1>
 } // namespace
 
 // MD-ANON-INDEX: # namespace @nonymous_namespace
@@ -84,69 +63,51 @@ class AnonClass {};
 // MD-ANON-INDEX: ### anonFunction
 // MD-ANON-INDEX: *void anonFunction()*
 
-// HTML-ANON-INDEX: <h1>namespace @nonymous_namespace</h1>
-// HTML-ANON-INDEX: <p> Anonymous Namespace</p>
-// HTML-ANON-INDEX: <h2 id="Records">Records</h2>
-// HTML-ANON-INDEX: <a href="AnonClass.html">AnonClass</a>
-// HTML-ANON-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-ANON-INDEX: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
-// HTML-ANON-INDEX: <p>void anonFunction()</p>
-
-// MUSTACHE-ANON-INDEX: <h2> @nonymous_namespace</h2>
-// MUSTACHE-ANON-INDEX:     <h2>Inner Classes</h2>
-// MUSTACHE-ANON-INDEX:         <ul class="class-container">
-// MUSTACHE-ANON-INDEX:             <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-ANON-INDEX:                 <a href="_ZTVN12_GLOBAL__N_19AnonClassE.html">
-// MUSTACHE-ANON-INDEX:                     <pre><code class="language-cpp code-clang-doc">class AnonClass</code></pre>
-// MUSTACHE-ANON-INDEX:                 </a>
-// MUSTACHE-ANON-INDEX:             </li>
-// MUSTACHE-ANON-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-ANON-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
-// MUSTACHE-ANON-INDEX-NOT: <p>void anonFunction()</p>
+// HTML-ANON-INDEX: <h2> @nonymous_namespace</h2>
+// HTML-ANON-INDEX:     <h2>Inner Classes</h2>
+// HTML-ANON-INDEX:         <ul class="class-container">
+// HTML-ANON-INDEX:             <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-ANON-INDEX:                 <a href="_ZTVN12_GLOBAL__N_19AnonClassE.html">
+// HTML-ANON-INDEX:                     <pre><code class="language-cpp code-clang-doc">class AnonClass</code></pre>
+// HTML-ANON-INDEX:                 </a>
+// HTML-ANON-INDEX:             </li>
+// HTML-ANON-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-ANON-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
+// HTML-ANON-INDEX-NOT: <p>void anonFunction()</p>
 
 // Primary Namespace
 namespace PrimaryNamespace {
 // Function in PrimaryNamespace
 void functionInPrimaryNamespace() {}
 // MD-PRIMARY-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-PRIMARY-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-PRIMARY-INDEX-LINE-NOT: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-PRIMARY-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in PrimaryNamespace
 class ClassInPrimaryNamespace {};
 // MD-PRIMARY-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-PRIMARY-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-PRIMARY-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-PRIMARY-CLASS: # class ClassInPrimaryNamespace
 // MD-PRIMARY-CLASS: Class in PrimaryNamespace
 
-// HTML-PRIMARY-CLASS: <h1>class ClassInPrimaryNamespace</h1>
-// HTML-PRIMARY-CLASS: <p> Class in PrimaryNamespace</p>
-
-// MUSTACHE-PRIMARY-CLASS: <h1 class="hero__title-large">class ClassInPrimaryNamespace</h1>
+// HTML-PRIMARY-CLASS: <h1 class="hero__title-large">class ClassInPrimaryNamespace</h1>
 
 // Nested namespace
 namespace NestedNamespace {
 // Function in NestedNamespace
 void functionInNestedNamespace() {}
 // MD-NESTED-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-NESTED-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-NESTED-INDEX-LINE-NOT: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-NESTED-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in NestedNamespace
 class ClassInNestedNamespace {};
 // MD-NESTED-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-NESTED-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-NESTED-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-NESTED-CLASS: # class ClassInNestedNamespace
 // MD-NESTED-CLASS: Class in NestedNamespace
 
-// HTML-NESTED-CLASS: <h1>class ClassInNestedNamespace</h1>
-// HTML-NESTED-CLASS: <p> Class in NestedNamespace</p>
-
-// MUSTACHE-NESTED-CLASS: <h1 class="hero__title-large">class ClassInNestedNamespace</h1>
+// HTML-NESTED-CLASS: <h1 class="hero__title-large">class ClassInNestedNamespace</h1>
 } // namespace NestedNamespace
 
 // MD-NESTED-INDEX: # namespace NestedNamespace
@@ -158,28 +119,19 @@ class ClassInNestedNamespace {};
 // MD-NESTED-INDEX: *void functionInNestedNamespace()*
 // MD-NESTED-INDEX: Function in NestedNamespace
 
-// HTML-NESTED-INDEX: <h1>namespace NestedNamespace</h1>
-// HTML-NESTED-INDEX: <p> Nested namespace</p>
-// HTML-NESTED-INDEX: <h2 id="Records">Records</h2>
-// HTML-NESTED-INDEX: <a href="ClassInNestedNamespace.html">ClassInNestedNamespace</a>
-// HTML-NESTED-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-NESTED-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
-// HTML-NESTED-INDEX: <p>void functionInNestedNamespace()</p>
-// HTML-NESTED-INDEX: <p> Function in NestedNamespace</p>
-
-// MUSTACHE-NESTED-INDEX: <h2> NestedNamespace</h2>
-// MUSTACHE-NESTED-INDEX:     <h2>Inner Classes</h2>
-// MUSTACHE-NESTED-INDEX:     <ul class="class-container">
-// MUSTACHE-NESTED-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-NESTED-INDEX:             <a href="_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html">
-// MUSTACHE-NESTED-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInNestedNamespace</code></pre>
-// MUSTACHE-NESTED-INDEX:             </a>
-// MUSTACHE-NESTED-INDEX:         </li>
-// MUSTACHE-NESTED-INDEX:     </ul>
-// MUSTACHE-NESTED-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-NESTED-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
-// MUSTACHE-NESTED-INDEX-NOT: <p>void functionInNestedNamespace()</p>
-// MUSTACHE-NESTED-INDEX-NOT: <p> Function in NestedNamespace</p>
+// HTML-NESTED-INDEX: <h2> NestedNamespace</h2>
+// HTML-NESTED-INDEX:     <h2>Inner Classes</h2>
+// HTML-NESTED-INDEX:     <ul class="class-container">
+// HTML-NESTED-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-NESTED-INDEX:             <a href="_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html">
+// HTML-NESTED-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInNestedNamespace</code></pre>
+// HTML-NESTED-INDEX:             </a>
+// HTML-NESTED-INDEX:         </li>
+// HTML-NESTED-INDEX:     </ul>
+// HTML-NESTED-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-NESTED-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
+// HTML-NESTED-INDEX-NOT: <p>void functionInNestedNamespace()</p>
+// HTML-NESTED-INDEX-NOT: <p> Function in NestedNamespace</p>
 } // namespace PrimaryNamespace
 
 // MD-PRIMARY-INDEX: # namespace PrimaryNamespace
@@ -193,54 +145,38 @@ class ClassInNestedNamespace {};
 // MD-PRIMARY-INDEX: *void functionInPrimaryNamespace()*
 // MD-PRIMARY-INDEX:  Function in PrimaryNamespace
 
-// HTML-PRIMARY-INDEX: <h1>namespace PrimaryNamespace</h1>
-// HTML-PRIMARY-INDEX: <p> Primary Namespace</p>
-// HTML-PRIMARY-INDEX: <h2 id="Namespaces">Namespaces</h2>
-// HTML-PRIMARY-INDEX: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
-// HTML-PRIMARY-INDEX: <h2 id="Records">Records</h2>
-// HTML-PRIMARY-INDEX: <a href="ClassInPrimaryNamespace.html">ClassInPrimaryNamespace</a>
-// HTML-PRIMARY-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-PRIMARY-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
-// HTML-PRIMARY-INDEX: <p>void functionInPrimaryNamespace()</p>
-// HTML-PRIMARY-INDEX: <p> Function in PrimaryNamespace</p>
-
-// MUSTACHE-PRIMARY-INDEX: <h2> PrimaryNamespace</h2>
-// MUSTACHE-PRIMARY-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
-// MUSTACHE-PRIMARY-INDEX-NOT: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
-// MUSTACHE-PRIMARY-INDEX      <h2>Inner Classes</h2>
-// MUSTACHE-PRIMARY-INDEX          <ul class="class-container">
-// MUSTACHE-PRIMARY-INDEX              <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-PRIMARY-INDEX                  <a href="_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html">
-// MUSTACHE-PRIMARY-INDEX                      <pre><code class="language-cpp code-clang-doc">class ClassInPrimaryNamespace</code></pre>
-// MUSTACHE-PRIMARY-INDEX                  </a>
-// MUSTACHE-PRIMARY-INDEX              </li>
-// MUSTACHE-PRIMARY-INDEX          </ul>
-// MUSTACHE-PRIMARY-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-PRIMARY-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
-// MUSTACHE-PRIMARY-INDEX-NOT: <p>void functionInPrimaryNamespace()</p>
-// MUSTACHE-PRIMARY-INDEX-NOT: <p> Function in PrimaryNamespace</p>
+// HTML-PRIMARY-INDEX: <h2> PrimaryNamespace</h2>
+// HTML-PRIMARY-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
+// HTML-PRIMARY-INDEX-NOT: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
+// HTML-PRIMARY-INDEX      <h2>Inner Classes</h2>
+// HTML-PRIMARY-INDEX          <ul class="class-container">
+// HTML-PRIMARY-INDEX              <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-PRIMARY-INDEX                  <a href="_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html">
+// HTML-PRIMARY-INDEX                      <pre><code class="language-cpp code-clang-doc">class ClassInPrimaryNamespace</code></pre>
+// HTML-PRIMARY-INDEX                  </a>
+// HTML-PRIMARY-INDEX              </li>
+// HTML-PRIMARY-INDEX          </ul>
+// HTML-PRIMARY-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-PRIMARY-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
+// HTML-PRIMARY-INDEX-NOT: <p>void functionInPrimaryNamespace()</p>
+// HTML-PRIMARY-INDEX-NOT: <p> Function in PrimaryNamespace</p>
 
 // AnotherNamespace
 namespace AnotherNamespace {
 // Function in AnotherNamespace
 void functionInAnotherNamespace() {}
 // MD-ANOTHER-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-ANOTHER-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANOTHER-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-ANOTHER-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in AnotherNamespace
 class ClassInAnotherNamespace {};
 // MD-ANOTHER-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANOTHER-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANOTHER-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-ANOTHER-CLASS: # class ClassInAnotherNamespace
 // MD-ANOTHER-CLASS:  Class in AnotherNamespace
 
-// HTML-ANOTHER-CLASS: <h1>class ClassInAnotherNamespace</h1>
-// HTML-ANOTHER-CLASS: <p> Class in AnotherNamespace</p>
-
-// MUSTACHE-ANOTHER-CLASS: <h1 class="hero__title-large">class ClassInAnotherNamespace</h1>
+// HTML-ANOTHER-CLASS: <h1 class="hero__title-large">class ClassInAnotherNamespace</h1>
 
 } // namespace AnotherNamespace
 
@@ -253,120 +189,27 @@ class ClassInAnotherNamespace {};
 // MD-ANOTHER-INDEX: *void functionInAnotherNamespace()*
 // MD-ANOTHER-INDEX: Function in AnotherNamespace
 
-// HTML-ANOTHER-INDEX: <h1>namespace AnotherNamespace</h1>
-// HTML-ANOTHER-INDEX: <p> AnotherNamespace</p>
-// HTML-ANOTHER-INDEX: <h2 id="Records">Records</h2>
-// HTML-ANOTHER-INDEX: <a href="ClassInAnotherNamespace.html">ClassInAnotherNamespace</a>
-// HTML-ANOTHER-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-ANOTHER-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
-// HTML-ANOTHER-INDEX: <p>void functionInAnotherNamespace()</p>
-// HTML-ANOTHER-INDEX: <p> Function in AnotherNamespace</p>
-
-// MUSTACHE-ANOTHER-INDEX: <h2> AnotherNamespace</h2>
-// MUSTACHE-ANOTHER-INDEX:     <h2>Inner Classes</h2>
-// MUSTACHE-ANOTHER-INDEX:     <ul class="class-container">
-// MUSTACHE-ANOTHER-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-ANOTHER-INDEX:             <a href="_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html">
-// MUSTACHE-ANOTHER-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInAnotherNamespace</code></pre>
-// MUSTACHE-ANOTHER-INDEX:             </a>
-// MUSTACHE-ANOTHER-INDEX:         </li>
-// MUSTACHE-ANOTHER-INDEX:     </ul>
-// MUSTACHE-ANOTHER-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-ANOTHER-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
-// MUSTACHE-ANOTHER-INDEX-NOT: <p>void functionInAnotherNamespace()</p>
-// MUSTACHE-ANOTHER-INDEX-NOT: <p> Function in AnotherNamespace</p>
-
-// JSON-INDEX: async function LoadIndex() {
-// JSON-INDEX-NEXT: return{
-// JSON-INDEX-NEXT:   "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:   "Name": "",
-// JSON-INDEX-NEXT:   "RefType": "default",
-// JSON-INDEX-NEXT:   "Path": "",
-// JSON-INDEX-NEXT:   "Children": [
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "@nonymous_namespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "@nonymous_namespace",
-// JSON-INDEX-NEXT:       "Children": [
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "AnonClass",
-// JSON-INDEX-NEXT:           "RefType": "record",
-// JSON-INDEX-NEXT:           "Path": "@nonymous_namespace",
-// JSON-INDEX-NEXT:           "Children": []
-// JSON-INDEX-NEXT:         }
-// JSON-INDEX-NEXT:       ]
-// JSON-INDEX-NEXT:     },
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "AnotherNamespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "AnotherNamespace",
-// JSON-INDEX-NEXT:       "Children": [
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "ClassInAnotherNamespace",
-// JSON-INDEX-NEXT:           "RefType": "record",
-// JSON-INDEX-NEXT:           "Path": "AnotherNamespace",
-// JSON-INDEX-NEXT:           "Children": []
-// JSON-INDEX-NEXT:         }
-// JSON-INDEX-NEXT:       ]
-// JSON-INDEX-NEXT:     },
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "GlobalNamespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "GlobalNamespace",
-// JSON-INDEX-NEXT:       "Children": []
-// JSON-INDEX-NEXT:     },
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "PrimaryNamespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "PrimaryNamespace",
-// JSON-INDEX-NEXT:       "Children": [
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "ClassInPrimaryNamespace",
-// JSON-INDEX-NEXT:           "RefType": "record",
-// JSON-INDEX-NEXT:           "Path": "PrimaryNamespace",
-// JSON-INDEX-NEXT:           "Children": []
-// JSON-INDEX-NEXT:         },
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "NestedNamespace",
-// JSON-INDEX-NEXT:           "RefType": "namespace",
-// JSON-INDEX-NEXT:           "Path": "PrimaryNamespace{{[\/]+}}NestedNamespace",
-// JSON-INDEX-NEXT:           "Children": [
-// JSON-INDEX-NEXT:             {
-// JSON-INDEX-NEXT:               "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:               "Name": "ClassInNestedNamespace",
-// JSON-INDEX-NEXT:               "RefType": "record",
-// JSON-INDEX-NEXT:               "Path": "PrimaryNamespace{{[\/]+}}NestedNamespace",
-// JSON-INDEX-NEXT:               "Children": []
-// JSON-INDEX-NEXT:             }
-// JSON-INDEX-NEXT:           ]
-// JSON-INDEX-NEXT:         }
-// JSON-INDEX-NEXT:       ]
-// JSON-INDEX-NEXT:     }
-// JSON-INDEX-NEXT:   ]
-// JSON-INDEX-NEXT: };
-// JSON-INDEX-NEXT: }
-
-// HTML-GLOBAL-INDEX: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-// HTML-GLOBAL-INDEX: <h1>Global Namespace</h1>
-// HTML-GLOBAL-INDEX: <h2 id="Namespaces">Namespaces</h2>
-// HTML-GLOBAL-INDEX: <li>@nonymous_namespace</li>
-// HTML-GLOBAL-INDEX: <li>AnotherNamespace</li>
-// HTML-GLOBAL-INDEX: <li>PrimaryNamespace</li>
-
-// MUSTACHE-GLOBAL-INDEX: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-// MUSTACHE-GLOBAL-INDEX: <h1>Global Namespace</h1>
-// MUSTACHE-GLOBAL-INDEX: <h2 id="Namespaces">Namespaces</h2>
-// MUSTACHE-GLOBAL-INDEX: <li>@nonymous_namespace</li>
-// MUSTACHE-GLOBAL-INDEX: <li>AnotherNamespace</li>
-// MUSTACHE-GLOBAL-INDEX: <li>PrimaryNamespace</li>
+// HTML-ANOTHER-INDEX: <h2> AnotherNamespace</h2>
+// HTML-ANOTHER-INDEX:     <h2>Inner Classes</h2>
+// HTML-ANOTHER-INDEX:     <ul class="class-container">
+// HTML-ANOTHER-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-ANOTHER-INDEX:             <a href="_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html">
+// HTML-ANOTHER-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInAnotherNamespace</code></pre>
+// HTML-ANOTHER-INDEX:             </a>
+// HTML-ANOTHER-INDEX:         </li>
+// HTML-ANOTHER-INDEX:     </ul>
+// HTML-ANOTHER-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-ANOTHER-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
+// HTML-ANOTHER-INDEX-NOT: <p>void functionInAnotherNamespace()</p>
+// HTML-ANOTHER-INDEX-NOT: <p> Function in AnotherNamespace</p>
+
+// COM: FIXME: Add namespaces to namespace template
+// HTML-GLOBAL-INDEX-NOT: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
+// HTML-GLOBAL-INDEX-NOT: <h1>Global Namespace</h1>
+// HTML-GLOBAL-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
+// HTML-GLOBAL-INDEX-NOT: <li>@nonymous_namespace</li>
+// HTML-GLOBAL-INDEX-NOT: <li>AnotherNamespace</li>
+// HTML-GLOBAL-INDEX-NOT: <li>PrimaryNamespace</li>
 
 // MD-GLOBAL-INDEX: # Global Namespace
 // MD-GLOBAL-INDEX: ## Namespaces
diff --git a/clang-tools-extra/test/clang-doc/test-path-abs.cpp b/clang-tools-extra/test/clang-doc/test-path-abs.cpp
deleted file mode 100644
index 8875a3a73ab7e..0000000000000
--- a/clang-tools-extra/test/clang-doc/test-path-abs.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --format=html --executor=standalone %s --output=%t --base base_dir
-// RUN: FileCheck %s -input-file=%t/index_json.js  -check-prefix=JSON-INDEX
-
-// JSON-INDEX: var RootPath = "{{.*}}test-path-abs.cpp.tmp";
-// JSON-INDEX-NEXT: var Base = "base_dir";
-
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 18166acf9bbca..01b34ec9a791e 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -26,7 +26,6 @@ add_extra_unittest(ClangDocTests
   ClangDocTest.cpp
   GeneratorTest.cpp
   HTMLGeneratorTest.cpp
-  HTMLMustacheGeneratorTest.cpp
   MDGeneratorTest.cpp
   MergeTest.cpp
   SerializeTest.cpp
diff --git a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
index 2fe443d9db5c5..cf510afe214dd 100644
--- a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
@@ -9,17 +9,22 @@
 #include "ClangDocTest.h"
 #include "Generators.h"
 #include "Representation.h"
+#include "config.h"
+#include "support/Utils.h"
 #include "clang/Basic/Version.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-namespace clang {
-namespace doc {
+using namespace llvm;
+using namespace testing;
+using namespace clang;
+using namespace clang::doc;
 
-static const std::string ClangDocVersion =
-    clang::getClangToolFullVersion("clang-doc");
+static const std::string ClangDocVersion = getClangToolFullVersion("clang-doc");
 
 static std::unique_ptr<Generator> getHTMLGenerator() {
-  auto G = doc::findGeneratorByName("html");
+  auto G = findGeneratorByName("html");
   if (!G)
     return nullptr;
   return std::move(G.get());
@@ -50,454 +55,10 @@ class HTMLGeneratorTest : public ClangDocContextTest {
   }
 };
 
-TEST_F(HTMLGeneratorTest, emitNamespaceHTML) {
-  NamespaceInfo I;
-  I.Name = "Namespace";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.Children.Namespaces.emplace_back(EmptySID, "ChildNamespace",
-                                     InfoType::IT_namespace,
-                                     "Namespace::ChildNamespace", "Namespace");
-  I.Children.Records.emplace_back(EmptySID, "ChildStruct", InfoType::IT_record,
-                                  "Namespace::ChildStruct", "Namespace");
-  I.Children.Functions.emplace_back();
-  I.Children.Functions.back().Access = AccessSpecifier::AS_none;
-  I.Children.Functions.back().Name = "OneFunction";
-  I.Children.Enums.emplace_back();
-  I.Children.Enums.back().Name = "OneEnum";
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({"user-provided-stylesheet.css"});
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title>namespace Namespace</title>
-<link rel="stylesheet" href="../clang-doc-default-stylesheet.css"/>
-<link rel="stylesheet" href="../user-provided-stylesheet.css"/>
-<script src="../index_json.js"></script>
-<script src="../index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="Namespace" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h1>namespace Namespace</h1>
-    <h2 id="Namespaces">Namespaces</h2>
-    <ul>
-      <li>
-        <a href="ChildNamespace/index.html">ChildNamespace</a>
-      </li>
-    </ul>
-    <h2 id="Records">Records</h2>
-    <ul>
-      <li>
-        <a href="ChildStruct.html">ChildStruct</a>
-      </li>
-    </ul>
-    <h2 id="Functions">Functions</h2>
-    <div>
-      <h3 id="0000000000000000000000000000000000000000">OneFunction</h3>
-      <p>OneFunction()</p>
-    </div>
-    <h2 id="Enums">Enums</h2>
-    <div>
-      <table id="0000000000000000000000000000000000000000">
-        <thead>
-          <tr>
-            <th colspan="2">enum OneEnum</th>
-          </tr>
-        </thead>
-      </table>
-    </div>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right">
-    <ol>
-      <li>
-        <span>
-          <a href="#Namespaces">Namespaces</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Records">Records</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Functions">Functions</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneFunction</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-      <li>
-        <span>
-          <a href="#Enums">Enums</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneEnum</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-    </ol>
-  </div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitRecordHTML) {
-  RecordInfo I;
-  I.Name = "r";
-  I.Path = "X/Y/Z";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.DefLoc = Location(10, 10, "dir/test.cpp", true);
-  I.Loc.emplace_back(12, 12, "test.cpp");
-
-  SmallString<16> PathTo;
-  llvm::sys::path::native("path/to", PathTo);
-  I.Members.emplace_back(TypeInfo("int"), "X", AccessSpecifier::AS_private);
-  I.TagType = TagTypeKind::Class;
-  I.Parents.emplace_back(EmptySID, "F", InfoType::IT_record, "F", PathTo);
-  I.VirtualParents.emplace_back(EmptySID, "G", InfoType::IT_record);
-
-  I.Children.Records.emplace_back(EmptySID, "ChildStruct", InfoType::IT_record,
-                                  "X::Y::Z::r::ChildStruct", "X/Y/Z/r");
-  I.Children.Functions.emplace_back();
-  I.Children.Functions.back().Name = "OneFunction";
-  I.Children.Enums.emplace_back();
-  I.Children.Enums.back().Name = "OneEnum";
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({}, "http://www.repository.com");
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title>class r</title>
-<link rel="stylesheet" href="../../../clang-doc-default-stylesheet.css"/>
-<script src="../../../index_json.js"></script>
-<script src="../../../index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="X/Y/Z" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h1>class r</h1>
-    <p>
-      Defined at line 
-      <a href="http://www.repository.com/dir/test.cpp#10">10</a>
-       of file 
-      <a href="http://www.repository.com/dir/test.cpp">test.cpp</a>
-    </p>
-    <p>
-      Inherits from 
-      <a href="../../../path/to/F.html">F</a>
-      , G
-    </p>
-    <h2 id="Members">Members</h2>
-    <ul>
-      <li>
-        <div>private int X</div>
-      </li>
-    </ul>
-    <h2 id="Records">Records</h2>
-    <ul>
-      <li>
-        <a href="../../../X/Y/Z/r/ChildStruct.html">ChildStruct</a>
-      </li>
-    </ul>
-    <h2 id="Functions">Functions</h2>
-    <div>
-      <h3 id="0000000000000000000000000000000000000000">OneFunction</h3>
-      <p>public OneFunction()</p>
-    </div>
-    <h2 id="Enums">Enums</h2>
-    <div>
-      <table id="0000000000000000000000000000000000000000">
-        <thead>
-          <tr>
-            <th colspan="2">enum OneEnum</th>
-          </tr>
-        </thead>
-      </table>
-    </div>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right">
-    <ol>
-      <li>
-        <span>
-          <a href="#Members">Members</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Records">Records</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Functions">Functions</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneFunction</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-      <li>
-        <span>
-          <a href="#Enums">Enums</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneEnum</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-    </ol>
-  </div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitFunctionHTML) {
-  FunctionInfo I;
-  I.Name = "f";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.DefLoc = Location(10, 10, "dir/test.cpp", true);
-  I.Loc.emplace_back(12, 12, "test.cpp");
-
-  I.Access = AccessSpecifier::AS_none;
-
-  SmallString<16> PathTo;
-  llvm::sys::path::native("path/to", PathTo);
-  I.ReturnType = TypeInfo(
-      Reference(EmptySID, "float", InfoType::IT_default, "float", PathTo));
-  I.Params.emplace_back(TypeInfo("int", PathTo), "P");
-  I.IsMethod = true;
-  I.Parent = Reference(EmptySID, "Parent", InfoType::IT_record);
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({}, "https://www.repository.com");
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title></title>
-<link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
-<script src="index_json.js"></script>
-<script src="index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h3 id="0000000000000000000000000000000000000000">f</h3>
-    <p>
-      <a href="path/to/float.html">float</a>
-       f(
-      <a href="path/to/int.html">int</a>
-       P)
-    </p>
-    <p>
-      Defined at line 
-      <a href="https://www.repository.com/dir/test.cpp#10">10</a>
-       of file 
-      <a href="https://www.repository.com/dir/test.cpp">test.cpp</a>
-    </p>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right"></div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitEnumHTML) {
-  EnumInfo I;
-  I.Name = "e";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.DefLoc = Location(10, 10, "test.cpp", true);
-  I.Loc.emplace_back(12, 12, "test.cpp");
-
-  I.Members.emplace_back("X");
-  I.Scoped = true;
-
+TEST_F(HTMLGeneratorTest, createResources) {
   auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({}, "www.repository.com");
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title></title>
-<link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
-<script src="index_json.js"></script>
-<script src="index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <table id="0000000000000000000000000000000000000000">
-      <thead>
-        <tr>
-          <th colspan="2">enum class e</th>
-        </tr>
-      </thead>
-      <tbody>
-        <tr>
-          <td>X</td>
-          <td>0</td>
-        </tr>
-      </tbody>
-    </table>
-    <p>
-      Defined at line 
-      <a href="https://www.repository.com/test.cpp#10">10</a>
-       of file 
-      <a href="https://www.repository.com/test.cpp">test.cpp</a>
-    </p>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right"></div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitCommentHTML) {
-  FunctionInfo I;
-  I.Name = "f";
-  I.DefLoc = Location(10, 10, "test.cpp", true);
-  I.ReturnType = TypeInfo("void");
-  I.Params.emplace_back(TypeInfo("int"), "I");
-  I.Params.emplace_back(TypeInfo("int"), "J");
-  I.Access = AccessSpecifier::AS_none;
-
-  CommentInfo Top;
-  Top.Kind = CommentKind::CK_FullComment;
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *BlankLine = Top.Children.back().get();
-  BlankLine->Kind = CommentKind::CK_ParagraphComment;
-  BlankLine->Children.emplace_back(std::make_unique<CommentInfo>());
-  BlankLine->Children.back()->Kind = CommentKind::CK_TextComment;
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *Brief = Top.Children.back().get();
-  Brief->Kind = CommentKind::CK_ParagraphComment;
-  Brief->Children.emplace_back(std::make_unique<CommentInfo>());
-  Brief->Children.back()->Kind = CommentKind::CK_TextComment;
-  Brief->Children.back()->Name = "ParagraphComment";
-  Brief->Children.back()->Text = " Brief description.";
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *Extended = Top.Children.back().get();
-  Extended->Kind = CommentKind::CK_ParagraphComment;
-  Extended->Children.emplace_back(std::make_unique<CommentInfo>());
-  Extended->Children.back()->Kind = CommentKind::CK_TextComment;
-  Extended->Children.back()->Text = " Extended description that";
-  Extended->Children.emplace_back(std::make_unique<CommentInfo>());
-  Extended->Children.back()->Kind = CommentKind::CK_TextComment;
-  Extended->Children.back()->Text = " continues onto the next line.";
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *Entities = Top.Children.back().get();
-  Entities->Kind = CommentKind::CK_ParagraphComment;
-  Entities->Children.emplace_back(std::make_unique<CommentInfo>());
-  Entities->Children.back()->Kind = CommentKind::CK_TextComment;
-  Entities->Children.back()->Name = "ParagraphComment";
-  Entities->Children.back()->Text =
-      " Comment with html entities: &, <, >, \", \'.";
-
-  I.Description.emplace_back(std::move(Top));
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
+  ASSERT_THAT(G, NotNull()) << "Could not find HTMLGenerator";
   ClangDocContext CDCtx = getClangDocContext();
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title></title>
-<link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
-<script src="index_json.js"></script>
-<script src="index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h3 id="0000000000000000000000000000000000000000">f</h3>
-    <p>void f(int I, int J)</p>
-    <p>
-      Defined at line 
-      <a href="test.cpp#10">10</a>
-       of file 
-      <a href="test.cpp">test.cpp</a>
-    </p>
-    <div>
-      <div>
-        <p> Brief description.</p>
-        <p> Extended description that continues onto the next line.</p>
-        <p> Comment with html entities: &amp;, &lt;, &gt;, &quot;, &apos;.</p>
-      </div>
-    </div>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right"></div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
+  EXPECT_THAT_ERROR(G->createResources(CDCtx), Failed())
+      << "Empty UserStylesheets or JsScripts should fail!";
 }
-
-} // namespace doc
-} // namespace clang
diff --git a/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp
deleted file mode 100644
index 7bbd299d7b59c..0000000000000
--- a/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-//===-- clang-doc/HTMLMustacheGeneratorTest.cpp ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ClangDocTest.h"
-#include "Generators.h"
-#include "Representation.h"
-#include "config.h"
-#include "support/Utils.h"
-#include "clang/Basic/Version.h"
-#include "llvm/Testing/Support/Error.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace testing;
-using namespace clang;
-using namespace clang::doc;
-
-// FIXME: Don't enable unit tests that can read files. Remove once we can use
-// lit to test these properties.
-#define ENABLE_LOCAL_TEST 0
-
-static const std::string ClangDocVersion = getClangToolFullVersion("clang-doc");
-
-static std::unique_ptr<Generator> getHTMLMustacheGenerator() {
-  auto G = findGeneratorByName("mustache");
-  if (!G)
-    return nullptr;
-  return std::move(G.get());
-}
-
-class HTMLMustacheGeneratorTest : public ClangDocContextTest {
-protected:
-  ClangDocContext
-  getClangDocContext(std::vector<std::string> UserStylesheets = {},
-                     StringRef RepositoryUrl = "",
-                     StringRef RepositoryLinePrefix = "", StringRef Base = "") {
-    ClangDocContext CDCtx{nullptr,
-                          "test-project",
-                          false,
-                          "",
-                          "",
-                          RepositoryUrl,
-                          RepositoryLinePrefix,
-                          Base,
-                          UserStylesheets,
-                          Diags,
-                          false};
-    CDCtx.UserStylesheets.insert(CDCtx.UserStylesheets.begin(), "");
-    CDCtx.JsScripts.emplace_back("");
-    return CDCtx;
-  }
-};
-
-TEST_F(HTMLMustacheGeneratorTest, createResources) {
-  auto G = getHTMLMustacheGenerator();
-  ASSERT_THAT(G, NotNull()) << "Could not find HTMLMustacheGenerator";
-  ClangDocContext CDCtx = getClangDocContext();
-  EXPECT_THAT_ERROR(G->createResources(CDCtx), Failed())
-      << "Empty UserStylesheets or JsScripts should fail!";
-}
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index dc5ae6e83d2d7..ed657fc175ccf 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -1068,6 +1068,12 @@ The matrix type extension supports explicit casts. Implicit type conversion betw
     i = static_cast<matrix_5_5<int>>(d);
   }
 
+The matrix type extension supports column and row major memory layouts, but not
+all builtins are supported with row-major layout. The layout defaults to column
+major and can be specified using `-fmatrix-memory-layout`. To enable column 
+major layout, use `-fmatrix-memory-layout=column-major`, and for row major
+layout use `-fmatrix-memory-layout=row-major`
+
 Half-Precision Floating Point
 =============================
 
diff --git a/clang/docs/MatrixTypes.rst b/clang/docs/MatrixTypes.rst
index b3a2c8cf53670..8701baa2d0f1c 100644
--- a/clang/docs/MatrixTypes.rst
+++ b/clang/docs/MatrixTypes.rst
@@ -287,6 +287,10 @@ part of the draft specification.
 The elements of a  value of a matrix type are laid out in column-major order
 without padding.
 
+To change memory layout to row major use the `-fmatrix-memory-layout` flag.
+This flag supports two flag argument values either `column-major` or
+`row-major` used like so `-fmatrix-memory-layout=column-major`.` 
+
 We propose to provide a Clang option to override this behavior and allow
 contraction of those operations (e.g. *-ffp-contract=matrix*).
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index a007f6ea812b4..65ab7128d4420 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -326,6 +326,7 @@ New Compiler Flags
 - New option ``-fsanitize-debug-trap-reasons=`` added to control emitting trap reasons into the debug info when compiling with trapping UBSan (e.g. ``-fsanitize-trap=undefined``).
 - New options for enabling allocation token instrumentation: ``-fsanitize=alloc-token``, ``-falloc-token-max=``, ``-fsanitize-alloc-token-fast-abi``, ``-fsanitize-alloc-token-extended``.
 - The ``-resource-dir`` option is now displayed in the list of options shown by ``--help``.
+- New option ``-fmatrix-memory-layout`` added to control the memory layout of Clang matrix types. (e.g. ``-fmatrix-memory-layout=column-major`` or ``-fmatrix-memory-layout=row-major``).
 
 Lanai Support
 ^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index cebc992f778c3..9788a19edbe8d 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -980,45 +980,45 @@ TARGET_BUILTIN(__builtin_amdgcn_image_sample_3d_v4f16_f32, "V4xifffQtV4ibii", "n
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f16_f32, "V4xifffQtV4ibii", "nc", "image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f32_f32, "V4fifQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f16_f32, "V4eifQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f16_f32, "V4xifQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32, "V4xiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_f32_f32, "fiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f16_f32, "V4xiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f16_f32, "V4xiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_f32_f32, "fiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_f32_f32, "fiffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f32_f32, "V4fiffffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f16_f32, "V4eiffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f16_f32, "V4xiffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_f32_f32, "fifffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f32_f32, "V4fifffffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f16_f32, "V4eifffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f16_f32, "V4xifffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f32_f32, "V4fifffffffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f16_f32, "V4eifffffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f16_f32, "V4xifffffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
 
 #undef BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsHexagon.td b/clang/include/clang/Basic/BuiltinsHexagon.td
index cf18359e7bf60..00f84cd72a051 100644
--- a/clang/include/clang/Basic/BuiltinsHexagon.td
+++ b/clang/include/clang/Basic/BuiltinsHexagon.td
@@ -2146,3 +2146,69 @@ let Features = HVXV79.Features in {
   def V6_vsub_hf_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
   def V6_vsub_hf_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
 }
+
+// V81 HVX Instructions.
+let Features = HVXV81.Features in {
+  def V6_vabs_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_qf16_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf16_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_qf32_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf32_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_valign4 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_valign4_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vconv_bf_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>)">;
+  def V6_vconv_bf_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>)">;
+  def V6_vconv_f8_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_f8_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_h_hf_rnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_h_hf_rnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf16_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vconv_qf16_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vconv_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf16_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf16_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf32_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf32_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_veqhf : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqhf_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqhf_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqhf_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vilog2_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vilog2_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vilog2_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vilog2_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf16_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf16_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf32_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf32_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vsub_hf_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_hf_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_sf_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_sf_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+}
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index feb3e9a4afe3b..71aee5038d518 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1088,27 +1088,27 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -4222,12 +4222,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
-  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
+  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
@@ -4235,51 +4235,51 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 99b01d32ad99c..4c27df568d42f 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -813,6 +813,9 @@ def err_cc1_round_trip_mismatch : Error<
 def err_cc1_unbounded_vscale_min : Error<
   "minimum vscale must be an unsigned integer greater than 0">;
 
+def err_conflicting_matrix_layout_flags: Error<
+  "-fmatrix-memory-layout=%0 conflicts with -mllvm -matrix-default-layout=%1">;
+
 def err_drv_using_omit_rtti_component_without_no_rtti : Error<
   "-fexperimental-omit-vtable-rtti call only be used with -fno-rtti">;
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b31dc1ea820b3..127818ec5767b 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -13041,6 +13041,9 @@ def err_builtin_trivially_relocate_invalid_arg_type: Error <
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
+def err_builtin_matrix_major_order_disabled: Error<
+  "matrix %select{row|column}0 major %select{load|store}1 is not supported with "
+  "-fmatrix-memory-layout=%select{column|row}0-major. Pass -fmatrix-memory-layout=%select{row|column}0-major to enable it">;
 def err_matrix_index_not_integer: Error<
   "matrix %select{row|column}0 index is not an integer">;
 def err_matrix_index_outside_range: Error<
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index eae3aa27078da..bd3fb665b4a8d 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -449,6 +449,7 @@ ENUM_LANGOPT(RegisterStaticDestructors, RegisterStaticDestructorsKind, 2,
 LANGOPT(RegCall4, 1, 0, NotCompatible, "Set __regcall4 as a default calling convention to respect __regcall ABI v.4")
 
 LANGOPT(MatrixTypes, 1, 0, NotCompatible, "Enable or disable the builtin matrix type")
+ENUM_LANGOPT(DefaultMatrixMemoryLayout, MatrixMemoryLayout, 1, MatrixMemoryLayout::MatrixColMajor, NotCompatible, "Defines the default memory Layout for matrices")
 VALUE_LANGOPT(MaxMatrixDimension, 32, (1 << 20) - 1, NotCompatible, "maximum allowed matrix dimension")
 
 LANGOPT(CXXAssumptions, 1, 1, NotCompatible, "Enable or disable codegen and compile-time checks for C++23's [[assume]] attribute")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 3f042f8ddb5a1..859ed57996983 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -251,6 +251,13 @@ class LangOptionsBase {
     FEM_UnsetOnCommandLine = 3
   };
 
+  enum class MatrixMemoryLayout : unsigned {
+    // Use column-major layout for matrices
+    MatrixColMajor = 0,
+    // Use row-major layout for matrices
+    MatrixRowMajor = 1,
+  };
+
   enum ExcessPrecisionKind { FPP_Standard, FPP_Fast, FPP_None };
 
   enum class LaxVectorConversionKind {
@@ -623,6 +630,8 @@ class LangOptions : public LangOptionsBase {
            !ObjCSubscriptingLegacyRuntime;
   }
 
+  bool isCompatibleWithMSVC() const { return MSCompatibilityVersion > 0; }
+
   bool isCompatibleWithMSVC(MSVCMajorVersion MajorVersion) const {
     return MSCompatibilityVersion >= MajorVersion * 100000U;
   }
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index aa47c4bce189b..7ee3785ef74f0 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -308,14 +308,16 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return cir::GlobalViewAttr::get(type, symbol, indices);
   }
 
-  mlir::Value createGetGlobal(mlir::Location loc, cir::GlobalOp global) {
+  mlir::Value createGetGlobal(mlir::Location loc, cir::GlobalOp global,
+                              bool threadLocal = false) {
     assert(!cir::MissingFeatures::addressSpace());
-    return cir::GetGlobalOp::create(
-        *this, loc, getPointerTo(global.getSymType()), global.getSymName());
+    return cir::GetGlobalOp::create(*this, loc,
+                                    getPointerTo(global.getSymType()),
+                                    global.getSymNameAttr(), threadLocal);
   }
 
-  mlir::Value createGetGlobal(cir::GlobalOp global) {
-    return createGetGlobal(global.getLoc(), global);
+  mlir::Value createGetGlobal(cir::GlobalOp global, bool threadLocal = false) {
+    return createGetGlobal(global.getLoc(), global, threadLocal);
   }
 
   /// Create a copy with inferred length.
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
index 03a6a97dc8c2e..858d4d6350bed 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
@@ -35,6 +35,7 @@ namespace cir {
 class ArrayType;
 class BoolType;
 class ComplexType;
+class DataMemberType;
 class IntType;
 class MethodType;
 class PointerType;
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 98d4636dafc29..c0279a0b20670 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -447,6 +447,57 @@ def CIR_ConstPtrAttr : CIR_Attr<"ConstPtr", "ptr", [TypedAttrInterface]> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// DataMemberAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_DataMemberAttr : CIR_Attr<"DataMember", "data_member", [
+  TypedAttrInterface
+]> {
+  let summary = "Holds a constant data member pointer value";
+  let parameters = (ins AttributeSelfTypeParameter<
+                            "", "cir::DataMemberType">:$type,
+                        OptionalParameter<
+                            "std::optional<unsigned>">:$member_index);
+  let description = [{
+    A data member attribute is a literal attribute that represents a constant
+    pointer-to-data-member value.
+
+    The `member_index` parameter represents the index of the pointed-to member
+    within its containing record. It is an optional parameter; lack of this
+    parameter indicates a null pointer-to-data-member value.
+
+    Example:
+    ```
+    #ptr = #cir.data_member<1> : !cir.data_member<!s32i in !rec_22Point22>
+
+    #null = #cir.data_member<null> : !cir.data_member<!s32i in !rec_22Point22>
+    ```
+  }];
+
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "cir::DataMemberType":$type), [{
+      return $_get(type.getContext(), type, std::nullopt);
+    }]>,
+    AttrBuilderWithInferredContext<(ins "cir::DataMemberType":$type,
+                                        "unsigned":$member_index), [{
+      return $_get(type.getContext(), type, member_index);
+    }]>,
+  ];
+
+  let genVerifyDecl = 1;
+
+  let assemblyFormat = [{
+    `<` ($member_index^):(`null`)? `>`
+  }];
+
+  let extraClassDeclaration = [{
+    bool isNullPtr() const {
+      return !getMemberIndex().has_value();
+    }
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // GlobalViewAttr
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 9bd24cf0bcf27..04648bee848f1 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -2078,6 +2078,13 @@ def CIR_GlobalLinkageKind : CIR_I32EnumAttr<
 // properties of a global variable will be added over time as more of ClangIR
 // is upstreamed.
 
+def CIR_TLSModel : CIR_I32EnumAttr<"TLS_Model", "TLS model", [
+  I32EnumAttrCase<"GeneralDynamic", 0, "tls_dyn">,
+  I32EnumAttrCase<"LocalDynamic", 1, "tls_local_dyn">,
+  I32EnumAttrCase<"InitialExec", 2, "tls_init_exec">,
+  I32EnumAttrCase<"LocalExec", 3, "tls_local_exec">
+]>;
+
 def CIR_GlobalOp : CIR_Op<"global", [
   DeclareOpInterfaceMethods<RegionBranchOpInterface>,
   DeclareOpInterfaceMethods<CIRGlobalValueInterface>,
@@ -2106,6 +2113,7 @@ def CIR_GlobalOp : CIR_Op<"global", [
                        OptionalAttr<StrAttr>:$sym_visibility,
                        TypeAttr:$sym_type,
                        CIR_GlobalLinkageKind:$linkage,
+                       OptionalAttr<CIR_TLSModel>:$tls_model,
                        OptionalAttr<AnyAttr>:$initial_value,
                        UnitAttr:$comdat,
                        UnitAttr:$constant,
@@ -2121,6 +2129,7 @@ def CIR_GlobalOp : CIR_Op<"global", [
     (`constant` $constant^)?
     $linkage
     (`comdat` $comdat^)?
+    ($tls_model^)?
     (`dso_local` $dso_local^)?
     $sym_name
     custom<GlobalOpTypeAndInitialValue>($sym_type, $initial_value,
@@ -2184,16 +2193,22 @@ def CIR_GetGlobalOp : CIR_Op<"get_global", [
     undefined. The resulting type must always be a `!cir.ptr<...>` type with the
     same address space as the global variable.
 
+    Addresses of thread local globals can only be retrieved if this operation
+    is marked `thread_local`, which indicates the address isn't constant.
+
     Example:
     ```mlir
     %x = cir.get_global @gv : !cir.ptr<i32>
+    ...
+    %y = cir.get_global thread_local @tls_gv : !cir.ptr<i32>
     ```
   }];
 
-  let arguments = (ins FlatSymbolRefAttr:$name);
+  let arguments = (ins FlatSymbolRefAttr:$name, UnitAttr:$tls);
   let results = (outs Res<CIR_PointerType, "", []>:$addr);
 
   let assemblyFormat = [{
+    (`thread_local` $tls^)?
     $name `:` qualified(type($addr)) attr-dict
   }];
 }
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
index ddca98eac93ab..89762249ed0c4 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
@@ -309,6 +309,13 @@ def CIR_AnyFloatOrVecOfFloatType
     let cppFunctionName = "isFPOrVectorOfFPType";
 }
 
+//===----------------------------------------------------------------------===//
+// Data member type predicates
+//===----------------------------------------------------------------------===//
+
+def CIR_AnyDataMemberType : CIR_TypeBase<"::cir::DataMemberType",
+    "data member type">;
+
 //===----------------------------------------------------------------------===//
 // VPtr type predicates
 //===----------------------------------------------------------------------===//
@@ -322,7 +329,8 @@ def CIR_PtrToVPtr : CIR_PtrToType<CIR_AnyVPtrType>;
 //===----------------------------------------------------------------------===//
 
 defvar CIR_ScalarTypes = [
-    CIR_AnyBoolType, CIR_AnyIntType, CIR_AnyFloatType, CIR_AnyPtrType
+    CIR_AnyBoolType, CIR_AnyIntType, CIR_AnyFloatType, CIR_AnyPtrType,
+    CIR_AnyDataMemberType, CIR_AnyVPtrType
 ];
 
 def CIR_AnyScalarType : AnyTypeOf<CIR_ScalarTypes, "cir scalar type"> {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index 3e062add6633a..59b97f0c6d39a 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -305,6 +305,36 @@ def CIR_PointerType : CIR_Type<"Pointer", "ptr", [
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// CIR_DataMemberType
+//===----------------------------------------------------------------------===//
+
+def CIR_DataMemberType : CIR_Type<"DataMember", "data_member",
+    [DeclareTypeInterfaceMethods<DataLayoutTypeInterface>]
+> {
+  let summary = "CIR type that represents a pointer-to-data-member in C++";
+  let description = [{
+    `cir.data_member` models a pointer-to-data-member in C++. Values of this
+    type are essentially offsets of the pointed-to member within one of its
+    containing record.
+  }];
+
+  let parameters = (ins "mlir::Type":$member_ty,
+                        "cir::RecordType":$class_ty);
+
+  let builders = [
+    TypeBuilderWithInferredContext<(ins
+      "mlir::Type":$member_ty, "cir::RecordType":$class_ty
+    ), [{
+      return $_get(member_ty.getContext(), member_ty, class_ty);
+    }]>,
+  ];
+
+  let assemblyFormat = [{
+    `<` $member_ty `in` $class_ty `>`
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // CIR_VPtrType
 //===----------------------------------------------------------------------===//
@@ -693,7 +723,7 @@ def CIRRecordType : Type<
 def CIR_AnyType : AnyTypeOf<[
   CIR_VoidType, CIR_BoolType, CIR_ArrayType, CIR_VectorType, CIR_IntType,
   CIR_AnyFloatType, CIR_PointerType, CIR_FuncType, CIR_RecordType,
-  CIR_ComplexType, CIR_VPtrType
+  CIR_ComplexType, CIR_VPtrType, CIR_DataMemberType
 ]>;
 
 #endif // CLANG_CIR_DIALECT_IR_CIRTYPES_TD
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 826a4b13f5c0c..b2d94709016fa 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -189,6 +189,10 @@ struct MissingFeatures {
   static bool globalCtorLexOrder() { return false; }
   static bool globalCtorAssociatedData() { return false; }
 
+  // LowerModule handling
+  static bool lowerModuleCodeGenOpts() { return false; }
+  static bool lowerModuleLangOpts() { return false; }
+
   // Misc
   static bool aarch64SIMDIntrinsics() { return false; }
   static bool aarch64SMEIntrinsics() { return false; }
@@ -292,6 +296,7 @@ struct MissingFeatures {
   static bool lowerModeOptLevel() { return false; }
   static bool loweringPrepareX86CXXABI() { return false; }
   static bool loweringPrepareAArch64XXABI() { return false; }
+  static bool makeTripleAlwaysPresent() { return false; }
   static bool maybeHandleStaticInExternC() { return false; }
   static bool mergeAllConstants() { return false; }
   static bool metaDataNode() { return false; }
diff --git a/clang/include/clang/DependencyScanning/DependencyScannerImpl.h b/clang/include/clang/DependencyScanning/DependencyScannerImpl.h
index 352a0ad44fb7f..f79f21acf52f9 100644
--- a/clang/include/clang/DependencyScanning/DependencyScannerImpl.h
+++ b/clang/include/clang/DependencyScanning/DependencyScannerImpl.h
@@ -63,15 +63,15 @@ class DependencyScanningAction {
 std::unique_ptr<DiagnosticOptions>
 createDiagOptions(ArrayRef<std::string> CommandLine);
 
-struct DignosticsEngineWithDiagOpts {
+struct DiagnosticsEngineWithDiagOpts {
   // We need to bound the lifetime of the DiagOpts used to create the
   // DiganosticsEngine with the DiagnosticsEngine itself.
   std::unique_ptr<DiagnosticOptions> DiagOpts;
   IntrusiveRefCntPtr<DiagnosticsEngine> DiagEngine;
 
-  DignosticsEngineWithDiagOpts(ArrayRef<std::string> CommandLine,
-                               IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
-                               DiagnosticConsumer &DC);
+  DiagnosticsEngineWithDiagOpts(ArrayRef<std::string> CommandLine,
+                                IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
+                                DiagnosticConsumer &DC);
 };
 
 struct TextDiagnosticsPrinterWithOutput {
@@ -151,7 +151,7 @@ class CompilerInstanceWithContext {
   // DiagConsumer may points to DiagPrinterWithOS->DiagPrinter, or a custom
   // DiagnosticConsumer passed in from initialize.
   DiagnosticConsumer *DiagConsumer = nullptr;
-  std::unique_ptr<DignosticsEngineWithDiagOpts> DiagEngineWithCmdAndOpts;
+  std::unique_ptr<DiagnosticsEngineWithDiagOpts> DiagEngineWithCmdAndOpts;
 
   // Context - compiler invocation
   // Compilation's command's arguments may be owned by Alloc when expanded from
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 41a1b19935696..84ae211bed0a4 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -4797,6 +4797,12 @@ def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
     HelpText<"Enable matrix data type and related builtin functions">,
     MarshallingInfoFlag<LangOpts<"MatrixTypes">, hlsl.KeyPath>;
 
+def fmatrix_memory_layout_EQ : Joined<["-"], "fmatrix-memory-layout=">,
+    Visibility<[ClangOption, CC1Option]>,
+    HelpText<"Sets the matrix memory layout (row-major or column-major)">,
+    Values<"row-major,column-major">,
+    Group<f_Group>;
+
 defm raw_string_literals : BoolFOption<"raw-string-literals",
     LangOpts<"RawStringLiterals">, Default<std#".hasRawStringLiterals()">,
     PosFlag<SetTrue, [], [], "Enable">,
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 85b38120169fd..bf13eeeaea60a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -189,6 +189,11 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return getType<cir::RecordType>(nameAttr, kind);
   }
 
+  cir::DataMemberAttr getDataMemberAttr(cir::DataMemberType ty,
+                                        unsigned memberIndex) {
+    return cir::DataMemberAttr::get(ty, memberIndex);
+  }
+
   // Return true if the value is a null constant such as null pointer, (+0.0)
   // for floating-point or zero initializer
   bool isNullValue(mlir::Attribute attr) const {
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index fb17e31bf36d6..62836ce0f7537 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -151,6 +151,17 @@ computeFullLaneShuffleMask(CIRGenFunction &cgf, const mlir::Value vec,
 
   outIndices.resize(numElts);
 }
+static mlir::Value emitX86CompressExpand(CIRGenBuilderTy &builder,
+                                         mlir::Location loc, mlir::Value source,
+                                         mlir::Value mask,
+                                         mlir::Value inputVector,
+                                         const std::string &id) {
+  auto resultTy = cast<cir::VectorType>(mask.getType());
+  mlir::Value maskValue = getMaskVecValue(
+      builder, loc, inputVector, cast<cir::VectorType>(resultTy).getSize());
+  return emitIntrinsicCallOp(builder, loc, id, resultTy,
+                             mlir::ValueRange{source, mask, maskValue});
+}
 
 static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder,
                                        mlir::Location loc,
@@ -533,9 +544,78 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_xsaves:
   case X86::BI__builtin_ia32_xsaves64:
   case X86::BI__builtin_ia32_xsetbv:
-  case X86::BI_xsetbv:
+  case X86::BI_xsetbv: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    StringRef intrinsicName;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unexpected builtin");
+    case X86::BI__builtin_ia32_xsave:
+      intrinsicName = "x86.xsave";
+      break;
+    case X86::BI__builtin_ia32_xsave64:
+      intrinsicName = "x86.xsave64";
+      break;
+    case X86::BI__builtin_ia32_xrstor:
+      intrinsicName = "x86.xrstor";
+      break;
+    case X86::BI__builtin_ia32_xrstor64:
+      intrinsicName = "x86.xrstor64";
+      break;
+    case X86::BI__builtin_ia32_xsaveopt:
+      intrinsicName = "x86.xsaveopt";
+      break;
+    case X86::BI__builtin_ia32_xsaveopt64:
+      intrinsicName = "x86.xsaveopt64";
+      break;
+    case X86::BI__builtin_ia32_xrstors:
+      intrinsicName = "x86.xrstors";
+      break;
+    case X86::BI__builtin_ia32_xrstors64:
+      intrinsicName = "x86.xrstors64";
+      break;
+    case X86::BI__builtin_ia32_xsavec:
+      intrinsicName = "x86.xsavec";
+      break;
+    case X86::BI__builtin_ia32_xsavec64:
+      intrinsicName = "x86.xsavec64";
+      break;
+    case X86::BI__builtin_ia32_xsaves:
+      intrinsicName = "x86.xsaves";
+      break;
+    case X86::BI__builtin_ia32_xsaves64:
+      intrinsicName = "x86.xsaves64";
+      break;
+    case X86::BI__builtin_ia32_xsetbv:
+    case X86::BI_xsetbv:
+      intrinsicName = "x86.xsetbv";
+      break;
+    }
+
+    // The xsave family of instructions take a 64-bit mask that specifies
+    // which processor state components to save/restore. The hardware expects
+    // this mask split into two 32-bit registers: EDX (high 32 bits) and
+    // EAX (low 32 bits).
+    mlir::Type i32Ty = builder.getSInt32Ty();
+
+    // Mhi = (uint32_t)(ops[1] >> 32) - extract high 32 bits via right shift
+    cir::ConstantOp shift32 = builder.getSInt64(32, loc);
+    mlir::Value mhi = builder.createShift(loc, ops[1], shift32.getResult(),
+                                          /*isShiftLeft=*/false);
+    mhi = builder.createIntCast(mhi, i32Ty);
+
+    // Mlo = (uint32_t)ops[1] - extract low 32 bits by truncation
+    mlir::Value mlo = builder.createIntCast(ops[1], i32Ty);
+
+    return emitIntrinsicCallOp(builder, loc, intrinsicName, voidTy,
+                               mlir::ValueRange{ops[0], mhi, mlo});
+  }
   case X86::BI__builtin_ia32_xgetbv:
   case X86::BI_xgetbv:
+    // xgetbv reads the extended control register specified by ops[0] (ECX)
+    // and returns the 64-bit value
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "x86.xgetbv", builder.getUInt64Ty(), ops[0]);
   case X86::BI__builtin_ia32_storedqudi128_mask:
   case X86::BI__builtin_ia32_storedqusi128_mask:
   case X86::BI__builtin_ia32_storedquhi128_mask:
@@ -712,6 +792,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_compressstoreqi128_mask:
   case X86::BI__builtin_ia32_compressstoreqi256_mask:
   case X86::BI__builtin_ia32_compressstoreqi512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_expanddf128_mask:
   case X86::BI__builtin_ia32_expanddf256_mask:
   case X86::BI__builtin_ia32_expanddf512_mask:
@@ -729,7 +813,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_expandhi512_mask:
   case X86::BI__builtin_ia32_expandqi128_mask:
   case X86::BI__builtin_ia32_expandqi256_mask:
-  case X86::BI__builtin_ia32_expandqi512_mask:
+  case X86::BI__builtin_ia32_expandqi512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+                                 "x86.avx512.mask.expand");
+  }
   case X86::BI__builtin_ia32_compressdf128_mask:
   case X86::BI__builtin_ia32_compressdf256_mask:
   case X86::BI__builtin_ia32_compressdf512_mask:
@@ -747,11 +835,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_compresshi512_mask:
   case X86::BI__builtin_ia32_compressqi128_mask:
   case X86::BI__builtin_ia32_compressqi256_mask:
-  case X86::BI__builtin_ia32_compressqi512_mask:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented X86 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return {};
+  case X86::BI__builtin_ia32_compressqi512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+                                 "x86.avx512.mask.compress");
+  }
   case X86::BI__builtin_ia32_gather3div2df:
   case X86::BI__builtin_ia32_gather3div2di:
   case X86::BI__builtin_ia32_gather3div4df:
diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
index 948245ceab2cd..12b153af36c3e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp
@@ -454,7 +454,8 @@ CIRGenModule::getOrCreateStaticVarDecl(const VarDecl &d,
   if (supportsCOMDAT() && gv.isWeakForLinker())
     gv.setComdat(true);
 
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+  if (d.getTLSKind())
+    errorNYI(d.getSourceRange(), "getOrCreateStaticVarDecl: TLS");
 
   setGVProperties(gv, &d);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 5d509e37f4621..cac046c41e30d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -282,7 +282,6 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
   QualType t = e->getType();
 
   // If it's thread_local, emit a call to its wrapper function instead.
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
   if (vd->getTLSKind() == VarDecl::TLS_Dynamic)
     cgf.cgm.errorNYI(e->getSourceRange(),
                      "emitGlobalVarDeclLValue: thread_local variable");
@@ -318,7 +317,6 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                        bool isVolatile, QualType ty,
                                        LValueBaseInfo baseInfo, bool isInit,
                                        bool isNontemporal) {
-  assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
     // Boolean vectors use `iN` as storage type.
@@ -569,7 +567,8 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue,
 mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile,
                                              QualType ty, SourceLocation loc,
                                              LValueBaseInfo baseInfo) {
-  assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
+  // Traditional LLVM codegen handles thread local separately, CIR handles
+  // as part of getAddrOfGlobalVar (GetGlobalOp).
   mlir::Type eltTy = addr.getElementType();
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 9043ecab42f15..25ce1ba26da09 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -139,6 +139,11 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return {};
   }
 
+  mlir::Value VisitConstantExpr(ConstantExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: constant expr");
+    return {};
+  }
+
   mlir::Value VisitPackIndexingExpr(PackIndexingExpr *e) {
     return Visit(e->getSelectedExpr());
   }
@@ -159,6 +164,14 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   mlir::Value VisitCoawaitExpr(CoawaitExpr *s) {
     return cgf.emitCoawaitExpr(*s).getValue();
   }
+  mlir::Value VisitCoyieldExpr(CoyieldExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: coyield");
+    return {};
+  }
+  mlir::Value VisitUnaryCoawait(const UnaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: unary coawait");
+    return {};
+  }
 
   mlir::Value emitLoadOfLValue(LValue lv, SourceLocation loc) {
     return cgf.emitLoadOfLValue(lv, loc).getValue();
@@ -198,6 +211,12 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
                                    cir::IntAttr::get(type, e->getValue()));
   }
 
+  mlir::Value VisitFixedPointLiteral(const FixedPointLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: fixed point literal");
+    return {};
+  }
+
   mlir::Value VisitFloatingLiteral(const FloatingLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
     assert(mlir::isa<cir::FPTypeInterface>(type) &&
@@ -229,6 +248,23 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitOffsetOfExpr(OffsetOfExpr *e);
 
+  mlir::Value VisitSizeOfPackExpr(SizeOfPackExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: size of pack");
+    return {};
+  }
+  mlir::Value VisitPseudoObjectExpr(PseudoObjectExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: pseudo object");
+    return {};
+  }
+  mlir::Value VisitSYCLUniqueStableNameExpr(SYCLUniqueStableNameExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: sycl unique stable name");
+    return {};
+  }
+  mlir::Value VisitEmbedExpr(EmbedExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: embed");
+    return {};
+  }
   mlir::Value VisitOpaqueValueExpr(OpaqueValueExpr *e) {
     if (e->isGLValue())
       return emitLoadOfLValue(cgf.getOrCreateOpaqueLValueMapping(e),
@@ -238,6 +274,38 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return cgf.getOrCreateOpaqueRValueMapping(e).getValue();
   }
 
+  mlir::Value VisitObjCSelectorExpr(ObjCSelectorExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc selector");
+    return {};
+  }
+  mlir::Value VisitObjCProtocolExpr(ObjCProtocolExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc protocol");
+    return {};
+  }
+  mlir::Value VisitObjCIVarRefExpr(ObjCIvarRefExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc ivar ref");
+    return {};
+  }
+  mlir::Value VisitObjCMessageExpr(ObjCMessageExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc message");
+    return {};
+  }
+  mlir::Value VisitObjCIsaExpr(ObjCIsaExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc isa");
+    return {};
+  }
+  mlir::Value VisitObjCAvailabilityCheckExpr(ObjCAvailabilityCheckExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc availability check");
+    return {};
+  }
+
+  mlir::Value VisitMatrixSubscriptExpr(MatrixSubscriptExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: matrix subscript");
+    return {};
+  }
+
   mlir::Value VisitCastExpr(CastExpr *e);
   mlir::Value VisitCallExpr(const CallExpr *e);
 
@@ -319,6 +387,18 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitInitListExpr(InitListExpr *e);
 
+  mlir::Value VisitArrayInitIndexExpr(ArrayInitIndexExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: array init index");
+    return {};
+  }
+
+  mlir::Value VisitImplicitValueInitExpr(const ImplicitValueInitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: implicit value init");
+    return {};
+  }
+
   mlir::Value VisitExplicitCastExpr(ExplicitCastExpr *e) {
     return VisitCastExpr(e);
   }
@@ -651,11 +731,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   }
 
   mlir::Value VisitUnaryAddrOf(const UnaryOperator *e) {
-    if (llvm::isa<MemberPointerType>(e->getType())) {
-      cgf.cgm.errorNYI(e->getSourceRange(), "Address of member pointer");
-      return builder.getNullPtr(cgf.convertType(e->getType()),
-                                cgf.getLoc(e->getExprLoc()));
-    }
+    if (llvm::isa<MemberPointerType>(e->getType()))
+      return cgf.cgm.emitMemberPointerConstant(e);
 
     return cgf.emitLValue(e->getSubExpr()).getPointer();
   }
@@ -726,6 +803,16 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return Visit(e->getSubExpr());
   }
 
+  // C++
+  mlir::Value VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: materialize temporary");
+    return {};
+  }
+  mlir::Value VisitSourceLocExpr(SourceLocExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: source loc");
+    return {};
+  }
   mlir::Value VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
     CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
     return Visit(dae->getExpr());
@@ -745,11 +832,43 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     cgf.emitCXXDeleteExpr(e);
     return {};
   }
-
+  mlir::Value VisitTypeTraitExpr(const TypeTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: type trait");
+    return {};
+  }
+  mlir::Value
+  VisitConceptSpecializationExpr(const ConceptSpecializationExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: concept specialization");
+    return {};
+  }
+  mlir::Value VisitRequiresExpr(const RequiresExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: requires");
+    return {};
+  }
+  mlir::Value VisitArrayTypeTraitExpr(const ArrayTypeTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: array type trait");
+    return {};
+  }
+  mlir::Value VisitExpressionTraitExpr(const ExpressionTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: expression trait");
+    return {};
+  }
+  mlir::Value VisitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: cxx pseudo destructor");
+    return {};
+  }
   mlir::Value VisitCXXThrowExpr(const CXXThrowExpr *e) {
     cgf.emitCXXThrowExpr(e);
     return {};
   }
+  mlir::Value VisitCXXNoexceptExpr(CXXNoexceptExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: cxx noexcept");
+    return {};
+  }
 
   /// Emit a conversion from the specified type to the specified destination
   /// type, both of which are CIR scalar types.
@@ -1213,6 +1332,52 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return maybePromoteBoolResult(resOp.getResult(), resTy);
   }
 
+  mlir::Value VisitBinPtrMemD(const BinaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: ptr mem d");
+    return {};
+  }
+
+  mlir::Value VisitBinPtrMemI(const BinaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: ptr mem i");
+    return {};
+  }
+
+  // Other Operators.
+  mlir::Value VisitBlockExpr(const BlockExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: block");
+    return {};
+  }
+
+  mlir::Value VisitChooseExpr(ChooseExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: choose");
+    return {};
+  }
+
+  mlir::Value VisitObjCStringLiteral(const ObjCStringLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc string literal");
+    return {};
+  }
+  mlir::Value VisitObjCBoxedExpr(ObjCBoxedExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc boxed");
+    return {};
+  }
+  mlir::Value VisitObjCArrayLiteral(ObjCArrayLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc array literal");
+    return {};
+  }
+  mlir::Value VisitObjCDictionaryLiteral(ObjCDictionaryLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc dictionary literal");
+    return {};
+  }
+
+  mlir::Value VisitAsTypeExpr(AsTypeExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: as type");
+    return {};
+  }
+
   mlir::Value VisitAtomicExpr(AtomicExpr *e) {
     return cgf.emitAtomicExpr(e).getValue();
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index 492cbb69fabb3..6b2e60a551198 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -16,6 +16,7 @@
 #include "CIRGenCall.h"
 #include "CIRGenValue.h"
 #include "mlir/IR/Location.h"
+#include "clang/AST/Attr.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/GlobalDecl.h"
 #include "clang/CIR/MissingFeatures.h"
@@ -422,28 +423,35 @@ cir::TryOp CIRGenFunction::LexicalScope::getClosestTryParent() {
   return nullptr;
 }
 
-void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
-                                   cir::FuncOp fn, cir::FuncType funcType,
-                                   FunctionArgList args, SourceLocation loc,
-                                   SourceLocation startLoc) {
-  assert(!curFn &&
-         "CIRGenFunction can only be used for one function at a time");
+/// An argument came in as a promoted argument; demote it back to its
+/// declared type.
+static mlir::Value emitArgumentDemotion(CIRGenFunction &cgf, const VarDecl *var,
+                                        mlir::Value value) {
+  mlir::Type ty = cgf.convertType(var->getType());
 
-  curFn = fn;
+  // This can happen with promotions that actually don't change the
+  // underlying type, like the enum promotions.
+  if (value.getType() == ty)
+    return value;
 
-  const Decl *d = gd.getDecl();
+  assert((mlir::isa<cir::IntType>(ty) || cir::isAnyFloatingPointType(ty)) &&
+         "unexpected promotion type");
 
-  didCallStackSave = false;
-  curCodeDecl = d;
-  const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
-  curFuncDecl = d->getNonClosureContext();
+  if (mlir::isa<cir::IntType>(ty))
+    return cgf.getBuilder().CIRBaseBuilderTy::createIntCast(value, ty);
 
-  prologueCleanupDepth = ehStack.stable_begin();
+  return cgf.getBuilder().createFloatingCast(value, ty);
+}
 
-  mlir::Block *entryBB = &fn.getBlocks().front();
-  builder.setInsertionPointToStart(entryBB);
+void CIRGenFunction::emitFunctionProlog(const FunctionArgList &args,
+                                        mlir::Block *entryBB,
+                                        const FunctionDecl *fd,
+                                        SourceLocation bodyBeginLoc) {
+  // Naked functions don't have prologues.
+  if (fd && fd->hasAttr<NakedAttr>()) {
+    cgm.errorNYI(bodyBeginLoc, "naked function decl");
+  }
 
-  // TODO(cir): this should live in `emitFunctionProlog
   // Declare all the function arguments in the symbol table.
   for (const auto nameValue : llvm::zip(args, entryBB->getArguments())) {
     const VarDecl *paramVar = std::get<0>(nameValue);
@@ -466,20 +474,64 @@ void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
                       cast<ParmVarDecl>(paramVar)->isKNRPromoted();
     assert(!cir::MissingFeatures::constructABIArgDirectExtend());
     if (isPromoted)
-      cgm.errorNYI(fd->getSourceRange(), "Function argument demotion");
+      paramVal = emitArgumentDemotion(*this, paramVar, paramVal);
 
     // Location of the store to the param storage tracked as beginning of
     // the function body.
-    mlir::Location fnBodyBegin = getLoc(fd->getBody()->getBeginLoc());
+    mlir::Location fnBodyBegin = getLoc(bodyBeginLoc);
     builder.CIRBaseBuilderTy::createStore(fnBodyBegin, paramVal, addrVal);
   }
   assert(builder.getInsertionBlock() && "Should be valid");
+}
+
+void CIRGenFunction::startFunction(GlobalDecl gd, QualType returnType,
+                                   cir::FuncOp fn, cir::FuncType funcType,
+                                   FunctionArgList args, SourceLocation loc,
+                                   SourceLocation startLoc) {
+  assert(!curFn &&
+         "CIRGenFunction can only be used for one function at a time");
+
+  curFn = fn;
+
+  const Decl *d = gd.getDecl();
+
+  didCallStackSave = false;
+  curCodeDecl = d;
+  const auto *fd = dyn_cast_or_null<FunctionDecl>(d);
+  curFuncDecl = d->getNonClosureContext();
+
+  prologueCleanupDepth = ehStack.stable_begin();
+
+  mlir::Block *entryBB = &fn.getBlocks().front();
+  builder.setInsertionPointToStart(entryBB);
+
+  // Determine the function body begin location for the prolog.
+  // If fd is null or has no body, use startLoc as fallback.
+  SourceLocation bodyBeginLoc = startLoc;
+  if (fd) {
+    if (Stmt *body = fd->getBody())
+      bodyBeginLoc = body->getBeginLoc();
+    else
+      bodyBeginLoc = fd->getLocation();
+  }
+
+  emitFunctionProlog(args, entryBB, fd, bodyBeginLoc);
 
   // When the current function is not void, create an address to store the
   // result value.
-  if (!returnType->isVoidType())
-    emitAndUpdateRetAlloca(returnType, getLoc(fd->getBody()->getEndLoc()),
+  if (!returnType->isVoidType()) {
+    // Determine the function body end location.
+    // If fd is null or has no body, use loc as fallback.
+    SourceLocation bodyEndLoc = loc;
+    if (fd) {
+      if (Stmt *body = fd->getBody())
+        bodyEndLoc = body->getEndLoc();
+      else
+        bodyEndLoc = fd->getLocation();
+    }
+    emitAndUpdateRetAlloca(returnType, getLoc(bodyEndLoc),
                            getContext().getTypeAlignInChars(returnType));
+  }
 
   if (isa_and_nonnull<CXXMethodDecl>(d) &&
       cast<CXXMethodDecl>(d)->isInstance()) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 0df812bcfb94e..15322ee72a1b0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -954,6 +954,11 @@ class CIRGenFunction : public CIRGenTypeCache {
   clang::QualType buildFunctionArgList(clang::GlobalDecl gd,
                                        FunctionArgList &args);
 
+  /// Emit the function prologue: declare function arguments in the symbol
+  /// table.
+  void emitFunctionProlog(const FunctionArgList &args, mlir::Block *entryBB,
+                          const FunctionDecl *fd, SourceLocation bodyBeginLoc);
+
   /// Emit code for the start of a function.
   /// \param loc       The location to be associated with the function.
   /// \param startLoc  The location of the function body.
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index e1894c040dd53..eaa9e946e243d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -682,8 +682,11 @@ CIRGenModule::getOrCreateCIRGlobal(StringRef mangledName, mlir::Type ty,
 
     setLinkageForGV(gv, d);
 
-    if (d->getTLSKind())
-      errorNYI(d->getSourceRange(), "thread local global variable");
+    if (d->getTLSKind()) {
+      if (d->getTLSKind() == VarDecl::TLS_Dynamic)
+        errorNYI(d->getSourceRange(), "TLS dynamic");
+      setTLSMode(gv, *d);
+    }
 
     setGVProperties(gv, d);
 
@@ -738,12 +741,11 @@ mlir::Value CIRGenModule::getAddrOfGlobalVar(const VarDecl *d, mlir::Type ty,
   if (!ty)
     ty = getTypes().convertTypeForMem(astTy);
 
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
-
+  bool tlsAccess = d->getTLSKind() != VarDecl::TLS_None;
   cir::GlobalOp g = getOrCreateCIRGlobal(d, ty, isForDefinition);
   mlir::Type ptrTy = builder.getPointerTo(g.getSymType());
   return cir::GetGlobalOp::create(builder, getLoc(d->getSourceRange()), ptrTy,
-                                  g.getSymName());
+                                  g.getSymNameAttr(), tlsAccess);
 }
 
 cir::GlobalViewAttr CIRGenModule::getAddrOfGlobalVarAttr(const VarDecl *d) {
@@ -1464,6 +1466,26 @@ void CIRGenModule::emitExplicitCastExprType(const ExplicitCastExpr *e,
          "emitExplicitCastExprType");
 }
 
+mlir::Value CIRGenModule::emitMemberPointerConstant(const UnaryOperator *e) {
+  assert(!cir::MissingFeatures::cxxABI());
+
+  mlir::Location loc = getLoc(e->getSourceRange());
+
+  const auto *decl = cast<DeclRefExpr>(e->getSubExpr())->getDecl();
+
+  // A member function pointer.
+  if (isa<CXXMethodDecl>(decl)) {
+    errorNYI(e->getSourceRange(), "emitMemberPointerConstant: method pointer");
+    return {};
+  }
+
+  // Otherwise, a member data pointer.
+  auto ty = mlir::cast<cir::DataMemberType>(convertType(e->getType()));
+  const auto *fieldDecl = cast<FieldDecl>(decl);
+  return cir::ConstantOp::create(
+      builder, loc, builder.getDataMemberAttr(ty, fieldDecl->getFieldIndex()));
+}
+
 void CIRGenModule::emitDeclContext(const DeclContext *dc) {
   for (Decl *decl : dc->decls()) {
     // Unlike other DeclContexts, the contents of an ObjCImplDecl at TU scope
@@ -1933,6 +1955,33 @@ void CIRGenModule::setGVPropertiesAux(mlir::Operation *op,
   assert(!cir::MissingFeatures::opGlobalPartition());
 }
 
+cir::TLS_Model CIRGenModule::getDefaultCIRTLSModel() const {
+  switch (getCodeGenOpts().getDefaultTLSModel()) {
+  case CodeGenOptions::GeneralDynamicTLSModel:
+    return cir::TLS_Model::GeneralDynamic;
+  case CodeGenOptions::LocalDynamicTLSModel:
+    return cir::TLS_Model::LocalDynamic;
+  case CodeGenOptions::InitialExecTLSModel:
+    return cir::TLS_Model::InitialExec;
+  case CodeGenOptions::LocalExecTLSModel:
+    return cir::TLS_Model::LocalExec;
+  }
+  llvm_unreachable("Invalid TLS model!");
+}
+
+void CIRGenModule::setTLSMode(mlir::Operation *op, const VarDecl &d) {
+  assert(d.getTLSKind() && "setting TLS mode on non-TLS var!");
+
+  cir::TLS_Model tlm = getDefaultCIRTLSModel();
+
+  // Override the TLS model if it is explicitly specified.
+  if (d.getAttr<TLSModelAttr>())
+    errorNYI(d.getSourceRange(), "TLS model attribute");
+
+  auto global = cast<cir::GlobalOp>(op);
+  global.setTlsModel(tlm);
+}
+
 void CIRGenModule::setFunctionAttributes(GlobalDecl globalDecl,
                                          cir::FuncOp func,
                                          bool isIncompleteFunction,
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 59eb5f8938129..de263f4868507 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -447,6 +447,13 @@ class CIRGenModule : public CIRGenTypeCache {
   void setGVProperties(mlir::Operation *op, const NamedDecl *d) const;
   void setGVPropertiesAux(mlir::Operation *op, const NamedDecl *d) const;
 
+  /// Set TLS mode for the given operation based on the given variable
+  /// declaration.
+  void setTLSMode(mlir::Operation *op, const VarDecl &d);
+
+  /// Get TLS mode from CodeGenOptions.
+  cir::TLS_Model getDefaultCIRTLSModel() const;
+
   /// Set function attributes for a function declaration.
   void setFunctionAttributes(GlobalDecl gd, cir::FuncOp f,
                              bool isIncompleteFunction, bool isThunk);
@@ -497,6 +504,8 @@ class CIRGenModule : public CIRGenTypeCache {
   /// the given type. This is usually, but not always, an LLVM null constant.
   mlir::TypedAttr emitNullConstantForBase(const CXXRecordDecl *record);
 
+  mlir::Value emitMemberPointerConstant(const UnaryOperator *e);
+
   llvm::StringRef getMangledName(clang::GlobalDecl gd);
 
   void emitTentativeDefinition(const VarDecl *d);
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index da7ab0691cb63..f13e7cb32c71e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -159,6 +159,10 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
     return emitCXXTryStmt(cast<CXXTryStmt>(*s));
   case Stmt::CXXForRangeStmtClass:
     return emitCXXForRangeStmt(cast<CXXForRangeStmt>(*s), attr);
+  case Stmt::CoroutineBodyStmtClass:
+    return emitCoroutineBody(cast<CoroutineBodyStmt>(*s));
+  case Stmt::IndirectGotoStmtClass:
+    return emitIndirectGotoStmt(cast<IndirectGotoStmt>(*s));
   case Stmt::OpenACCComputeConstructClass:
     return emitOpenACCComputeConstruct(cast<OpenACCComputeConstruct>(*s));
   case Stmt::OpenACCLoopConstructClass:
@@ -199,11 +203,7 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
   case Stmt::CaseStmtClass:
   case Stmt::SEHLeaveStmtClass:
   case Stmt::SYCLKernelCallStmtClass:
-  case Stmt::CoroutineBodyStmtClass:
-    return emitCoroutineBody(cast<CoroutineBodyStmt>(*s));
   case Stmt::CoreturnStmtClass:
-  case Stmt::IndirectGotoStmtClass:
-    return emitIndirectGotoStmt(cast<IndirectGotoStmt>(*s));
   case Stmt::OMPParallelDirectiveClass:
   case Stmt::OMPTaskwaitDirectiveClass:
   case Stmt::OMPTaskyieldDirectiveClass:
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index 24b106b4bcee7..7f000ece8a494 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -482,6 +482,21 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     break;
   }
 
+  case Type::MemberPointer: {
+    const auto *mpt = cast<MemberPointerType>(ty);
+
+    mlir::Type memberTy = convertType(mpt->getPointeeType());
+    auto clsTy = mlir::cast<cir::RecordType>(
+        convertType(QualType(mpt->getQualifier().getAsType(), 0)));
+    if (mpt->isMemberDataPointer()) {
+      resultType = cir::DataMemberType::get(memberTy, clsTy);
+    } else {
+      assert(!cir::MissingFeatures::methodType());
+      cgm.errorNYI(SourceLocation(), "MethodType");
+    }
+    break;
+  }
+
   case Type::FunctionNoProto:
   case Type::FunctionProto:
     resultType = convertFunctionTypeInternal(type);
diff --git a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
index ee296f171e0d9..59d7765198f9e 100644
--- a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
@@ -269,6 +269,38 @@ ConstComplexAttr::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DataMemberAttr definitions
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+DataMemberAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                       cir::DataMemberType ty,
+                       std::optional<unsigned> memberIndex) {
+  // DataMemberAttr without a given index represents a null value.
+  if (!memberIndex.has_value())
+    return success();
+
+  cir::RecordType recTy = ty.getClassTy();
+  if (recTy.isIncomplete())
+    return emitError()
+           << "incomplete 'cir.record' cannot be used to build a non-null "
+              "data member pointer";
+
+  unsigned memberIndexValue = memberIndex.value();
+  if (memberIndexValue >= recTy.getNumElements())
+    return emitError()
+           << "member index of a #cir.data_member attribute is out of range";
+
+  mlir::Type memberTy = recTy.getMembers()[memberIndexValue];
+  if (memberTy != ty.getMemberTy())
+    return emitError()
+           << "member type of a #cir.data_member attribute must match the "
+              "attribute type";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // CIR ConstArrayAttr
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index ec8cae62d6bc8..0f546cb254db4 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -357,6 +357,12 @@ static LogicalResult checkConstantTypes(mlir::Operation *op, mlir::Type opType,
     return success();
   }
 
+  if (isa<cir::DataMemberAttr>(attrType)) {
+    // More detailed type verifications are already done in
+    // DataMemberAttr::verify. Don't need to repeat here.
+    return success();
+  }
+
   if (isa<cir::ZeroAttr>(attrType)) {
     if (isa<cir::RecordType, cir::ArrayType, cir::VectorType, cir::ComplexType>(
             opType))
@@ -1731,7 +1737,10 @@ cir::GetGlobalOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
   if (auto g = dyn_cast<GlobalOp>(op)) {
     symTy = g.getSymType();
     assert(!cir::MissingFeatures::addressSpace());
-    assert(!cir::MissingFeatures::opGlobalThreadLocal());
+    // Verify that for thread local global access, the global needs to
+    // be marked with tls bits.
+    if (getTls() && !g.getTlsModel())
+      return emitOpError("access to global not marked thread local");
   } else if (auto f = dyn_cast<FuncOp>(op)) {
     symTy = f.getFunctionType();
   } else {
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
index bb87056048ec5..9a37a4f4e3996 100644
--- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -750,6 +750,26 @@ BoolType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
   return 1;
 }
 
+//===----------------------------------------------------------------------===//
+//  DataMemberType Definitions
+//===----------------------------------------------------------------------===//
+
+llvm::TypeSize
+DataMemberType::getTypeSizeInBits(const ::mlir::DataLayout &dataLayout,
+                                  ::mlir::DataLayoutEntryListRef params) const {
+  // FIXME: consider size differences under different ABIs
+  assert(!MissingFeatures::cxxABI());
+  return llvm::TypeSize::getFixed(64);
+}
+
+uint64_t
+DataMemberType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
+                                ::mlir::DataLayoutEntryListRef params) const {
+  // FIXME: consider alignment differences under different ABIs
+  assert(!MissingFeatures::cxxABI());
+  return 8;
+}
+
 //===----------------------------------------------------------------------===//
 //  VPtrType Definitions
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
index 3fc5b06b74e4d..e3b7106c1d6b9 100644
--- a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
+++ b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(TargetLowering)
+
 add_clang_library(MLIRCIRTransforms
   CIRCanonicalize.cpp
   CIRSimplify.cpp
@@ -21,4 +23,5 @@ add_clang_library(MLIRCIRTransforms
 
   MLIRCIR
   MLIRCIRInterfaces
+  MLIRCIRTargetLowering
 )
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp
new file mode 100644
index 0000000000000..86cf7ebdc8f50
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp
@@ -0,0 +1,20 @@
+//===- CIRCXXABI.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics clang/lib/CodeGen/CGCXXABI.cpp. The queries are
+// adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRCXXABI.h"
+
+namespace cir {
+
+CIRCXXABI::~CIRCXXABI() {}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
new file mode 100644
index 0000000000000..003cd78eb3f26
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
@@ -0,0 +1,55 @@
+//===----- CIRCXXABI.h - Interface to C++ ABIs for CIR Dialect --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics the CodeGen/CGCXXABI.h class. The main difference
+// is that this is adapted to operate on the CIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
+#define CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
+
+#include "mlir/Transforms/DialectConversion.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+
+namespace cir {
+
+// Forward declarations.
+class LowerModule;
+
+class CIRCXXABI {
+  friend class LowerModule;
+
+protected:
+  LowerModule &lm;
+
+  CIRCXXABI(LowerModule &lm) : lm(lm) {}
+
+public:
+  virtual ~CIRCXXABI();
+
+  /// Lower the given data member pointer type to its ABI type. The returned
+  /// type is also a CIR type.
+  virtual mlir::Type
+  lowerDataMemberType(cir::DataMemberType type,
+                      const mlir::TypeConverter &typeConverter) const = 0;
+
+  /// Lower the given data member pointer constant to a constant of the ABI
+  /// type. The returned constant is represented as an attribute as well.
+  virtual mlir::TypedAttr
+  lowerDataMemberConstant(cir::DataMemberAttr attr,
+                          const mlir::DataLayout &layout,
+                          const mlir::TypeConverter &typeConverter) const = 0;
+};
+
+/// Creates an Itanium-family ABI.
+std::unique_ptr<CIRCXXABI> createItaniumCXXABI(LowerModule &lm);
+
+} // namespace cir
+
+#endif // CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
new file mode 100644
index 0000000000000..158c42e729536
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
@@ -0,0 +1,20 @@
+add_clang_library(MLIRCIRTargetLowering
+  CIRCXXABI.cpp
+  LowerModule.cpp
+  LowerItaniumCXXABI.cpp
+
+  DEPENDS
+  clangBasic
+
+  LINK_COMPONENTS
+  TargetParser
+
+  LINK_LIBS PUBLIC
+
+  clangBasic
+  MLIRIR
+  MLIRPass
+  MLIRDLTIDialect
+  MLIRCIR
+  MLIRCIRInterfaces
+)
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
new file mode 100644
index 0000000000000..7089990343dc0
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
@@ -0,0 +1,90 @@
+//===---- LowerItaniumCXXABI.cpp - Emit CIR code Itanium-specific code  ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides CIR lowering logic targeting the Itanium C++ ABI. The class in
+// this file generates records that follow the Itanium C++ ABI, which is
+// documented at:
+//  https://itanium-cxx-abi.github.io/cxx-abi/abi.html
+//  https://itanium-cxx-abi.github.io/cxx-abi/abi-eh.html
+//
+// It also supports the closely-related ARM ABI, documented at:
+// https://developer.arm.com/documentation/ihi0041/g/
+//
+// This file partially mimics clang/lib/CodeGen/ItaniumCXXABI.cpp. The queries
+// are adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRCXXABI.h"
+#include "LowerModule.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace cir {
+
+namespace {
+
+class LowerItaniumCXXABI : public CIRCXXABI {
+public:
+  LowerItaniumCXXABI(LowerModule &lm) : CIRCXXABI(lm) {}
+
+  /// Lower the given data member pointer type to its ABI type. The returned
+  /// type is also a CIR type.
+  virtual mlir::Type
+  lowerDataMemberType(cir::DataMemberType type,
+                      const mlir::TypeConverter &typeConverter) const override;
+
+  mlir::TypedAttr lowerDataMemberConstant(
+      cir::DataMemberAttr attr, const mlir::DataLayout &layout,
+      const mlir::TypeConverter &typeConverter) const override;
+};
+
+} // namespace
+
+std::unique_ptr<CIRCXXABI> createItaniumCXXABI(LowerModule &lm) {
+  return std::make_unique<LowerItaniumCXXABI>(lm);
+}
+
+static cir::IntType getPtrDiffCIRTy(LowerModule &lm) {
+  const clang::TargetInfo &target = lm.getTarget();
+  clang::TargetInfo::IntType ptrdiffTy =
+      target.getPtrDiffType(clang::LangAS::Default);
+  return cir::IntType::get(lm.getMLIRContext(), target.getTypeWidth(ptrdiffTy),
+                           target.isTypeSigned(ptrdiffTy));
+}
+
+mlir::Type LowerItaniumCXXABI::lowerDataMemberType(
+    cir::DataMemberType type, const mlir::TypeConverter &typeConverter) const {
+  // Itanium C++ ABI 2.3.1:
+  //   A data member pointer is represented as the data member's offset in bytes
+  //   from the address point of an object of the base type, as a ptrdiff_t.
+  return getPtrDiffCIRTy(lm);
+}
+
+mlir::TypedAttr LowerItaniumCXXABI::lowerDataMemberConstant(
+    cir::DataMemberAttr attr, const mlir::DataLayout &layout,
+    const mlir::TypeConverter &typeConverter) const {
+  uint64_t memberOffset;
+  if (attr.isNullPtr()) {
+    // Itanium C++ ABI 2.3:
+    //   A NULL pointer is represented as -1.
+    memberOffset = -1ull;
+  } else {
+    // Itanium C++ ABI 2.3:
+    //   A pointer to data member is an offset from the base address of
+    //   the class object containing it, represented as a ptrdiff_t
+    unsigned memberIndex = attr.getMemberIndex().value();
+    memberOffset =
+        attr.getType().getClassTy().getElementOffset(layout, memberIndex);
+  }
+
+  mlir::Type abiTy = lowerDataMemberType(attr.getType(), typeConverter);
+  return cir::IntAttr::get(abiTy, memberOffset);
+}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp
new file mode 100644
index 0000000000000..7576e20ac8f54
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp
@@ -0,0 +1,87 @@
+//===--- LowerModule.cpp - Lower CIR Module to a Target -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics clang/lib/CodeGen/CodeGenModule.cpp. The queries
+// are adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LowerModule.h"
+#include "CIRCXXABI.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
+#include "clang/CIR/MissingFeatures.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace cir {
+
+static std::unique_ptr<CIRCXXABI> createCXXABI(LowerModule &lm) {
+  switch (lm.getCXXABIKind()) {
+  case clang::TargetCXXABI::AppleARM64:
+  case clang::TargetCXXABI::Fuchsia:
+  case clang::TargetCXXABI::GenericAArch64:
+  case clang::TargetCXXABI::GenericARM:
+  case clang::TargetCXXABI::iOS:
+  case clang::TargetCXXABI::WatchOS:
+  case clang::TargetCXXABI::GenericMIPS:
+  case clang::TargetCXXABI::GenericItanium:
+  case clang::TargetCXXABI::WebAssembly:
+  case clang::TargetCXXABI::XL:
+    return createItaniumCXXABI(lm);
+  case clang::TargetCXXABI::Microsoft:
+    llvm_unreachable("Windows ABI NYI");
+  }
+
+  llvm_unreachable("invalid C++ ABI kind");
+}
+
+LowerModule::LowerModule(clang::LangOptions langOpts,
+                         clang::CodeGenOptions codeGenOpts,
+                         mlir::ModuleOp &module,
+                         std::unique_ptr<clang::TargetInfo> target,
+                         mlir::PatternRewriter &rewriter)
+    : module(module), target(std::move(target)), abi(createCXXABI(*this)),
+      rewriter(rewriter) {}
+
+// TODO: not to create it every time
+std::unique_ptr<LowerModule>
+createLowerModule(mlir::ModuleOp module, mlir::PatternRewriter &rewriter) {
+  // Fetch target information.
+  llvm::Triple triple(mlir::cast<mlir::StringAttr>(
+                          module->getAttr(cir::CIRDialect::getTripleAttrName()))
+                          .getValue());
+  clang::TargetOptions targetOptions;
+  targetOptions.Triple = triple.str();
+  auto targetInfo = clang::targets::AllocateTarget(triple, targetOptions);
+
+  // FIXME(cir): This just uses the default language options. We need to account
+  // for custom options.
+  // Create context.
+  assert(!cir::MissingFeatures::lowerModuleLangOpts());
+  clang::LangOptions langOpts;
+
+  // FIXME(cir): This just uses the default code generation options. We need to
+  // account for custom options.
+  assert(!cir::MissingFeatures::lowerModuleCodeGenOpts());
+  clang::CodeGenOptions codeGenOpts;
+
+  if (auto optInfo = mlir::cast_if_present<cir::OptInfoAttr>(
+          module->getAttr(cir::CIRDialect::getOptInfoAttrName()))) {
+    codeGenOpts.OptimizationLevel = optInfo.getLevel();
+    codeGenOpts.OptimizeSize = optInfo.getSize();
+  }
+
+  return std::make_unique<LowerModule>(std::move(langOpts),
+                                       std::move(codeGenOpts), module,
+                                       std::move(targetInfo), rewriter);
+}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h
new file mode 100644
index 0000000000000..440e307f571e9
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h
@@ -0,0 +1,55 @@
+//===--- LowerModule.h - Abstracts CIR's module lowering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics clang/lib/CodeGen/CodeGenModule.h. The queries are
+// adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_LOWERMODULE_H
+#define CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_LOWERMODULE_H
+
+#include "CIRCXXABI.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "clang/Basic/CodeGenOptions.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/MissingFeatures.h"
+#include <memory>
+
+namespace cir {
+
+class LowerModule {
+  mlir::ModuleOp module;
+  const std::unique_ptr<clang::TargetInfo> target;
+  std::unique_ptr<CIRCXXABI> abi;
+  [[maybe_unused]] mlir::PatternRewriter &rewriter;
+
+public:
+  LowerModule(clang::LangOptions langOpts, clang::CodeGenOptions codeGenOpts,
+              mlir::ModuleOp &module, std::unique_ptr<clang::TargetInfo> target,
+              mlir::PatternRewriter &rewriter);
+  ~LowerModule() = default;
+
+  clang::TargetCXXABI::Kind getCXXABIKind() const {
+    assert(!cir::MissingFeatures::lowerModuleLangOpts());
+    return target->getCXXABI().getKind();
+  }
+
+  CIRCXXABI &getCXXABI() const { return *abi; }
+  const clang::TargetInfo &getTarget() const { return *target; }
+  mlir::MLIRContext *getMLIRContext() { return module.getContext(); }
+};
+
+std::unique_ptr<LowerModule> createLowerModule(mlir::ModuleOp module,
+                                               mlir::PatternRewriter &rewriter);
+
+} // namespace cir
+
+#endif // CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_LOWERMODULE_H
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
index 7baff3412a84e..2525e02ae8f85 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
@@ -18,7 +18,12 @@ add_clang_library(clangCIRLoweringDirectToLLVM
   clangCIRLoweringCommon
   ${dialect_libs}
   MLIRCIR
+  MLIRCIRTargetLowering
   MLIRBuiltinToLLVMIRTranslation
   MLIRLLVMToLLVMIRTranslation
   MLIRIR
   )
+
+target_include_directories(clangCIRLoweringDirectToLLVM PRIVATE
+  ${CLANG_SOURCE_DIR}/lib/CIR/Dialect/Transforms/TargetLowering
+  )
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index cc911cfc7d778..0ad3360e1357e 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1755,6 +1755,14 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
       return mlir::success();
     }
     attr = op.getValue();
+  } else if (mlir::isa<cir::DataMemberType>(op.getType())) {
+    assert(lowerMod && "lower module is not available");
+    auto dataMember = mlir::cast<cir::DataMemberAttr>(op.getValue());
+    mlir::DataLayout layout(op->getParentOfType<mlir::ModuleOp>());
+    mlir::TypedAttr abiValue = lowerMod->getCXXABI().lowerDataMemberConstant(
+        dataMember, layout, *typeConverter);
+    rewriter.replaceOpWithNewOp<ConstantOp>(op, abiValue);
+    return mlir::success();
   } else if (const auto arrTy = mlir::dyn_cast<cir::ArrayType>(op.getType())) {
     const auto constArr = mlir::dyn_cast<cir::ConstArrayAttr>(op.getValue());
     if (!constArr && !isa<cir::ZeroAttr, cir::UndefAttr>(op.getValue()))
@@ -2050,7 +2058,11 @@ mlir::LogicalResult CIRToLLVMGetGlobalOpLowering::matchAndRewrite(
   mlir::Operation *newop = mlir::LLVM::AddressOfOp::create(
       rewriter, op.getLoc(), type, op.getName());
 
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
+  if (op.getTls()) {
+    // Handle access to TLS via intrinsic.
+    newop = mlir::LLVM::ThreadlocalAddressOp::create(rewriter, op.getLoc(),
+                                                     type, newop->getResult(0));
+  }
 
   rewriter.replaceOp(op, newop);
   return mlir::success();
@@ -2071,8 +2083,7 @@ void CIRToLLVMGlobalOpLowering::setupRegionInitializedLLVMGlobalOp(
   assert(!cir::MissingFeatures::addressSpace());
   const unsigned addrSpace = 0;
   const bool isDsoLocal = op.getDsoLocal();
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
-  const bool isThreadLocal = false;
+  const bool isThreadLocal = (bool)op.getTlsModelAttr();
   const uint64_t alignment = op.getAlignment().value_or(0);
   const mlir::LLVM::Linkage linkage = convertLinkage(op.getLinkage());
   const StringRef symbol = op.getSymName();
@@ -2132,8 +2143,7 @@ mlir::LogicalResult CIRToLLVMGlobalOpLowering::matchAndRewrite(
   assert(!cir::MissingFeatures::addressSpace());
   const unsigned addrSpace = 0;
   const bool isDsoLocal = op.getDsoLocal();
-  assert(!cir::MissingFeatures::opGlobalThreadLocal());
-  const bool isThreadLocal = false;
+  const bool isThreadLocal = (bool)op.getTlsModelAttr();
   const uint64_t alignment = op.getAlignment().value_or(0);
   const mlir::LLVM::Linkage linkage = convertLinkage(op.getLinkage());
   const StringRef symbol = op.getSymName();
@@ -2839,8 +2849,20 @@ mlir::LogicalResult CIRToLLVMSelectOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+std::unique_ptr<cir::LowerModule> prepareLowerModule(mlir::ModuleOp module) {
+  mlir::PatternRewriter rewriter{module->getContext()};
+  // If the triple is not present, e.g. CIR modules parsed from text, we
+  // cannot init LowerModule properly. This happens in some lowering tests,
+  // but it should not happen in real compilation.
+  assert(!cir::MissingFeatures::makeTripleAlwaysPresent());
+  if (!module->hasAttr(cir::CIRDialect::getTripleAttrName()))
+    return {};
+  return cir::createLowerModule(module, rewriter);
+}
+
 static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
-                                 mlir::DataLayout &dataLayout) {
+                                 mlir::DataLayout &dataLayout,
+                                 cir::LowerModule *lowerModule) {
   converter.addConversion([&](cir::PointerType type) -> mlir::Type {
     unsigned addrSpace =
         type.getAddrSpace() ? type.getAddrSpace().getValue().getUInt() : 0;
@@ -2850,6 +2872,13 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
     assert(!cir::MissingFeatures::addressSpace());
     return mlir::LLVM::LLVMPointerType::get(type.getContext());
   });
+  converter.addConversion(
+      [&, lowerModule](cir::DataMemberType type) -> mlir::Type {
+        assert(lowerModule && "CXXABI is not available");
+        mlir::Type abiType =
+            lowerModule->getCXXABI().lowerDataMemberType(type, converter);
+        return converter.convertType(abiType);
+      });
   converter.addConversion([&](cir::ArrayType type) -> mlir::Type {
     mlir::Type ty =
         convertTypeForMemory(converter, dataLayout, type.getElementType());
@@ -3118,7 +3147,8 @@ void ConvertCIRToLLVMPass::runOnOperation() {
   mlir::ModuleOp module = getOperation();
   mlir::DataLayout dl(module);
   mlir::LLVMTypeConverter converter(&getContext());
-  prepareTypeConverter(converter, dl);
+  std::unique_ptr<cir::LowerModule> lowerModule = prepareLowerModule(module);
+  prepareTypeConverter(converter, dl, lowerModule.get());
 
   mlir::RewritePatternSet patterns(&getContext());
 
@@ -3126,7 +3156,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
 #define GET_LLVM_LOWERING_PATTERNS_LIST
 #include "clang/CIR/Dialect/IR/CIRLowering.inc"
 #undef GET_LLVM_LOWERING_PATTERNS_LIST
-      >(converter, patterns.getContext(), dl);
+      >(converter, patterns.getContext(), lowerModule.get(), dl);
 
   processCIRAttrs(module);
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 0591de545b81d..d32f8603ee0be 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -12,6 +12,8 @@
 #ifndef CLANG_CIR_LOWERTOLLVM_H
 #define CLANG_CIR_LOWERTOLLVM_H
 
+#include "LowerModule.h"
+
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Transforms/DialectConversion.h"
diff --git a/clang/lib/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/DependencyScanning/DependencyScannerImpl.cpp
index acd05cc50daa8..beec192e3031d 100644
--- a/clang/lib/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/DependencyScanning/DependencyScannerImpl.cpp
@@ -367,7 +367,7 @@ dependencies::createDiagOptions(ArrayRef<std::string> CommandLine) {
   return DiagOpts;
 }
 
-DignosticsEngineWithDiagOpts::DignosticsEngineWithDiagOpts(
+DiagnosticsEngineWithDiagOpts::DiagnosticsEngineWithDiagOpts(
     ArrayRef<std::string> CommandLine,
     IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS, DiagnosticConsumer &DC) {
   std::vector<const char *> CCommandLine(CommandLine.size(), nullptr);
@@ -725,7 +725,7 @@ bool CompilerInstanceWithContext::initialize(DiagnosticConsumer *DC) {
   std::tie(OverlayFS, CommandLine) = initVFSForByNameScanning(
       Worker.DepFS, CommandLine, CWD, "ScanningByName");
 
-  DiagEngineWithCmdAndOpts = std::make_unique<DignosticsEngineWithDiagOpts>(
+  DiagEngineWithCmdAndOpts = std::make_unique<DiagnosticsEngineWithDiagOpts>(
       CommandLine, OverlayFS, *DiagConsumer);
 
   std::tie(Driver, Compilation) = buildCompilation(
diff --git a/clang/lib/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/DependencyScanning/DependencyScanningWorker.cpp
index 7b03abd8e3138..5982dcf160083 100644
--- a/clang/lib/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/DependencyScanning/DependencyScanningWorker.cpp
@@ -100,7 +100,7 @@ bool DependencyScanningWorker::scanDependencies(
     FS = std::move(OverlayFS);
   }
 
-  DignosticsEngineWithDiagOpts DiagEngineWithCmdAndOpts(CommandLine, FS, DC);
+  DiagnosticsEngineWithDiagOpts DiagEngineWithCmdAndOpts(CommandLine, FS, DC);
   DependencyScanningAction Action(Service, WorkingDirectory, Consumer,
                                   Controller, DepFS);
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 8cd2aadd76493..613438dcacee5 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5787,6 +5787,18 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-fenable-matrix");
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back("-enable-matrix");
+    // Only handle default layout if matrix is enabled
+    if (const Arg *A = Args.getLastArg(options::OPT_fmatrix_memory_layout_EQ)) {
+      StringRef Val = A->getValue();
+      if (Val == "row-major" || Val == "column-major") {
+        CmdArgs.push_back(Args.MakeArgString("-fmatrix-memory-layout=" + Val));
+        CmdArgs.push_back("-mllvm");
+        CmdArgs.push_back(Args.MakeArgString("-matrix-default-layout=" + Val));
+
+      } else {
+        D.Diag(diag::err_drv_invalid_value) << A->getAsString(Args) << Val;
+      }
+    }
   }
 
   CodeGenOptions::FramePointerKind FPKeepKind =
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index e5eda46df8056..7f2778713eade 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -633,7 +633,10 @@ DeclarationFragmentsBuilder::getFragmentsForParam(const ParmVarDecl *Param) {
                 DeclarationFragments::FragmentKind::InternalParam);
   } else {
     Fragments.append(std::move(TypeFragments));
-    if (!T->isAnyPointerType() && !T->isBlockPointerType())
+    // If the type is a type alias, append the space
+    // even if the underlying type is a pointer type.
+    if (T->isTypedefNameType() ||
+        (!T->isAnyPointerType() && !T->isBlockPointerType()))
       Fragments.appendSpace();
     Fragments
         .append(Param->getName(),
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 3d37461e3176c..75804413d89d4 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4018,6 +4018,15 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
     StringRef S = llvm::getAllocTokenModeAsString(*Opts.AllocTokenMode);
     GenerateArg(Consumer, OPT_falloc_token_mode_EQ, S);
   }
+  // Generate args for matrix types.
+  if (Opts.MatrixTypes) {
+    if (Opts.getDefaultMatrixMemoryLayout() ==
+        LangOptions::MatrixMemoryLayout::MatrixColMajor)
+      GenerateArg(Consumer, OPT_fmatrix_memory_layout_EQ, "column-major");
+    if (Opts.getDefaultMatrixMemoryLayout() ==
+        LangOptions::MatrixMemoryLayout::MatrixRowMajor)
+      GenerateArg(Consumer, OPT_fmatrix_memory_layout_EQ, "row-major");
+  }
 }
 
 bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
@@ -4670,6 +4679,27 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
       Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S;
   }
 
+  // Enable options for matrix types.
+  if (Opts.MatrixTypes) {
+    if (const Arg *A = Args.getLastArg(OPT_fmatrix_memory_layout_EQ)) {
+      StringRef ClangValue = A->getValue();
+      if (ClangValue == "row-major")
+        Opts.setDefaultMatrixMemoryLayout(
+            LangOptions::MatrixMemoryLayout::MatrixRowMajor);
+      else
+        Opts.setDefaultMatrixMemoryLayout(
+            LangOptions::MatrixMemoryLayout::MatrixColMajor);
+
+      for (Arg *A : Args.filtered(options::OPT_mllvm)) {
+        StringRef OptValue = A->getValue();
+        if (OptValue.consume_front("-matrix-default-layout=") &&
+            ClangValue != OptValue)
+          Diags.Report(diag::err_conflicting_matrix_layout_flags)
+              << ClangValue << OptValue;
+      }
+    }
+  }
+
   // Validate options for HLSL
   if (Opts.HLSL) {
     // TODO: Revisit restricting SPIR-V to logical once we've figured out how to
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index fdb57c7c9e27b..b2215b72c57bc 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -185,8 +185,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v32hi)__B,
+                                             (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -206,8 +206,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v32hi)__B,
+                                              (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
@@ -227,8 +227,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v32hu)__B,
+                                             (__v32hi)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -248,8 +248,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v32hu)__B,
+                                              (__v32hi)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
@@ -269,8 +269,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v32hu)__B,
+                                             (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -290,8 +290,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v32hu)__B,
+                                              (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index a1a0338a69e0d..4b8a199af32e5 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -80,8 +80,8 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-#define _mm256_dpwssd_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssd_epi32(S, A, B)                                           \
+  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -98,8 +98,9 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-#define _mm256_dpwssds_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssds_epi32(S, A, B)                                          \
+  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A),             \
+                                        (__v16hi)(B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -157,8 +158,8 @@
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-#define _mm_dpwssd_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssd_epi32(S, A, B)                                              \
+  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -175,8 +176,8 @@
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-#define _mm_dpwssds_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssds_epi32(S, A, B)                                             \
+  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index c386923360de6..2ce88efe4a04f 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -68,8 +68,8 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A,
+                                             (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -91,8 +91,8 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A,
+                                              (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h
index 805d249911c17..98d94ee3fcf3a 100644
--- a/clang/lib/Headers/avxvnniint16intrin.h
+++ b/clang/lib/Headers/avxvnniint16intrin.h
@@ -16,9 +16,10 @@
 #define __AVXVNNIINT16INTRIN_H
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -40,19 +41,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwsud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v8hi)(__A),           \
+                                       (__v8hu)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -74,20 +77,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+///		dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwsud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v16hi)(__A),          \
+                                       (__v16hu)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -109,20 +113,22 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 #define _mm_dpwsuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v8hi)(__A),          \
+                                        (__v8hu)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -144,19 +150,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwsuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v16hi)(__A),         \
+                                        (__v16hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -178,19 +186,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwusd_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v8hu)(__A),           \
+                                       (__v8hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -212,20 +222,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwusd_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v16hu)(__A),          \
+                                       (__v16hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and
+///    store the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -233,7 +244,7 @@
 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUSDS instruction.
 ///
 /// \param __W
 ///    A 128-bit vector of [4 x int].
@@ -247,20 +258,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwusds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v8hu)(__A),          \
+                                        (__v8hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and
+///    store the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -268,7 +280,7 @@
 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUSDS instruction.
 ///
 /// \param __W
 ///    A 256-bit vector of [8 x int].
@@ -282,19 +294,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwusds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v16hu)(__A),         \
+                                        (__v16hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -305,30 +319,32 @@
 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 ///
 /// \param __W
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 /// \param __A
 ///    A 128-bit vector of [8 x unsigned short].
 /// \param __B
 ///    A 128-bit vector of [8 x unsigned short].
 /// \returns
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwuud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v8hu)(__A),           \
+                                       (__v8hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -339,31 +355,32 @@
 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 ///
 /// \param __W
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 /// \param __A
 ///    A 256-bit vector of [16 x unsigned short].
 /// \param __B
 ///    A 256-bit vector of [16 x unsigned short].
 /// \returns
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwuud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v16hu)(__A),          \
+                                       (__v16hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -371,34 +388,35 @@
 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUUDS instruction.
 ///
 /// \param __W
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 /// \param __A
 ///    A 128-bit vector of [8 x unsigned short].
 /// \param __B
 ///    A 128-bit vector of [8 x unsigned short].
 /// \returns
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwuuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v8hu)(__A),          \
+                                        (__v8hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -406,27 +424,28 @@
 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUUDS instruction.
 ///
 /// \param __W
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 /// \param __A
 ///    A 256-bit vector of [16 x unsigned short].
 /// \param __B
 ///    A 256-bit vector of [16 x unsigned short].
 /// \returns
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwuuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v16hu)(__A),         \
+                                        (__v16hu)(__B)))
 
 #endif // __AVXVNNIINT16INTRIN_H
diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h
index 3c4c44a930fe2..1d2e8c906effc 100644
--- a/clang/lib/Headers/avxvnniintrin.h
+++ b/clang/lib/Headers/avxvnniintrin.h
@@ -109,7 +109,8 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v16hi)__A,
+                                             (__v16hi)__B);
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
@@ -130,7 +131,8 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v16hi)__A,
+                                              (__v16hi)__B);
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
@@ -199,7 +201,8 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v8hi)__A,
+                                             (__v8hi)__B);
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
@@ -220,7 +223,8 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v8hi)__A,
+                                              (__v8hi)__B);
 }
 
 #undef __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 223ea2cb99c5d..9237825b9ae06 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -16513,6 +16513,13 @@ ExprResult Sema::BuiltinMatrixColumnMajorLoad(CallExpr *TheCall,
     return ExprError();
   }
 
+  if (getLangOpts().getDefaultMatrixMemoryLayout() !=
+      LangOptions::MatrixMemoryLayout::MatrixColMajor) {
+    Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_major_order_disabled)
+        << /*column*/ 1 << /*load*/ 0;
+    return ExprError();
+  }
+
   if (checkArgCount(TheCall, 4))
     return ExprError();
 
@@ -16625,6 +16632,18 @@ ExprResult Sema::BuiltinMatrixColumnMajorLoad(CallExpr *TheCall,
 
 ExprResult Sema::BuiltinMatrixColumnMajorStore(CallExpr *TheCall,
                                                ExprResult CallResult) {
+  if (!getLangOpts().MatrixTypes) {
+    Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_disabled);
+    return ExprError();
+  }
+
+  if (getLangOpts().getDefaultMatrixMemoryLayout() !=
+      LangOptions::MatrixMemoryLayout::MatrixColMajor) {
+    Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_major_order_disabled)
+        << /*column*/ 1 << /*store*/ 1;
+    return ExprError();
+  }
+
   if (checkArgCount(TheCall, 3))
     return ExprError();
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 3bc748969065a..1cadaf54b7bdd 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -6627,6 +6627,7 @@ void Sema::checkClassLevelDLLAttribute(CXXRecordDecl *Class) {
         auto *Ctor = dyn_cast<CXXConstructorDecl>(MD);
         if ((MD->isMoveAssignmentOperator() ||
              (Ctor && Ctor->isMoveConstructor())) &&
+            getLangOpts().isCompatibleWithMSVC() &&
             !getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015))
           continue;
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index 84adbf318e9f8..e46bba7be4aea 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -321,6 +321,51 @@ bool isExprToGetCheckedPtrCapableMember(const clang::Expr *E) {
   return result && *result;
 }
 
+bool isAllocInit(const Expr *E, const Expr **InnerExpr) {
+  auto *ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(E);
+  if (auto *POE = dyn_cast<PseudoObjectExpr>(E)) {
+    if (unsigned ExprCount = POE->getNumSemanticExprs()) {
+      auto *Expr = POE->getSemanticExpr(ExprCount - 1)->IgnoreParenCasts();
+      ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(Expr);
+      if (InnerExpr)
+        *InnerExpr = ObjCMsgExpr;
+    }
+  }
+  if (!ObjCMsgExpr)
+    return false;
+  auto Selector = ObjCMsgExpr->getSelector();
+  auto NameForFirstSlot = Selector.getNameForSlot(0);
+  if (NameForFirstSlot.starts_with("alloc") ||
+      NameForFirstSlot.starts_with("copy") ||
+      NameForFirstSlot.starts_with("mutableCopy"))
+    return true;
+  if (!NameForFirstSlot.starts_with("init") &&
+      !NameForFirstSlot.starts_with("_init"))
+    return false;
+  if (!ObjCMsgExpr->isInstanceMessage())
+    return false;
+  auto *Receiver = ObjCMsgExpr->getInstanceReceiver();
+  if (!Receiver)
+    return false;
+  Receiver = Receiver->IgnoreParenCasts();
+  if (auto *Inner = dyn_cast<ObjCMessageExpr>(Receiver)) {
+    if (InnerExpr)
+      *InnerExpr = Inner;
+    auto InnerSelector = Inner->getSelector();
+    return InnerSelector.getNameForSlot(0).starts_with("alloc");
+  } else if (auto *CE = dyn_cast<CallExpr>(Receiver)) {
+    if (InnerExpr)
+      *InnerExpr = CE;
+    if (auto *Callee = CE->getDirectCallee()) {
+      if (Callee->getDeclName().isIdentifier()) {
+        auto CalleeName = Callee->getName();
+        return CalleeName.starts_with("alloc");
+      }
+    }
+  }
+  return false;
+}
+
 class EnsureFunctionVisitor
     : public ConstStmtVisitor<EnsureFunctionVisitor, bool> {
 public:
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
index 9fff456b7e8b8..d0a3e471365e2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
@@ -77,6 +77,10 @@ bool isConstOwnerPtrMemberExpr(const clang::Expr *E);
 /// supports CheckedPtr.
 bool isExprToGetCheckedPtrCapableMember(const clang::Expr *E);
 
+/// \returns true if \p E is a [[alloc] init] pattern expression.
+/// Sets \p InnerExpr to the inner function call or selector invocation.
+bool isAllocInit(const Expr *E, const Expr **InnerExpr = nullptr);
+
 /// \returns true if E is a CXXMemberCallExpr which returns a const smart
 /// pointer type.
 class EnsureFunctionAnalysis {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
index 791e70998477f..dcc14a0aecdf7 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
@@ -177,16 +177,11 @@ class RawPtrRefCallArgsChecker
     if (BR->getSourceManager().isInSystemHeader(E->getExprLoc()))
       return;
 
-    auto Selector = E->getSelector();
     if (auto *Receiver = E->getInstanceReceiver()) {
       std::optional<bool> IsUnsafe = isUnsafePtr(E->getReceiverType());
       if (IsUnsafe && *IsUnsafe && !isPtrOriginSafe(Receiver)) {
-        if (auto *InnerMsg = dyn_cast<ObjCMessageExpr>(Receiver)) {
-          auto InnerSelector = InnerMsg->getSelector();
-          if (InnerSelector.getNameForSlot(0) == "alloc" &&
-              Selector.getNameForSlot(0).starts_with("init"))
-            return;
-        }
+        if (isAllocInit(E))
+          return;
         reportBugOnReceiver(Receiver, D);
       }
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
index f60d1936b7584..f3fadeaefc491 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
@@ -587,6 +587,8 @@ class UnretainedLambdaCapturesChecker : public RawPtrRefLambdaCapturesChecker {
   }
 
   std::optional<bool> isUnsafePtr(QualType QT) const final {
+    if (QT.hasStrongOrWeakObjCLifetime())
+      return false;
     return RTC->isUnretained(QT);
   }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
index c13df47920f72..f2235e7c25ab2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
@@ -433,6 +433,8 @@ class UnretainedLocalVarsChecker final : public RawPtrRefLocalVarsChecker {
     RTC = RetainTypeChecker();
   }
   std::optional<bool> isUnsafePtr(const QualType T) const final {
+    if (T.hasStrongOrWeakObjCLifetime())
+      return false;
     return RTC->isUnretained(T);
   }
   bool isSafePtr(const CXXRecordDecl *Record) const final {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
index ace639ce7ab18..0e23ae34ea212 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
@@ -231,8 +231,11 @@ class RawPtrRefMemberChecker
     // "assign" property doesn't retain even under ARC so treat it as unsafe.
     bool ignoreARC =
         !PD->isReadOnly() && PD->getSetterKind() == ObjCPropertyDecl::Assign;
+    bool IsWeak =
+        PD->getPropertyAttributes() & ObjCPropertyAttribute::kind_weak;
+    bool HasSafeAttr = PD->isRetaining() || IsWeak;
     auto IsUnsafePtr = isUnsafePtr(QT, ignoreARC);
-    return {IsUnsafePtr && *IsUnsafePtr && !PD->isRetaining(), PropType};
+    return {IsUnsafePtr && *IsUnsafePtr && !HasSafeAttr, PropType};
   }
 
   bool shouldSkipDecl(const RecordDecl *RD) const {
@@ -363,6 +366,8 @@ class NoUnretainedMemberChecker final : public RawPtrRefMemberChecker {
   }
 
   std::optional<bool> isUnsafePtr(QualType QT, bool ignoreARC) const final {
+    if (QT.hasStrongOrWeakObjCLifetime())
+      return false;
     return RTC->isUnretained(QT, ignoreARC);
   }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
index 955b8d19a820c..2af9067f8f808 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
@@ -355,15 +355,37 @@ class RetainPtrCtorAdoptChecker
   void visitBinaryOperator(const BinaryOperator *BO) const {
     if (!BO->isAssignmentOp())
       return;
-    if (!isa<ObjCIvarRefExpr>(BO->getLHS()))
-      return;
+    auto *LHS = BO->getLHS();
     auto *RHS = BO->getRHS()->IgnoreParenCasts();
-    const Expr *Inner = nullptr;
-    if (isAllocInit(RHS, &Inner)) {
-      CreateOrCopyFnCall.insert(RHS);
-      if (Inner)
-        CreateOrCopyFnCall.insert(Inner);
+    if (isa<ObjCIvarRefExpr>(LHS)) {
+      const Expr *Inner = nullptr;
+      if (isAllocInit(RHS, &Inner)) {
+        CreateOrCopyFnCall.insert(RHS);
+        if (Inner)
+          CreateOrCopyFnCall.insert(Inner);
+      }
+      return;
     }
+    auto *UO = dyn_cast<UnaryOperator>(LHS);
+    if (!UO)
+      return;
+    auto OpCode = UO->getOpcode();
+    if (OpCode != UO_Deref)
+      return;
+    auto *DerefTarget = UO->getSubExpr();
+    if (!DerefTarget)
+      return;
+    DerefTarget = DerefTarget->IgnoreParenCasts();
+    auto *DRE = dyn_cast<DeclRefExpr>(DerefTarget);
+    if (!DRE)
+      return;
+    auto *Decl = DRE->getDecl();
+    if (!Decl)
+      return;
+    if (!isa<ParmVarDecl>(Decl) || !isCreateOrCopy(RHS))
+      return;
+    if (Decl->hasAttr<CFReturnsRetainedAttr>())
+      CreateOrCopyFnCall.insert(RHS);
   }
 
   void visitReturnStmt(const ReturnStmt *RS, const Decl *DeclWithIssue) const {
@@ -423,50 +445,6 @@ class RetainPtrCtorAdoptChecker
     return std::nullopt;
   }
 
-  bool isAllocInit(const Expr *E, const Expr **InnerExpr = nullptr) const {
-    auto *ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(E);
-    if (auto *POE = dyn_cast<PseudoObjectExpr>(E)) {
-      if (unsigned ExprCount = POE->getNumSemanticExprs()) {
-        auto *Expr = POE->getSemanticExpr(ExprCount - 1)->IgnoreParenCasts();
-        ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(Expr);
-        if (InnerExpr)
-          *InnerExpr = ObjCMsgExpr;
-      }
-    }
-    if (!ObjCMsgExpr)
-      return false;
-    auto Selector = ObjCMsgExpr->getSelector();
-    auto NameForFirstSlot = Selector.getNameForSlot(0);
-    if (NameForFirstSlot == "alloc" || NameForFirstSlot.starts_with("copy") ||
-        NameForFirstSlot.starts_with("mutableCopy"))
-      return true;
-    if (!NameForFirstSlot.starts_with("init") &&
-        !NameForFirstSlot.starts_with("_init"))
-      return false;
-    if (!ObjCMsgExpr->isInstanceMessage())
-      return false;
-    auto *Receiver = ObjCMsgExpr->getInstanceReceiver();
-    if (!Receiver)
-      return false;
-    Receiver = Receiver->IgnoreParenCasts();
-    if (auto *Inner = dyn_cast<ObjCMessageExpr>(Receiver)) {
-      if (InnerExpr)
-        *InnerExpr = Inner;
-      auto InnerSelector = Inner->getSelector();
-      return InnerSelector.getNameForSlot(0) == "alloc";
-    } else if (auto *CE = dyn_cast<CallExpr>(Receiver)) {
-      if (InnerExpr)
-        *InnerExpr = CE;
-      if (auto *Callee = CE->getDirectCallee()) {
-        if (Callee->getDeclName().isIdentifier()) {
-          auto CalleeName = Callee->getName();
-          return CalleeName.starts_with("alloc");
-        }
-      }
-    }
-    return false;
-  }
-
   bool isCreateOrCopy(const Expr *E) const {
     auto *CE = dyn_cast<CallExpr>(E);
     if (!CE)
diff --git a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
index 45705615f3196..427affdbbd601 100644
--- a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
+++ b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
@@ -190,6 +190,14 @@ void adopt_retainptr() {
   auto bar = adoptNS([allocSomeObj() init]);
 }
 
+CFTypeRef make_cf_obj() CF_RETURNS_RETAINED {
+  return CFArrayCreateMutable(kCFAllocatorDefault, 1);
+}
+
+void get_cf_obj(CFTypeRef* CF_RETURNS_RETAINED result) {
+  *result = CFArrayCreateMutable(kCFAllocatorDefault, 1);
+}
+
 RetainPtr<CFArrayRef> return_arg(CFArrayRef arg) {
   return arg;
 }
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
index 8bef24f93ceed..cfc214fae33e5 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
@@ -625,6 +625,8 @@ void foo() {
 
 } // namespace template_function
 
+SomeObj *allocObj();
+
 @interface TestObject : NSObject
 - (void)doWork:(NSString *)msg, ...;
 - (void)doWorkOnSelf;
@@ -647,6 +649,7 @@ - (void)doWorkOnSelf {
   [self doWork:__null];
   [self doWork:nil];
   [NSApp run];
+  adoptNS([allocObj() init]);
 }
 
 - (SomeObj *)getSomeObj {
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm
new file mode 100644
index 0000000000000..a52bc7c9a5572
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLambdaCapturesChecker -fobjc-runtime-has-weak -fobjc-weak -fobjc-arc -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+void someFunction();
+template <typename Callback> void call(Callback callback) {
+  someFunction();
+  callback();
+}
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+void foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  auto lambda = [weakStr, weakObj]() {
+    return [weakStr length] + [weakObj value];
+  };
+  call(lambda);
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm
new file mode 100644
index 0000000000000..7439d7f8bb93b
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLambdaCapturesChecker -fobjc-runtime-has-weak -fobjc-weak -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+void someFunction();
+template <typename Callback> void call(Callback callback) {
+  someFunction();
+  callback();
+}
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+void foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  auto lambda = [weakStr, weakObj]() {
+    return [weakStr length] + [weakObj value];
+  };
+  call(lambda);
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm
new file mode 100644
index 0000000000000..8c709b5921227
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm
@@ -0,0 +1,13 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLocalVarsChecker -fobjc-runtime-has-weak -fobjc-weak -fobjc-arc -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+int foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  return [weakStr length] + [weakObj value];
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm
new file mode 100644
index 0000000000000..3ac4ff9d1e4cb
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm
@@ -0,0 +1,13 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLocalVarsChecker -fobjc-runtime-has-weak -fobjc-weak -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+int foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  return [weakStr length] + [weakObj value];
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm
new file mode 100644
index 0000000000000..c0aaac09e68d8
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm
@@ -0,0 +1,29 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUnretainedMemberChecker -fobjc-runtime-has-weak -fobjc-weak -fobjc-arc -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+struct Foo {
+  __weak NSString *weakPtr = nullptr;
+  Foo();
+  ~Foo();
+  void bar();
+};
+
+@interface ObjectWithWeakProperty : NSObject
+@property(nonatomic, weak) NSString *weak_prop;
+@end
+
+@implementation ObjectWithWeakProperty
+@end
+
+NS_REQUIRES_PROPERTY_DEFINITIONS
+@interface NoSynthesisObjectWithWeakProperty : NSObject
+@property(nonatomic, readonly, weak) NSString *weak_prop;
+@end
+
+@implementation NoSynthesisObjectWithWeakProperty
+- (NSString *)weak_prop {
+  return nil;
+}
+@end
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm
new file mode 100644
index 0000000000000..422cf6189446d
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm
@@ -0,0 +1,31 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUnretainedMemberChecker -fobjc-runtime-has-weak -fobjc-weak -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+struct Foo {
+  __weak NSString *weakPtr = nullptr;
+  Foo();
+  ~Foo();
+  void bar();
+};
+
+@interface ObjectWithWeakProperty : NSObject
+@property(nonatomic, weak) NSString *weak_prop;
+@end
+
+@implementation ObjectWithWeakProperty
+@end
+
+NS_REQUIRES_PROPERTY_DEFINITIONS
+@interface NoSynthesisObjectWithWeakProperty : NSObject
+@property(nonatomic, readonly, weak) NSString *weak_prop;
+@end
+
+@implementation NoSynthesisObjectWithWeakProperty {
+  __weak NSNumber *weak_ivar;
+}
+- (NSString *)weak_prop {
+  return nil;
+}
+@end
diff --git a/clang/test/CIR/CodeGen/kr-func-promote.c b/clang/test/CIR/CodeGen/kr-func-promote.c
new file mode 100644
index 0000000000000..5fed068a30398
--- /dev/null
+++ b/clang/test/CIR/CodeGen/kr-func-promote.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c89 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c89 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c89 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG
+
+// CIR: cir.func {{.*}}@foo(%arg0: !s32i
+// CIR:   %0 = cir.alloca !s16i, !cir.ptr<!s16i>, ["x", init]
+// CIR:   %1 = cir.cast integral %arg0 : !s32i -> !s16i
+// CIR:   cir.store %1, %0 : !s16i, !cir.ptr<!s16i>
+// expected-warning@+1 {{a function definition without a prototype is deprecated}}
+void foo(x) short x; {}
+
+// LLVM: define{{.*}} void @foo(i32 %0)
+// LLVM:   %[[X_PTR:.*]] = alloca i16, i64 1, align 2
+// LLVM:   %[[X:.*]] = trunc i32 %0 to i16
+// LLVM:   store i16 %[[X]], ptr %[[X_PTR]], align 2
+
+// OGCG: define{{.*}} void @foo(i32 noundef %0)
+// OGCG: entry:
+// OGCG:   %[[X_PTR:.*]] = alloca i16, align 2
+// OGCG:   %[[X:.*]] = trunc i32 %0 to i16
+// OGCG:   store i16 %[[X]], ptr %[[X_PTR]], align 2
+
+// CIR: cir.func{{.*}}no_proto dso_local @bar(%arg0: !cir.double
+// CIR:   %0 = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f", init]
+// CIR:   %1 = cir.cast floating %arg0 : !cir.double -> !cir.float
+// CIR:   cir.store %1, %0 : !cir.float, !cir.ptr<!cir.float>
+// expected-warning@+1 {{a function definition without a prototype is deprecated}}
+void bar(f) float f; {}
+
+// LLVM: define{{.*}} void @bar(double %0)
+// LLVM:   %[[F_PTR:.*]] = alloca float, i64 1, align 4
+// LLVM:   %[[F:.*]] = fptrunc double %0 to float
+// LLVM:   store float %[[F]], ptr %[[F_PTR]], align 4
+
+// OGCG: define{{.*}} void @bar(double noundef %0)
+// OGCG: entry:
+// OGCG:   %[[F_PTR:.*]] = alloca float, align 4
+// OGCG:   %[[F:.*]] = fptrunc double %0 to float
+// OGCG:   store float %[[F]], ptr %[[F_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/pointer-to-data-member.cpp b/clang/test/CIR/CodeGen/pointer-to-data-member.cpp
new file mode 100644
index 0000000000000..b116d21f01170
--- /dev/null
+++ b/clang/test/CIR/CodeGen/pointer-to-data-member.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -Wno-unused-value -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -Wno-unused-value -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+struct Point {
+  int x;
+  int y;
+  int z;
+};
+
+auto test1() -> int Point::* {
+  return &Point::y;
+}
+
+// CIR: cir.func {{.*}} @_Z5test1v() -> !cir.data_member<!s32i in !rec_Point> {
+// CIR:   %[[RETVAL:.*]] = cir.alloca !cir.data_member<!s32i in !rec_Point>, !cir.ptr<!cir.data_member<!s32i in !rec_Point>>, ["__retval"]
+// CIR:   %[[MEMBER:.*]] = cir.const #cir.data_member<1> : !cir.data_member<!s32i in !rec_Point>
+// CIR:   cir.store %[[MEMBER]], %[[RETVAL]] : !cir.data_member<!s32i in !rec_Point>, !cir.ptr<!cir.data_member<!s32i in !rec_Point>>
+// CIR:   %[[RET:.*]] = cir.load %[[RETVAL]] : !cir.ptr<!cir.data_member<!s32i in !rec_Point>>, !cir.data_member<!s32i in !rec_Point>
+// CIR:   cir.return %[[RET]] : !cir.data_member<!s32i in !rec_Point>
+
+// LLVM: define {{.*}} i64 @_Z5test1v()
+// LLVM:   %[[RETVAL:.*]] = alloca i64
+// LLVM:   store i64 4, ptr %[[RETVAL]]
+// LLVM:   %[[RET:.*]] = load i64, ptr %[[RETVAL]]
+// LLVM:   ret i64 %[[RET]]
+
+// OGCG: define {{.*}} i64 @_Z5test1v()
+// OGCG:   ret i64 4
diff --git a/clang/test/CIR/CodeGen/tls.c b/clang/test/CIR/CodeGen/tls.c
new file mode 100644
index 0000000000000..582a716f0fa73
--- /dev/null
+++ b/clang/test/CIR/CodeGen/tls.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ogcg.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ogcg.ll %s
+
+extern __thread int b;
+// CIR: cir.global "private" external tls_dyn @b : !s32i
+
+__thread int a;
+// CIR: cir.global external tls_dyn @a = #cir.int<0> : !s32i
+
+int c(void) { return *&b; }
+// CIR: cir.func no_inline dso_local @c() -> !s32i
+// CIR:   %[[TLS_ADDR:.*]] = cir.get_global thread_local @b : !cir.ptr<!s32i>
+
+// LLVM: @b = external thread_local global i32
+// LLVM: @a = thread_local global i32 0
+
+// LLVM-LABEL: @c
+// LLVM: = call ptr @llvm.threadlocal.address.p0(ptr @b)
+
+// OGCG: @b = external thread_local{{.*}} global i32
+// OGCG: @a = thread_local{{.*}} global i32 0
+
+// OGCG-LABEL: define{{.*}} @c
+// OGCG: call{{.*}} ptr @llvm.threadlocal.address.p0(ptr{{.*}} @b)
+
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
index accf1f60d7c32..9ba3e19d41566 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
@@ -199,3 +199,36 @@ __m256i test_mm256_mask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, __m25
   // OGCG: @llvm.x86.avx512.mask.gather3siv8.si
   return _mm256_mmask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
 }
+
+__m128d test_mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CIR-LABEL: _mm_mask_expand_pd
+  // CIR: %[[MASK:.*]] = cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[MASK]], %[[MASK]] : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_mm_mask_expand_pd
+  // LLVM: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  // OGCG-LABEL: test_mm_mask_expand_pd
+  // OGCG: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  return _mm_mask_expand_pd(__W,__U,__A);
+}
+
+__m128d test_mm_maskz_expand_pd(__mmask8 __U, __m128d __A) {
+  // CIR-LABEL: _mm_maskz_expand_pd
+  // CIR: %[[MASK:.*]] = cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[MASK]], %[[MASK]] : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_mm_maskz_expand_pd
+  // LLVM: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  // OGCG-LABEL: test_mm_maskz_expand_pd
+  // OGCG: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  return _mm_maskz_expand_pd(__U,__A);
+}
+
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c
new file mode 100644
index 0000000000000..964971d71eb6c
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c
@@ -0,0 +1,171 @@
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+
+__m128i test_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_compress_epi16
+  // %[[MASK8:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK8]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_mask_compress_epi16(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_compress_epi16
+  // %[[MASK8:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK8]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_compress_epi16(__U, __D);
+}
+
+__m128i test_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_compress_epi8
+  // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK16]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_compress_epi8
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_compress_epi8
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_mask_compress_epi8(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_compress_epi8
+  // %[[ZERO:.+]] = cir.call @_mm_setzero_si128() : () -> !cir.vector<2 x !s64i>
+  // %[[CAST1:.+]] = cir.cast bitcast %[[ZERO]] : !cir.vector<2 x !s64i> -> !cir.vector<16 x !s8i>
+  // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %[[CAST1]], %[[MASK16]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST2:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_compress_epi8
+  // store <2 x i64> zeroinitializer, ptr %{{.+}}, align 16
+  // %[[CAST1:.+]] = bitcast <2 x i64> %{{.+}} to <16 x i8>
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %[[CAST1]], <16 x i1> %[[MASK16]])
+  // %[[CAST2:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_compress_epi8
+  // store <2 x i64> zeroinitializer, ptr %{{.+}}, align 16
+  // %[[CAST1:.+]] = bitcast <2 x i64> %{{.+}} to <16 x i8>
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %[[CAST1]], <16 x i1> %[[MASK16]])
+  // %[[CAST2:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_compress_epi8(__U, __D);
+}
+
+__m128i test_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_expand_epi16
+  // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK16]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_expand_epi16
+  // %[[MASK16:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_expand_epi16
+  // %[[MASK16:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_mask_expand_epi16(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_expand_epi16
+  // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_expand_epi16
+  // %[[MASK:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_expand_epi16
+  // %[[MASK:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_expand_epi16(__U, __D);
+}
+
+__m128i test_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_expand_epi8
+  // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_mask_expand_epi8(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_expand_epi8
+  // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_expand_epi8(__U, __D);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c
new file mode 100644
index 0000000000000..23d6edc6c8c6e
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c
@@ -0,0 +1,194 @@
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +xsave -target-feature +xsaveopt -target-feature +xsavec -target-feature +xsaves -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +xsave -target-feature +xsaveopt -target-feature +xsavec -target-feature +xsaves -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +xsave -target-feature +xsaveopt -target-feature +xsavec -target-feature +xsaves -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+void test_xsave(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsave
+  // CIR: [[P:%.*]] = cir.load {{.*}} : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+  // CIR: [[M:%.*]] = cir.load {{.*}} : !cir.ptr<!u64i>, !u64i
+  // CIR: [[CONST:%.*]] = cir.const #cir.int<32> : !s64i
+  // CIR: [[SHIFT:%.*]] = cir.shift(right, [[M]] : !u64i, [[CONST]] : !s64i) -> !u64i
+  // CIR: [[CAST1:%.*]] = cir.cast integral [[SHIFT]] : !u64i -> !s32i
+  // CIR: [[CAST2:%.*]] = cir.cast integral [[M]] : !u64i -> !s32i
+  // CIR: cir.call_llvm_intrinsic "x86.xsave" [[P]], [[CAST1]], [[CAST2]]
+
+  // LLVM-LABEL: test_xsave
+  // LLVM: [[LP:%.*]] = load ptr, ptr
+  // LLVM: [[LM:%.*]] = load i64, ptr
+  // LLVM: [[LSHIFT:%.*]] = lshr i64 [[LM]], 32
+  // LLVM: [[LCAST1:%.*]] = trunc i64 [[LSHIFT]] to i32
+  // LLVM: [[LCAST2:%.*]] = trunc i64 [[LM]] to i32
+  // LLVM: call void @llvm.x86.xsave(ptr [[LP]], i32 [[LCAST1]], i32 [[LCAST2]])
+
+  // OGCG-LABEL: test_xsave
+  // OGCG: [[OP:%.*]] = load ptr, ptr
+  // OGCG: [[OM:%.*]] = load i64, ptr
+  // OGCG: [[OSHIFT:%.*]] = lshr i64 [[OM]], 32
+  // OGCG: [[OCAST1:%.*]] = trunc i64 [[OSHIFT]] to i32
+  // OGCG: [[OCAST2:%.*]] = trunc i64 [[OM]] to i32
+  // OGCG: call void @llvm.x86.xsave(ptr [[OP]], i32 [[OCAST1]], i32 [[OCAST2]])
+  __builtin_ia32_xsave(p, m);
+}
+
+// The following tests use the same pattern as test_xsave (load, shift, cast, cast, intrinsic call).
+// Only the intrinsic name differs, so we just check the intrinsic call.
+
+void test_xsave64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsave64
+  // CIR: cir.call_llvm_intrinsic "x86.xsave64"
+
+  // LLVM-LABEL: test_xsave64
+  // LLVM: call void @llvm.x86.xsave64
+
+  // OGCG-LABEL: test_xsave64
+  // OGCG: call void @llvm.x86.xsave64
+  __builtin_ia32_xsave64(p, m);
+}
+
+void test_xrstor(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstor
+  // CIR: cir.call_llvm_intrinsic "x86.xrstor"
+
+  // LLVM-LABEL: test_xrstor
+  // LLVM: call void @llvm.x86.xrstor
+
+  // OGCG-LABEL: test_xrstor
+  // OGCG: call void @llvm.x86.xrstor
+  __builtin_ia32_xrstor(p, m);
+}
+
+void test_xrstor64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstor64
+  // CIR: cir.call_llvm_intrinsic "x86.xrstor64"
+
+  // LLVM-LABEL: test_xrstor64
+  // LLVM: call void @llvm.x86.xrstor64
+
+  // OGCG-LABEL: test_xrstor64
+  // OGCG: call void @llvm.x86.xrstor64
+  __builtin_ia32_xrstor64(p, m);
+}
+
+void test_xsaveopt(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaveopt
+  // CIR: cir.call_llvm_intrinsic "x86.xsaveopt"
+
+  // LLVM-LABEL: test_xsaveopt
+  // LLVM: call void @llvm.x86.xsaveopt
+
+  // OGCG-LABEL: test_xsaveopt
+  // OGCG: call void @llvm.x86.xsaveopt
+  __builtin_ia32_xsaveopt(p, m);
+}
+
+void test_xsaveopt64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaveopt64
+  // CIR: cir.call_llvm_intrinsic "x86.xsaveopt64"
+
+  // LLVM-LABEL: test_xsaveopt64
+  // LLVM: call void @llvm.x86.xsaveopt64
+
+  // OGCG-LABEL: test_xsaveopt64
+  // OGCG: call void @llvm.x86.xsaveopt64
+  __builtin_ia32_xsaveopt64(p, m);
+}
+
+void test_xsavec(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsavec
+  // CIR: cir.call_llvm_intrinsic "x86.xsavec"
+
+  // LLVM-LABEL: test_xsavec
+  // LLVM: call void @llvm.x86.xsavec
+
+  // OGCG-LABEL: test_xsavec
+  // OGCG: call void @llvm.x86.xsavec
+  __builtin_ia32_xsavec(p, m);
+}
+
+void test_xsavec64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsavec64
+  // CIR: cir.call_llvm_intrinsic "x86.xsavec64"
+
+  // LLVM-LABEL: test_xsavec64
+  // LLVM: call void @llvm.x86.xsavec64
+
+  // OGCG-LABEL: test_xsavec64
+  // OGCG: call void @llvm.x86.xsavec64
+  __builtin_ia32_xsavec64(p, m);
+}
+
+void test_xsaves(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaves
+  // CIR: cir.call_llvm_intrinsic "x86.xsaves"
+
+  // LLVM-LABEL: test_xsaves
+  // LLVM: call void @llvm.x86.xsaves
+
+  // OGCG-LABEL: test_xsaves
+  // OGCG: call void @llvm.x86.xsaves
+  __builtin_ia32_xsaves(p, m);
+}
+
+void test_xsaves64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaves64
+  // CIR: cir.call_llvm_intrinsic "x86.xsaves64"
+
+  // LLVM-LABEL: test_xsaves64
+  // LLVM: call void @llvm.x86.xsaves64
+
+  // OGCG-LABEL: test_xsaves64
+  // OGCG: call void @llvm.x86.xsaves64
+  __builtin_ia32_xsaves64(p, m);
+}
+
+void test_xrstors(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstors
+  // CIR: cir.call_llvm_intrinsic "x86.xrstors"
+
+  // LLVM-LABEL: test_xrstors
+  // LLVM: call void @llvm.x86.xrstors
+
+  // OGCG-LABEL: test_xrstors
+  // OGCG: call void @llvm.x86.xrstors
+  __builtin_ia32_xrstors(p, m);
+}
+
+void test_xrstors64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstors64
+  // CIR: cir.call_llvm_intrinsic "x86.xrstors64"
+
+  // LLVM-LABEL: test_xrstors64
+  // LLVM: call void @llvm.x86.xrstors64
+
+  // OGCG-LABEL: test_xrstors64
+  // OGCG: call void @llvm.x86.xrstors64
+  __builtin_ia32_xrstors64(p, m);
+}
+
+unsigned long long test_xgetbv(unsigned int a) {
+  // CIR-LABEL: test_xgetbv
+  // CIR: cir.call_llvm_intrinsic "x86.xgetbv"
+
+  // LLVM-LABEL: test_xgetbv
+  // LLVM: call i64 @llvm.x86.xgetbv
+
+  // OGCG-LABEL: test_xgetbv
+  // OGCG: call i64 @llvm.x86.xgetbv
+  return __builtin_ia32_xgetbv(a);
+}
+
+void test_xsetbv(unsigned int a, unsigned long long m) {
+  // CIR-LABEL: test_xsetbv
+  // CIR: cir.call_llvm_intrinsic "x86.xsetbv"
+
+  // LLVM-LABEL: test_xsetbv
+  // LLVM: call void @llvm.x86.xsetbv
+
+  // OGCG-LABEL: test_xsetbv
+  // OGCG: call void @llvm.x86.xsetbv
+  __builtin_ia32_xsetbv(a, m);
+}
+
diff --git a/clang/test/CIR/IR/invalid-data-member.cir b/clang/test/CIR/IR/invalid-data-member.cir
new file mode 100644
index 0000000000000..2941777404973
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-data-member.cir
@@ -0,0 +1,27 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+// -----
+
+!u16i = !cir.int<u, 16>
+!u32i = !cir.int<u, 32>
+!struct1 = !cir.record<struct "Struct1" {!u16i, !u32i}>
+
+// expected-error@+1 {{member type of a #cir.data_member attribute must match the attribute type}}
+#invalid_member_ty = #cir.data_member<0> : !cir.data_member<!u32i in !struct1>
+
+// -----
+
+!u16i = !cir.int<u, 16>
+!incomplete_struct = !cir.record<struct "Incomplete" incomplete>
+
+// expected-error@+1 {{incomplete 'cir.record' cannot be used to build a non-null data member pointer}}
+#incomplete_cls_member = #cir.data_member<0> : !cir.data_member<!u16i in !incomplete_struct>
+
+// -----
+
+!u16i = !cir.int<u, 16>
+!u32i = !cir.int<u, 32>
+!struct1 = !cir.record<struct "Struct1" {!u16i, !u32i}>
+
+// expected-error@+1 {{member index of a #cir.data_member attribute is out of range}}
+#invalid_member_ty = #cir.data_member<2> : !cir.data_member<!u32i in !struct1>
diff --git a/clang/test/CIR/IR/invalid-tls.cir b/clang/test/CIR/IR/invalid-tls.cir
new file mode 100644
index 0000000000000..36df7fdb1e619
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-tls.cir
@@ -0,0 +1,13 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+!s32i = !cir.int<s, 32>
+
+module {
+  cir.global "private" external @non_tls : !s32i
+  cir.func @error() {
+    // expected-error@+1 {{access to global not marked thread local}}
+    %0 = cir.get_global thread_local @non_tls : !cir.ptr<!s32i>
+    cir.return
+  }
+}
+
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
index 728c9f5652dd9..1ba6e87653a74 100644
--- a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -176,20 +176,20 @@ __m512i test_mm512_maskz_dpbuuds_epi32(__mmask16 __U, __m512i __W, __m512i __A,
 /* VNNI INT16 */
 __m512i test_mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwsud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwsud_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwsud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwsud_epi32(__U, __A, __B, __C);
@@ -197,20 +197,20 @@ __m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, _
 
 __m512i test_mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwsuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwsuds_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwsuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwsuds_epi32(__U, __A, __B, __C);
@@ -218,20 +218,20 @@ __m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B,
 
 __m512i test_mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwusd_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwusd_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwusd_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusd_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwusd_epi32(__U, __A, __B, __C);
@@ -239,20 +239,20 @@ __m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, _
 
 __m512i test_mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwusds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwusds_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwusds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwusds_epi32(__U, __A, __B, __C);
@@ -260,20 +260,20 @@ __m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B,
 
 __m512i test_mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwuud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwuud_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwuud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwuud_epi32(__U, __A, __B, __C);
@@ -281,20 +281,20 @@ __m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, _
 
 __m512i test_mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwuuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwuuds_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwuuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwuuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwuuds_epi32(__U, __A, __B, __C);
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
index a250d91ae5989..be2719c33c52f 100644
--- a/clang/test/CodeGen/X86/avx10_2ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -259,168 +259,168 @@ __m256i test_mm256_maskz_dpbuuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, _
 // VNNI INT16
 __m128i test_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwsud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwsud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwsud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwsud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwsuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwsuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwsuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwsuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwusd_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwusd_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusd_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwusd_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwusd_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusd_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwusds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwusds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwusds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwusds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwuud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwuud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwuud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwuud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwuuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwuuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwuuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwuuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
index f63b5c6e73917..11dbd717a9f77 100644
--- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
@@ -47,41 +47,41 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
 
 __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
 
@@ -127,41 +127,41 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
 
 __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
 
 __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
 
diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c
index afe80458e37cc..6b8465206eedb 100644
--- a/clang/test/CodeGen/X86/avx512vnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c
@@ -47,41 +47,41 @@ __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssd_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssd_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssd_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssd_epi32(__S, __A, __B);
 }
 
 __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssds_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssds_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssds_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssds_epi32(__S, __A, __B);
 }
 
diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c
index 7948e0d57d9bf..6557a26807eb2 100644
--- a/clang/test/CodeGen/X86/avxvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnni-builtins.c
@@ -19,13 +19,13 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
 
@@ -43,13 +43,13 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
 
@@ -67,13 +67,13 @@ __m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
 
 __m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_avx_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_avx_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_avx_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_avx_epi32(__S, __A, __B);
 }
 
@@ -91,12 +91,12 @@ __m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
 
 __m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_avx_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_avx_epi32(__S, __A, __B);
 }
 
 __m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_avx_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_avx_epi32(__S, __A, __B);
 }
diff --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
index 941da9aa223b5..f28fff2c0cfec 100644
--- a/clang/test/CodeGen/X86/avxvnniint16-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
@@ -11,72 +11,72 @@
 
 __m128i test_mm_dpwsud_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwsud_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwsud_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwsud_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwsud_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwsud_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwsuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwsuds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwsuds_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwsuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwsuds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwsuds_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwusd_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwusd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwusd_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwusd_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwusd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwusd_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwusds_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwusds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwusds_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwusds_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwusds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwusds_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwuud_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwuud_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwuud_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwuud_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwuud_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwuud_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwuuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwuuds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwuuds_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwuuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwuuds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwuuds_epi32(__A, __B, __C);
 }
diff --git a/clang/test/CodeGenCXX/dllexport.cpp b/clang/test/CodeGenCXX/dllexport.cpp
index dfbb2762ac85c..2c9e7d36d2cbe 100644
--- a/clang/test/CodeGenCXX/dllexport.cpp
+++ b/clang/test/CodeGenCXX/dllexport.cpp
@@ -1130,5 +1130,6 @@ class __declspec(dllexport) ACE_Shared_Object {
 class __declspec(dllexport) ACE_Service_Object : public ACE_Shared_Object {};
 // Implicit move constructor declaration.
 // MSVC2015-DAG: define weak_odr dso_local dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
+// PS-DAG: define weak_odr dllexport void @_ZN18ACE_Service_ObjectC1EOS_
 // The declarations should not be exported.
 // MSVC2013-NOT: define weak_odr dso_local dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
diff --git a/clang/test/CodeGenCXX/dllimport.cpp b/clang/test/CodeGenCXX/dllimport.cpp
index 363f97a8d58ee..ed1c72c5185d3 100644
--- a/clang/test/CodeGenCXX/dllimport.cpp
+++ b/clang/test/CodeGenCXX/dllimport.cpp
@@ -35,7 +35,7 @@ struct ExplicitSpec_NotImported {};
 #define USEMEMFUNC(class, func) void (class::*UNIQ(use)())() { return &class::func; }
 #define USESTATICMEMFUNC(class, func) void (*UNIQ(use)())() { return &class::func; }
 #define USECLASS(class) void UNIQ(USE)() { class x; }
-#define USECOPYASSIGN(class) class& (class::*UNIQ(use)())(class&) { return &class::operator=; }
+#define USECOPYASSIGN(class) class& (class::*UNIQ(use)())(const class&) { return &class::operator=; }
 #define USEMOVEASSIGN(class) class& (class::*UNIQ(use)())(class&&) { return &class::operator=; }
 
 //===----------------------------------------------------------------------===//
@@ -649,13 +649,15 @@ struct __declspec(dllimport) T {
   static int b;
   // MO1-DAG: @"?b@T@@2HA" = external dllimport global i32
 
-  T& operator=(T&) = default;
-  // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@AAU0@@Z"
+  T& operator=(const T&) = default;
+  // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@ABU0@@Z"
+  // PS-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN1TaSERKS_
 
   T& operator=(T&&) = default;
-  // Note: Don't mark inline move operators dllimport because current MSVC versions don't export them.
+  // Note: Don't mark inline move operators dllimport because MSVC versions before 2015 don't export them.
   // M18-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@$$QAU0@@Z"
   // M19-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@$$QAU0@@Z"
+  // PS-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN1TaSEOS_
 };
 USEMEMFUNC(T, a)
 USESTATICMEMFUNC(T, StaticMethod)
diff --git a/clang/test/CodeGenCXX/mingw-template-dllexport.cpp b/clang/test/CodeGenCXX/mingw-template-dllexport.cpp
index de112d6da53db..9f116c46853b6 100644
--- a/clang/test/CodeGenCXX/mingw-template-dllexport.cpp
+++ b/clang/test/CodeGenCXX/mingw-template-dllexport.cpp
@@ -10,11 +10,16 @@
 
 template <class T>
 class c {
+public:
+  c(const c &) {}
+  c(c &&) noexcept {}
   void f() {}
 };
 
 template class __declspec(dllexport) c<int>;
 
+// CHECK: define {{.*}} dllexport {{.*}} @_ZN1cIiEC1ERKS0_
+// CHECK: define {{.*}} dllexport {{.*}} @_ZN1cIiEC1EOS0_
 // CHECK: define {{.*}} dllexport {{.*}} @_ZN1cIiE1fEv
 
 extern template class __declspec(dllexport) c<char>;
@@ -27,6 +32,18 @@ template class __declspec(dllexport) c<double>;
 
 // CHECK-NOT: define {{.*}} dllexport {{.*}} @_ZN1cIdE1fEv
 
+extern template class __declspec(dllimport) c<short>;
+
+// CHECK: declare dllimport {{.*}} @_ZN1cIsEC1ERKS0_
+// CHECK: declare dllimport {{.*}} @_ZN1cIsEC1EOS0_
+// CHECK: declare dllimport {{.*}} @_ZN1cIsE1fEv
+
+void use_ctors(c<short> &&x) {
+  c<short> y{x};
+  c<short> z{static_cast<c<short> &&>(x)};
+  z.f();
+}
+
 template <class T>
 struct outer {
   void f();
diff --git a/clang/test/Driver/fmatrix-memory-layout.c b/clang/test/Driver/fmatrix-memory-layout.c
new file mode 100644
index 0000000000000..f05cd8f26c004
--- /dev/null
+++ b/clang/test/Driver/fmatrix-memory-layout.c
@@ -0,0 +1,49 @@
+// RUN: %clang --target=x86_64-linux-gnu -fenable-matrix -fmatrix-memory-layout=column-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-COL-MAJOR
+// CHECK-COL-MAJOR:  -fenable-matrix
+// CHECK-COL-MAJOR:  -mllvm
+// CHECK-COL-MAJOR:  -enable-matrix
+// CHECK-COL-MAJOR:  -fmatrix-memory-layout=column-major
+// CHECK-COL-MAJOR:  -mllvm
+// CHECK-COL-MAJOR:  -matrix-default-layout=column-major
+
+// RUN: %clang --target=x86_64-linux-gnu -fenable-matrix -fmatrix-memory-layout=row-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ROW-MAJOR
+// CHECK-ROW-MAJOR:  -fenable-matrix
+// CHECK-ROW-MAJOR:  -mllvm
+// CHECK-ROW-MAJOR:  -enable-matrix
+// CHECK-ROW-MAJOR:  -fmatrix-memory-layout=row-major
+// CHECK-ROW-MAJOR:  -mllvm
+// CHECK-ROW-MAJOR:  -matrix-default-layout=row-major
+
+// RUN: not %clang --target=x86_64-linux-gnu -fenable-matrix -fmatrix-memory-layout=error-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR-MAJOR
+// CHECK-ERROR-MAJOR: error: invalid value 'error-major' in '-fmatrix-memory-layout=error-major'
+
+// RUN: %clang --target=x86_64-linux-gnu -fmatrix-memory-layout=column-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-COL-MAJOR-DISABLED
+// CHECK-COL-MAJOR-DISABLED: clang: warning: argument unused during compilation: '-fmatrix-memory-layout=column-major'
+// CHECK-COL-MAJOR-DISABLED-NOT:  -fenable-matrix
+// CHECK-COL-MAJOR-DISABLED-NOT:  -mllvm
+// CHECK-COL-MAJOR-DISABLED-NOT:  -enable-matrix
+// CHECK-COL-MAJOR-DISABLED-NOT:  -fmatrix-memory-layout=column-major
+// CHECK-COL-MAJOR-DISABLED-NOT:  -mllvm
+// CHECK-COL-MAJOR-DISABLED-NOT:  -matrix-default-layout=column-major
+
+// RUN: %clang --target=x86_64-linux-gnu -fmatrix-memory-layout=row-major %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ROW-MAJOR-DISABLED
+// CHECK-ROW-MAJOR-DISABLED: clang: warning: argument unused during compilation: '-fmatrix-memory-layout=row-major'
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -fenable-matrix
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -mllvm
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -enable-matrix
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -fmatrix-memory-layout=row-major
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -mllvm
+// CHECK-ROW-MAJOR-DISABLED-NOT:  -matrix-default-layout=row-major
+
+// RUN: %clang --target=x86_64-linux-gnu -fenable-matrix  %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MATRIX-ENABLED
+// CHECK-MATRIX-ENABLED:  -fenable-matrix
+// CHECK-MATRIX-ENABLED:  -mllvm
+// CHECK-MATRIX-ENABLED:  -enable-matrix
+// CHECK-MATRIX-ENABLED-NOT:  -fmatrix-memory-layout=row-major
+// CHECK-MATRIX-ENABLED-NOT:  -fmatrix-memory-layout=column-major
+// CHECK-MATRIX-ENABLED-NOT:  -mllvm
+// CHECK-MATRIX-ENABLED-NOT:  -matrix-default-layout=row-major
+// CHECK-MATRIX-ENABLED-NOT:  -matrix-default-layout=column-major
+
+// RUN: not %clang --target=x86_64-linux-gnu -fenable-matrix -fmatrix-memory-layout=column-major -mllvm -matrix-default-layout=row-major %s -fsyntax-only 2>&1 | FileCheck %s --check-prefix=CHECK-MISMATCH-MAJOR
+// CHECK-MISMATCH-MAJOR: error: -fmatrix-memory-layout=column-major conflicts with -mllvm -matrix-default-layout=row-major
diff --git a/clang/test/ExtractAPI/typedef.c b/clang/test/ExtractAPI/typedef.c
index a4c3619bfd210..6099cd62f8776 100644
--- a/clang/test/ExtractAPI/typedef.c
+++ b/clang/test/ExtractAPI/typedef.c
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing -fblocks \
 // RUN:   -triple arm64-apple-macosx -x objective-c-header %s -o %t/output.symbols.json -verify
 
 // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix MYINT
@@ -90,4 +90,83 @@ void foo(BarPtr value);
 void baz(BarPtr *value);
 // CHECK-NOT: struct Bar *
 
+// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix BLOCKPTR
+typedef int (^CustomType)(const unsigned int *, unsigned long);
+void bar(CustomType block);
+
+// BLOCKPTR-LABEL: "!testLabel": "c:@F@bar",
+// BLOCKPTR:           "declarationFragments": [
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "typeIdentifier",
+// BLOCKPTR-NEXT:          "preciseIdentifier": "c:v",
+// BLOCKPTR-NEXT:          "spelling": "void"
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "text",
+// BLOCKPTR-NEXT:          "spelling": " "
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "identifier",
+// BLOCKPTR-NEXT:          "spelling": "bar"
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "text",
+// BLOCKPTR-NEXT:          "spelling": "("
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "typeIdentifier",
+// BLOCKPTR-NEXT:          "preciseIdentifier": "c:typedef.c@T@CustomType",
+// BLOCKPTR-NEXT:          "spelling": "CustomType"
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "text",
+// BLOCKPTR-NEXT:          "spelling": " "
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "internalParam",
+// BLOCKPTR-NEXT:          "spelling": "block"
+// BLOCKPTR-NEXT:        },
+// BLOCKPTR-NEXT:        {
+// BLOCKPTR-NEXT:          "kind": "text",
+// BLOCKPTR-NEXT:          "spelling": ");"
+// BLOCKPTR-NEXT:        }
+// BLOCKPTR-NEXT:      ],
+// BLOCKPTR-NEXT:      "functionSignature": {
+// BLOCKPTR-NEXT:        "parameters": [
+// BLOCKPTR-NEXT:          {
+// BLOCKPTR-NEXT:            "declarationFragments": [
+// BLOCKPTR-NEXT:              {
+// BLOCKPTR-NEXT:                "kind": "typeIdentifier",
+// BLOCKPTR-NEXT:                "preciseIdentifier": "c:typedef.c@T@CustomType",
+// BLOCKPTR-NEXT:                "spelling": "CustomType"
+// BLOCKPTR-NEXT:              },
+// BLOCKPTR-NEXT:              {
+// BLOCKPTR-NEXT:                "kind": "text",
+// BLOCKPTR-NEXT:                "spelling": " "
+// BLOCKPTR-NEXT:              },
+// BLOCKPTR-NEXT:              {
+// BLOCKPTR-NEXT:                "kind": "internalParam",
+// BLOCKPTR-NEXT:                "spelling": "block"
+// BLOCKPTR-NEXT:              }
+// BLOCKPTR-NEXT:            ],
+// BLOCKPTR-NEXT:            "name": "block"
+// BLOCKPTR-NEXT:          }
+// BLOCKPTR-NEXT:        ],
+// BLOCKPTR-NEXT:        "returns": [
+// BLOCKPTR-NEXT:          {
+// BLOCKPTR-NEXT:            "kind": "typeIdentifier",
+// BLOCKPTR-NEXT:            "preciseIdentifier": "c:v",
+// BLOCKPTR-NEXT:            "spelling": "void"
+// BLOCKPTR-NEXT:          }
+// BLOCKPTR-NEXT:        ]
+// BLOCKPTR-NEXT:      },
+// BLOCKPTR:           "identifier": {
+// BLOCKPTR-NEXT:        "interfaceLanguage": "objective-c",
+// BLOCKPTR-NEXT:        "precise": "c:@F@bar"
+// BLOCKPTR-NEXT:      },
+// BLOCKPTR:           "kind": {
+// BLOCKPTR-NEXT:        "displayName": "Function",
+// BLOCKPTR-NEXT:        "identifier": "objective-c.func"
+// BLOCKPTR-NEXT:      },
+
 // expected-no-diagnostics
diff --git a/clang/test/Sema/matrix-col-major-builtin-disable.c b/clang/test/Sema/matrix-col-major-builtin-disable.c
new file mode 100644
index 0000000000000..b86d4ebe54f93
--- /dev/null
+++ b/clang/test/Sema/matrix-col-major-builtin-disable.c
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 %s -fenable-matrix -fmatrix-memory-layout=row-major -pedantic -verify -triple=x86_64-apple-darwin9
+
+typedef float sx5x10_t __attribute__((matrix_type(5, 10)));
+
+void column_major_load(float *p1) {
+  sx5x10_t a1 = __builtin_matrix_column_major_load(p1, 5, 11, 5);
+  // expected-error@-1 {{matrix column major load is not supported with -fmatrix-memory-layout=row-major. Pass -fmatrix-memory-layout=column-major to enable it}}
+}
+
+void column_major_store(sx5x10_t *m1, float *p1) {
+  __builtin_matrix_column_major_store(*m1, p1, 1);
+  // expected-error@-1 {{matrix column major store is not supported with -fmatrix-memory-layout=row-major. Pass -fmatrix-memory-layout=column-major to enable it}}
+}
diff --git a/clang/unittests/Frontend/CompilerInvocationTest.cpp b/clang/unittests/Frontend/CompilerInvocationTest.cpp
index 1332422688fe6..64d0f4c38f11c 100644
--- a/clang/unittests/Frontend/CompilerInvocationTest.cpp
+++ b/clang/unittests/Frontend/CompilerInvocationTest.cpp
@@ -738,6 +738,18 @@ TEST_F(CommandLineTest, ConditionalParsingIfHLSLFlagPresent) {
   CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
 
   ASSERT_EQ(Invocation.getLangOpts().MaxMatrixDimension, 4u);
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixColMajor);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
+TEST_F(CommandLineTest, ConditionalParsingHLSLRowMajor) {
+  const char *Args[] = {"-xhlsl", "-fmatrix-memory-layout=row-major"};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixRowMajor);
 
   Invocation.generateCC1CommandLine(GeneratedArgs, *this);
 }
@@ -748,6 +760,30 @@ TEST_F(CommandLineTest, ConditionalParsingIfHLSLFlagNotPresent) {
   CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
 
   ASSERT_EQ(Invocation.getLangOpts().MaxMatrixDimension, 1048575u);
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixColMajor);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
+TEST_F(CommandLineTest, ConditionalParsingClangRowMajor) {
+  const char *Args[] = {"-fenable-matrix", "-fmatrix-memory-layout=row-major"};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixRowMajor);
+
+  Invocation.generateCC1CommandLine(GeneratedArgs, *this);
+}
+
+TEST_F(CommandLineTest, ConditionalParsingIgnoreRowMajorIfMatrixNotEnabled) {
+  const char *Args[] = {"-fmatrix-memory-layout=row-major"};
+
+  CompilerInvocation::CreateFromArgs(Invocation, Args, *Diags);
+
+  ASSERT_EQ(Invocation.getLangOpts().getDefaultMatrixMemoryLayout(),
+            LangOptions::MatrixMemoryLayout::MatrixColMajor);
 
   Invocation.generateCC1CommandLine(GeneratedArgs, *this);
 }
diff --git a/clang/utils/ClangVisualizers/clang.natvis b/clang/utils/ClangVisualizers/clang.natvis
index 3ecd93902d1bb..0755f0ffcbf56 100644
--- a/clang/utils/ClangVisualizers/clang.natvis
+++ b/clang/utils/ClangVisualizers/clang.natvis
@@ -44,9 +44,6 @@ For later versions of Visual Studio, no setup is required-->
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="poly">{(clang::DecayedType *)this,na}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="left">{(clang::DecayedType *)this,view(left)na}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="right">{(clang::DecayedType *)this,view(right)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="poly">{(clang::ElaboratedType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="left">{(clang::ElaboratedType *)this,view(left)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="right">{(clang::ElaboratedType *)this,view(right)na}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="poly">{*(clang::TemplateTypeParmType *)this}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="cpp">{*(clang::TemplateTypeParmType *)this,view(cpp)}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm" IncludeView="poly">{*(clang::SubstTemplateTypeParmType *)this}</DisplayString>
@@ -94,7 +91,6 @@ For later versions of Visual Studio, no setup is required-->
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray">(clang::IncompleteArrayType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Attributed">*(clang::AttributedType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Decayed">(clang::DecayedType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated">(clang::ElaboratedType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm">(clang::TemplateTypeParmType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm">(clang::SubstTemplateTypeParmType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Record">(clang::RecordType *)this</ExpandedItem>
@@ -428,16 +424,6 @@ For later versions of Visual Studio, no setup is required-->
       <ExpandedItem>(clang::AdjustedType *)this</ExpandedItem>
     </Expand>
   </Type>
-  <Type Name="clang::ElaboratedType">
-    <DisplayString IncludeView="left">{NamedType,view(left)}</DisplayString>
-    <DisplayString IncludeView="right">{NamedType,view(right)}</DisplayString>
-    <DisplayString>{NamedType}</DisplayString>
-    <Expand>
-      <Item Name="[Keyword]">(clang::ElaboratedTypeKeyword)TypeWithKeywordBits.Keyword</Item>
-      <Item Name="[Nested Name Specifier]">NNS</Item>
-      <Item Name="[Underlying Type]">NamedType,view(cmn)</Item>
-    </Expand>
-  </Type>
   <Type Name="clang::TemplateTypeParmType">
     <DisplayString IncludeView="cpp" Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">{TTPDecl->Name,view(cpp)}</DisplayString>
     <DisplayString Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">Non-canonical: {*TTPDecl}</DisplayString>
diff --git a/clang/utils/TableGen/CIRLoweringEmitter.cpp b/clang/utils/TableGen/CIRLoweringEmitter.cpp
index 80dc209c69a7b..c81b8941f9a39 100644
--- a/clang/utils/TableGen/CIRLoweringEmitter.cpp
+++ b/clang/utils/TableGen/CIRLoweringEmitter.cpp
@@ -60,6 +60,7 @@ void GenerateLLVMLoweringPattern(llvm::StringRef OpName,
 
   Code << "class " << PatternName
        << " : public mlir::OpConversionPattern<cir::" << OpName << "> {\n";
+  Code << "  [[maybe_unused]] cir::LowerModule *lowerMod;\n";
   Code << "  [[maybe_unused]] mlir::DataLayout const &dataLayout;\n";
   Code << "\n";
 
@@ -69,10 +70,12 @@ void GenerateLLVMLoweringPattern(llvm::StringRef OpName,
 
   Code << "  " << PatternName
        << "(mlir::TypeConverter const "
-          "&typeConverter, mlir::MLIRContext *context, mlir::DataLayout const "
+          "&typeConverter, mlir::MLIRContext *context, "
+          "cir::LowerModule *lowerMod, mlir::DataLayout const "
           "&dataLayout)\n";
   Code << "    : OpConversionPattern<cir::" << OpName
-       << ">(typeConverter, context), dataLayout(dataLayout)";
+       << ">(typeConverter, context), lowerMod(lowerMod), "
+          "dataLayout(dataLayout)";
   if (IsRecursive) {
     Code << " {\n";
     Code << "    setHasBoundedRewriteRecursion();\n";
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index a6f757173728b..3f8de8dd064a5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -281,53 +281,43 @@ int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp,
                       (size_t)newlen);
 }
 
-static fd_t internal_spawn_impl(const char *argv[], const char *envp[],
-                                pid_t *pid) {
-  fd_t primary_fd = kInvalidFd;
-  fd_t secondary_fd = kInvalidFd;
-
+bool internal_spawn(const char* argv[], const char* envp[], pid_t* pid,
+                    fd_t fd_stdin, fd_t fd_stdout) {
+  // NOTE: Caller ensures that fd_stdin and fd_stdout are not 0, 1, or 2, since
+  // this can break communication.
+  //
+  // NOTE: Caller is responsible for closing fd_stdin after the process has
+  // died.
+
+  int res;
   auto fd_closer = at_scope_exit([&] {
-    internal_close(primary_fd);
-    internal_close(secondary_fd);
+    // NOTE: We intentionally do not close fd_stdin since this can
+    // cause us to receive a fatal SIGPIPE if the process dies.
+    internal_close(fd_stdout);
   });
 
-  // We need a new pseudoterminal to avoid buffering problems. The 'atos' tool
-  // in particular detects when it's talking to a pipe and forgets to flush the
-  // output stream after sending a response.
-  primary_fd = posix_openpt(O_RDWR);
-  if (primary_fd == kInvalidFd)
-    return kInvalidFd;
-
-  int res = grantpt(primary_fd) || unlockpt(primary_fd);
-  if (res != 0) return kInvalidFd;
-
-  // Use TIOCPTYGNAME instead of ptsname() to avoid threading problems.
-  char secondary_pty_name[128];
-  res = ioctl(primary_fd, TIOCPTYGNAME, secondary_pty_name);
-  if (res == -1) return kInvalidFd;
-
-  secondary_fd = internal_open(secondary_pty_name, O_RDWR);
-  if (secondary_fd == kInvalidFd)
-    return kInvalidFd;
-
   // File descriptor actions
   posix_spawn_file_actions_t acts;
   res = posix_spawn_file_actions_init(&acts);
-  if (res != 0) return kInvalidFd;
+  if (res != 0)
+    return false;
 
   auto acts_cleanup = at_scope_exit([&] {
     posix_spawn_file_actions_destroy(&acts);
   });
 
-  res = posix_spawn_file_actions_adddup2(&acts, secondary_fd, STDIN_FILENO) ||
-        posix_spawn_file_actions_adddup2(&acts, secondary_fd, STDOUT_FILENO) ||
-        posix_spawn_file_actions_addclose(&acts, secondary_fd);
-  if (res != 0) return kInvalidFd;
+  res = posix_spawn_file_actions_adddup2(&acts, fd_stdin, STDIN_FILENO) ||
+        posix_spawn_file_actions_adddup2(&acts, fd_stdout, STDOUT_FILENO) ||
+        posix_spawn_file_actions_addclose(&acts, fd_stdin) ||
+        posix_spawn_file_actions_addclose(&acts, fd_stdout);
+  if (res != 0)
+    return false;
 
   // Spawn attributes
   posix_spawnattr_t attrs;
   res = posix_spawnattr_init(&attrs);
-  if (res != 0) return kInvalidFd;
+  if (res != 0)
+    return false;
 
   auto attrs_cleanup  = at_scope_exit([&] {
     posix_spawnattr_destroy(&attrs);
@@ -336,50 +326,17 @@ static fd_t internal_spawn_impl(const char *argv[], const char *envp[],
   // In the spawned process, close all file descriptors that are not explicitly
   // described by the file actions object. This is Darwin-specific extension.
   res = posix_spawnattr_setflags(&attrs, POSIX_SPAWN_CLOEXEC_DEFAULT);
-  if (res != 0) return kInvalidFd;
+  if (res != 0)
+    return false;
 
   // posix_spawn
   char **argv_casted = const_cast<char **>(argv);
   char **envp_casted = const_cast<char **>(envp);
   res = posix_spawn(pid, argv[0], &acts, &attrs, argv_casted, envp_casted);
-  if (res != 0) return kInvalidFd;
-
-  // Disable echo in the new terminal, disable CR.
-  struct termios termflags;
-  tcgetattr(primary_fd, &termflags);
-  termflags.c_oflag &= ~ONLCR;
-  termflags.c_lflag &= ~ECHO;
-  tcsetattr(primary_fd, TCSANOW, &termflags);
-
-  // On success, do not close primary_fd on scope exit.
-  fd_t fd = primary_fd;
-  primary_fd = kInvalidFd;
-
-  return fd;
-}
-
-fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid) {
-  // The client program may close its stdin and/or stdout and/or stderr thus
-  // allowing open/posix_openpt to reuse file descriptors 0, 1 or 2. In this
-  // case the communication is broken if either the parent or the child tries to
-  // close or duplicate these descriptors. We temporarily reserve these
-  // descriptors here to prevent this.
-  fd_t low_fds[3];
-  size_t count = 0;
-
-  for (; count < 3; count++) {
-    low_fds[count] = posix_openpt(O_RDWR);
-    if (low_fds[count] >= STDERR_FILENO)
-      break;
-  }
-
-  fd_t fd = internal_spawn_impl(argv, envp, pid);
-
-  for (; count > 0; count--) {
-    internal_close(low_fds[count]);
-  }
+  if (res != 0)
+    return false;
 
-  return fd;
+  return true;
 }
 
 uptr internal_rename(const char *oldpath, const char *newpath) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
index b5491c540dc08..063408b8360c1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
@@ -67,7 +67,8 @@ uptr internal_ptrace(int request, int pid, void *addr, void *data);
 uptr internal_waitpid(int pid, int *status, int options);
 
 int internal_fork();
-fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid);
+bool internal_spawn(const char* argv[], const char* envp[], pid_t* pid,
+                    fd_t stdin, fd_t stdout);
 
 int internal_sysctl(const int *name, unsigned int namelen, void *oldp,
                     uptr *oldlenp, const void *newp, uptr newlen);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h
index 2345aee985541..6442a2980bf2f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h
@@ -83,7 +83,7 @@ class SymbolizerProcess {
   const char *SendCommand(const char *command);
 
  protected:
-  ~SymbolizerProcess() {}
+  ~SymbolizerProcess();
 
   /// The maximum number of arguments required to invoke a tool process.
   static const unsigned kArgVMax = 16;
@@ -114,6 +114,10 @@ class SymbolizerProcess {
   fd_t input_fd_;
   fd_t output_fd_;
 
+  // We hold on to the child's stdin fd (the read end of the pipe)
+  // so that when we write to it, we don't get a SIGPIPE
+  fd_t child_stdin_fd_;
+
   InternalMmapVector<char> buffer_;
 
   static const uptr kMaxTimesRestarted = 5;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
index 565701c85d978..cc31d3d8056f9 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
@@ -476,10 +476,11 @@ const char *LLVMSymbolizer::FormatAndSendCommand(const char *command_prefix,
   return symbolizer_process_->SendCommand(buffer_);
 }
 
-SymbolizerProcess::SymbolizerProcess(const char *path, bool use_posix_spawn)
+SymbolizerProcess::SymbolizerProcess(const char* path, bool use_posix_spawn)
     : path_(path),
       input_fd_(kInvalidFd),
       output_fd_(kInvalidFd),
+      child_stdin_fd_(kInvalidFd),
       times_restarted_(0),
       failed_to_start_(false),
       reported_invalid_path_(false),
@@ -488,6 +489,11 @@ SymbolizerProcess::SymbolizerProcess(const char *path, bool use_posix_spawn)
   CHECK_NE(path_[0], '\0');
 }
 
+SymbolizerProcess::~SymbolizerProcess() {
+  if (child_stdin_fd_ != kInvalidFd)
+    CloseFile(child_stdin_fd_);
+}
+
 static bool IsSameModule(const char *path) {
   if (const char *ProcessName = GetProcessName()) {
     if (const char *SymbolizerName = StripModuleName(path)) {
@@ -533,6 +539,10 @@ bool SymbolizerProcess::Restart() {
     CloseFile(input_fd_);
   if (output_fd_ != kInvalidFd)
     CloseFile(output_fd_);
+  if (child_stdin_fd_ != kInvalidFd) {
+    CloseFile(child_stdin_fd_);
+    child_stdin_fd_ = kInvalidFd;  // Don't free in destructor
+  }
   return StartSymbolizerSubprocess();
 }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index 7eb0c9756d64a..ab6aee7c9fba7 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -156,30 +156,34 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
     Printf("\n");
   }
 
+  fd_t infd[2] = {}, outfd[2] = {};
+  if (!CreateTwoHighNumberedPipes(infd, outfd)) {
+    Report(
+        "WARNING: Can't create a socket pair to start "
+        "external symbolizer (errno: %d)\n",
+        errno);
+    return false;
+  }
+
   if (use_posix_spawn_) {
 #  if SANITIZER_APPLE
-    fd_t fd = internal_spawn(argv, const_cast<const char **>(GetEnvP()), &pid);
-    if (fd == kInvalidFd) {
+    bool success = internal_spawn(argv, const_cast<const char**>(GetEnvP()),
+                                  &pid, outfd[0], infd[1]);
+    if (!success) {
       Report("WARNING: failed to spawn external symbolizer (errno: %d)\n",
              errno);
+      internal_close(infd[0]);
+      internal_close(outfd[1]);
       return false;
     }
 
-    input_fd_ = fd;
-    output_fd_ = fd;
+    // We intentionally hold on to the read-end so that we don't get a SIGPIPE
+    child_stdin_fd_ = outfd[0];
+
 #  else   // SANITIZER_APPLE
     UNIMPLEMENTED();
 #  endif  // SANITIZER_APPLE
   } else {
-    fd_t infd[2] = {}, outfd[2] = {};
-    if (!CreateTwoHighNumberedPipes(infd, outfd)) {
-      Report(
-          "WARNING: Can't create a socket pair to start "
-          "external symbolizer (errno: %d)\n",
-          errno);
-      return false;
-    }
-
     pid = StartSubprocess(path_, argv, GetEnvP(), /* stdin */ outfd[0],
                           /* stdout */ infd[1]);
     if (pid < 0) {
@@ -187,11 +191,11 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
       internal_close(outfd[1]);
       return false;
     }
-
-    input_fd_ = infd[0];
-    output_fd_ = outfd[1];
   }
 
+  input_fd_ = infd[0];
+  output_fd_ = outfd[1];
+
   CHECK_GT(pid, 0);
 
   // Check that symbolizer subprocess started successfully.
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 1d4208b6a2aa0..b70b9c9269fed 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -18,6 +18,7 @@
 #include "size_class_map.h"
 
 #include <algorithm>
+#include <atomic>
 #include <condition_variable>
 #include <memory>
 #include <mutex>
@@ -1396,6 +1397,7 @@ TEST(ScudoCombinedTest, FullUsableSizeMTE) {
   VerifyExactUsableSize<AllocatorT>(*Allocator);
   VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
 }
+
 // Verify that no special quarantine blocks appear in iterateOverChunks.
 TEST(ScudoCombinedTest, QuarantineIterateOverChunks) {
   using AllocatorT = TestAllocator<TestQuarantineConfig>;
@@ -1426,3 +1428,89 @@ TEST(ScudoCombinedTest, QuarantineIterateOverChunks) {
                        << std::hex << Base << " Size " << std::dec << Size;
   }
 }
+
+struct InitSizeClassConfig {
+  static const scudo::uptr NumBits = 1;
+  static const scudo::uptr MinSizeLog = 10;
+  static const scudo::uptr MidSizeLog = 10;
+  static const scudo::uptr MaxSizeLog = 13;
+  static const scudo::u16 MaxNumCachedHint = 8;
+  static const scudo::uptr MaxBytesCachedLog = 12;
+  static const scudo::uptr SizeDelta = 0;
+};
+
+struct TestInitSizeConfig {
+  static const bool MaySupportMemoryTagging = false;
+  static const bool QuarantineDisabled = true;
+
+  struct Primary {
+    using SizeClassMap = scudo::FixedSizeClassMap<InitSizeClassConfig>;
+    static const scudo::uptr RegionSizeLog = 21U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 18;
+  };
+  template <typename Config>
+  using PrimaryT = scudo::SizeClassAllocator64<Config>;
+
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
+};
+
+struct TestInitSizeTSDSharedConfig : public TestInitSizeConfig {
+  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 4U, 4U>;
+};
+
+struct TestInitSizeTSDExclusiveConfig : public TestInitSizeConfig {
+  template <class A> using TSDRegistryT = scudo::TSDRegistryExT<A>;
+};
+
+template <class AllocatorT> void RunStress() {
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  // This test is designed to try and have many threads trying to initialize
+  // the TSD at the same time. Make sure this doesn't crash.
+  std::atomic_bool StartRunning = false;
+  std::vector<std::thread *> threads;
+  for (size_t I = 0; I < 16; I++) {
+    threads.emplace_back(new std::thread([&Allocator, &StartRunning]() {
+      while (!StartRunning.load())
+        ;
+
+      void *Ptr = Allocator->allocate(10, Origin);
+      EXPECT_TRUE(Ptr != nullptr);
+      // Make sure this value is not optimized away.
+      asm volatile("" : : "r,m"(Ptr) : "memory");
+      Allocator->deallocate(Ptr, Origin);
+    }));
+  }
+
+  StartRunning = true;
+
+  for (auto *thread : threads) {
+    thread->join();
+    delete thread;
+  }
+}
+
+TEST(ScudoCombinedTest, StressThreadInitTSDShared) {
+  using AllocatorT = scudo::Allocator<TestInitSizeTSDSharedConfig>;
+  // Run the stress test a few times.
+  for (size_t I = 0; I < 10; I++)
+    RunStress<AllocatorT>();
+}
+
+TEST(ScudoCombinedTest, StressThreadInitTSDExclusive) {
+  using AllocatorT = scudo::Allocator<TestInitSizeTSDExclusiveConfig>;
+  // Run the stress test a few times.
+  for (size_t I = 0; I < 10; I++)
+    RunStress<AllocatorT>();
+}
diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
index a58ba6505089f..75921f2be3ffe 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
@@ -52,17 +52,20 @@ template <class Allocator> struct TSDRegistryExT {
     bool UnlockRequired;
   };
 
-  void init(Allocator *Instance) REQUIRES(Mutex) {
-    DCHECK(!Initialized);
+  void init(Allocator *Instance) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+    // If more than one thread is initializing at the exact same moment, the
+    // threads that lose don't need to do anything.
+    if (UNLIKELY(atomic_load_relaxed(&Initialized) != 0))
+      return;
     Instance->init();
     CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread<Allocator>), 0);
     FallbackTSD.init(Instance);
-    Initialized = true;
+    atomic_store_relaxed(&Initialized, 1);
   }
 
-  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
+  void initOnceMaybe(Allocator *Instance) {
+    if (LIKELY(atomic_load_relaxed(&Initialized) != 0))
       return;
     init(Instance); // Sets Initialized.
   }
@@ -81,7 +84,7 @@ template <class Allocator> struct TSDRegistryExT {
     FallbackTSD = {};
     State = {};
     ScopedLock L(Mutex);
-    Initialized = false;
+    atomic_store_relaxed(&Initialized, 0);
   }
 
   void drainCaches(Allocator *Instance) {
@@ -158,7 +161,7 @@ template <class Allocator> struct TSDRegistryExT {
   }
 
   pthread_key_t PThreadKey = {};
-  bool Initialized GUARDED_BY(Mutex) = false;
+  atomic_u8 Initialized = {};
   atomic_u8 Disabled = {};
   TSD<Allocator> FallbackTSD;
   HybridMutex Mutex;
diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
index 404e984e1f5e9..425a028c955aa 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -47,20 +47,24 @@ struct TSDRegistrySharedT {
     TSD<Allocator> *CurrentTSD;
   };
 
-  void init(Allocator *Instance) REQUIRES(Mutex) {
-    DCHECK(!Initialized);
+  void init(Allocator *Instance) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+    // If more than one thread is initializing at the exact same moment, the
+    // threads that lose don't need to do anything.
+    if (UNLIKELY(atomic_load_relaxed(&Initialized) != 0))
+      return;
+
     Instance->init();
     for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].init(Instance);
     const u32 NumberOfCPUs = getNumberOfCPUs();
     setNumberOfTSDs((NumberOfCPUs == 0) ? DefaultTSDCount
                                         : Min(NumberOfCPUs, DefaultTSDCount));
-    Initialized = true;
+    atomic_store_relaxed(&Initialized, 1);
   }
 
-  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
+  void initOnceMaybe(Allocator *Instance) {
+    if (LIKELY(atomic_load_relaxed(&Initialized) != 0))
       return;
     init(Instance); // Sets Initialized.
   }
@@ -72,11 +76,11 @@ struct TSDRegistrySharedT {
     }
     setCurrentTSD(nullptr);
     ScopedLock L(Mutex);
-    Initialized = false;
+    atomic_store_relaxed(&Initialized, 0);
   }
 
   void drainCaches(Allocator *Instance) {
-    ScopedLock L(MutexTSDs);
+    ScopedLock L(Mutex);
     for (uptr I = 0; I < NumberOfTSDs; ++I) {
       TSDs[I].lock();
       Instance->drainCache(&TSDs[I]);
@@ -93,7 +97,6 @@ struct TSDRegistrySharedT {
 
   void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
-    MutexTSDs.lock();
     for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].lock();
   }
@@ -101,13 +104,14 @@ struct TSDRegistrySharedT {
   void enable() NO_THREAD_SAFETY_ANALYSIS {
     for (s32 I = static_cast<s32>(TSDsArraySize - 1); I >= 0; I--)
       TSDs[I].unlock();
-    MutexTSDs.unlock();
     Mutex.unlock();
   }
 
   bool setOption(Option O, sptr Value) {
-    if (O == Option::MaxTSDsCount)
+    if (O == Option::MaxTSDsCount) {
+      ScopedLock L(Mutex);
       return setNumberOfTSDs(static_cast<u32>(Value));
+    }
     if (O == Option::ThreadDisableMemInit)
       setDisableMemInit(Value);
     // Not supported by the TSD Registry, but not an error either.
@@ -116,8 +120,8 @@ struct TSDRegistrySharedT {
 
   bool getDisableMemInit() const { return *getTlsPtr() & 1; }
 
-  void getStats(ScopedString *Str) EXCLUDES(MutexTSDs) {
-    ScopedLock L(MutexTSDs);
+  void getStats(ScopedString *Str) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
 
     Str->append("Stats: SharedTSDs: %u available; total %u\n", NumberOfTSDs,
                 TSDsArraySize);
@@ -171,8 +175,7 @@ struct TSDRegistrySharedT {
     return reinterpret_cast<TSD<Allocator> *>(*getTlsPtr() & ~1ULL);
   }
 
-  bool setNumberOfTSDs(u32 N) EXCLUDES(MutexTSDs) {
-    ScopedLock L(MutexTSDs);
+  bool setNumberOfTSDs(u32 N) REQUIRES(Mutex) {
     if (N < NumberOfTSDs)
       return false;
     if (N > TSDsArraySize)
@@ -213,14 +216,14 @@ struct TSDRegistrySharedT {
   // TSDs is an array of locks which is not supported for marking thread-safety
   // capability.
   NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD)
-      EXCLUDES(MutexTSDs) {
+      EXCLUDES(Mutex) {
     // Use the Precedence of the current TSD as our random seed. Since we are
     // in the slow path, it means that tryLock failed, and as a result it's
     // very likely that said Precedence is non-zero.
     const u32 R = static_cast<u32>(CurrentTSD->getPrecedence());
     u32 N, Inc;
     {
-      ScopedLock L(MutexTSDs);
+      ScopedLock L(Mutex);
       N = NumberOfTSDs;
       DCHECK_NE(NumberOfCoPrimes, 0U);
       Inc = CoPrimes[R % NumberOfCoPrimes];
@@ -257,12 +260,15 @@ struct TSDRegistrySharedT {
   }
 
   atomic_u32 CurrentIndex = {};
-  u32 NumberOfTSDs GUARDED_BY(MutexTSDs) = 0;
-  u32 NumberOfCoPrimes GUARDED_BY(MutexTSDs) = 0;
-  u32 CoPrimes[TSDsArraySize] GUARDED_BY(MutexTSDs) = {};
-  bool Initialized GUARDED_BY(Mutex) = false;
+  u32 NumberOfTSDs GUARDED_BY(Mutex) = 0;
+  u32 NumberOfCoPrimes GUARDED_BY(Mutex) = 0;
+  u32 CoPrimes[TSDsArraySize] GUARDED_BY(Mutex) = {};
+  atomic_u8 Initialized = {};
+  // Used for global initialization and TSDs access.
+  // Acquiring the global initialization should only lock once in normal
+  // operation, which is why using it for TSDs access should not cause
+  // any interference.
   HybridMutex Mutex;
-  HybridMutex MutexTSDs;
   TSD<Allocator> TSDs[TSDsArraySize];
 };
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
index 958fe40d5db64..0c7257849dc31 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
@@ -2,12 +2,12 @@
 // RUN: %clangxx -O2 %s -o %t
 //
 // Run with limit should fail:
-// RUN: %env_tool_opts=soft_rss_limit_mb=250:quarantine_size=1:allocator_may_return_null=1     %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_1
-// RUN: %env_tool_opts=soft_rss_limit_mb=250:quarantine_size=1:allocator_may_return_null=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
+// RUN: %env_tool_opts=soft_rss_limit_mb=384:quarantine_size=1:allocator_may_return_null=1     %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_1
+// RUN: %env_tool_opts=soft_rss_limit_mb=384:quarantine_size=1:allocator_may_return_null=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
 
 // This run uses getrusage. We can only test getrusage when allocator_may_return_null=0
 // because getrusage gives us max-rss, not current-rss.
-// RUN: %env_tool_opts=soft_rss_limit_mb=250:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
+// RUN: %env_tool_opts=soft_rss_limit_mb=384:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
 // REQUIRES: stable-runtime
 
 // Ubsan does not intercept pthread_create.
diff --git a/flang/docs/OptionComparison.md b/flang/docs/OptionComparison.md
index 934750a07e9bb..c416742c8217b 100644
--- a/flang/docs/OptionComparison.md
+++ b/flang/docs/OptionComparison.md
@@ -8,7 +8,7 @@
 
 # Compiler options comparison
 
-This document catalogs the options processed by Flang's peers/competitors.  Much of the document is taken up by a set of tables that list the options categorized into different topics.  Some of the table headings link to more information about the contents of the tables.  For example, the table on **Standards conformance** options links to <a href=#standards">notes on Standards conformance</a>.
+This document catalogs the options processed by Flang's peers/competitors.  Much of the document is taken up by a set of tables that list the options categorized into different topics.  Some of the table headings link to more information about the contents of the tables.  For example, the table on **Standards conformance** options links to <a href="#standards">notes on Standards conformance</a>.
 
 **There's also important information in the ___[Appendix section](#appendix)___ near the end of the document on how this data was gathered and what ___is___ and ___is not___ included in this document.**  
 
@@ -28,7 +28,7 @@ Note that compilers may support language features without having an option for t
    <td><strong>IBM</strong> </td>
    <td><strong>Intel</strong> </td>
    <td><strong>PGI</strong> </td>
-   <td><strong>Flang</strong> </td>
+   <td><strong>Classic Flang</strong> </td>
   </tr>
   <tr>
    <td>Overall conformance </td>
@@ -112,7 +112,7 @@ fall-intrinsics
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -241,7 +241,7 @@ fd-lines-as-comments
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -410,7 +410,7 @@ fd-lines-as-comments
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -471,7 +471,7 @@ fd-lines-as-comments
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -604,7 +604,7 @@ Mr8intrinsics
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -739,7 +739,7 @@ fdefault-integer-8
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -810,7 +810,7 @@ fdefault-integer-8
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -952,7 +952,7 @@ Msave
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -1027,7 +1027,7 @@ Msave
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -1098,7 +1098,7 @@ Mcuda
    </td>
    <td><strong>PGI</strong>
    </td>
-   <td><strong>Flang</strong>
+   <td><strong>Classic Flang</strong>
    </td>
   </tr>
   <tr>
@@ -1220,7 +1220,7 @@ IBM Fortran's options allow the source line length to be specified with the opti
 *   **GNU:** For both "ffixed-line-length-_n_" and "ffree-line-length-_n_" options, characters are ignored after the specified length.  The default for fixed is 72.  The default for free is 132.  For free, you can specify 'none' as the length, which means that all characters in the line are meaningful.
 *   **IBM:** For **fixed**, the default is 72.  For **free**, there's no default, but the maximum length for either form is 132.
 *   **Intel:** The default is 72 for **fixed** and 132 for **free**.
-*   **PGI, Flang:** 
+*   **PGI, Classic Flang:** 
     * in free form, it is an error if the line is longer than 1000 characters
     * in fixed form by default, characters after column 72 are ignored
     * in fixed form with -Mextend, characters after column 132 are ignored
@@ -1233,7 +1233,7 @@ IBM Fortran's options allow the source line length to be specified with the opti
 *   **GNU:** The "-fbackslash" option the interpretation of backslashes in string literals from a single backslash character to "C-style" escape characters. The following combinations are expanded \a, \b, \f, \n, \r, \t, \v, \\, and \0 to the ASCII characters alert, backspace, form feed, newline, carriage return, horizontal tab, vertical tab, backslash, and NUL, respectively. Additionally, \xnn, \unnnn and \Unnnnnnnn (where each n is a hexadecimal digit) are translated into the Unicode characters corresponding to the specified code points. All other combinations of a character preceded by \ are unexpanded.
 *   **Intel:** The option "-assume bscc" tells the compiler to treat the backslash character (\) as a C-style control (escape) character syntax in character literals. "nobscc" specifies that the backslash character is treated as a normal character in character literals.  This is the default.
 
-**"$" in symbol names:** Allowing "$" in names is controlled by an option in GNU and is the default behavior in IBM and Intel.  Presumably, these compilers issue warnings when standard conformance options are enabled.  Dollar signs in names don't seem to be allowed in Cray, PGI, or Flang.
+**"$" in symbol names:** Allowing "$" in names is controlled by an option in GNU and is the default behavior in IBM and Intel.  Presumably, these compilers issue warnings when standard conformance options are enabled.  Dollar signs in names don't seem to be allowed in Cray, PGI, or Classic Flang.
 
 **<a name="do"></a>DO loop handling**
 
@@ -1328,7 +1328,7 @@ Here's the list of compilers surveyed, hot linked to the source of data on it.
 *   [NAG Fortran Release 6.2](https://www.nag.co.uk/nagware/np/r62_doc/manual/compiler_2_4.html)
 *   [Oracle Fortran version 819-0492-10](https://docs.oracle.com/cd/E19059-01/stud.10/819-0492/3_options.html)
 *   PGI -- [Compiler Reference version 19.1](https://www.pgroup.com/resources/docs/19.1/x86/pgi-ref-guide/index.htm#cmdln-options-ref), [Fortran Reference Guide version 17](https://www.pgroup.com/doc/pgi17fortref.pdf)
-*   [Flang](https://github.com/flang-compiler/flang/wiki/Using-Flang) -- information from GitHub
+*   [Classic Flang](https://github.com/flang-compiler/flang/wiki/Using-Flang) -- information from GitHub
 
 This document has been kept relatively small by providing links to much of the information about options rather than duplicating that information.  For IBM, Intel, and some PGI options, there are direct links.  But direct links were not possible for Cray, GNU and some PGI options.
 
diff --git a/flang/docs/Unsigned.md b/flang/docs/Unsigned.md
index 5c90e2aa185bc..5815822cd53ad 100644
--- a/flang/docs/Unsigned.md
+++ b/flang/docs/Unsigned.md
@@ -6,7 +6,7 @@
   
 -->
 
-# Fortran Extensions supported by Flang
+# Flang support for UNSIGNED type
 
 ```{contents}
 ---
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 134702d6e4434..b9ca5cedfe8ec 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3338,19 +3338,14 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
                  simdReductionSyms);
 
-  DataSharingProcessor wsloopItemDSP(
-      converter, semaCtx, doItem->clauses, eval,
-      /*shouldCollectPreDeterminedSymbols=*/false,
-      /*useDelayedPrivatization=*/true, symTable);
+
+  DataSharingProcessor wsloopItemDSP(converter, semaCtx, doItem->clauses, eval,
+                                     /*shouldCollectPreDeterminedSymbols=*/true,
+                                     /*useDelayedPrivatization=*/true,
+                                     symTable);
   wsloopItemDSP.processStep1();
   wsloopItemDSP.processStep2(&wsloopClauseOps);
 
-  DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval,
-                                   /*shouldCollectPreDeterminedSymbols=*/true,
-                                   /*useDelayedPrivatization=*/true, symTable);
-  simdItemDSP.processStep1();
-  simdItemDSP.processStep2(&simdClauseOps, simdItem->id);
-
   // Pass the innermost leaf construct's clauses because that's where COLLAPSE
   // is placed by construct decomposition.
   mlir::omp::LoopNestOperands loopNestClauseOps;
@@ -3369,8 +3364,9 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  simdArgs.priv.syms = simdItemDSP.getDelayedPrivSymbols();
-  simdArgs.priv.vars = simdClauseOps.privateVars;
+  // For composite 'do simd', privatization is handled by the wsloop.
+  // The simd does not create separate private storage for variables already
+  // privatized by the worksharing construct.
   simdArgs.reduction.syms = simdReductionSyms;
   simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
@@ -3380,7 +3376,7 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, simdItem,
                 loopNestClauseOps, iv,
                 {{wsloopOp, wsloopArgs}, {simdOp, simdArgs}},
-                llvm::omp::Directive::OMPD_do_simd, simdItemDSP);
+                llvm::omp::Directive::OMPD_do_simd, wsloopItemDSP);
   return wsloopOp;
 }
 
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 6b3bcd00c5bec..a0e106d70a5fd 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1920,7 +1920,7 @@ struct OmpLoopConstructParser {
     auto loopItem{LoopNestParser{} || ompLoopConstruct};
 
     if (auto &&begin{OmpBeginDirectiveParser(dirs_).Parse(state)}) {
-      auto loopDir{begin->DirName().v};
+      auto loopDir{begin->DirId()};
       auto assoc{llvm::omp::getDirectiveAssociation(loopDir)};
       if (assoc == llvm::omp::Association::LoopNest) {
         if (auto &&item{attempt(loopItem).Parse(state)}) {
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index f25cf7eb33817..726dbe865834d 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -271,7 +271,7 @@ void OmpStructureChecker::CheckNestedBlock(const parser::OpenMPLoopConstruct &x,
     } else if (parser::Unwrap<parser::DoConstruct>(stmt)) {
       ++nestedCount;
     } else if (auto *omp{parser::Unwrap<parser::OpenMPLoopConstruct>(stmt)}) {
-      if (!IsLoopTransforming(omp->BeginDir().DirName().v)) {
+      if (!IsLoopTransforming(omp->BeginDir().DirId())) {
         context_.Say(omp->source,
             "Only loop-transforming OpenMP constructs are allowed inside OpenMP loop constructs"_err_en_US);
       }
@@ -324,7 +324,7 @@ void OmpStructureChecker::CheckFullUnroll(
   // since it won't contain a loop.
   if (const parser::OpenMPLoopConstruct *nested{x.GetNestedConstruct()}) {
     auto &nestedSpec{nested->BeginDir()};
-    if (nestedSpec.DirName().v == llvm::omp::Directive::OMPD_unroll) {
+    if (nestedSpec.DirId() == llvm::omp::Directive::OMPD_unroll) {
       bool isPartial{
           llvm::any_of(nestedSpec.Clauses().v, [](const parser::OmpClause &c) {
             return c.Id() == llvm::omp::Clause::OMPC_partial;
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index dfba9e344a046..05364157afc9d 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2816,7 +2816,7 @@ void OmpStructureChecker::Leave(const parser::OpenMPCancelConstruct &) {
 void OmpStructureChecker::Enter(const parser::OpenMPCriticalConstruct &x) {
   const parser::OmpBeginDirective &beginSpec{x.BeginDir()};
   const std::optional<parser::OmpEndDirective> &endSpec{x.EndDir()};
-  PushContextAndClauseSets(beginSpec.DirName().source, beginSpec.DirName().v);
+  PushContextAndClauseSets(beginSpec.DirName().source, beginSpec.DirId());
 
   const auto &block{std::get<parser::Block>(x.t)};
   CheckNoBranching(
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 5c9d03d9e5d67..66b34c7221814 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2477,7 +2477,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionConstruct &x) {
 
 bool OmpAttributeVisitor::Pre(const parser::OpenMPCriticalConstruct &x) {
   const parser::OmpBeginDirective &beginSpec{x.BeginDir()};
-  PushContext(beginSpec.DirName().source, beginSpec.DirName().v);
+  PushContext(beginSpec.DirName().source, beginSpec.DirId());
   GetContext().withinConstruct = true;
   return true;
 }
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index 285eaac1e2c8f..60e3e6ab3f5f1 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -118,7 +118,7 @@ static bool ReturnsDataPointer(const Symbol &symbol) {
 }
 
 static bool LoopConstructIsSIMD(parser::OpenMPLoopConstruct *ompLoop) {
-  return llvm::omp::allSimdSet.test(ompLoop->BeginDir().DirName().v);
+  return llvm::omp::allSimdSet.test(ompLoop->BeginDir().DirId());
 }
 
 // Remove non-SIMD OpenMPConstructs once they are parsed.
diff --git a/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90 b/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
new file mode 100644
index 0000000000000..4fef69188e0ee
--- /dev/null
+++ b/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
@@ -0,0 +1,48 @@
+! Test runtime behavior of DO SIMD with firstprivate and lastprivate on same variable
+! This is the reproducer from issue #168306
+
+! REQUIRES: openmp-runtime
+
+! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=LLVM
+! RUN: %flang -fopenmp %s -o %t && %t | FileCheck %s
+
+! LLVM-LABEL: define {{.*}} @_QQmain
+program main
+  integer :: a
+  integer :: i
+  
+  a = 10
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+     ! Inside loop: a should be 10 (from firstprivate initialization)
+     ! CHECK: main1 : a = 10
+     print *, "main1 : a = ", a
+     a = 20
+  end do
+  !$omp end do simd
+  ! After loop: a should be 20 (from lastprivate copy-out)
+  ! CHECK: main2 : a = 20
+  print *, "main2 : a = ", a
+  
+  call sub
+  ! CHECK: pass
+  print *, 'pass'
+end program main
+
+subroutine sub
+  integer :: a
+  integer :: i
+  
+  a = 10
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+     ! Inside loop: a should be 10 (from firstprivate initialization)
+     ! CHECK: sub1  : a = 10
+     print *, "sub1  : a = ", a
+     a = 20
+  end do
+  !$omp end do simd
+  ! After loop: a should be 20 (from lastprivate copy-out)
+  ! CHECK: sub2  : a = 20
+  print *, "sub2  : a = ", a
+end subroutine sub
diff --git a/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90 b/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
new file mode 100644
index 0000000000000..429409926d47b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
@@ -0,0 +1,89 @@
+! Test for DO SIMD with the same variable in both firstprivate and lastprivate clauses
+! This tests the fix for issue #168306
+
+! RUN: %flang_fc1 -fopenmp -mmlir --enable-delayed-privatization-staging=true -emit-hlfir %s -o - | FileCheck %s
+
+! Test case 1: Basic test with firstprivate + lastprivate on same variable
+! CHECK-LABEL: func.func @_QPdo_simd_first_last_same_var
+subroutine do_simd_first_last_same_var()
+  integer :: a
+  integer :: i
+  a = 10
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  ! CHECK-NEXT: omp.loop_nest (%[[IV:.*]]) : i32
+  !$omp do simd firstprivate(a) lastprivate(a)
+  do i = 1, 1
+    ! CHECK: %[[FIRSTPRIV_A_DECL:.*]]:2 = hlfir.declare %[[FIRSTPRIV_A]]
+    ! CHECK: %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I]]
+    ! The private copy should be initialized from firstprivate (value 10)
+    ! and then modified to 20
+    a = 20
+  end do
+  !$omp end do simd
+  ! After the loop, 'a' should be 20 due to lastprivate
+end subroutine do_simd_first_last_same_var
+
+! Test case 2: Test with lastprivate and firstprivate in reverse order
+! CHECK-LABEL: func.func @_QPdo_simd_last_first_reverse
+subroutine do_simd_last_first_reverse()
+  integer :: a
+  integer :: i
+  a = 10
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+    a = 20
+  end do
+  !$omp end do simd
+end subroutine do_simd_last_first_reverse
+
+! Test case 3: Multiple variables with mixed privatization
+! CHECK-LABEL: func.func @_QPdo_simd_multiple_vars
+subroutine do_simd_multiple_vars()
+  integer :: a, b, c
+  integer :: i
+  a = 10
+  b = 20
+  c = 30
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %{{.*}}, @{{.*}}firstprivate{{.*}} %{{.*}} -> %{{.*}}, @{{.*}}private{{.*}} %{{.*}} -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  !$omp do simd firstprivate(a, b) lastprivate(a) private(c)
+  do i = 1, 5
+    a = a + 1
+    b = b + 1
+    c = i
+  end do
+  !$omp end do simd
+end subroutine do_simd_multiple_vars
+
+! Test case 4: Reproducer from issue #168306
+! CHECK-LABEL: func.func @_QPissue_168306_reproducer
+subroutine issue_168306_reproducer()
+  integer :: a
+  integer :: i
+  a = 10
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+    ! Inside the loop, 'a' should start at 10 (from firstprivate)
+    ! This is the key behavior that was broken
+    a = 20
+  end do
+  !$omp end do simd
+  ! After the loop, 'a' should be 20 (from lastprivate)
+end subroutine issue_168306_reproducer
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 b/flang/test/Lower/OpenMP/order-clause.f90
index 1f678e02708da..bf007c765b0c1 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -36,15 +36,15 @@ end subroutine do_order
 
 !CHECK-LABEL:   func.func @_QPdo_simd_order() {
 subroutine do_simd_order
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent)
    !$omp do simd order(concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent)
    !$omp do simd order(reproducible:concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(unconstrained:concurrent) {
+   !CHECK: omp.wsloop order(unconstrained:concurrent)
    !$omp do simd order(unconstrained:concurrent)
    do i = 1, 10
    end do
@@ -53,7 +53,7 @@ end subroutine do_simd_order
 !CHECK-LABEL:   func.func @_QPdo_simd_order_parallel() {
 subroutine do_simd_order_parallel
    !CHECK: omp.parallel {
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent)
    !$omp parallel do simd order(reproducible:concurrent)
    do i = 1, 10
    end do
diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90
index 03e35de04cace..b18bc29efb230 100644
--- a/flang/test/Lower/OpenMP/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -71,16 +71,13 @@ end subroutine do_simd_reduction
 subroutine do_simd_private()
   integer, allocatable :: tmp
   ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@[[PRIV_IVAR_SYM:.*]] %{{.*}} -> %[[PRIV_IVAR:.*]] : !fir.ref<i32>)
   ! CHECK-NEXT: omp.simd
-  ! CHECK-SAME: private(@[[PRIV_BOX_SYM:.*]] %{{.*}} -> %[[PRIV_BOX:.*]], @[[PRIV_IVAR_SYM:.*]] %{{.*}} -> %[[PRIV_IVAR:.*]] : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<i32>)
   ! CHECK-NEXT: omp.loop_nest (%[[IVAR:.*]]) : i32
   !$omp do simd private(tmp)
   do i=1, 10
-  ! CHECK:      %[[PRIV_BOX_DECL:.*]]:2 = hlfir.declare %[[PRIV_BOX]]
   ! CHECK:      %[[PRIV_IVAR_DECL:.*]]:2 = hlfir.declare %[[PRIV_IVAR]]
   ! CHECK:      hlfir.assign %[[IVAR]] to %[[PRIV_IVAR_DECL]]#0
-  ! CHECK:      %[[PRIV_BOX_LOAD:.*]] = fir.load %[[PRIV_BOX_DECL]]
-  ! CHECK:      hlfir.assign %{{.*}} to %[[PRIV_BOX_DECL]]#0
   ! CHECK:      omp.yield
     tmp = tmp + 1
   end do
@@ -90,13 +87,11 @@ end subroutine do_simd_private
 subroutine do_simd_lastprivate_firstprivate()
   integer :: a
   ! CHECK:      omp.wsloop
-  ! CHECK-SAME: private(@[[FIRSTPRIVATE_A_SYM:.*]] %{{.*}} -> %[[FIRSTPRIVATE_A:.*]] : !fir.ref<i32>)
+  ! CHECK-SAME: private(@[[FIRSTPRIVATE_A_SYM:.*]] %{{.*}} -> %[[FIRSTPRIVATE_A:.*]], @[[PRIVATE_I_SYM:.*]] %{{.*}} -> %[[PRIVATE_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
   ! CHECK-NEXT: omp.simd
-  ! CHECK-SAME: private(@[[PRIVATE_A_SYM:.*]] %{{.*}} -> %[[PRIVATE_A:.*]], @[[PRIVATE_I_SYM:.*]] %{{.*}} -> %[[PRIVATE_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
   !$omp do simd lastprivate(a) firstprivate(a)
   do i = 1, 10
     ! CHECK: %[[FIRSTPRIVATE_A_DECL:.*]]:2 = hlfir.declare %[[FIRSTPRIVATE_A]]
-    ! CHECK: %[[PRIVATE_A_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_A]]
     ! CHECK: %[[PRIVATE_I_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_I]]
     a = a + 1
   end do
diff --git a/libcxx/src/include/refstring.h b/libcxx/src/include/refstring.h
index 3e0ec7a97c7be..1c73c60f9ced1 100644
--- a/libcxx/src/include/refstring.h
+++ b/libcxx/src/include/refstring.h
@@ -15,7 +15,7 @@
 #include <cstring>
 #include <stdexcept>
 
-// MacOS and iOS used to ship with libstdc++, and still support old applications
+// MacOS used to ship with libstdc++, and still support old applications
 // linking against libstdc++. The libc++ and libstdc++ exceptions are supposed
 // to be ABI compatible, such that they can be thrown from one library and caught
 // in the other.
@@ -25,7 +25,7 @@
 // string singleton before manipulating the reference count. This is done so that
 // if an exception is created with a zero-length string in libstdc++, libc++abi
 // won't try to delete the memory.
-#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) || defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)
+#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
 #  define _LIBCPP_CHECK_FOR_GCC_EMPTY_STRING_STORAGE
 #  include <dlfcn.h>
 #  include <mach-o/dyld.h>
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index ef19a2af0c4d2..59aa43036ce01 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1295,7 +1295,7 @@ unsigned RelocScan::handleTlsRelocation(RelExpr expr, RelType type,
     // label, so TLSDESC=>IE will be categorized as R_RELAX_TLS_GD_TO_LE. We fix
     // the categorization in RISCV::relocateAllosec->
     if (sym.isPreemptible) {
-      sym.setFlags(NEEDS_TLSGD_TO_IE);
+      sym.setFlags(NEEDS_TLSIE);
       sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE),
                      type, offset, addend, &sym});
     } else {
@@ -1635,18 +1635,13 @@ void elf::postScanRelocations(Ctx &ctx) {
       else
         got->addConstant({R_ABS, ctx.target->tlsOffsetRel, offsetOff, 0, &sym});
     }
-    if (flags & NEEDS_TLSGD_TO_IE) {
-      got->addEntry(sym);
-      ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->tlsGotRel, *got,
-                                            sym.getGotOffset(ctx), sym);
-    }
     if (flags & NEEDS_GOT_DTPREL) {
       got->addEntry(sym);
       got->addConstant(
           {R_ABS, ctx.target->tlsOffsetRel, sym.getGotOffset(ctx), 0, &sym});
     }
 
-    if ((flags & NEEDS_TLSIE) && !(flags & NEEDS_TLSGD_TO_IE))
+    if (flags & NEEDS_TLSIE)
       addTpOffsetGotEntry(ctx, sym);
   };
 
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index a7d61f48ed3d5..034c8734addb8 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -48,7 +48,7 @@ enum {
   NEEDS_COPY = 1 << 3,
   NEEDS_TLSDESC = 1 << 4,
   NEEDS_TLSGD = 1 << 5,
-  NEEDS_TLSGD_TO_IE = 1 << 6,
+  // 1 << 6 unused
   NEEDS_GOT_DTPREL = 1 << 7,
   NEEDS_TLSIE = 1 << 8,
   NEEDS_GOT_AUTH = 1 << 9,
@@ -352,7 +352,7 @@ class Symbol {
   bool needsDynReloc() const {
     return flags.load(std::memory_order_relaxed) &
            (NEEDS_COPY | NEEDS_GOT | NEEDS_PLT | NEEDS_TLSDESC | NEEDS_TLSGD |
-            NEEDS_TLSGD_TO_IE | NEEDS_GOT_DTPREL | NEEDS_TLSIE);
+            NEEDS_GOT_DTPREL | NEEDS_TLSIE);
   }
   void allocateAux(Ctx &ctx) {
     assert(auxIdx == 0);
diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 2eb167709dbda..9f76b3a6719c6 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -104,7 +104,7 @@ Build Requirements
 
 Please follow the steps below if you only want to **build** lldb.
 
-1. Install `Visual Studio <https://visualstudio.microsoft.com>` with the
+1. Install `Visual Studio <https://visualstudio.microsoft.com>`_ with the
    "Desktop Development with C++" workload. Make sure that the latest Windows
    SDK and the Active Template Library (ATL) are installed.
 2. Install `Git Bash <https://git-scm.com/install/windows>`_ and add
diff --git a/lldb/docs/use/map.rst b/lldb/docs/use/map.rst
index da566e7afe058..1221546db45da 100644
--- a/lldb/docs/use/map.rst
+++ b/lldb/docs/use/map.rst
@@ -802,7 +802,7 @@ Print the dynamic type of the result of an expression
 
 LLDB does this automatically if determining the dynamic type does not require
 running the target (in C++, running the target is never needed). This default is
-controlled by the `target.prefer-dynamic-value` setting. If that is disabled, it
+controlled by the ``target.prefer-dynamic-value`` setting. If that is disabled, it
 can be re-enabled on a per-command basis:
 
 .. code-block:: shell
@@ -812,7 +812,7 @@ can be re-enabled on a per-command basis:
   (lldb) expr -d no-run-target -- someCPPObjectPtr
 
 Note that printing of the dynamic type of references is not possible with the
-`expr` command. The workaround is to take the address of the reference and
+``expr`` command. The workaround is to take the address of the reference and
 instruct lldb to print the children of the resulting pointer.
 
 .. code-block:: shell
diff --git a/lldb/include/lldb/Interpreter/CommandReturnObject.h b/lldb/include/lldb/Interpreter/CommandReturnObject.h
index 0742f1b836f5e..f6e60840256a4 100644
--- a/lldb/include/lldb/Interpreter/CommandReturnObject.h
+++ b/lldb/include/lldb/Interpreter/CommandReturnObject.h
@@ -129,8 +129,6 @@ class CommandReturnObject {
 
   void AppendError(llvm::StringRef in_string);
 
-  void AppendRawError(llvm::StringRef in_string);
-
   void AppendErrorWithFormat(const char *format, ...)
       __attribute__((format(printf, 2, 3)));
 
diff --git a/lldb/source/Commands/CommandObjectMultiword.cpp b/lldb/source/Commands/CommandObjectMultiword.cpp
index a369557cca845..e08b33cdce940 100644
--- a/lldb/source/Commands/CommandObjectMultiword.cpp
+++ b/lldb/source/Commands/CommandObjectMultiword.cpp
@@ -205,7 +205,7 @@ void CommandObjectMultiword::Execute(const char *args_string,
             .str());
   }
   error_msg.append("\n");
-  result.AppendRawError(error_msg.c_str());
+  result.AppendError(error_msg);
 }
 
 std::string CommandObjectMultiword::GetSubcommandsHintText() {
diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp
index ffcc9ceeb2a93..afc1753e21c46 100644
--- a/lldb/source/Interpreter/CommandInterpreter.cpp
+++ b/lldb/source/Interpreter/CommandInterpreter.cpp
@@ -3702,13 +3702,11 @@ CommandInterpreter::ResolveCommandImpl(std::string &command_line,
           done = static_cast<bool>(cmd_obj);
         } else {
           StreamString error_msg;
-          error_msg.Printf("Ambiguous command '%s'. Possible matches:\n",
+          error_msg.Printf("ambiguous command '%s'. Possible matches:\n",
                            next_word.c_str());
-
-          for (uint32_t i = 0; i < num_matches; ++i) {
+          for (uint32_t i = 0; i < num_matches; ++i)
             error_msg.Printf("\t%s\n", matches.GetStringAtIndex(i));
-          }
-          result.AppendRawError(error_msg.GetString());
+          result.AppendError(error_msg.GetString());
         }
       } else {
         // We didn't have only one match, otherwise we wouldn't get here.
diff --git a/lldb/source/Interpreter/CommandReturnObject.cpp b/lldb/source/Interpreter/CommandReturnObject.cpp
index ef5bfae1bd1bd..85b058e97a679 100644
--- a/lldb/source/Interpreter/CommandReturnObject.cpp
+++ b/lldb/source/Interpreter/CommandReturnObject.cpp
@@ -173,15 +173,6 @@ StructuredData::ObjectSP CommandReturnObject::GetErrorData() {
   return Serialize(m_diagnostics);
 }
 
-// Similar to AppendError, but do not prepend 'Status: ' to message, and don't
-// append "\n" to the end of it.
-
-void CommandReturnObject::AppendRawError(llvm::StringRef in_string) {
-  SetStatus(eReturnStatusFailed);
-  assert(!in_string.empty() && "Expected a non-empty error message");
-  GetErrorStream() << in_string;
-}
-
 void CommandReturnObject::SetStatus(ReturnStatus status) { m_status = status; }
 
 ReturnStatus CommandReturnObject::GetStatus() const { return m_status; }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
index 5289027fbd8af..8c5ac31aef3f3 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
@@ -203,6 +203,16 @@ class MsvcStlListFrontEnd : public AbstractListFrontEnd<StlType::MsvcStl> {
   ValueObject *m_tail = nullptr;
 };
 
+/// Gets the (forward-)list element type from the head node instead of the
+/// template arguments. This is needed with PDB as it doesn't have info about
+/// the template arguments.
+CompilerType GetMsvcStlElementTypeFromHead(ValueObject &head) {
+  auto val_sp = head.GetChildMemberWithName("_Myval");
+  if (val_sp)
+    return val_sp->GetCompilerType();
+  return CompilerType();
+}
+
 } // end anonymous namespace
 
 template <StlType Stl>
@@ -530,6 +540,10 @@ lldb::ChildCacheState MsvcStlForwardListFrontEnd::Update() {
           m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Myhead"}))
     m_head = head_sp.get();
 
+  // With PDB, we can't get the element type from the template arguments
+  if (!m_element_type && m_head)
+    m_element_type = GetMsvcStlElementTypeFromHead(*m_head);
+
   return ChildCacheState::eRefetch;
 }
 
@@ -606,6 +620,10 @@ lldb::ChildCacheState MsvcStlListFrontEnd::Update() {
   m_head = first.get();
   m_tail = last.get();
 
+  // With PDB, we can't get the element type from the template arguments
+  if (!m_element_type && m_head)
+    m_element_type = GetMsvcStlElementTypeFromHead(*m_head);
+
   return lldb::ChildCacheState::eRefetch;
 }
 
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
index 954f269f8860b..ebde8892d8f62 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
@@ -111,7 +111,6 @@ bool ClassDescriptorV2::class_rw_t::Read(Process *process, lldb::addr_t addr) {
                           process->GetAddressByteSize());
 
   lldb::offset_t cursor = 0;
-
   m_flags = extractor.GetU32_unchecked(&cursor);
   m_version = extractor.GetU32_unchecked(&cursor);
   m_ro_ptr = extractor.GetAddress_unchecked(&cursor);
@@ -119,18 +118,16 @@ bool ClassDescriptorV2::class_rw_t::Read(Process *process, lldb::addr_t addr) {
     m_ro_ptr = abi_sp->FixCodeAddress(m_ro_ptr);
   m_method_list_ptr = extractor.GetAddress_unchecked(&cursor);
   m_properties_ptr = extractor.GetAddress_unchecked(&cursor);
-  m_firstSubclass = extractor.GetAddress_unchecked(&cursor);
-  m_nextSiblingClass = extractor.GetAddress_unchecked(&cursor);
 
   if (m_ro_ptr & 1) {
     DataBufferHeap buffer(ptr_size, '\0');
     process->ReadMemory(m_ro_ptr ^ 1, buffer.GetBytes(), ptr_size, error);
     if (error.Fail())
       return false;
-    cursor = 0;
     DataExtractor extractor(buffer.GetBytes(), ptr_size,
                             process->GetByteOrder(),
                             process->GetAddressByteSize());
+    lldb::offset_t cursor = 0;
     m_ro_ptr = extractor.GetAddress_unchecked(&cursor);
     if (ABISP abi_sp = process->GetABI())
       m_ro_ptr = abi_sp->FixCodeAddress(m_ro_ptr);
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
index 0fff9af438367..8d19b00f1551f 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
@@ -133,9 +133,6 @@ class ClassDescriptorV2 : public ObjCLanguageRuntime::ClassDescriptor {
     lldb::addr_t m_properties_ptr;
     lldb::addr_t m_protocols_ptr;
 
-    ObjCLanguageRuntime::ObjCISA m_firstSubclass;
-    ObjCLanguageRuntime::ObjCISA m_nextSiblingClass;
-
     bool Read(Process *process, lldb::addr_t addr);
   };
 
diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
index 0bedb5e753b77..4ab4cac68c5f8 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.cpp
@@ -307,6 +307,41 @@ struct WasmFunction {
   uint32_t size = 0;
 };
 
+static llvm::Expected<uint32_t> ParseImports(DataExtractor &import_data) {
+  // Currently this function just returns the number of imported functions.
+  // If we want to do anything with global names in the future, we'll also
+  // need to know those.
+  llvm::DataExtractor data = import_data.GetAsLLVM();
+  llvm::DataExtractor::Cursor c(0);
+
+  llvm::Expected<uint32_t> count = GetULEB32(data, c);
+  if (!count)
+    return count.takeError();
+
+  uint32_t function_imports = 0;
+  for (uint32_t i = 0; c && i < *count; ++i) {
+    // We don't need module and field names, so we can just get them as raw
+    // strings and discard.
+    if (!GetWasmString(data, c))
+      return llvm::createStringError("failed to parse module name");
+    if (!GetWasmString(data, c))
+      return llvm::createStringError("failed to parse field name");
+
+    uint8_t kind = data.getU8(c);
+    if (kind == llvm::wasm::WASM_EXTERNAL_FUNCTION)
+      function_imports++;
+
+    // For function imports, this is a type index. For others it's different.
+    // We don't need it, just need to parse it to advance the cursor.
+    data.getULEB128(c);
+  }
+
+  if (!c)
+    return c.takeError();
+
+  return function_imports;
+}
+
 static llvm::Expected<std::vector<WasmFunction>>
 ParseFunctions(DataExtractor &data) {
   lldb::offset_t offset = 0;
@@ -410,7 +445,8 @@ static llvm::Expected<std::vector<WasmSegment>> ParseData(DataExtractor &data) {
 static llvm::Expected<std::vector<Symbol>>
 ParseNames(SectionSP code_section_sp, DataExtractor &name_data,
            const std::vector<WasmFunction> &functions,
-           std::vector<WasmSegment> &segments) {
+           std::vector<WasmSegment> &segments,
+           uint32_t num_imported_functions) {
 
   llvm::DataExtractor data = name_data.GetAsLLVM();
   llvm::DataExtractor::Cursor c(0);
@@ -434,15 +470,29 @@ ParseNames(SectionSP code_section_sp, DataExtractor &name_data,
         llvm::Expected<std::string> name = GetWasmString(data, c);
         if (!name)
           return name.takeError();
-        if (*idx >= functions.size())
+        if (*idx >= num_imported_functions + functions.size())
           continue;
-        symbols.emplace_back(
-            symbols.size(), *name, lldb::eSymbolTypeCode,
-            /*external=*/false, /*is_debug=*/false, /*is_trampoline=*/false,
-            /*is_artificial=*/false, code_section_sp,
-            functions[i].section_offset, functions[i].size,
-            /*size_is_valid=*/true, /*contains_linker_annotations=*/false,
-            /*flags=*/0);
+
+        if (*idx < num_imported_functions) {
+          symbols.emplace_back(symbols.size(), *name, lldb::eSymbolTypeCode,
+                               /*external=*/true, /*is_debug=*/false,
+                               /*is_trampoline=*/false,
+                               /*is_artificial=*/false,
+                               /*section_sp=*/lldb::SectionSP(),
+                               /*value=*/0, /*size=*/0,
+                               /*size_is_valid=*/false,
+                               /*contains_linker_annotations=*/false,
+                               /*flags=*/0);
+        } else {
+          const WasmFunction &func = functions[*idx - num_imported_functions];
+          symbols.emplace_back(symbols.size(), *name, lldb::eSymbolTypeCode,
+                               /*external=*/false, /*is_debug=*/false,
+                               /*is_trampoline=*/false, /*is_artificial=*/false,
+                               code_section_sp, func.section_offset, func.size,
+                               /*size_is_valid=*/true,
+                               /*contains_linker_annotations=*/false,
+                               /*flags=*/0);
+        }
       }
     } break;
     case llvm::wasm::WASM_NAMES_DATA_SEGMENT: {
@@ -590,6 +640,20 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) {
     }
   }
 
+  // Parse the import section. The number of functions is needed because the
+  // function index space used in the name section includes imports.
+  if (std::optional<section_info> info =
+          GetSectionInfo(llvm::wasm::WASM_SEC_IMPORT)) {
+    DataExtractor import_data = ReadImageData(info->offset, info->size);
+    llvm::Expected<uint32_t> num_imports = ParseImports(import_data);
+    if (!num_imports) {
+      LLDB_LOG_ERROR(log, num_imports.takeError(),
+                     "Failed to parse Wasm import section: {0}");
+    } else {
+      m_num_imported_functions = *num_imports;
+    }
+  }
+
   // Parse the data section.
   std::optional<section_info> data_info =
       GetSectionInfo(llvm::wasm::WASM_SEC_DATA);
@@ -609,7 +673,7 @@ void ObjectFileWasm::CreateSections(SectionList &unified_section_list) {
     DataExtractor names_data = ReadImageData(info->offset, info->size);
     llvm::Expected<std::vector<Symbol>> symbols = ParseNames(
         m_sections_up->FindSectionByType(lldb::eSectionTypeCode, false),
-        names_data, functions, segments);
+        names_data, functions, segments, m_num_imported_functions);
     if (!symbols) {
       LLDB_LOG_ERROR(log, symbols.takeError(),
                      "Failed to parse Wasm names: {0}");
diff --git a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h
index 86ecbf26803cf..17fe23c1131d2 100644
--- a/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h
+++ b/lldb/source/Plugins/ObjectFile/wasm/ObjectFileWasm.h
@@ -146,6 +146,7 @@ class ObjectFileWasm : public ObjectFile {
   /// \}
 
   std::vector<section_info> m_sect_infos;
+  uint32_t m_num_imported_functions = 0;
   std::vector<Symbol> m_symbols;
   ArchSpec m_arch;
   UUID m_uuid;
diff --git a/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py b/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
index cc767edaaa619..e5bbb9ee2eb9d 100644
--- a/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
+++ b/lldb/test/API/functionalities/abbreviation/TestAbbreviations.py
@@ -41,7 +41,7 @@ def test_command_abbreviations_and_aliases(self):
         # "pl" could be "platform" or "plugin".
         command_interpreter.ResolveCommand("pl", result)
         self.assertFalse(result.Succeeded())
-        self.assertTrue(result.GetError().startswith("Ambiguous command"))
+        self.assertTrue(result.GetError().startswith("error: ambiguous command"))
 
         # Make sure an unabbreviated command is not mangled.
         command_interpreter.ResolveCommand(
diff --git a/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
index 14c66fefea7ef..f114820d16347 100644
--- a/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
+++ b/lldb/test/API/functionalities/ambigous_commands/TestAmbiguousCommands.py
@@ -24,7 +24,7 @@ def test_ambiguous_command_with_alias(self):
         self.assertFalse(result.Succeeded())
         self.assertEqual(
             result.GetError(),
-            "Ambiguous command 'co'. Possible matches:\n\tcommand\n\tcontinue\n\tcorefile\n",
+            "error: ambiguous command 'co'. Possible matches:\n\tcommand\n\tcontinue\n\tcorefile\n",
         )
 
         command_interpreter.HandleCommand("command unalias continue", result)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
index 45695c43b42a9..1db0c489bc7f9 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
@@ -9,6 +9,8 @@
 
 
 class TestDataFormatterGenericForwardList(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         TestBase.setUp(self)
         self.line = line_number("main.cpp", "// break here")
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
index c0207e6ab5911..fbd021190214b 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
@@ -10,6 +10,8 @@
 
 
 class GenericListDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
index f6174dd786380..9c5daf760b31f 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
@@ -11,6 +11,7 @@
 
 
 class GenericListDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
     NO_DEBUG_INFO_TESTCASE = True
 
     def do_test_with_run_command(self):
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
index 7bb4f75de4e59..c88e83bb5b1f4 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
@@ -5,6 +5,8 @@
 
 
 class GenericOptionalDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test_with_run_command(self):
         """Test that that file and class static variables display correctly."""
 
@@ -55,7 +57,11 @@ def cleanup():
         self.expect(
             "frame var numbers",
             substrs=[
-                "(optional_int_vect) numbers =  Has Value=true  {",
+                (
+                    "(std::optional<std::vector<int, std::allocator<int>>>) numbers =  Has Value=true  {"
+                    if self.getDebugInfo() == "pdb"
+                    else "(optional_int_vect) numbers =  Has Value=true  {"
+                ),
                 "Value = size=4 {",
                 "[0] = 1",
                 "[1] = 2",
@@ -69,7 +75,11 @@ def cleanup():
         self.expect(
             "frame var ostring",
             substrs=[
-                "(optional_string) ostring =  Has Value=true  {",
+                (
+                    "(std::optional<std::basic_string<char, std::char_traits<char>, std::allocator<char>>>) ostring =  Has Value=true  {"
+                    if self.getDebugInfo() == "pdb"
+                    else "(optional_string) ostring =  Has Value=true  {"
+                ),
                 'Value = "hello"',
                 "}",
             ],
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
index d71fbf8d5f81a..fa03fc14dfb83 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
@@ -9,6 +9,8 @@
 
 
 class TestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test(self):
         """Test `frame variable` output for `std::shared_ptr` types."""
         (_, process, _, bkpt) = lldbutil.run_to_source_breakpoint(
@@ -62,7 +64,7 @@ def do_test(self):
         valobj = self.expect_var_path("sp_user", type="std::shared_ptr<User>")
         self.assertRegex(
             valobj.summary,
-            "element_type @ 0x0*[1-9a-f][0-9a-f]+( strong=1)? weak=0",
+            f"{'User' if self.getDebugInfo() == 'pdb' else 'element_type'} @ 0x0*[1-9a-f][0-9a-f]+( strong=1)? weak=0",
         )
         self.assertNotEqual(valobj.child[0].unsigned, 0)
 
@@ -77,7 +79,15 @@ def do_test(self):
         self.assertEqual(str(valobj), '(User) *pointer = (id = 30, name = "steph")')
 
         self.expect_var_path("sp_user->id", type="int", value="30")
-        self.expect_var_path("sp_user->name", type="std::string", summary='"steph"')
+        self.expect_var_path(
+            "sp_user->name",
+            type=(
+                "std::basic_string<char, std::char_traits<char>, std::allocator<char>>"
+                if self.getDebugInfo() == "pdb"
+                else "std::string"
+            ),
+            summary='"steph"',
+        )
 
         valobj = self.expect_var_path(
             "si", type="std::shared_ptr<int>", summary="47 strong=2 weak=0"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py
index 0b68b1b532bb0..1516db698798d 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py
@@ -9,6 +9,8 @@
 
 
 class TestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test(self):
         """Test `frame variable` output for `std::unique_ptr` types."""
 
@@ -84,7 +86,15 @@ def do_test(self):
         self.assertNotEqual(valobj.child[0].unsigned, 0)
 
         self.expect_var_path("up_user->id", type="int", value="30")
-        self.expect_var_path("up_user->name", type="std::string", summary='"steph"')
+        self.expect_var_path(
+            "up_user->name",
+            type=(
+                "std::basic_string<char, std::char_traits<char>, std::allocator<char>>"
+                if self.getDebugInfo() == "pdb"
+                else "std::string"
+            ),
+            summary='"steph"',
+        )
 
         self.runCmd("settings set target.experimental.use-DIL true")
         self.expect_var_path("ptr_node->value", value="1")
diff --git a/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py b/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py
index 6d2ce2bb3e709..941a909aa93aa 100644
--- a/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py
+++ b/lldb/test/API/functionalities/wrong_commands/TestWrongCommands.py
@@ -17,7 +17,9 @@ def test_ambiguous_command(self):
 
         command_interpreter.HandleCommand("g", result)
         self.assertFalse(result.Succeeded())
-        self.assertRegex(result.GetError(), "Ambiguous command 'g'. Possible matches:")
+        self.assertRegex(
+            result.GetError(), "error: ambiguous command 'g'. Possible matches:"
+        )
         self.assertRegex(result.GetError(), "gui")
         self.assertRegex(result.GetError(), "gdb-remote")
         self.assertEqual(1, result.GetError().count("gdb-remote"))
diff --git a/lldb/test/Shell/Commands/command-wrong-subcommand-error-msg.test b/lldb/test/Shell/Commands/command-wrong-subcommand-error-msg.test
index 5365f81c19587..968a4d6fb9682 100644
--- a/lldb/test/Shell/Commands/command-wrong-subcommand-error-msg.test
+++ b/lldb/test/Shell/Commands/command-wrong-subcommand-error-msg.test
@@ -4,5 +4,5 @@
 # RUN: not %lldb -b -o 'breakpoint foo' %t.out -o exit 2>&1 | FileCheck %s --check-prefix BP-MSG
 # RUN: not %lldb -b -o 'watchpoint set foo' %t.out -o exit 2>&1 | FileCheck %s --check-prefix WP-MSG
 # CHECK: at main.c:2:21
-# BP-MSG: "foo" is not a valid subcommand of "breakpoint". Valid subcommands are: add, clear, command, delete, disable, and others. Use "help breakpoint" to find out more.
-# WP-MSG: "foo" is not a valid subcommand of "watchpoint set". Valid subcommands are: expression, variable. Use "help watchpoint set" to find out more.
\ No newline at end of file
+# BP-MSG: error: "foo" is not a valid subcommand of "breakpoint". Valid subcommands are: add, clear, command, delete, disable, and others. Use "help breakpoint" to find out more.
+# WP-MSG: error: "foo" is not a valid subcommand of "watchpoint set". Valid subcommands are: expression, variable. Use "help watchpoint set" to find out more.
diff --git a/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml b/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
index 088d6163d6b0b..8f9742c4edff8 100644
--- a/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
+++ b/lldb/test/Shell/Symtab/Inputs/simple.wasm.yaml
@@ -10,6 +10,7 @@
 #   int j = 2;
 #   return add(i, j);
 # }
+# Additional imports have been manually added and indexes adjusted after compiling
 --- !WASM
 FileHeader:
   Version:         0x1
@@ -29,6 +30,21 @@ Sections:
         ParamTypes:      []
         ReturnTypes:
           - I32
+  - Type:            IMPORT
+    Imports:
+      - Module:          env
+        Field:           foo
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           another_import
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          env
+        Field:           bar
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   true
   - Type:            FUNCTION
     FunctionTypes:   [ 0, 1, 2, 1 ]
   - Type:            TABLE
@@ -44,73 +60,73 @@ Sections:
       - Minimum:         0x2
   - Type:            GLOBAL
     Globals:
-      - Index:           0
+      - Index:           1
         Type:            I32
         Mutable:         true
         InitExpr:
           Opcode:          I32_CONST
           Value:           66576
-      - Index:           1
+      - Index:           2
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1036
-      - Index:           2
+      - Index:           3
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1024
-      - Index:           3
+      - Index:           4
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1040
-      - Index:           4
+      - Index:           5
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1040
-      - Index:           5
+      - Index:           6
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           66576
-      - Index:           6
+      - Index:           7
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1024
-      - Index:           7
+      - Index:           8
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           66576
-      - Index:           8
+      - Index:           9
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           131072
-      - Index:           9
+      - Index:           10
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           0
-      - Index:           10
+      - Index:           11
         Type:            I32
         Mutable:         false
         InitExpr:
           Opcode:          I32_CONST
           Value:           1
-      - Index:           11
+      - Index:           12
         Type:            I32
         Mutable:         false
         InitExpr:
@@ -123,16 +139,16 @@ Sections:
         Index:           0
       - Name:            __wasm_call_ctors
         Kind:            FUNCTION
-        Index:           0
+        Index:           2
       - Name:            add
         Kind:            FUNCTION
-        Index:           1
+        Index:           3
       - Name:            __original_main
         Kind:            FUNCTION
-        Index:           2
+        Index:           4
       - Name:            main
         Kind:            FUNCTION
-        Index:           3
+        Index:           5
       - Name:            str
         Kind:            GLOBAL
         Index:           1
@@ -174,20 +190,20 @@ Sections:
         Index:           11
   - Type:            CODE
     Functions:
-      - Index:           0
+      - Index:           2
         Locals:          []
         Body:            0B
-      - Index:           1
+      - Index:           3
         Locals:
           - Type:            I32
             Count:           1
         Body:            23808080800041106B21022002200036020C20022001360208200228020C20022802086A0F0B
-      - Index:           2
+      - Index:           4
         Locals:
           - Type:            I32
             Count:           2
         Body:            23808080800041106B210020002480808080002000410036020C2000410136020820004102360204200028020820002802041081808080002101200041106A24808080800020010F0B
-      - Index:           3
+      - Index:           5
         Locals:          []
         Body:            1082808080000F0B
   - Type:            DATA
@@ -208,15 +224,19 @@ Sections:
     Name:            name
     FunctionNames:
       - Index:           0
-        Name:            __wasm_call_ctors
+        Name:            foo
       - Index:           1
-        Name:            add
+        Name:            another_import
       - Index:           2
-        Name:            __original_main
+        Name:            __wasm_call_ctors
       - Index:           3
+        Name:            add
+      - Index:           4
+        Name:            __original_main
+      - Index:           5
         Name:            main
     GlobalNames:
-      - Index:           0
+      - Index:           1
         Name:            __stack_pointer
     DataSegmentNames:
       - Index:           0
diff --git a/lldb/test/Shell/Symtab/symtab-wasm.test b/lldb/test/Shell/Symtab/symtab-wasm.test
index 524691b897322..e8628dbd04fb9 100644
--- a/lldb/test/Shell/Symtab/symtab-wasm.test
+++ b/lldb/test/Shell/Symtab/symtab-wasm.test
@@ -1,16 +1,18 @@
 # RUN: yaml2obj %S/Inputs/simple.wasm.yaml -o %t.wasm
 
 # RUN: %lldb %t.wasm -o 'image dump symtab' | FileCheck %s --check-prefix SYMTAB
+SYMTAB: Code 0x0000000000000000 0x0000000000000000 0x00000000 foo
+SYMTAB: Code 0x0000000000000000 0x0000000000000000 0x00000000 another_import
 SYMTAB: Code 0x0000000000000002 0x0000000000000002 0x00000000 __wasm_call_ctors
 SYMTAB: Code 0x0000000000000005 0x0000000000000029 0x00000000 add
 SYMTAB: Code 0x000000000000002f 0x000000000000004c 0x00000000 __original_main
 SYMTAB: Code 0x000000000000007c 0x0000000000000009 0x00000000 main
 
 # RUN: %lldb %t.wasm -o 'image dump sections' | FileCheck %s --check-prefix SECTIONS
-SECTIONS: 0x0000000000000001 code                   [0x0000000000000000-0x0000000000000085)  ---  0x000001a1 0x00000085 0x00000000 symtab-wasm.test.tmp.wasm.code
-SECTIONS: 0x0000000000000040 wasm-name                                                       ---  0x00000251 0x00000059 0x00000000 symtab-wasm.test.tmp.wasm.name
-SECTIONS: 0x0000000000000100 data                   [0x0000000000000400-0x0000000000000409)  ---  0x00000233 0x00000009 0x00000000 symtab-wasm.test.tmp.wasm..rodata
-SECTIONS: 0x0000000000000200 data                   [0x000000000000040c-0x0000000000000410)  ---  0x00000242 0x00000004 0x00000000 symtab-wasm.test.tmp.wasm..data
+SECTIONS: 0x0000000000000001 code                   [0x0000000000000000-0x0000000000000085)  ---  0x000001d2 0x00000085 0x00000000 symtab-wasm.test.tmp.wasm.code
+SECTIONS: 0x0000000000000040 wasm-name                                                       ---  0x00000282 0x0000006e 0x00000000 symtab-wasm.test.tmp.wasm.name
+SECTIONS: 0x0000000000000100 data                   [0x0000000000000400-0x0000000000000409)  ---  0x00000264 0x00000009 0x00000000 symtab-wasm.test.tmp.wasm..rodata
+SECTIONS: 0x0000000000000200 data                   [0x000000000000040c-0x0000000000000410)  ---  0x00000273 0x00000004 0x00000000 symtab-wasm.test.tmp.wasm..data
 
 # RUN: %lldb %t.wasm -o 'x/s 0x0000000000000400' | FileCheck %s --check-prefix STR
 STR: "data str"
diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
index 53158c88be19c..a2d948033724c 100644
--- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
@@ -240,7 +240,7 @@ value llvm_dispose_context(value C) {
 
 /* unit -> llcontext */
 value llvm_global_context(value Unit) {
-  return to_val(getGlobalContextForCAPI());
+  return to_val(LLVMGetGlobalContext());
 }
 
 /* llcontext -> string -> int */
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index f5ca0750c8882..ef2e7795aab9e 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1528,11 +1528,6 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
                                                    The iglp_opt strategy implementations are subject to change.
 
-  llvm.amdgcn.atomic.cond.sub.u32                  Provides direct access to flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32
-                                                   and ds_cond_sub_u32 based on address space on gfx12 targets. This
-                                                   performs a subtraction only if the memory value is greater than or
-                                                   equal to the data value.
-
   llvm.amdgcn.s.barrier.signal.isfirst             Provides access to the s_barrier_signal_first instruction;
                                                    additionally ensures that the result value is valid even when the
                                                    intrinsic is used from a wave that is not running in a workgroup.
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 7e95545425f2d..de8be0ad2dded 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -478,6 +478,12 @@ its enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_ENABLE_BINDINGS**:BOOL
   If disabled, do not try to build the OCaml bindings.
 
+**LLVM_ENABLE_CURL**:
+  Used to decide if LLVM tools, should support downloading information
+  (particularly debug info from ``llvm-debuginfod``) over HTTP. Allowed
+  values are ``OFF`` (default), ``ON``, and ``FORCE_ON`` (error if libcurl
+  is not found).
+
 **LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING**:STRING
   Enhances Debugify's ability to detect line number errors by storing extra
   information inside Instructions, removing false positives from Debugify's
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 22d5b4183fac0..1b85145efbf4a 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -114,6 +114,10 @@ Changes to the AArch64 Backend
 Changes to the AMDGPU Backend
 -----------------------------
 
+* Removed `llvm.amdgcn.atomic.cond.sub.u32` and
+  `llvm.amdgcn.atomic.csub.u32` intrinsics. Users should use the
+  `atomicrmw` instruction with `usub_cond` and `usub_sat` instead.
+
 Changes to the ARM Backend
 --------------------------
 
@@ -253,6 +257,10 @@ Changes to LLDB
   LLVM's PDB and CodeView support. You can switch back to the DIA reader with
   `settings set plugin.symbol-file.pdb.reader dia`. Note that support for the
   DIA reader will be removed in a future version of LLDB.
+* A `--verbose` option was added to the `version` command. When `--verbose` is used,
+  LLDB's build configuration is included in the command's output. This includes
+  all the supported targets, along with the presence of (or lack of) optional
+  features like XML parsing.
 
 Changes to BOLT
 ---------------------------------
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index cc3f3a9226395..f4645c18a93f0 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -550,7 +550,7 @@ class BitVector {
     return *this;
   }
 
-  /// test - Check if (This - RHS) is zero.
+  /// test - Check if (This - RHS) is non-zero.
   /// This is the same as reset(RHS) and any().
   bool test(const BitVector &RHS) const {
     unsigned ThisWords = Bits.size();
@@ -567,6 +567,9 @@ class BitVector {
     return false;
   }
 
+  /// subsetOf - Check if This is a subset of RHS.
+  bool subsetOf(const BitVector &RHS) const { return !test(RHS); }
+
   template <class F, class... ArgTys>
   static BitVector &apply(F &&f, BitVector &Out, BitVector const &Arg,
                           ArgTys const &...Args) {
diff --git a/llvm/include/llvm/ADT/SmallBitVector.h b/llvm/include/llvm/ADT/SmallBitVector.h
index 5b2a5221b791f..978dc3f073031 100644
--- a/llvm/include/llvm/ADT/SmallBitVector.h
+++ b/llvm/include/llvm/ADT/SmallBitVector.h
@@ -552,7 +552,8 @@ class SmallBitVector {
     return *this;
   }
 
-  /// Check if (This - RHS) is zero. This is the same as reset(RHS) and any().
+  /// Check if (This - RHS) is non-zero.
+  /// This is the same as reset(RHS) and any().
   bool test(const SmallBitVector &RHS) const {
     if (isSmall() && RHS.isSmall())
       return (getSmallBits() & ~RHS.getSmallBits()) != 0;
@@ -571,6 +572,9 @@ class SmallBitVector {
     return false;
   }
 
+  /// Check if This is a subset of RHS.
+  bool subsetOf(const SmallBitVector &RHS) const { return !test(RHS); }
+
   SmallBitVector &operator|=(const SmallBitVector &RHS) {
     resize(std::max(size(), RHS.size()));
     if (isSmall() && RHS.isSmall())
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d894ab995e4cc..d1aa7cac6a9a0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2913,8 +2913,6 @@ class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic <
   [IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
   [SDNPMemOperand]>;
 
-def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>;
-
 // uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
 //                                           <ray_dir>, <ray_inv_dir>, <texture_descr>
 // <node_ptr> is i32 or i64.
@@ -3162,8 +3160,6 @@ def int_amdgcn_flat_atomic_fmax_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
 
-def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>;
-
 class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
   Intrinsic<
     [llvm_any_ty],
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 1dd23f60c7e1e..ec80ba3e1ee81 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1893,29 +1893,29 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
   def int_x86_avx512_vpdpwssd_128 :
       ClangBuiltin<"__builtin_ia32_vpdpwssd128">,
-      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                             llvm_v4i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty,
+                             llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssd_256 :
       ClangBuiltin<"__builtin_ia32_vpdpwssd256">,
-      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                             llvm_v8i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty,
+                             llvm_v16i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssd_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwssd512">,
-      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                             llvm_v16i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty,
+                             llvm_v32i16_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpdpwssds_128 :
       ClangBuiltin<"__builtin_ia32_vpdpwssds128">,
-      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                             llvm_v4i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty,
+                             llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssds_256 :
       ClangBuiltin<"__builtin_ia32_vpdpwssds256">,
-      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                             llvm_v8i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty,
+                             llvm_v16i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwssds512">,
-      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                             llvm_v16i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty,
+                             llvm_v32i16_ty], [IntrNoMem]>;
   def int_x86_avx2_vpdpbssd_128
       : ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
@@ -1980,62 +1980,62 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_vpdpwsud_128
       : ClangBuiltin<"__builtin_ia32_vpdpwsud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwsud_256
       : ClangBuiltin<"__builtin_ia32_vpdpwsud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwsuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpwsuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwsuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpwsuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusd_128
       : ClangBuiltin<"__builtin_ia32_vpdpwusd128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusd_256
       : ClangBuiltin<"__builtin_ia32_vpdpwusd256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusds_128
       : ClangBuiltin<"__builtin_ia32_vpdpwusds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusds_256
       : ClangBuiltin<"__builtin_ia32_vpdpwusds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuud_128
       : ClangBuiltin<"__builtin_ia32_vpdpwuud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuud_256
       : ClangBuiltin<"__builtin_ia32_vpdpwuud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpwuuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpwuuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
 }
 
@@ -5031,32 +5031,32 @@ let TargetPrefix = "x86" in {
   def int_x86_avx10_vpdpwsud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwsud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwsuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwsuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwusd_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwusd512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwusds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwusds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwuud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwuud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwuuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwuuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
 
   // VMPSADBW
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index 1e20fca965ace..30e0316b882cc 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -173,7 +173,7 @@ void StackLifetime::calculateLocalLiveness() {
         BitsIn.resize(NumAllocas, true);
 
       // Update block LiveIn set, noting whether it has changed.
-      if (BitsIn.test(BlockInfo.LiveIn)) {
+      if (!BitsIn.subsetOf(BlockInfo.LiveIn)) {
         BlockInfo.LiveIn |= BitsIn;
       }
 
@@ -198,7 +198,7 @@ void StackLifetime::calculateLocalLiveness() {
       }
 
       // Update block LiveOut set, noting whether it has changed.
-      if (BitsIn.test(BlockInfo.LiveOut)) {
+      if (!BitsIn.subsetOf(BlockInfo.LiveOut)) {
         Changed = true;
         BlockInfo.LiveOut |= BitsIn;
       }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 3ad38c865db0a..855abcacbd4fd 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -167,7 +167,7 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
 
     // If this sub-register has a DWARF number and we haven't covered
     // its range, and its range covers the value, emit a DWARF piece for it.
-    if (Offset < MaxSize && CurSubReg.test(Coverage)) {
+    if (Offset < MaxSize && !CurSubReg.subsetOf(Coverage)) {
       // Emit a piece for any gap in the coverage.
       if (Offset > CurPos)
         DwarfRegs.push_back(Register::createSubRegister(
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index d9bc042d6807e..d19862ad7c188 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -853,8 +853,8 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
   if (AddrAlign < MinWordSize) {
     PMV.AlignedAddr = Builder.CreateIntrinsic(
         Intrinsic::ptrmask, {PtrTy, IntTy},
-        {Addr, ConstantInt::get(IntTy, ~(uint64_t)(MinWordSize - 1))}, nullptr,
-        "AlignedAddr");
+        {Addr, ConstantInt::getSigned(IntTy, ~(uint64_t)(MinWordSize - 1))},
+        nullptr, "AlignedAddr");
 
     Value *AddrInt = Builder.CreatePtrToInt(Addr, IntTy);
     PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB");
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 6a6d58c705487..83581052560cb 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -618,8 +618,6 @@ bool ShrinkWrapImpl::postShrinkWrapping(bool HasCandidate, MachineFunction &MF,
 
   DenseSet<const MachineBasicBlock *> DirtyBBs;
   for (MachineBasicBlock &MBB : MF) {
-    if (!MDT->isReachableFromEntry(&MBB))
-      continue;
     if (MBB.isEHPad()) {
       DirtyBBs.insert(&MBB);
       continue;
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index f7862641d94b9..4ec8b8b3646c1 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -815,13 +815,13 @@ void StackColoring::calculateLocalLiveness() {
       LocalLiveOut |= BlockInfo.Begin;
 
       // Update block LiveIn set, noting whether it has changed.
-      if (LocalLiveIn.test(BlockInfo.LiveIn)) {
+      if (!LocalLiveIn.subsetOf(BlockInfo.LiveIn)) {
         changed = true;
         BlockInfo.LiveIn |= LocalLiveIn;
       }
 
       // Update block LiveOut set, noting whether it has changed.
-      if (LocalLiveOut.test(BlockInfo.LiveOut)) {
+      if (!LocalLiveOut.subsetOf(BlockInfo.LiveOut)) {
         changed = true;
         BlockInfo.LiveOut |= LocalLiveOut;
       }
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 03e18986d43c7..d47aafb31ebdf 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -125,6 +125,24 @@ static bool upgradeX86MultiplyAddBytes(Function *F, Intrinsic::ID IID,
   return true;
 }
 
+// Upgrade the declaration of multipy and add words intrinsics whose input
+// arguments' types have changed to vectors of i32 to vectors of i16
+static bool upgradeX86MultiplyAddWords(Function *F, Intrinsic::ID IID,
+                                       Function *&NewFn) {
+  // check if input argument type is a vector of i16
+  Type *Arg1Type = F->getFunctionType()->getParamType(1);
+  Type *Arg2Type = F->getFunctionType()->getParamType(2);
+  if (Arg1Type->isVectorTy() &&
+      cast<VectorType>(Arg1Type)->getElementType()->isIntegerTy(16) &&
+      Arg2Type->isVectorTy() &&
+      cast<VectorType>(Arg2Type)->getElementType()->isIntegerTy(16))
+    return false;
+
+  rename(F);
+  NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
+  return true;
+}
+
 static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID,
                                     Function *&NewFn) {
   if (F->getReturnType()->getScalarType()->isBFloatTy())
@@ -590,43 +608,89 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
                .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic)
         return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    } else if (Name.starts_with("vpdpwssd.") ||
+               Name.starts_with("vpdpwssds.")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("vpdpwssd.128", Intrinsic::x86_avx512_vpdpwssd_128)
+               .Case("vpdpwssd.256", Intrinsic::x86_avx512_vpdpwssd_256)
+               .Case("vpdpwssd.512", Intrinsic::x86_avx512_vpdpwssd_512)
+               .Case("vpdpwssds.128", Intrinsic::x86_avx512_vpdpwssds_128)
+               .Case("vpdpwssds.256", Intrinsic::x86_avx512_vpdpwssds_256)
+               .Case("vpdpwssds.512", Intrinsic::x86_avx512_vpdpwssds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddWords(F, ID, NewFn);
     }
     return false; // No other 'x86.avx512.*'.
   }
 
-  if (Name.consume_front("avx2.vpdpb")) {
-    // Added in 21.1
-    ID = StringSwitch<Intrinsic::ID>(Name)
-             .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128)
-             .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256)
-             .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128)
-             .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256)
-             .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128)
-             .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256)
-             .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128)
-             .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256)
-             .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128)
-             .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256)
-             .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128)
-             .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256)
-             .Default(Intrinsic::not_intrinsic);
-    if (ID != Intrinsic::not_intrinsic)
-      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+  if (Name.consume_front("avx2.")) {
+    if (Name.consume_front("vpdpb")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128)
+               .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256)
+               .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128)
+               .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256)
+               .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128)
+               .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256)
+               .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128)
+               .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256)
+               .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128)
+               .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256)
+               .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128)
+               .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    } else if (Name.consume_front("vpdpw")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("sud.128", Intrinsic::x86_avx2_vpdpwsud_128)
+               .Case("sud.256", Intrinsic::x86_avx2_vpdpwsud_256)
+               .Case("suds.128", Intrinsic::x86_avx2_vpdpwsuds_128)
+               .Case("suds.256", Intrinsic::x86_avx2_vpdpwsuds_256)
+               .Case("usd.128", Intrinsic::x86_avx2_vpdpwusd_128)
+               .Case("usd.256", Intrinsic::x86_avx2_vpdpwusd_256)
+               .Case("usds.128", Intrinsic::x86_avx2_vpdpwusds_128)
+               .Case("usds.256", Intrinsic::x86_avx2_vpdpwusds_256)
+               .Case("uud.128", Intrinsic::x86_avx2_vpdpwuud_128)
+               .Case("uud.256", Intrinsic::x86_avx2_vpdpwuud_256)
+               .Case("uuds.128", Intrinsic::x86_avx2_vpdpwuuds_128)
+               .Case("uuds.256", Intrinsic::x86_avx2_vpdpwuuds_256)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddWords(F, ID, NewFn);
+    }
     return false; // No other 'x86.avx2.*'
   }
 
-  if (Name.consume_front("avx10.vpdpb")) {
-    // Added in 21.1
-    ID = StringSwitch<Intrinsic::ID>(Name)
-             .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512)
-             .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512)
-             .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512)
-             .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512)
-             .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512)
-             .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512)
-             .Default(Intrinsic::not_intrinsic);
-    if (ID != Intrinsic::not_intrinsic)
-      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+  if (Name.consume_front("avx10.")) {
+    if (Name.consume_front("vpdpb")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512)
+               .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512)
+               .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512)
+               .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512)
+               .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512)
+               .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    } else if (Name.consume_front("vpdpw")) {
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("sud.512", Intrinsic::x86_avx10_vpdpwsud_512)
+               .Case("suds.512", Intrinsic::x86_avx10_vpdpwsuds_512)
+               .Case("usd.512", Intrinsic::x86_avx10_vpdpwusd_512)
+               .Case("usds.512", Intrinsic::x86_avx10_vpdpwusds_512)
+               .Case("uud.512", Intrinsic::x86_avx10_vpdpwuud_512)
+               .Case("uuds.512", Intrinsic::x86_avx10_vpdpwuuds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddWords(F, ID, NewFn);
+    }
     return false; // No other 'x86.avx10.*'
   }
 
@@ -1210,9 +1274,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       }
 
       if (Name.consume_front("atomic.")) {
-        if (Name.starts_with("inc") || Name.starts_with("dec")) {
-          // These were replaced with atomicrmw uinc_wrap and udec_wrap, so
-          // there's no new declaration.
+        if (Name.starts_with("inc") || Name.starts_with("dec") ||
+            Name.starts_with("cond.sub") || Name.starts_with("csub")) {
+          // These were replaced with atomicrmw uinc_wrap, udec_wrap, usub_cond
+          // and usub_sat so there's no new declaration.
           NewFn = nullptr;
           return true;
         }
@@ -4315,6 +4380,32 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
+
+    // Input arguments types were incorrectly set to vectors of i32 before but
+    // they should be vectors of i16. Insert bit cast when encountering the old
+    // types
+    if (Args[1]->getType()->isVectorTy() &&
+        cast<VectorType>(Args[1]->getType())
+            ->getElementType()
+            ->isIntegerTy(32) &&
+        Args[2]->getType()->isVectorTy() &&
+        cast<VectorType>(Args[2]->getType())
+            ->getElementType()
+            ->isIntegerTy(32)) {
+      Type *NewArgType = nullptr;
+      if (VecWidth == 128)
+        NewArgType = VectorType::get(Builder.getInt16Ty(), 8, false);
+      else if (VecWidth == 256)
+        NewArgType = VectorType::get(Builder.getInt16Ty(), 16, false);
+      else if (VecWidth == 512)
+        NewArgType = VectorType::get(Builder.getInt16Ty(), 32, false);
+      else
+        llvm_unreachable("Unexpected vector bit width");
+
+      Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+      Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+    }
+
     Rep = Builder.CreateIntrinsic(IID, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
@@ -4516,7 +4607,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
           .StartsWith("global.atomic.fmin", AtomicRMWInst::FMin)
           .StartsWith("flat.atomic.fmin", AtomicRMWInst::FMin)
           .StartsWith("global.atomic.fmax", AtomicRMWInst::FMax)
-          .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax);
+          .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax)
+          .StartsWith("atomic.cond.sub", AtomicRMWInst::USubCond)
+          .StartsWith("atomic.csub", AtomicRMWInst::USubSat);
 
   unsigned NumOperands = CI->getNumOperands();
   if (NumOperands < 3) // Malformed bitcode.
@@ -5390,6 +5483,39 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     NewCall = Builder.CreateCall(NewFn, Args);
     break;
   }
+  case Intrinsic::x86_avx512_vpdpwssd_128:
+  case Intrinsic::x86_avx512_vpdpwssd_256:
+  case Intrinsic::x86_avx512_vpdpwssd_512:
+  case Intrinsic::x86_avx512_vpdpwssds_128:
+  case Intrinsic::x86_avx512_vpdpwssds_256:
+  case Intrinsic::x86_avx512_vpdpwssds_512:
+  case Intrinsic::x86_avx2_vpdpwsud_128:
+  case Intrinsic::x86_avx2_vpdpwsud_256:
+  case Intrinsic::x86_avx10_vpdpwsud_512:
+  case Intrinsic::x86_avx2_vpdpwsuds_128:
+  case Intrinsic::x86_avx2_vpdpwsuds_256:
+  case Intrinsic::x86_avx10_vpdpwsuds_512:
+  case Intrinsic::x86_avx2_vpdpwusd_128:
+  case Intrinsic::x86_avx2_vpdpwusd_256:
+  case Intrinsic::x86_avx10_vpdpwusd_512:
+  case Intrinsic::x86_avx2_vpdpwusds_128:
+  case Intrinsic::x86_avx2_vpdpwusds_256:
+  case Intrinsic::x86_avx10_vpdpwusds_512:
+  case Intrinsic::x86_avx2_vpdpwuud_128:
+  case Intrinsic::x86_avx2_vpdpwuud_256:
+  case Intrinsic::x86_avx10_vpdpwuud_512:
+  case Intrinsic::x86_avx2_vpdpwuuds_128:
+  case Intrinsic::x86_avx2_vpdpwuuds_256:
+  case Intrinsic::x86_avx10_vpdpwuuds_512:
+    unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / 16;
+    Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2)};
+    Type *NewArgType = VectorType::get(Builder.getInt16Ty(), NumElts, false);
+    Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+    Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
   assert(NewCall && "Should have either set this variable or returned through "
                     "the default case");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e140aabb9bbeb..30eb19036ddda 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14965,9 +14965,10 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
     return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
   }
-  if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
+  if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
-    return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
+    return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
+                       OperandOrder == 0 ? V2 : V1);
   }
 
   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
@@ -16679,7 +16680,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
           isREVMask(M, EltSize, NumElts, 16) ||
           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
           isSingletonEXTMask(M, VT, DummyUnsigned) ||
-          isTRNMask(M, NumElts, DummyUnsigned) ||
+          isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
           isUZPMask(M, NumElts, DummyUnsigned) ||
           isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
@@ -31798,10 +31799,13 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
                     OperandOrder == 0 ? Op1 : Op2,
                     OperandOrder == 0 ? Op2 : Op1));
 
-  if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
+  if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
+                OperandOrder)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
-    return convertFromScalableVector(
-        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
+    SDValue TRN =
+        DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2,
+                    OperandOrder == 0 ? Op2 : Op1);
+    return convertFromScalableVector(DAG, VT, TRN);
   }
 
   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index f82180fc57b99..ac4c1acedf05c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -708,53 +708,6 @@ unsigned AArch64InstrInfo::insertBranch(
   return 2;
 }
 
-bool llvm::optimizeTerminators(MachineBasicBlock *MBB,
-                               const TargetInstrInfo &TII) {
-  for (MachineInstr &MI : MBB->terminators()) {
-    unsigned Opc = MI.getOpcode();
-    switch (Opc) {
-    case AArch64::CBZW:
-    case AArch64::CBZX:
-    case AArch64::TBZW:
-    case AArch64::TBZX:
-      // CBZ/TBZ with WZR/XZR -> unconditional B
-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-          MI.getOperand(0).getReg() == AArch64::XZR) {
-        DEBUG_WITH_TYPE("optimizeTerminators",
-                        dbgs() << "Removing always taken branch: " << MI);
-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
-        for (auto *S : Succs)
-          if (S != Target)
-            MBB->removeSuccessor(S);
-        DebugLoc DL = MI.getDebugLoc();
-        while (MBB->rbegin() != &MI)
-          MBB->rbegin()->eraseFromParent();
-        MI.eraseFromParent();
-        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
-        return true;
-      }
-      break;
-    case AArch64::CBNZW:
-    case AArch64::CBNZX:
-    case AArch64::TBNZW:
-    case AArch64::TBNZX:
-      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
-      if (MI.getOperand(0).getReg() == AArch64::WZR ||
-          MI.getOperand(0).getReg() == AArch64::XZR) {
-        DEBUG_WITH_TYPE("optimizeTerminators",
-                        dbgs() << "Removing never taken branch: " << MI);
-        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
-        MI.getParent()->removeSuccessor(Target);
-        MI.eraseFromParent();
-        return true;
-      }
-      break;
-    }
-  }
-  return false;
-}
-
 // Find the original register that VReg is copied from.
 static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
   while (Register::isVirtualRegister(VReg)) {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index d237721450644..2de2e0d73901f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -705,8 +705,6 @@ int isAArch64FrameOffsetLegal(const MachineInstr &MI, StackOffset &Offset,
                               unsigned *OutUnscaledOp = nullptr,
                               int64_t *EmittableOffset = nullptr);
 
-bool optimizeTerminators(MachineBasicBlock *MBB, const TargetInstrInfo &TII);
-
 static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
 
 static inline bool isCondBranchOpcode(int Opc) {
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index ef8786d0ad0e1..c7d6b31291197 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6699,33 +6699,53 @@ inline bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
 }
 
 /// Return true for trn1 or trn2 masks of the form:
-///  <0, 8, 2, 10, 4, 12, 6, 14> or
-///  <1, 9, 3, 11, 5, 13, 7, 15>
+///  <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0, OperandOrderOut = 0) or
+///  <1, 9, 3, 11, 5, 13, 7, 15> (WhichResultOut = 1, OperandOrderOut = 0) or
+///  <8, 0, 10, 2, 12, 4, 14, 6> (WhichResultOut = 0, OperandOrderOut = 1) or
+///  <9, 1, 11, 3, 13, 5, 15, 7> (WhichResultOut = 1, OperandOrderOut = 1) or
 inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
-                      unsigned &WhichResultOut) {
+                      unsigned &WhichResultOut, unsigned &OperandOrderOut) {
   if (NumElts % 2 != 0)
     return false;
-  // Check the first non-undef element for trn1 vs trn2.
-  unsigned WhichResult = 2;
+
+  // "Result" corresponds to "WhichResultOut", selecting between trn1 and trn2.
+  // "Order" corresponds to "OperandOrderOut", selecting the order of operands
+  // for the instruction (flipped or not).
+  bool Result0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
+  bool Result1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
+  bool Result0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
+  bool Result1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
+  // Check all elements match.
   for (unsigned i = 0; i != NumElts; i += 2) {
     if (M[i] >= 0) {
-      WhichResult = ((unsigned)M[i] == i ? 0 : 1);
-      break;
+      unsigned EvenElt = (unsigned)M[i];
+      if (EvenElt != i)
+        Result0Order0 = false;
+      if (EvenElt != i + 1)
+        Result1Order0 = false;
+      if (EvenElt != NumElts + i)
+        Result0Order1 = false;
+      if (EvenElt != NumElts + i + 1)
+        Result1Order1 = false;
     }
     if (M[i + 1] >= 0) {
-      WhichResult = ((unsigned)M[i + 1] == i + NumElts ? 0 : 1);
-      break;
+      unsigned OddElt = (unsigned)M[i + 1];
+      if (OddElt != NumElts + i)
+        Result0Order0 = false;
+      if (OddElt != NumElts + i + 1)
+        Result1Order0 = false;
+      if (OddElt != i)
+        Result0Order1 = false;
+      if (OddElt != i + 1)
+        Result1Order1 = false;
     }
   }
-  if (WhichResult == 2)
+
+  if (Result0Order0 + Result1Order0 + Result0Order1 + Result1Order1 != 1)
     return false;
 
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
-      return false;
-  }
-  WhichResultOut = WhichResult;
+  WhichResultOut = (Result0Order0 || Result0Order1) ? 0 : 1;
+  OperandOrderOut = (Result0Order0 || Result1Order0) ? 0 : 1;
   return true;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
index 1a5a9f0a6018b..1b990796ec9da 100644
--- a/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RedundantCondBranchPass.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
-#include "AArch64InstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -46,6 +45,51 @@ INITIALIZE_PASS(AArch64RedundantCondBranch, "aarch64-redundantcondbranch",
                 "AArch64 Redundant Conditional Branch Elimination pass", false,
                 false)
 
+static bool optimizeTerminators(MachineBasicBlock *MBB,
+                                const TargetInstrInfo &TII) {
+  for (MachineInstr &MI : make_early_inc_range(MBB->terminators())) {
+    unsigned Opc = MI.getOpcode();
+    switch (Opc) {
+    case AArch64::CBZW:
+    case AArch64::CBZX:
+    case AArch64::TBZW:
+    case AArch64::TBZX:
+      // CBZ/TBZ with WZR/XZR -> unconditional B
+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+          MI.getOperand(0).getReg() == AArch64::XZR) {
+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+        SmallVector<MachineBasicBlock *> Succs(MBB->successors());
+        for (auto *S : Succs)
+          if (S != Target)
+            MBB->removeSuccessor(S);
+        DebugLoc DL = MI.getDebugLoc();
+        while (MBB->rbegin() != &MI)
+          MBB->rbegin()->eraseFromParent();
+        MI.eraseFromParent();
+        BuildMI(MBB, DL, TII.get(AArch64::B)).addMBB(Target);
+        return true;
+      }
+      break;
+    case AArch64::CBNZW:
+    case AArch64::CBNZX:
+    case AArch64::TBNZW:
+    case AArch64::TBNZX:
+      // CBNZ/TBNZ with WZR/XZR -> never taken, remove branch and successor
+      if (MI.getOperand(0).getReg() == AArch64::WZR ||
+          MI.getOperand(0).getReg() == AArch64::XZR) {
+        LLVM_DEBUG(dbgs() << "Removing redundant branch: " << MI);
+        MachineBasicBlock *Target = TII.getBranchDestBlock(MI);
+        MI.getParent()->removeSuccessor(Target);
+        MI.eraseFromParent();
+        return true;
+      }
+      break;
+    }
+  }
+  return false;
+}
+
 bool AArch64RedundantCondBranch::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index 9dc721eb7315d..84015e5061768 100644
--- a/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -50,7 +50,6 @@
 //        to use WZR/XZR directly in some cases.
 //===----------------------------------------------------------------------===//
 #include "AArch64.h"
-#include "AArch64InstrInfo.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
@@ -476,7 +475,6 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
     return false;
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   // Resize the clobbered and used register unit trackers.  We do this once per
   // function.
@@ -486,10 +484,8 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
   OptBBUsedRegs.init(*TRI);
 
   bool Changed = false;
-  for (MachineBasicBlock &MBB : MF) {
-    Changed |= optimizeTerminators(&MBB, TII);
+  for (MachineBasicBlock &MBB : MF)
     Changed |= optimizeBlock(&MBB);
-  }
   return Changed;
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 4fba593b3d0fb..221a7bcd881bb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -215,14 +215,15 @@ bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
               ShuffleVectorPseudo &MatchInfo) {
   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
   unsigned WhichResult;
+  unsigned OperandOrder;
   ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
   Register Dst = MI.getOperand(0).getReg();
   unsigned NumElts = MRI.getType(Dst).getNumElements();
-  if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
+  if (!isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder))
     return false;
   unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
-  Register V1 = MI.getOperand(1).getReg();
-  Register V2 = MI.getOperand(2).getReg();
+  Register V1 = MI.getOperand(OperandOrder == 0 ? 1 : 2).getReg();
+  Register V2 = MI.getOperand(OperandOrder == 0 ? 2 : 1).getReg();
   MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index dd86cb5d3d5a6..2a99dacba52a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -636,15 +636,11 @@ multiclass local_addr_space_atomic_op {
     }
 }
 
-defm int_amdgcn_global_atomic_csub : noret_op;
 defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
 defm int_amdgcn_flat_atomic_fmin_num : noret_op;
 defm int_amdgcn_flat_atomic_fmax_num : noret_op;
 defm int_amdgcn_global_atomic_fmin_num : noret_op;
 defm int_amdgcn_global_atomic_fmax_num : noret_op;
-defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
 
 multiclass noret_binary_atomic_op<SDNode atomic_op> {
   let HasNoUse = true in
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a6d39d136dfb2..10b781b30d1bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5309,12 +5309,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
-    case Intrinsic::amdgcn_global_atomic_csub:
     case Intrinsic::amdgcn_global_atomic_fmin_num:
     case Intrinsic::amdgcn_global_atomic_fmax_num:
     case Intrinsic::amdgcn_flat_atomic_fmin_num:
     case Intrinsic::amdgcn_flat_atomic_fmax_num:
-    case Intrinsic::amdgcn_atomic_cond_sub_u32:
     case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
     case Intrinsic::amdgcn_global_load_tr_b64:
     case Intrinsic::amdgcn_global_load_tr_b128:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index fe452f008c95c..58a9b5511f2d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
 def : SourceOfDivergence<int_r600_read_tidig_x>;
 def : SourceOfDivergence<int_r600_read_tidig_y>;
 def : SourceOfDivergence<int_r600_read_tidig_z>;
-def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
 def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index b97b7385dc1ff..bb0e9380e956d 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -783,37 +783,20 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
 
 multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
                                      RegisterOperand vdataClass,
-                                     ValueType vdataType,
-                                     SDPatternOperator atomic> {
+                                     ValueType vdataType> {
   let FPAtomic = vdataType.isFP in {
-    def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
-               vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <0, NAME # "_RTN">;
-
-    def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
-                vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <1, NAME # "_RTN">;
-
+    def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
+                      MUBUFAddr64Table <0, NAME # "_RTN">;
+    def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>,
+                      MUBUFAddr64Table <1, NAME # "_RTN">;
     def _OFFEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn,  vdataClass, 0>;
     def _IDXEN_RTN  : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn,  vdataClass, 0>;
     def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>;
 
-    def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
-               vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
-
-    def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1,
-      [(set vdataType:$vdata,
-       (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
-                vdataType:$vdata_in))]>,
-      MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
-
+    def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>,
+                      MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
+    def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>,
+                      MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
     def _VBUFFER_OFFEN_RTN  : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn,  vdataClass, 1>;
     def _VBUFFER_IDXEN_RTN  : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn,  vdataClass, 1>;
     def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>;
@@ -822,10 +805,9 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
 
 multiclass MUBUF_Pseudo_Atomics <string opName,
                                  RegisterOperand vdataClass,
-                                 ValueType vdataType,
-                                 SDPatternOperator atomic = null_frag> :
+                                 ValueType vdataType> :
   MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
-  MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+  MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType>;
 
 
 //===----------------------------------------------------------------------===//
@@ -1096,7 +1078,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
 
 let OtherPredicates = [HasGFX10_BEncoding] in {
   defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
-    "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub
+    "buffer_atomic_csub", VGPROp_32, i32
   >;
 }
 
@@ -1117,22 +1099,22 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
 let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
 
 defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag
+  "buffer_atomic_fcmpswap", AVLdSt_64, v2f32
 >;
 }
 
 let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
 defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmin", AVLdSt_32, f32, null_frag
+  "buffer_atomic_fmin", AVLdSt_32, f32
 >;
 defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fmax", AVLdSt_32, f32, null_frag
+  "buffer_atomic_fmax", AVLdSt_32, f32
 >;
 }
 
 let SubtargetPredicate = isGFX6GFX7GFX10 in {
 defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
-  "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag
+  "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64
 >;
 }
 
@@ -1201,12 +1183,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
 
 let SubtargetPredicate = HasAtomicFaddRtnInsts in
 defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
-  "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag
+  "buffer_atomic_add_f32", AVLdSt_32, f32
 >;
 
 let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
 defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
-  "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag
+  "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16
 >;
 
 let SubtargetPredicate = isGFX12Plus in {
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 4b3bd0c09e076..3a53cef96473c 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -886,15 +886,6 @@ defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_3
 def DS_BPERMUTE_FI_B32    : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
                                              int_amdgcn_ds_bpermute_fi_b32>;
 
-multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
-                                  ValueType vt, string frag> {
-  def : DSAtomicRetPat<inst, vt,
-                        !cast<PatFrag>(frag#"_local_addrspace")>;
-  def : DSAtomicRetPat<noRetInst, vt,
-                        !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>;
-}
-
-defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
 } // let SubtargetPredicate = isGFX12Plus
 
 let SubtargetPredicate = isGFX1250Plus in {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 9ac0cb6829e55..60a82c4189b7c 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1562,10 +1562,6 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType
   }
 }
 
-multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
-                                           ValueType vt> :
-  FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>;
-
 multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
                           ValueType data_vt = vt, bit isIntr = 0> :
   FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1590,10 +1586,6 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
   }
 }
 
-multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
-                                         ValueType vt> :
-  FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>;
-
 multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
                              ValueType data_vt = vt, bit isIntr = 0> :
   FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -2255,9 +2247,6 @@ let SubtargetPredicate = HasAtomicCondSubClampFlatInsts in {
 defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
 defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
 
-defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-
 let OtherPredicates = [HasD16LoadStore] in {
 defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
 defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index c4a0c751b8f67..2b98e25d67702 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1522,15 +1522,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
     return true;
   }
-  case Intrinsic::amdgcn_global_atomic_csub: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(CI.getType());
-    Info.ptrVal = CI.getOperand(0);
-    Info.align.reset();
-    Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
-                  MachineMemOperand::MOVolatile;
-    return true;
-  }
   case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
   case Intrinsic::amdgcn_image_bvh_intersect_ray:
   case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
@@ -1551,8 +1542,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::amdgcn_global_atomic_fmax_num:
   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
   case Intrinsic::amdgcn_flat_atomic_fmin_num:
-  case Intrinsic::amdgcn_flat_atomic_fmax_num:
-  case Intrinsic::amdgcn_atomic_cond_sub_u32: {
+  case Intrinsic::amdgcn_flat_atomic_fmax_num: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
@@ -1747,7 +1737,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
                                             Type *&AccessTy) const {
   Value *Ptr = nullptr;
   switch (II->getIntrinsicID()) {
-  case Intrinsic::amdgcn_atomic_cond_sub_u32:
   case Intrinsic::amdgcn_cluster_load_b128:
   case Intrinsic::amdgcn_cluster_load_b64:
   case Intrinsic::amdgcn_cluster_load_b32:
@@ -1770,7 +1759,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
   case Intrinsic::amdgcn_flat_load_monitor_b128:
   case Intrinsic::amdgcn_flat_load_monitor_b32:
   case Intrinsic::amdgcn_flat_load_monitor_b64:
-  case Intrinsic::amdgcn_global_atomic_csub:
   case Intrinsic::amdgcn_global_atomic_fmax_num:
   case Intrinsic::amdgcn_global_atomic_fmin_num:
   case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
@@ -18412,7 +18400,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
   case AMDGPUISD::BUFFER_ATOMIC_INC:
   case AMDGPUISD::BUFFER_ATOMIC_DEC:
   case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
-  case AMDGPUISD::BUFFER_ATOMIC_CSUB:
   case AMDGPUISD::BUFFER_ATOMIC_FADD:
   case AMDGPUISD::BUFFER_ATOMIC_FMIN:
   case AMDGPUISD::BUFFER_ATOMIC_FMAX:
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 690eee91a126f..71d4fe16047dc 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3185,10 +3185,36 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                   : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
                                                   MI, false, 0, !UseSGPR);
 
-      // TODO: for flat scratch another attempt can be made with a VGPR index
-      //       if no SGPRs can be scavenged.
-      if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+      if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
+        int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
+        if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
+          Register TmpVGPR = RS->scavengeRegisterBackwards(
+              AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
+
+          // Materialize the frame register.
+          auto MIB =
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
+          if (FrameReg)
+            MIB.addReg(FrameReg);
+          else
+            MIB.addImm(Offset);
+
+          // Add the offset to the frame register.
+          if (FrameReg && Offset)
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
+                .addReg(FrameReg, RegState::Kill)
+                .addImm(Offset);
+
+          BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
+              .add(MI->getOperand(0)) // $vdata
+              .addReg(TmpVGPR)        // $vaddr
+              .addImm(0)              // Offset
+              .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
+          MI->eraseFromParent();
+          return true;
+        }
         report_fatal_error("Cannot scavenge register in FI elimination!");
+      }
 
       if (!TmpSReg) {
         // Use frame register and restore it after.
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 557a0a3f27819..848337457c997 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -137,8 +137,7 @@ namespace {
       return !Bits.any();
     }
     bool includes(const RegisterSet &Rs) const {
-      // A.test(B)  <=>  A-B != {}
-      return !Rs.Bits.test(Bits);
+      return Rs.Bits.subsetOf(Bits);
     }
     bool intersects(const RegisterSet &Rs) const {
       return Bits.anyCommon(Rs.Bits);
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index ff876f6595350..18fcd6a4873fb 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -153,8 +153,7 @@ namespace {
       return !BitVector::any();
     }
     bool includes(const RegisterSet &Rs) const {
-      // A.BitVector::test(B)  <=>  A-B != {}
-      return !Rs.BitVector::test(*this);
+      return Rs.BitVector::subsetOf(*this);
     }
     bool intersects(const RegisterSet &Rs) const {
       return BitVector::anyCommon(Rs);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index bae9d705f5a7a..025e5b087abed 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2527,7 +2527,7 @@ HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values,
     // Make sure to always cast to IntTy.
     if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) {
       const ConstantInt *CI = CN->getConstantIntValue();
-      Consts[i] = ConstantInt::get(IntTy, CI->getValue().getSExtValue());
+      Consts[i] = ConstantInt::getSigned(IntTy, CI->getValue().getSExtValue());
     } else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) {
       const ConstantFP *CF = CN->getConstantFPValue();
       APInt A = CF->getValueAPF().bitcastToAPInt();
diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index f9c24750c4836..6fe58c25c757d 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -18,6 +18,7 @@ set(NVPTXCodeGen_sources
   NVPTXAssignValidGlobalNames.cpp
   NVPTXAtomicLower.cpp
   NVPTXCtorDtorLowering.cpp
+  NVPTXIRPeephole.cpp
   NVPTXForwardParams.cpp
   NVPTXFrameLowering.cpp
   NVPTXGenericToNVVM.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 95fd05f2a926f..210624fbb235c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -52,6 +52,7 @@ FunctionPass *createNVPTXLowerAllocaPass();
 FunctionPass *createNVPTXLowerUnreachablePass(bool TrapUnreachable,
                                               bool NoTrapAfterNoreturn);
 FunctionPass *createNVPTXTagInvariantLoadsPass();
+FunctionPass *createNVPTXIRPeepholePass();
 MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 MachineFunctionPass *createNVPTXForwardParamsPass();
@@ -75,12 +76,17 @@ void initializeNVPTXAAWrapperPassPass(PassRegistry &);
 void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
 void initializeNVPTXPeepholePass(PassRegistry &);
 void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);
+void initializeNVPTXIRPeepholePass(PassRegistry &);
 void initializeNVPTXPrologEpilogPassPass(PassRegistry &);
 
 struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+struct NVPTXIRPeepholePass : PassInfoMixin<NVPTXIRPeepholePass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
   NVVMReflectPass() : SmVersion(0) {}
   NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp b/llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp
new file mode 100644
index 0000000000000..bd16c7213b1e7
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp
@@ -0,0 +1,167 @@
+//===------ NVPTXIRPeephole.cpp - NVPTX IR Peephole --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements IR-level peephole optimizations. These transformations
+// run late in the NVPTX IR pass pipeline just before the instruction selection.
+//
+// Currently, it implements the following transformation(s):
+// 1. FMA folding (float/double types):
+//    Transforms FMUL+FADD/FSUB sequences into FMA intrinsics when the
+//    'contract' fast-math flag is present. Supported patterns:
+//    - fadd(fmul(a, b), c) => fma(a, b, c)
+//    - fadd(c, fmul(a, b)) => fma(a, b, c)
+//    - fadd(fmul(a, b), fmul(c, d)) => fma(a, b, fmul(c, d))
+//    - fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+//    - fsub(a, fmul(b, c)) => fma(fneg(b), c, a)
+//    - fsub(fmul(a, b), fmul(c, d)) => fma(a, b, fneg(fmul(c, d)))
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+
+#define DEBUG_TYPE "nvptx-ir-peephole"
+
+using namespace llvm;
+
+static bool tryFoldBinaryFMul(BinaryOperator *BI) {
+  Value *Op0 = BI->getOperand(0);
+  Value *Op1 = BI->getOperand(1);
+
+  auto *FMul0 = dyn_cast<BinaryOperator>(Op0);
+  auto *FMul1 = dyn_cast<BinaryOperator>(Op1);
+
+  BinaryOperator *FMul = nullptr;
+  Value *OtherOperand = nullptr;
+  bool IsFirstOperand = false;
+
+  // Either Op0 or Op1 should be a valid FMul
+  if (FMul0 && FMul0->getOpcode() == Instruction::FMul && FMul0->hasOneUse() &&
+      FMul0->hasAllowContract()) {
+    FMul = FMul0;
+    OtherOperand = Op1;
+    IsFirstOperand = true;
+  } else if (FMul1 && FMul1->getOpcode() == Instruction::FMul &&
+             FMul1->hasOneUse() && FMul1->hasAllowContract()) {
+    FMul = FMul1;
+    OtherOperand = Op0;
+    IsFirstOperand = false;
+  } else {
+    return false;
+  }
+
+  bool IsFSub = BI->getOpcode() == Instruction::FSub;
+  LLVM_DEBUG({
+    const char *OpName = IsFSub ? "FSub" : "FAdd";
+    dbgs() << "Found " << OpName << " with FMul (single use) as "
+           << (IsFirstOperand ? "first" : "second") << " operand: " << *BI
+           << "\n";
+  });
+
+  Value *MulOp0 = FMul->getOperand(0);
+  Value *MulOp1 = FMul->getOperand(1);
+  IRBuilder<> Builder(BI);
+  Value *FMA = nullptr;
+
+  if (!IsFSub) {
+    // fadd(fmul(a, b), c) => fma(a, b, c)
+    // fadd(c, fmul(a, b)) => fma(a, b, c)
+    FMA = Builder.CreateIntrinsic(Intrinsic::fma, {BI->getType()},
+                                  {MulOp0, MulOp1, OtherOperand});
+  } else {
+    if (IsFirstOperand) {
+      // fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+      Value *NegOtherOp =
+          Builder.CreateFNegFMF(OtherOperand, BI->getFastMathFlags());
+      FMA = Builder.CreateIntrinsic(Intrinsic::fma, {BI->getType()},
+                                    {MulOp0, MulOp1, NegOtherOp});
+    } else {
+      // fsub(a, fmul(b, c)) => fma(fneg(b), c, a)
+      Value *NegMulOp0 =
+          Builder.CreateFNegFMF(MulOp0, FMul->getFastMathFlags());
+      FMA = Builder.CreateIntrinsic(Intrinsic::fma, {BI->getType()},
+                                    {NegMulOp0, MulOp1, OtherOperand});
+    }
+  }
+
+  // Combine fast-math flags from the original instructions
+  auto *FMAInst = cast<Instruction>(FMA);
+  FastMathFlags BinaryFMF = BI->getFastMathFlags();
+  FastMathFlags FMulFMF = FMul->getFastMathFlags();
+  FastMathFlags NewFMF = FastMathFlags::intersectRewrite(BinaryFMF, FMulFMF) |
+                         FastMathFlags::unionValue(BinaryFMF, FMulFMF);
+  FMAInst->setFastMathFlags(NewFMF);
+
+  LLVM_DEBUG({
+    const char *OpName = IsFSub ? "FSub" : "FAdd";
+    dbgs() << "Replacing " << OpName << " with FMA: " << *FMA << "\n";
+  });
+  BI->replaceAllUsesWith(FMA);
+  BI->eraseFromParent();
+  FMul->eraseFromParent();
+  return true;
+}
+
+static bool foldFMA(Function &F) {
+  bool Changed = false;
+
+  // Iterate and process float/double FAdd/FSub instructions with allow-contract
+  for (auto &I : llvm::make_early_inc_range(instructions(F))) {
+    if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
+      // Only FAdd and FSub are supported.
+      if (BI->getOpcode() != Instruction::FAdd &&
+          BI->getOpcode() != Instruction::FSub)
+        continue;
+
+      // At minimum, the instruction should have allow-contract.
+      if (!BI->hasAllowContract())
+        continue;
+
+      // Only float and double are supported.
+      if (!BI->getType()->isFloatTy() && !BI->getType()->isDoubleTy())
+        continue;
+
+      if (tryFoldBinaryFMul(BI))
+        Changed = true;
+    }
+  }
+  return Changed;
+}
+
+namespace {
+
+struct NVPTXIRPeephole : public FunctionPass {
+  static char ID;
+  NVPTXIRPeephole() : FunctionPass(ID) {}
+  bool runOnFunction(Function &F) override;
+};
+
+} // namespace
+
+char NVPTXIRPeephole::ID = 0;
+INITIALIZE_PASS(NVPTXIRPeephole, "nvptx-ir-peephole", "NVPTX IR Peephole",
+                false, false)
+
+bool NVPTXIRPeephole::runOnFunction(Function &F) { return foldFMA(F); }
+
+FunctionPass *llvm::createNVPTXIRPeepholePass() {
+  return new NVPTXIRPeephole();
+}
+
+PreservedAnalyses NVPTXIRPeepholePass::run(Function &F,
+                                           FunctionAnalysisManager &) {
+  if (!foldFMA(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
index ee37c9826012c..7d645bff7110f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
+++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -40,4 +40,5 @@ FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
 FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
 FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
 FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
+FUNCTION_PASS("nvptx-ir-peephole", NVPTXIRPeepholePass())
 #undef FUNCTION_PASS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index a6837a482608c..74bae28044e66 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -51,6 +51,13 @@ static cl::opt<bool>
                                cl::desc("Disable load/store vectorizer"),
                                cl::init(false), cl::Hidden);
 
+// NVPTX IR Peephole is a new pass; this option will lets us turn it off in case
+// we encounter some issues.
+static cl::opt<bool>
+    DisableNVPTXIRPeephole("disable-nvptx-ir-peephole",
+                           cl::desc("Disable NVPTX IR Peephole"),
+                           cl::init(false), cl::Hidden);
+
 // TODO: Remove this flag when we are confident with no regressions.
 static cl::opt<bool> DisableRequireStructuredCFG(
     "disable-nvptx-require-structured-cfg",
@@ -115,6 +122,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   initializeNVPTXExternalAAWrapperPass(PR);
   initializeNVPTXPeepholePass(PR);
   initializeNVPTXTagInvariantLoadLegacyPassPass(PR);
+  initializeNVPTXIRPeepholePass(PR);
   initializeNVPTXPrologEpilogPassPass(PR);
 }
 
@@ -379,6 +387,8 @@ void NVPTXPassConfig::addIRPasses() {
       addPass(createLoadStoreVectorizerPass());
     addPass(createSROAPass());
     addPass(createNVPTXTagInvariantLoadsPass());
+    if (!DisableNVPTXIRPeephole)
+      addPass(createNVPTXIRPeepholePass());
   }
 
   if (ST.hasPTXASUnreachableBug()) {
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 2cc594a33eb0d..a37b34a3758a1 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -1240,7 +1240,7 @@ bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
   LLT BigTy = MRI.getType(BigVec);
   LLT LitTy = MRI.getType(LitVec);
 
-  if (Idx == 0 ||
+  if (Idx == 0 &&
       MRI.getVRegDef(BigVec)->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
     return true;
 
@@ -1314,7 +1314,7 @@ bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
   auto Insert = MIB.buildInsertSubvector(InterLitTy, MIB.buildUndef(InterLitTy),
                                          LitVec, 0);
 
-  auto [Mask, _] = buildDefaultVLOps(BigTy, MIB, MRI);
+  auto [Mask, _] = buildDefaultVLOps(InterLitTy, MIB, MRI);
   auto VL = MIB.buildVScale(XLenTy, LitTy.getElementCount().getKnownMinValue());
 
   // If we're inserting into the lowest elements, use a tail undisturbed
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b74dba12e8959..7cbb9c0da4874 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24323,14 +24323,15 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       break;
     }
   } else if (Constraint == "vr") {
+    // Check VM first so that mask types will use that instead of VR.
     for (const auto *RC :
-         {&RISCV::VRRegClass, &RISCV::VRM2RegClass, &RISCV::VRM4RegClass,
-          &RISCV::VRM8RegClass, &RISCV::VRN2M1RegClass, &RISCV::VRN3M1RegClass,
-          &RISCV::VRN4M1RegClass, &RISCV::VRN5M1RegClass,
-          &RISCV::VRN6M1RegClass, &RISCV::VRN7M1RegClass,
-          &RISCV::VRN8M1RegClass, &RISCV::VRN2M2RegClass,
-          &RISCV::VRN3M2RegClass, &RISCV::VRN4M2RegClass,
-          &RISCV::VRN2M4RegClass}) {
+         {&RISCV::VMRegClass, &RISCV::VRRegClass, &RISCV::VRM2RegClass,
+          &RISCV::VRM4RegClass, &RISCV::VRM8RegClass, &RISCV::VRN2M1RegClass,
+          &RISCV::VRN3M1RegClass, &RISCV::VRN4M1RegClass,
+          &RISCV::VRN5M1RegClass, &RISCV::VRN6M1RegClass,
+          &RISCV::VRN7M1RegClass, &RISCV::VRN8M1RegClass,
+          &RISCV::VRN2M2RegClass, &RISCV::VRN3M2RegClass,
+          &RISCV::VRN4M2RegClass, &RISCV::VRN2M4RegClass}) {
       if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
         return std::make_pair(0U, RC);
 
@@ -24341,15 +24342,16 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       }
     }
   } else if (Constraint == "vd") {
+    // Check VMNoV0 first so that mask types will use that instead of VRNoV0.
     for (const auto *RC :
-         {&RISCV::VRNoV0RegClass, &RISCV::VRM2NoV0RegClass,
-          &RISCV::VRM4NoV0RegClass, &RISCV::VRM8NoV0RegClass,
-          &RISCV::VRN2M1NoV0RegClass, &RISCV::VRN3M1NoV0RegClass,
-          &RISCV::VRN4M1NoV0RegClass, &RISCV::VRN5M1NoV0RegClass,
-          &RISCV::VRN6M1NoV0RegClass, &RISCV::VRN7M1NoV0RegClass,
-          &RISCV::VRN8M1NoV0RegClass, &RISCV::VRN2M2NoV0RegClass,
-          &RISCV::VRN3M2NoV0RegClass, &RISCV::VRN4M2NoV0RegClass,
-          &RISCV::VRN2M4NoV0RegClass}) {
+         {&RISCV::VMNoV0RegClass, &RISCV::VRNoV0RegClass,
+          &RISCV::VRM2NoV0RegClass, &RISCV::VRM4NoV0RegClass,
+          &RISCV::VRM8NoV0RegClass, &RISCV::VRN2M1NoV0RegClass,
+          &RISCV::VRN3M1NoV0RegClass, &RISCV::VRN4M1NoV0RegClass,
+          &RISCV::VRN5M1NoV0RegClass, &RISCV::VRN6M1NoV0RegClass,
+          &RISCV::VRN7M1NoV0RegClass, &RISCV::VRN8M1NoV0RegClass,
+          &RISCV::VRN2M2NoV0RegClass, &RISCV::VRN3M2NoV0RegClass,
+          &RISCV::VRN4M2NoV0RegClass, &RISCV::VRN2M4NoV0RegClass}) {
       if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
         return std::make_pair(0U, RC);
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index f7b4914135711..c07ed8596f009 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -935,7 +935,7 @@ let Predicates = [HasVendorXSfcease] in {
     let rd = 0b00000;
     let rs1 = 0b00000;
     let rs2 = 0b00101;
-}
+  }
 }
 
 let Predicates = [HasVendorXSfvfbfexp16e] in {
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 11b7a0a3c691a..f354793eb0eac 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -813,6 +813,7 @@ def VMV0 : VReg<VMaskVTs, (add V0), 1>;
 
 // The register class is added for inline assembly for vector mask types.
 def VM : VReg<VMaskVTs, (add VR), 1>;
+def VMNoV0 : VReg<VMaskVTs, (sub VR, V0), 1>;
 
 defvar VTupM1N2VTs = [riscv_nxv8i8x2, riscv_nxv4i8x2, riscv_nxv2i8x2, riscv_nxv1i8x2];
 defvar VTupM1N3VTs = [riscv_nxv8i8x3, riscv_nxv4i8x3, riscv_nxv2i8x3, riscv_nxv1i8x3];
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 62f5e47c5ea3b..42de884840dc9 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -50,6 +50,14 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI,
   unsigned IsBitwidth16 = MI->getFlags() & SPIRV::INST_PRINTER_WIDTH16;
   const unsigned NumVarOps = MI->getNumOperands() - StartIndex;
 
+  if (MI->getOpcode() == SPIRV::OpConstantI && NumVarOps > 2) {
+    // SPV_ALTERA_arbitrary_precision_integers allows for integer widths greater
+    // than 64, which will be encoded via multiple operands.
+    for (unsigned I = StartIndex; I != MI->getNumOperands(); ++I)
+      O << ' ' << MI->getOperand(I).getImm();
+    return;
+  }
+
   assert((NumVarOps == 1 || NumVarOps == 2) &&
          "Unsupported number of bits for literal variable");
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index d2a8fddc5d8e4..42edad255ce82 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -13,9 +13,14 @@
 
 #include "SPIRVCommandLine.h"
 #include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/TargetParser/Triple.h"
-#include <algorithm>
+
+#include <functional>
 #include <map>
+#include <string>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "spirv-commandline"
 
@@ -176,7 +181,17 @@ bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   std::set<SPIRV::Extension::Extension> &Vals) {
   SmallVector<StringRef, 10> Tokens;
   ArgValue.split(Tokens, ",", -1, false);
-  std::sort(Tokens.begin(), Tokens.end());
+  llvm::sort(Tokens, [](auto &&LHS, auto &&RHS) {
+    // We want to ensure that we handle "all" first, to ensure that any
+    // subsequent disablement actually behaves as expected i.e. given
+    // --spv-ext=all,-foo, we first enable all and then disable foo; this should
+    // be revisited and simplified.
+    if (LHS == "all")
+      return true;
+    if (RHS == "all")
+      return false;
+    return !(RHS < LHS);
+  });
 
   std::set<SPIRV::Extension::Extension> EnabledExtensions;
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 0fb44052527f0..5d96a67500dff 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -151,22 +151,22 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeBool(MachineIRBuilder &MIRBuilder) {
 }
 
 unsigned SPIRVGlobalRegistry::adjustOpTypeIntWidth(unsigned Width) const {
-  if (Width > 64)
-    report_fatal_error("Unsupported integer width!");
   const SPIRVSubtarget &ST = cast<SPIRVSubtarget>(CurMF->getSubtarget());
   if (ST.canUseExtension(
           SPIRV::Extension::SPV_ALTERA_arbitrary_precision_integers) ||
-      ST.canUseExtension(SPIRV::Extension::SPV_INTEL_int4))
+      (Width == 4 && ST.canUseExtension(SPIRV::Extension::SPV_INTEL_int4)))
     return Width;
   if (Width <= 8)
-    Width = 8;
+    return 8;
   else if (Width <= 16)
-    Width = 16;
+    return 16;
   else if (Width <= 32)
-    Width = 32;
-  else
-    Width = 64;
-  return Width;
+    return 32;
+  else if (Width <= 64)
+    return 64;
+  else if (Width <= 128)
+    return 128;
+  reportFatalUsageError("Unsupported Integer width!");
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeInt(unsigned Width,
@@ -413,7 +413,7 @@ Register SPIRVGlobalRegistry::createConstInt(const ConstantInt *CI,
           MIB = MIRBuilder.buildInstr(SPIRV::OpConstantI)
                     .addDef(Res)
                     .addUse(getSPIRVTypeID(SpvType));
-          addNumImm(APInt(BitWidth, CI->getZExtValue()), MIB);
+          addNumImm(CI->getValue(), MIB);
         } else {
           MIB = MIRBuilder.buildInstr(SPIRV::OpConstantNull)
                     .addDef(Res)
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 71df4cced434e..30703ee40be06 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -48,6 +48,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   const LLT s16 = LLT::scalar(16);
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
+  const LLT s128 = LLT::scalar(128);
 
   const LLT v16s64 = LLT::fixed_vector(16, 64);
   const LLT v16s32 = LLT::fixed_vector(16, 32);
@@ -114,18 +115,19 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
                            v4s1, v4s8, v4s16, v4s32, v4s64};
 
   auto allScalarsAndVectors = {
-      s1,   s8,   s16,   s32,   s64,   v2s1,  v2s8,  v2s16,  v2s32,  v2s64,
-      v3s1, v3s8, v3s16, v3s32, v3s64, v4s1,  v4s8,  v4s16,  v4s32,  v4s64,
-      v8s1, v8s8, v8s16, v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
+      s1,    s8,    s16,   s32,   s64,    s128,   v2s1,  v2s8,
+      v2s16, v2s32, v2s64, v3s1,  v3s8,   v3s16,  v3s32, v3s64,
+      v4s1,  v4s8,  v4s16, v4s32, v4s64,  v8s1,   v8s8,  v8s16,
+      v8s32, v8s64, v16s1, v16s8, v16s16, v16s32, v16s64};
 
-  auto allIntScalarsAndVectors = {s8,    s16,   s32,   s64,    v2s8,   v2s16,
-                                  v2s32, v2s64, v3s8,  v3s16,  v3s32,  v3s64,
-                                  v4s8,  v4s16, v4s32, v4s64,  v8s8,   v8s16,
-                                  v8s32, v8s64, v16s8, v16s16, v16s32, v16s64};
+  auto allIntScalarsAndVectors = {
+      s8,    s16,   s32,   s64,   s128,   v2s8,   v2s16, v2s32, v2s64,
+      v3s8,  v3s16, v3s32, v3s64, v4s8,   v4s16,  v4s32, v4s64, v8s8,
+      v8s16, v8s32, v8s64, v16s8, v16s16, v16s32, v16s64};
 
   auto allBoolScalarsAndVectors = {s1, v2s1, v3s1, v4s1, v8s1, v16s1};
 
-  auto allIntScalars = {s8, s16, s32, s64};
+  auto allIntScalars = {s8, s16, s32, s64, s128};
 
   auto allFloatScalarsAndF16Vector2AndVector4s = {s16, s32, s64, v2s16, v4s16};
 
@@ -307,7 +309,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
           typeInSet(1, allPtrsScalarsAndVectors)));
 
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
-      .legalFor({s1})
+      .legalFor({s1, s128})
       .legalFor(allFloatAndIntScalarsAndPtrs)
       .legalFor(allowedVectorTypes)
       .moreElementsToNextPow2(0)
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 73432279c3306..7717f18d14a34 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1436,6 +1436,21 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::Int16);
     else if (BitWidth == 8)
       Reqs.addCapability(SPIRV::Capability::Int8);
+    else if (BitWidth == 4 &&
+             ST.canUseExtension(SPIRV::Extension::SPV_INTEL_int4)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_int4);
+      Reqs.addCapability(SPIRV::Capability::Int4TypeINTEL);
+    } else if (BitWidth != 32) {
+      if (!ST.canUseExtension(
+              SPIRV::Extension::SPV_ALTERA_arbitrary_precision_integers))
+        reportFatalUsageError(
+            "OpTypeInt type with a width other than 8, 16, 32 or 64 bits "
+            "requires the following SPIR-V extension: "
+            "SPV_ALTERA_arbitrary_precision_integers");
+      Reqs.addExtension(
+          SPIRV::Extension::SPV_ALTERA_arbitrary_precision_integers);
+      Reqs.addCapability(SPIRV::Capability::ArbitraryPrecisionIntegersALTERA);
+    }
     break;
   }
   case SPIRV::OpDot: {
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index acc726717743d..9ddbeee92ffb6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -388,11 +388,11 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
 
 // To support current approach and limitations wrt. bit width here we widen a
 // scalar register with a bit width greater than 1 to valid sizes and cap it to
-// 64 width.
+// 128 width.
 static unsigned widenBitWidthToNextPow2(unsigned BitWidth) {
   if (BitWidth == 1)
     return 1; // No need to widen 1-bit values
-  return std::min(std::max(1u << Log2_32_Ceil(BitWidth), 8u), 64u);
+  return std::min(std::max(1u << Log2_32_Ceil(BitWidth), 8u), 128u);
 }
 
 static void widenScalarType(Register Reg, MachineRegisterInfo &MRI) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index d4dd897647cfc..c32ecfb3ef7ac 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -171,6 +171,13 @@ void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB) {
     // Asm Printer needs this info to print 64-bit operands correctly
     MIB.getInstr()->setAsmPrinterFlag(SPIRV::ASM_PRINTER_WIDTH64);
     return;
+  } else if (Bitwidth <= 128) {
+    uint32_t LowBits = Imm.getRawData()[0] & 0xffffffff;
+    uint32_t MidBits0 = (Imm.getRawData()[0] >> 32) & 0xffffffff;
+    uint32_t MidBits1 = Imm.getRawData()[1] & 0xffffffff;
+    uint32_t HighBits = (Imm.getRawData()[1] >> 32) & 0xffffffff;
+    MIB.addImm(LowBits).addImm(MidBits0).addImm(MidBits1).addImm(HighBits);
+    return;
   }
   report_fatal_error("Unsupported constant bitwidth");
 }
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 99665b5872fe2..88ade87e1dca8 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -534,7 +534,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2qqs_round_512, INTR_TYPE_1OP_MASK,
                        X86ISD::CVTTP2SIS, X86ISD::CVTTP2SIS_SAE),
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_128, CVTPD2DQ_MASK,
-                       X86ISD::CVTTP2UIS, X86ISD::MCVTTP2SIS),
+                       X86ISD::CVTTP2UIS, X86ISD::MCVTTP2UIS),
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_256, INTR_TYPE_1OP_MASK,
                        X86ISD::CVTTP2UIS, 0),
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_round_512, INTR_TYPE_1OP_MASK,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8e7282a4ffefe..85602a5a7575a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -737,42 +737,119 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   return nullptr;
 }
 
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-static Value *simplifyNeonTbl1(const IntrinsicInst &II,
-                               InstCombiner::BuilderTy &Builder) {
+/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
+/// at most two source operands are actually referenced.
+static Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
+                                    bool IsExtension) {
   // Bail out if the mask is not a constant.
-  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+  auto *C = dyn_cast<Constant>(II.getArgOperand(II.arg_size() - 1));
   if (!C)
     return nullptr;
 
-  auto *VecTy = cast<FixedVectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
+  auto *RetTy = cast<FixedVectorType>(II.getType());
+  unsigned NumIndexes = RetTy->getNumElements();
 
-  // Only perform this transformation for <8 x i8> vector types.
-  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+  // Only perform this transformation for <8 x i8> and <16 x i8> vector types.
+  if (!RetTy->getElementType()->isIntegerTy(8) ||
+      (NumIndexes != 8 && NumIndexes != 16))
     return nullptr;
 
-  int Indexes[8];
+  // For tbx instructions, the first argument is the "fallback" vector, which
+  // has the same length as the mask and return type.
+  unsigned int StartIndex = (unsigned)IsExtension;
+  auto *SourceTy =
+      cast<FixedVectorType>(II.getArgOperand(StartIndex)->getType());
+  // Note that the element count of each source vector does *not* need to be the
+  // same as the element count of the return type and mask! All source vectors
+  // must have the same element count as each other, though.
+  unsigned NumElementsPerSource = SourceTy->getNumElements();
+
+  // There are no tbl/tbx intrinsics for which the destination size exceeds the
+  // source size. However, our definitions of the intrinsics, at least in
+  // IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it
+  // *could* technically happen.
+  if (NumIndexes > NumElementsPerSource)
+    return nullptr;
+
+  // The tbl/tbx intrinsics take several source operands followed by a mask
+  // operand.
+  unsigned int NumSourceOperands = II.arg_size() - 1 - (unsigned)IsExtension;
 
-  for (unsigned I = 0; I < NumElts; ++I) {
+  // Map input operands to shuffle indices. This also helpfully deduplicates the
+  // input arguments, in case the same value is passed as an argument multiple
+  // times.
+  SmallDenseMap<Value *, unsigned, 2> ValueToShuffleSlot;
+  Value *ShuffleOperands[2] = {PoisonValue::get(SourceTy),
+                               PoisonValue::get(SourceTy)};
+
+  int Indexes[16];
+  for (unsigned I = 0; I < NumIndexes; ++I) {
     Constant *COp = C->getAggregateElement(I);
 
-    if (!COp || !isa<ConstantInt>(COp))
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return nullptr;
 
-    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
+    // The index of the input argument that this index references (0 = first
+    // source argument, etc).
+    unsigned SourceOperandIndex = Index / NumElementsPerSource;
+    // The index of the element at that source operand.
+    unsigned SourceOperandElementIndex = Index % NumElementsPerSource;
+
+    Value *SourceOperand;
+    if (SourceOperandIndex >= NumSourceOperands) {
+      // This index is out of bounds. Map it to index into either the fallback
+      // vector (tbx) or vector of zeroes (tbl).
+      SourceOperandIndex = NumSourceOperands;
+      if (IsExtension) {
+        // For out-of-bounds indices in tbx, choose the `I`th element of the
+        // fallback.
+        SourceOperand = II.getArgOperand(0);
+        SourceOperandElementIndex = I;
+      } else {
+        // Otherwise, choose some element from the dummy vector of zeroes (we'll
+        // always choose the first).
+        SourceOperand = Constant::getNullValue(SourceTy);
+        SourceOperandElementIndex = 0;
+      }
+    } else {
+      SourceOperand = II.getArgOperand(SourceOperandIndex + StartIndex);
+    }
+
+    // The source operand may be the fallback vector, which may not have the
+    // same number of elements as the source vector. In that case, we *could*
+    // choose to extend its length with another shufflevector, but it's simpler
+    // to just bail instead.
+    if (cast<FixedVectorType>(SourceOperand->getType())->getNumElements() !=
+        NumElementsPerSource)
+      return nullptr;
 
-    // Make sure the mask indices are in range.
-    if ((unsigned)Indexes[I] >= NumElts)
+    // We now know the source operand referenced by this index. Make it a
+    // shufflevector operand, if it isn't already.
+    unsigned NumSlots = ValueToShuffleSlot.size();
+    // This shuffle references more than two sources, and hence cannot be
+    // represented as a shufflevector.
+    if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand))
       return nullptr;
+
+    auto [It, Inserted] =
+        ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots);
+    if (Inserted)
+      ShuffleOperands[It->getSecond()] = SourceOperand;
+
+    unsigned RemappedIndex =
+        (It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex;
+    Indexes[I] = RemappedIndex;
   }
 
-  auto *V1 = II.getArgOperand(0);
-  auto *V2 = Constant::getNullValue(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
+  Value *Shuf = IC.Builder.CreateShuffleVector(
+      ShuffleOperands[0], ShuffleOperands[1], ArrayRef(Indexes, NumIndexes));
+  return IC.replaceInstUsesWith(II, Shuf);
 }
 
 // Returns true iff the 2 intrinsics have the same operands, limiting the
@@ -3167,10 +3244,23 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     return CallInst::Create(NewFn, CallArgs);
   }
   case Intrinsic::arm_neon_vtbl1:
+  case Intrinsic::arm_neon_vtbl2:
+  case Intrinsic::arm_neon_vtbl3:
+  case Intrinsic::arm_neon_vtbl4:
   case Intrinsic::aarch64_neon_tbl1:
-    if (Value *V = simplifyNeonTbl1(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
+  case Intrinsic::aarch64_neon_tbl2:
+  case Intrinsic::aarch64_neon_tbl3:
+  case Intrinsic::aarch64_neon_tbl4:
+    return simplifyNeonTbl(*II, *this, /*IsExtension=*/false);
+  case Intrinsic::arm_neon_vtbx1:
+  case Intrinsic::arm_neon_vtbx2:
+  case Intrinsic::arm_neon_vtbx3:
+  case Intrinsic::arm_neon_vtbx4:
+  case Intrinsic::aarch64_neon_tbx1:
+  case Intrinsic::aarch64_neon_tbx2:
+  case Intrinsic::aarch64_neon_tbx3:
+  case Intrinsic::aarch64_neon_tbx4:
+    return simplifyNeonTbl(*II, *this, /*IsExtension=*/true);
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu:
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b38e305df0ce4..32ee16c89b4fe 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5896,52 +5896,118 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //
     // Multiply and Add Signed Word Integers
     //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
-    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
     //   < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
-    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
     //
     // Multiply and Add Signed Word Integers With Saturation
     //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
-    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
     //   < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
-    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Signed and Unsigned Word Integers
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwsud.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwsud.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwsud.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Signed and Unsigned Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwsuds.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwsuds.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwsuds.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Signed Word Integers
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwusd.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwusd.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwusd.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Signed Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwusds.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwusds.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwusds.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Unsigned Word Integers
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwuud.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwuud.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwuud.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Unsigned Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwuuds.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwuuds.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwuuds.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
     //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     //
     //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     case Intrinsic::x86_avx512_vpdpwssd_128:
     case Intrinsic::x86_avx512_vpdpwssd_256:
     case Intrinsic::x86_avx512_vpdpwssd_512:
     case Intrinsic::x86_avx512_vpdpwssds_128:
     case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssds_512:
+    case Intrinsic::x86_avx2_vpdpwsud_128:
+    case Intrinsic::x86_avx2_vpdpwsud_256:
+    case Intrinsic::x86_avx10_vpdpwsud_512:
+    case Intrinsic::x86_avx2_vpdpwsuds_128:
+    case Intrinsic::x86_avx2_vpdpwsuds_256:
+    case Intrinsic::x86_avx10_vpdpwsuds_512:
+    case Intrinsic::x86_avx2_vpdpwusd_128:
+    case Intrinsic::x86_avx2_vpdpwusd_256:
+    case Intrinsic::x86_avx10_vpdpwusd_512:
+    case Intrinsic::x86_avx2_vpdpwusds_128:
+    case Intrinsic::x86_avx2_vpdpwusds_256:
+    case Intrinsic::x86_avx10_vpdpwusds_512:
+    case Intrinsic::x86_avx2_vpdpwuud_128:
+    case Intrinsic::x86_avx2_vpdpwuud_256:
+    case Intrinsic::x86_avx10_vpdpwuud_512:
+    case Intrinsic::x86_avx2_vpdpwuuds_128:
+    case Intrinsic::x86_avx2_vpdpwuuds_256:
+    case Intrinsic::x86_avx10_vpdpwuuds_512:
       handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
                                  /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
       break;
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 63b228efe3b11..68cffd4c18688 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -5805,7 +5805,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
       // negated immediate.
       if (!ICmpScaledV)
         ICmpScaledV =
-            ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
+            ConstantInt::getSigned(IntTy, -(uint64_t)Offset.getFixedValue());
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
         ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());
diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 9f6d89e97180f..66d8fea251cbd 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -335,10 +335,10 @@ Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
   else
     OrV = Op1 ? Op1 : Op2;
 
-  // BitMask is inverted to check if the operands are
-  // larger than the bypass type
-  uint64_t BitMask = ~BypassType->getBitMask();
-  Value *AndV = Builder.CreateAnd(OrV, BitMask);
+  // Check whether the operands are larger than the bypass type.
+  Value *AndV = Builder.CreateAnd(
+      OrV, APInt::getBitsSetFrom(OrV->getType()->getIntegerBitWidth(),
+                                 BypassType->getBitWidth()));
 
   // Compare operand values
   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fd02493fa2c78..afb654ed882f4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3600,8 +3600,6 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 
 /// A recipe for generating the active lane mask for the vector loop that is
 /// used to predicate the vector operations.
-/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
-/// remove VPActiveLaneMaskPHIRecipe.
 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
 public:
   VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 519a104b9484f..b0c8564ad231a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -4407,8 +4407,6 @@ void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
-// remove VPActiveLaneMaskPHIRecipe.
 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH =
       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index eb078c783d5f7..852196e589c59 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -44,6 +44,7 @@
 
 using namespace llvm;
 using namespace VPlanPatternMatch;
+using namespace SCEVPatternMatch;
 
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
     VPlan &Plan,
@@ -139,14 +140,77 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
   return true;
 }
 
-// Check if a memory operation doesn't alias with memory operations in blocks
-// between FirstBB and LastBB using scoped noalias metadata.
-// For load hoisting, we only check writes in one direction.
-// For store sinking, we check both reads and writes bidirectionally.
-static bool canHoistOrSinkWithNoAliasCheck(
-    const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
-    bool CheckReads,
-    const SmallPtrSetImpl<VPRecipeBase *> *ExcludeRecipes = nullptr) {
+/// Helper for extra no-alias checks via known-safe recipe and SCEV.
+class SinkStoreInfo {
+  const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
+  VPReplicateRecipe &GroupLeader;
+  ScalarEvolution &SE;
+  const Loop &L;
+  VPTypeAnalysis &TypeInfo;
+
+  // Return true if \p A and \p B are known to not alias for all VFs in the
+  // plan, checked via the distance between the accesses
+  bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
+    if (A->getOpcode() != Instruction::Store ||
+        B->getOpcode() != Instruction::Store)
+      return false;
+
+    VPValue *AddrA = A->getOperand(1);
+    const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, SE, &L);
+    VPValue *AddrB = B->getOperand(1);
+    const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, SE, &L);
+    if (isa<SCEVCouldNotCompute>(SCEVA) || isa<SCEVCouldNotCompute>(SCEVB))
+      return false;
+
+    const APInt *Distance;
+    if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
+      return false;
+
+    const DataLayout &DL = SE.getDataLayout();
+    Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
+    uint64_t SizeA = DL.getTypeStoreSize(TyA);
+    Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
+    uint64_t SizeB = DL.getTypeStoreSize(TyB);
+
+    // Use the maximum store size to ensure no overlap from either direction.
+    // Currently only handles fixed sizes, as it is only used for
+    // replicating VPReplicateRecipes.
+    uint64_t MaxStoreSize = std::max(SizeA, SizeB);
+
+    auto VFs = B->getParent()->getPlan()->vectorFactors();
+    ElementCount MaxVF = *max_element(VFs, ElementCount::isKnownLT);
+    return Distance->abs().uge(
+        MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
+  }
+
+public:
+  SinkStoreInfo(const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes,
+                VPReplicateRecipe &GroupLeader, ScalarEvolution &SE,
+                const Loop &L, VPTypeAnalysis &TypeInfo)
+      : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), SE(SE), L(L),
+        TypeInfo(TypeInfo) {}
+
+  /// Return true if \p R should be skipped during alias checking, either
+  /// because it's in the exclude set or because no-alias can be proven via
+  /// SCEV.
+  bool shouldSkip(VPRecipeBase &R) const {
+    auto *Store = dyn_cast<VPReplicateRecipe>(&R);
+    return ExcludeRecipes.contains(&R) ||
+           (Store && isNoAliasViaDistance(Store, &GroupLeader));
+  }
+};
+
+/// Check if a memory operation doesn't alias with memory operations in blocks
+/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
+/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
+/// checked (for load hoisting). Otherwise recipes that both read and write
+/// memory are checked, and SCEV is used to prove no-alias between the group
+/// leader and other replicate recipes (for store sinking).
+static bool
+canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,
+                               VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
+                               std::optional<SinkStoreInfo> SinkInfo = {}) {
+  bool CheckReads = SinkInfo.has_value();
   if (!MemLoc.AATags.Scope)
     return false;
 
@@ -158,7 +222,7 @@ static bool canHoistOrSinkWithNoAliasCheck(
            "Expected at most one successor in block chain");
     auto *VPBB = cast<VPBasicBlock>(Block);
     for (VPRecipeBase &R : *VPBB) {
-      if (ExcludeRecipes && ExcludeRecipes->contains(&R))
+      if (SinkInfo && SinkInfo->shouldSkip(R))
         continue;
 
       // Skip recipes that don't need checking.
@@ -4273,8 +4337,7 @@ void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
 
     // Check that the load doesn't alias with stores between first and last.
     auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
-    if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB,
-                                                    /*CheckReads=*/false))
+    if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
       continue;
 
     // Collect common metadata from all loads in the group.
@@ -4301,7 +4364,9 @@ void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
 }
 
 static bool
-canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink) {
+canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink,
+                             ScalarEvolution &SE, const Loop &L,
+                             VPTypeAnalysis &TypeInfo) {
   auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
   if (!StoreLoc || !StoreLoc->AATags.Scope)
     return false;
@@ -4313,8 +4378,8 @@ canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink) {
 
   VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
   VPBasicBlock *LastBB = StoresToSink.back()->getParent();
-  return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB,
-                                        /*CheckReads=*/true, &StoresToSinkSet);
+  SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], SE, L, TypeInfo);
+  return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
 }
 
 void VPlanTransforms::sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE,
@@ -4325,13 +4390,14 @@ void VPlanTransforms::sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE,
     return;
 
   VPDominatorTree VPDT(Plan);
+  VPTypeAnalysis TypeInfo(Plan);
 
   for (auto &Group : Groups) {
     sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
       return VPDT.properlyDominates(A, B);
     });
 
-    if (!canSinkStoreWithNoAliasCheck(Group))
+    if (!canSinkStoreWithNoAliasCheck(Group, SE, *L, TypeInfo))
       continue;
 
     // Use the last (most dominated) store's location for the unconditional
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
index 15355ea139205..d9e51c39c2042 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
@@ -15,62 +15,5 @@ define amdgpu_kernel void @test2(ptr %ptr, i32 %cmp, i32 %new) {
   ret void
 }
 
-; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
-define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) #0 {
-  %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
-  store i32 %ret, ptr addrspace(1) %ptr, align 4
-  ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
-define amdgpu_kernel void @test_ds_atomic_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) #0 {
-entry:
-  %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
-  store i32 %val, ptr addrspace(3) %use
-  ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
-define amdgpu_kernel void @test_flat_atomic_cond_sub_u32(ptr %addr, i32 %in, ptr %use) #0 {
-entry:
-  %gep = getelementptr i32, ptr %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
-  store i32 %val, ptr %use
-  ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
-define amdgpu_kernel void @test_global_atomic_cond_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) #0 {
-entry:
-  %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
-  store i32 %val, ptr addrspace(1) %use
-  ret void
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-define float @test_raw_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
-  %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-define float @test_struct_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
-  %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) #1
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #1
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #1
-
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index 3e28cd050fc88..e9194ea1e14fe 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -420,5 +420,152 @@ define double @upgrade_amdgcn_global_atomic_fmax_f64_p1_f64(ptr addrspace(1) %pt
 
 attributes #0 = { argmemonly nounwind willreturn }
 
+define void @atomic_usub_cond(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+  ; CHECK: atomicrmw usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+    %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result3 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result4 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+    %result5 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+  ret void
+}
+
+define void @atomic_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+    %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+    %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+  ret void
+}
+
+; Test some invalid ordering handling
+define void @ordering_usub_cond_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+  ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4
+    %result3 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result4 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result5 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result6 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result7 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result8 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false)
+
+  ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result9 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true)
+
+  ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+    %result10 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result11 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 -1, i64 0, i1 true)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result12 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 true)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result13 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 1, i64 0, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") monotonic, align 8
+    %result14 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 2, i64 0, i1 true)
+
+  ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result15 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result16 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 4, i1 true)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result17 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 5, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result18 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 6, i1 true)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result19 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 7, i1 false)
+
+  ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result20 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 8, i1 true)
+
+  ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+    %result21 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 true)
+  ret void
+}
+
+define void @immarg_violations_usub_sat(ptr %ptr0, i32 %val32, i1 %val1, i64 %val64) {
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+    %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+    %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+    %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+    %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 %val64, i64 0, i1 false)
+
+  ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+    %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 %val64, i1 false)
+
+  ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+    %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 0, i1 %val1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+
+declare i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+
 ; CHECK: !0 = !{i32 5, i32 6}
 ; CHECK: !1 = !{}
diff --git a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index b837a361bd287..724c8b3fc9170 100644
--- a/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -735,15 +735,21 @@ define void @infiniteloop() {
 ; ENABLE-NEXT:    .cfi_offset w29, -16
 ; ENABLE-NEXT:    .cfi_offset w19, -24
 ; ENABLE-NEXT:    .cfi_offset w20, -32
+; ENABLE-NEXT:  ; %bb.1: ; %if.then
 ; ENABLE-NEXT:    sub x19, sp, #16
 ; ENABLE-NEXT:    mov sp, x19
 ; ENABLE-NEXT:    mov w20, wzr
-; ENABLE-NEXT:  LBB10_1: ; %for.body
+; ENABLE-NEXT:  LBB10_2: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    bl _something
 ; ENABLE-NEXT:    add w20, w0, w20
 ; ENABLE-NEXT:    str w20, [x19]
-; ENABLE-NEXT:    b LBB10_1
+; ENABLE-NEXT:    b LBB10_2
+; ENABLE-NEXT:  ; %bb.3: ; %if.end
+; ENABLE-NEXT:    sub sp, x29, #16
+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ENABLE-NEXT:    ret
 ;
 ; DISABLE-LABEL: infiniteloop:
 ; DISABLE:       ; %bb.0: ; %entry
@@ -755,15 +761,21 @@ define void @infiniteloop() {
 ; DISABLE-NEXT:    .cfi_offset w29, -16
 ; DISABLE-NEXT:    .cfi_offset w19, -24
 ; DISABLE-NEXT:    .cfi_offset w20, -32
+; DISABLE-NEXT:  ; %bb.1: ; %if.then
 ; DISABLE-NEXT:    sub x19, sp, #16
 ; DISABLE-NEXT:    mov sp, x19
 ; DISABLE-NEXT:    mov w20, wzr
-; DISABLE-NEXT:  LBB10_1: ; %for.body
+; DISABLE-NEXT:  LBB10_2: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    bl _something
 ; DISABLE-NEXT:    add w20, w0, w20
 ; DISABLE-NEXT:    str w20, [x19]
-; DISABLE-NEXT:    b LBB10_1
+; DISABLE-NEXT:    b LBB10_2
+; DISABLE-NEXT:  ; %bb.3: ; %if.end
+; DISABLE-NEXT:    sub sp, x29, #16
+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; DISABLE-NEXT:    ret
 entry:
   br i1 undef, label %if.then, label %if.end
 
@@ -794,10 +806,11 @@ define void @infiniteloop2() {
 ; ENABLE-NEXT:    .cfi_offset w29, -16
 ; ENABLE-NEXT:    .cfi_offset w19, -24
 ; ENABLE-NEXT:    .cfi_offset w20, -32
+; ENABLE-NEXT:  ; %bb.1: ; %if.then
 ; ENABLE-NEXT:    sub x8, sp, #16
 ; ENABLE-NEXT:    mov sp, x8
 ; ENABLE-NEXT:    mov w9, wzr
-; ENABLE-NEXT:  LBB11_1: ; %for.body
+; ENABLE-NEXT:  LBB11_2: ; %for.body
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    ; InlineAsm Start
 ; ENABLE-NEXT:    mov x10, #0 ; =0x0
@@ -808,7 +821,12 @@ define void @infiniteloop2() {
 ; ENABLE-NEXT:    ; InlineAsm Start
 ; ENABLE-NEXT:    nop
 ; ENABLE-NEXT:    ; InlineAsm End
-; ENABLE-NEXT:    b LBB11_1
+; ENABLE-NEXT:    b LBB11_2
+; ENABLE-NEXT:  ; %bb.3: ; %if.end
+; ENABLE-NEXT:    sub sp, x29, #16
+; ENABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; ENABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; ENABLE-NEXT:    ret
 ;
 ; DISABLE-LABEL: infiniteloop2:
 ; DISABLE:       ; %bb.0: ; %entry
@@ -820,10 +838,11 @@ define void @infiniteloop2() {
 ; DISABLE-NEXT:    .cfi_offset w29, -16
 ; DISABLE-NEXT:    .cfi_offset w19, -24
 ; DISABLE-NEXT:    .cfi_offset w20, -32
+; DISABLE-NEXT:  ; %bb.1: ; %if.then
 ; DISABLE-NEXT:    sub x8, sp, #16
 ; DISABLE-NEXT:    mov sp, x8
 ; DISABLE-NEXT:    mov w9, wzr
-; DISABLE-NEXT:  LBB11_1: ; %for.body
+; DISABLE-NEXT:  LBB11_2: ; %for.body
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    ; InlineAsm Start
 ; DISABLE-NEXT:    mov x10, #0 ; =0x0
@@ -834,7 +853,12 @@ define void @infiniteloop2() {
 ; DISABLE-NEXT:    ; InlineAsm Start
 ; DISABLE-NEXT:    nop
 ; DISABLE-NEXT:    ; InlineAsm End
-; DISABLE-NEXT:    b LBB11_1
+; DISABLE-NEXT:    b LBB11_2
+; DISABLE-NEXT:  ; %bb.3: ; %if.end
+; DISABLE-NEXT:    sub sp, x29, #16
+; DISABLE-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; DISABLE-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
+; DISABLE-NEXT:    ret
 entry:
   br i1 undef, label %if.then, label %if.end
 
@@ -865,43 +889,49 @@ if.end:
 define void @infiniteloop3() {
 ; ENABLE-LABEL: infiniteloop3:
 ; ENABLE:       ; %bb.0: ; %entry
+; ENABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
 ; ENABLE-NEXT:    mov x8, xzr
 ; ENABLE-NEXT:    mov x9, xzr
 ; ENABLE-NEXT:    mov x11, xzr
-; ENABLE-NEXT:    b LBB12_2
-; ENABLE-NEXT:  LBB12_1: ; %loop2b
-; ENABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
+; ENABLE-NEXT:    b LBB12_3
+; ENABLE-NEXT:  LBB12_2: ; %loop2b
+; ENABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
 ; ENABLE-NEXT:    str x10, [x11]
 ; ENABLE-NEXT:    mov x11, x10
-; ENABLE-NEXT:  LBB12_2: ; %loop1
+; ENABLE-NEXT:  LBB12_3: ; %loop1
 ; ENABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; ENABLE-NEXT:    mov x10, x9
 ; ENABLE-NEXT:    ldr x9, [x8]
-; ENABLE-NEXT:    cbnz x8, LBB12_1
-; ENABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
+; ENABLE-NEXT:    cbnz x8, LBB12_2
+; ENABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
 ; ENABLE-NEXT:    mov x8, x10
 ; ENABLE-NEXT:    mov x11, x10
-; ENABLE-NEXT:    b LBB12_2
+; ENABLE-NEXT:    b LBB12_3
+; ENABLE-NEXT:  ; %bb.5: ; %end
+; ENABLE-NEXT:    ret
 ;
 ; DISABLE-LABEL: infiniteloop3:
 ; DISABLE:       ; %bb.0: ; %entry
+; DISABLE-NEXT:  ; %bb.1: ; %loop2a.preheader
 ; DISABLE-NEXT:    mov x8, xzr
 ; DISABLE-NEXT:    mov x9, xzr
 ; DISABLE-NEXT:    mov x11, xzr
-; DISABLE-NEXT:    b LBB12_2
-; DISABLE-NEXT:  LBB12_1: ; %loop2b
-; DISABLE-NEXT:    ; in Loop: Header=BB12_2 Depth=1
+; DISABLE-NEXT:    b LBB12_3
+; DISABLE-NEXT:  LBB12_2: ; %loop2b
+; DISABLE-NEXT:    ; in Loop: Header=BB12_3 Depth=1
 ; DISABLE-NEXT:    str x10, [x11]
 ; DISABLE-NEXT:    mov x11, x10
-; DISABLE-NEXT:  LBB12_2: ; %loop1
+; DISABLE-NEXT:  LBB12_3: ; %loop1
 ; DISABLE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; DISABLE-NEXT:    mov x10, x9
 ; DISABLE-NEXT:    ldr x9, [x8]
-; DISABLE-NEXT:    cbnz x8, LBB12_1
-; DISABLE-NEXT:  ; %bb.3: ; in Loop: Header=BB12_2 Depth=1
+; DISABLE-NEXT:    cbnz x8, LBB12_2
+; DISABLE-NEXT:  ; %bb.4: ; in Loop: Header=BB12_3 Depth=1
 ; DISABLE-NEXT:    mov x8, x10
 ; DISABLE-NEXT:    mov x11, x10
-; DISABLE-NEXT:    b LBB12_2
+; DISABLE-NEXT:    b LBB12_3
+; DISABLE-NEXT:  ; %bb.5: ; %end
+; DISABLE-NEXT:    ret
 entry:
   br i1 undef, label %loop2a, label %body
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-trn.ll b/llvm/test/CodeGen/AArch64/arm64-trn.ll
index fe245d01a7a6d..120c2d13a7ab7 100644
--- a/llvm/test/CodeGen/AArch64/arm64-trn.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-trn.ll
@@ -246,6 +246,63 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind {
 	ret <4 x float> %tmp5
 }
 
+define <8 x i8> @vtrni8_trn1_flipped(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_trn1_flipped:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    trn1 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: vtrni8_trn1_flipped:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v1.8b, v1.8b
+; CHECKBE-NEXT:    trn1 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @vtrni8_trn2_flipped(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_trn2_flipped:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: vtrni8_trn2_flipped:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v1.8b, v1.8b
+; CHECKBE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @vtrni8_both_flipped_with_poison_values(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_both_flipped_with_poison_values:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    trn1 v2.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    add v0.8b, v2.8b, v0.8b
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: vtrni8_both_flipped_with_poison_values:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v1.8b, v1.8b
+; CHECKBE-NEXT:    trn1 v2.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    add v0.8b, v2.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 poison, i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 14, i32 6>
+  %tmp2 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 poison, i32 1, i32 poison, i32 3, i32 13, i32 5, i32 15, i32 poison>
+  %tmp3 = add <8 x i8> %tmp1, %tmp2
+  ret <8 x i8> %tmp3
+}
+
 ; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN:
 
 define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
diff --git a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
index 6e6fb6f367867..7c3a567d1b336 100644
--- a/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
+++ b/llvm/test/CodeGen/AArch64/block-placement-optimize-branches.ll
@@ -8,14 +8,20 @@
 define i8 @foo_optsize(i32 %v4) optsize {
 ; CHECK-LABEL: foo_optsize:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbnz w0, .LBB0_2
-; CHECK-NEXT:  // %bb.1: // %b2
-; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    b .LBB0_2
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB0_2: // %b1
-; CHECK-NEXT:    cmp w0, #1
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    cbnz w0, .LBB0_4
+; CHECK-NEXT:  // %bb.3: // %b2
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_4: // %b1
+; CHECK-NEXT:    cmp w0, #1
+; CHECK-NEXT:    b.ne .LBB0_1
+; CHECK-NEXT:  // %bb.5: // %b3
+; CHECK-NEXT:    b .LBB0_1
 entry:
   %v2 = icmp eq i32 0, 0
   br i1 %v2, label %b1, label %b4
@@ -41,14 +47,20 @@ b4:
 define i8 @foo_optspeed(i32 %v4) {
 ; CHECK-LABEL: foo_optspeed:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cbnz w0, .LBB1_2
-; CHECK-NEXT:  // %bb.1: // %b2
-; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    b .LBB1_2
+; CHECK-NEXT:  .LBB1_1:
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:  .LBB1_2: // %b1
-; CHECK-NEXT:    cmp w0, #1
-; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    cbnz w0, .LBB1_4
+; CHECK-NEXT:  // %bb.3: // %b2
+; CHECK-NEXT:    mov w0, #1 // =0x1
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_4: // %b1
+; CHECK-NEXT:    cmp w0, #1
+; CHECK-NEXT:    b.ne .LBB1_1
+; CHECK-NEXT:  // %bb.5: // %b3
+; CHECK-NEXT:    b .LBB1_1
 entry:
   %v2 = icmp eq i32 0, 0
   br i1 %v2, label %b1, label %b4
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 282e0503dd7be..8e75d69be5062 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -6,12 +6,10 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-SD-LABEL: vector_deinterleave_v2f16_v4f16:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    dup v2.2s, v0.s[1]
-; CHECK-SD-NEXT:    mov v1.16b, v2.16b
-; CHECK-SD-NEXT:    zip1 v2.4h, v0.4h, v2.4h
-; CHECK-SD-NEXT:    mov v1.h[0], v0.h[1]
+; CHECK-SD-NEXT:    dup v1.2s, v0.s[1]
+; CHECK-SD-NEXT:    zip1 v2.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    trn2 v1.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    fmov d0, d2
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16:
diff --git a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
index 708ba621c26d8..29427146e8a43 100644
--- a/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
+++ b/llvm/test/CodeGen/AArch64/lr-reserved-for-ra-live-in.ll
@@ -21,8 +21,10 @@ define i32 @check_lr_liveness(ptr %arg) #1 {
   ; CHECK-NEXT:   B %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.bb:
+  ; CHECK-NEXT:   successors: %bb.3(0x2aaaaaab), %bb.2(0x55555555)
   ; CHECK-NEXT:   liveins: $w0, $lr
   ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CBNZW $wzr, %bb.3
   ; CHECK-NEXT:   B %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.bb1:
diff --git a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
index 72b2a77e51c06..abb090ccbcaed 100644
--- a/llvm/test/CodeGen/AArch64/pr164181.ll
+++ b/llvm/test/CodeGen/AArch64/pr164181.ll
@@ -29,11 +29,11 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Spill
 ; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Spill
 ; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Spill
-; CHECK-NEXT:    tbz w5, #0, .LBB0_40
+; CHECK-NEXT:    tbz w5, #0, .LBB0_43
 ; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
 ; CHECK-NEXT:    ldr x4, [sp, #312]
 ; CHECK-NEXT:    ldr x14, [sp, #280]
-; CHECK-NEXT:    tbz w0, #0, .LBB0_39
+; CHECK-NEXT:    tbz w0, #0, .LBB0_42
 ; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
 ; CHECK-NEXT:    ldrb w8, [sp, #368]
 ; CHECK-NEXT:    ldrb w12, [sp, #256]
@@ -92,7 +92,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
 ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
 ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
 ; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Reload
 ; CHECK-NEXT:    mov x12, x24
 ; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Spill
@@ -117,7 +117,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
 ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
 ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
 ; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Spill
 ; CHECK-NEXT:    cmn x24, #30
 ; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
@@ -142,7 +142,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
 ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
 ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
 ; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Reload
 ; CHECK-NEXT:    mov w14, #1152 // =0x480
 ; CHECK-NEXT:    mov w24, #1 // =0x1
@@ -176,7 +176,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    // => This Loop Header: Depth=4
 ; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
 ; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
-; CHECK-NEXT:    // Child Loop BB0_36 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
 ; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Reload
 ; CHECK-NEXT:    and w8, w8, w8, asr #31
 ; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Spill
@@ -281,23 +281,31 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    mov x24, xzr
 ; CHECK-NEXT:    mul w12, w12, w22
 ; CHECK-NEXT:    mov x22, x5
-; CHECK-NEXT:    tbz w0, #0, .LBB0_33
-; CHECK-NEXT:  .LBB0_28: // %if.then222.us
+; CHECK-NEXT:    tbz w0, #0, .LBB0_36
+; CHECK-NEXT:  .LBB0_28: // %for.body194.us
 ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
 ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
 ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
 ; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
 ; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:  // %bb.29: // %if.then222.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    adrp x27, :got:var_32
 ; CHECK-NEXT:    ldur w8, [x19, #-12]
 ; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
 ; CHECK-NEXT:    strh w8, [x27]
 ; CHECK-NEXT:    sxtb w8, w25
-; CHECK-NEXT:    strb w3, [x16]
 ; CHECK-NEXT:    bic w25, w8, w8, asr #31
+; CHECK-NEXT:    b .LBB0_31
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  // %bb.30:
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:  .LBB0_31: // %if.end239.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    strb w3, [x16]
 ; CHECK-NEXT:    tst w13, #0xff
-; CHECK-NEXT:    b.eq .LBB0_30
-; CHECK-NEXT:  // %bb.29: // %if.then254.us
+; CHECK-NEXT:    b.eq .LBB0_33
+; CHECK-NEXT:  // %bb.32: // %if.then254.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
 ; CHECK-NEXT:    adrp x27, :got:var_35
@@ -306,7 +314,7 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    csel x8, xzr, x7, eq
 ; CHECK-NEXT:    str x8, [x27]
 ; CHECK-NEXT:    strh w1, [x17]
-; CHECK-NEXT:  .LBB0_30: // %if.end282.us
+; CHECK-NEXT:  .LBB0_33: // %if.end282.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    orr x27, x24, x4
 ; CHECK-NEXT:    adrp x8, :got:var_39
@@ -317,14 +325,14 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    str x8, [x18]
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    cbnz x2, .LBB0_27
-; CHECK-NEXT:  // %bb.31: // %if.then327.us
+; CHECK-NEXT:  // %bb.34: // %if.then327.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    cbz w8, .LBB0_25
-; CHECK-NEXT:  // %bb.32: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
 ; CHECK-NEXT:    mov w4, wzr
 ; CHECK-NEXT:    b .LBB0_26
 ; CHECK-NEXT:    .p2align 5, , 16
-; CHECK-NEXT:  .LBB0_33: // %for.cond376.preheader.us
+; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
 ; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
 ; CHECK-NEXT:    mov w3, #1152 // =0x480
 ; CHECK-NEXT:    mov x22, xzr
@@ -335,24 +343,24 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    madd x14, x14, x3, x11
 ; CHECK-NEXT:    mov w28, w30
 ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
-; CHECK-NEXT:    b .LBB0_36
+; CHECK-NEXT:    b .LBB0_39
 ; CHECK-NEXT:    .p2align 5, , 16
-; CHECK-NEXT:  .LBB0_34: // %if.then466.us
-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+; CHECK-NEXT:  .LBB0_37: // %if.then466.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
 ; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Reload
 ; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Reload
 ; CHECK-NEXT:    sxtb w4, w4
 ; CHECK-NEXT:    bic w4, w4, w4, asr #31
 ; CHECK-NEXT:    str x3, [x28]
 ; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
-; CHECK-NEXT:  .LBB0_35: // %for.inc505.us
-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
 ; CHECK-NEXT:    add x22, x22, #1
 ; CHECK-NEXT:    add x27, x27, #1
 ; CHECK-NEXT:    mov w28, wzr
 ; CHECK-NEXT:    cmp x27, #0
 ; CHECK-NEXT:    b.hs .LBB0_9
-; CHECK-NEXT:  .LBB0_36: // %for.body380.us
+; CHECK-NEXT:  .LBB0_39: // %for.body380.us
 ; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
 ; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
 ; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
@@ -364,18 +372,18 @@ define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var
 ; CHECK-NEXT:    strh w28, [x11]
 ; CHECK-NEXT:    csel w28, w21, w3, ne
 ; CHECK-NEXT:    str w28, [x20]
-; CHECK-NEXT:    cbz x15, .LBB0_35
-; CHECK-NEXT:  // %bb.37: // %if.then436.us
-; CHECK-NEXT:    // in Loop: Header=BB0_36 Depth=5
+; CHECK-NEXT:    cbz x15, .LBB0_38
+; CHECK-NEXT:  // %bb.40: // %if.then436.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
 ; CHECK-NEXT:    ldrh w28, [x14]
-; CHECK-NEXT:    cbnz w28, .LBB0_34
-; CHECK-NEXT:  // %bb.38: // in Loop: Header=BB0_36 Depth=5
+; CHECK-NEXT:    cbnz w28, .LBB0_37
+; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
 ; CHECK-NEXT:    mov w4, wzr
-; CHECK-NEXT:    b .LBB0_35
-; CHECK-NEXT:  .LBB0_39: // %for.body41
+; CHECK-NEXT:    b .LBB0_38
+; CHECK-NEXT:  .LBB0_42: // %for.body41
 ; CHECK-NEXT:    strb wzr, [x4]
 ; CHECK-NEXT:    strb wzr, [x14]
-; CHECK-NEXT:  .LBB0_40: // %for.cond563.preheader
+; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
 ; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/pr166870.ll b/llvm/test/CodeGen/AArch64/pr166870.ll
index 6f54b0465fbfd..48967d30485ab 100644
--- a/llvm/test/CodeGen/AArch64/pr166870.ll
+++ b/llvm/test/CodeGen/AArch64/pr166870.ll
@@ -26,11 +26,12 @@ define i32 @widget(i32 %arg, i32 %arg1, i1 %arg2, ptr %arg3, i1 %arg4) #0 nounwi
 ; CHECK-NEXT:    mov x21, x1
 ; CHECK-NEXT:    bl baz
 ; CHECK-NEXT:    mov w0, #0 // =0x0
+; CHECK-NEXT:  // %bb.5: // %bb6
 ; CHECK-NEXT:    mov w10, #1 // =0x1
+; CHECK-NEXT:    cbnz w10, .LBB0_11
+; CHECK-NEXT:  // %bb.6: // %bb7
 ; CHECK-NEXT:    cbnz w10, .LBB0_10
-; CHECK-NEXT:  // %bb.5: // %bb7
-; CHECK-NEXT:    cbnz w10, .LBB0_9
-; CHECK-NEXT:  // %bb.6: // %bb8
+; CHECK-NEXT:  // %bb.7: // %bb8
 ; CHECK-NEXT:    mov x8, x21
 ; CHECK-NEXT:    mov x9, x20
 ; CHECK-NEXT:    mov w20, #0 // =0x0
@@ -38,17 +39,17 @@ define i32 @widget(i32 %arg, i32 %arg1, i1 %arg2, ptr %arg3, i1 %arg4) #0 nounwi
 ; CHECK-NEXT:    mov x21, x9
 ; CHECK-NEXT:    mov w8, w8
 ; CHECK-NEXT:    mov x22, x8
-; CHECK-NEXT:  .LBB0_7: // %bb10
+; CHECK-NEXT:  .LBB0_8: // %bb10
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    strb w20, [x19]
-; CHECK-NEXT:    cbnz x21, .LBB0_7
-; CHECK-NEXT:  // %bb.8: // %bb12
-; CHECK-NEXT:    // in Loop: Header=BB0_7 Depth=1
+; CHECK-NEXT:    cbnz x21, .LBB0_8
+; CHECK-NEXT:  // %bb.9: // %bb12
+; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=1
 ; CHECK-NEXT:    bl snork
-; CHECK-NEXT:    cbnz x22, .LBB0_7
-; CHECK-NEXT:  .LBB0_9:
-; CHECK-NEXT:    mov w0, #0 // =0x0
+; CHECK-NEXT:    cbnz x22, .LBB0_8
 ; CHECK-NEXT:  .LBB0_10:
+; CHECK-NEXT:    mov w0, #0 // =0x0
+; CHECK-NEXT:  .LBB0_11:
 ; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index 072f6f4e8f73e..39beffcf85783 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -36,93 +36,93 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    zip1 v5.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    trn1 v6.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    zip2 v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    ext v16.16b, v1.16b, v1.16b, #12
-; CHECK-NEXT:    zip2 v17.4s, v1.4s, v2.4s
-; CHECK-NEXT:    zip2 v7.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip1 v18.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ext v7.16b, v1.16b, v1.16b, #12
+; CHECK-NEXT:    zip2 v16.4s, v1.4s, v2.4s
+; CHECK-NEXT:    zip1 v17.4s, v2.4s, v1.4s
+; CHECK-NEXT:    trn2 v18.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    uzp2 v4.4s, v4.4s, v1.4s
 ; CHECK-NEXT:    ext v3.16b, v3.16b, v5.16b, #8
-; CHECK-NEXT:    mov v1.s[0], v2.s[1]
-; CHECK-NEXT:    ext v2.16b, v2.16b, v16.16b, #12
-; CHECK-NEXT:    mov v17.d[1], v6.d[1]
-; CHECK-NEXT:    mov v7.d[1], v6.d[1]
+; CHECK-NEXT:    zip2 v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ext v2.16b, v2.16b, v7.16b, #12
+; CHECK-NEXT:    mov v16.d[1], v6.d[1]
+; CHECK-NEXT:    mov v18.d[1], v5.d[1]
 ; CHECK-NEXT:    mov v4.d[1], v0.d[1]
-; CHECK-NEXT:    mov v18.d[1], v3.d[1]
-; CHECK-NEXT:    mov v1.d[1], v5.d[1]
+; CHECK-NEXT:    mov v17.d[1], v3.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
 ; CHECK-NEXT:    mov v2.d[1], v0.d[1]
-; CHECK-NEXT:    add v0.4s, v4.4s, v17.4s
-; CHECK-NEXT:    add v3.4s, v1.4s, v18.4s
-; CHECK-NEXT:    sub v1.4s, v18.4s, v1.4s
-; CHECK-NEXT:    sub v2.4s, v7.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v4.4s, v16.4s
+; CHECK-NEXT:    add v3.4s, v18.4s, v17.4s
+; CHECK-NEXT:    sub v6.4s, v17.4s, v18.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    rev64 v4.4s, v0.4s
 ; CHECK-NEXT:    rev64 v5.4s, v3.4s
-; CHECK-NEXT:    sub v6.4s, v1.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sub v2.4s, v6.4s, v1.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v6.4s
 ; CHECK-NEXT:    mov v4.d[1], v0.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v3.d[1]
-; CHECK-NEXT:    rev64 v2.4s, v6.4s
+; CHECK-NEXT:    rev64 v6.4s, v2.4s
 ; CHECK-NEXT:    rev64 v7.4s, v1.4s
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v4.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    sub v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    sub v4.4s, v2.4s, v6.4s
 ; CHECK-NEXT:    sub v5.4s, v1.4s, v7.4s
-; CHECK-NEXT:    addp v4.4s, v3.4s, v6.4s
+; CHECK-NEXT:    addp v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    addp v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    rev64 v6.4s, v0.4s
 ; CHECK-NEXT:    rev64 v7.4s, v3.4s
-; CHECK-NEXT:    ext v16.16b, v4.16b, v2.16b, #4
+; CHECK-NEXT:    ext v16.16b, v2.16b, v4.16b, #4
 ; CHECK-NEXT:    ext v17.16b, v1.16b, v5.16b, #4
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    mov v7.16b, v2.16b
-; CHECK-NEXT:    zip2 v6.4s, v16.4s, v4.4s
+; CHECK-NEXT:    mov v7.16b, v4.16b
+; CHECK-NEXT:    zip2 v6.4s, v16.4s, v2.4s
 ; CHECK-NEXT:    mov v16.16b, v5.16b
 ; CHECK-NEXT:    zip2 v17.4s, v17.4s, v1.4s
 ; CHECK-NEXT:    ext v18.16b, v0.16b, v1.16b, #4
-; CHECK-NEXT:    mov v7.s[2], v4.s[3]
+; CHECK-NEXT:    mov v7.s[2], v2.s[3]
 ; CHECK-NEXT:    mov v21.16b, v3.16b
 ; CHECK-NEXT:    mov v16.s[2], v1.s[3]
 ; CHECK-NEXT:    ext v5.16b, v5.16b, v17.16b, #12
 ; CHECK-NEXT:    zip1 v17.4s, v1.4s, v1.4s
-; CHECK-NEXT:    ext v2.16b, v2.16b, v6.16b, #12
+; CHECK-NEXT:    ext v4.16b, v4.16b, v6.16b, #12
 ; CHECK-NEXT:    ext v18.16b, v18.16b, v18.16b, #4
 ; CHECK-NEXT:    mov v19.16b, v7.16b
-; CHECK-NEXT:    ext v6.16b, v3.16b, v4.16b, #8
-; CHECK-NEXT:    mov v21.s[2], v4.s[1]
+; CHECK-NEXT:    ext v6.16b, v3.16b, v2.16b, #8
+; CHECK-NEXT:    mov v21.s[2], v2.s[1]
 ; CHECK-NEXT:    mov v20.16b, v16.16b
-; CHECK-NEXT:    mov v19.s[1], v4.s[2]
+; CHECK-NEXT:    mov v19.s[1], v2.s[2]
 ; CHECK-NEXT:    trn2 v0.4s, v17.4s, v0.4s
 ; CHECK-NEXT:    sub v16.4s, v16.4s, v5.4s
 ; CHECK-NEXT:    mov v17.16b, v18.16b
 ; CHECK-NEXT:    ext v3.16b, v6.16b, v3.16b, #4
-; CHECK-NEXT:    sub v7.4s, v7.4s, v2.4s
+; CHECK-NEXT:    sub v7.4s, v7.4s, v4.4s
 ; CHECK-NEXT:    mov v20.s[1], v1.s[2]
 ; CHECK-NEXT:    mov v17.s[0], v1.s[1]
 ; CHECK-NEXT:    mov v1.16b, v21.16b
-; CHECK-NEXT:    add v2.4s, v19.4s, v2.4s
-; CHECK-NEXT:    uzp2 v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
 ; CHECK-NEXT:    add v5.4s, v20.4s, v5.4s
-; CHECK-NEXT:    mov v1.s[1], v4.s[0]
-; CHECK-NEXT:    sub v4.4s, v0.4s, v18.4s
-; CHECK-NEXT:    mov v2.d[1], v7.d[1]
+; CHECK-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-NEXT:    uzp2 v2.4s, v6.4s, v3.4s
+; CHECK-NEXT:    sub v3.4s, v0.4s, v18.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v17.4s
+; CHECK-NEXT:    mov v4.d[1], v7.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v16.d[1]
-; CHECK-NEXT:    sub v6.4s, v21.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mov v0.d[1], v4.d[1]
-; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT:    cmlt v3.8h, v5.8h, #0
+; CHECK-NEXT:    sub v6.4s, v21.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    mov v0.d[1], v3.d[1]
+; CHECK-NEXT:    cmlt v3.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v2.8h, v5.8h, #0
 ; CHECK-NEXT:    mov v1.d[1], v6.d[1]
-; CHECK-NEXT:    add v2.4s, v4.4s, v2.4s
 ; CHECK-NEXT:    cmlt v6.8h, v0.8h, #0
-; CHECK-NEXT:    add v5.4s, v3.4s, v5.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    add v4.4s, v3.4s, v4.4s
+; CHECK-NEXT:    add v5.4s, v2.4s, v5.4s
 ; CHECK-NEXT:    cmlt v7.8h, v1.8h, #0
 ; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-NEXT:    eor v3.16b, v5.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v2.16b, v5.16b, v2.16b
 ; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v6.16b
-; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    eor v1.16b, v1.16b, v7.16b
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -255,77 +255,76 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
 ; CHECK-NEXT:    saddw v2.4s, v7.4s, v2.4h
 ; CHECK-NEXT:    zip1 v4.4s, v1.4s, v0.4s
-; CHECK-NEXT:    trn1 v18.4s, v1.4s, v0.4s
+; CHECK-NEXT:    trn1 v6.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    zip2 v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v2.4s
-; CHECK-NEXT:    mov v7.16b, v3.16b
-; CHECK-NEXT:    zip1 v6.4s, v2.4s, v3.4s
-; CHECK-NEXT:    zip2 v16.4s, v3.4s, v2.4s
+; CHECK-NEXT:    zip1 v7.4s, v2.4s, v3.4s
+; CHECK-NEXT:    trn2 v16.4s, v2.4s, v3.4s
+; CHECK-NEXT:    ext v18.16b, v3.16b, v3.16b, #12
 ; CHECK-NEXT:    ext v17.16b, v1.16b, v4.16b, #8
-; CHECK-NEXT:    mov v7.s[0], v2.s[1]
-; CHECK-NEXT:    ext v1.16b, v3.16b, v3.16b, #12
+; CHECK-NEXT:    zip2 v1.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v3.4s
+; CHECK-NEXT:    mov v16.d[1], v4.d[1]
 ; CHECK-NEXT:    zip2 v3.4s, v2.4s, v3.4s
-; CHECK-NEXT:    mov v16.d[1], v18.d[1]
-; CHECK-NEXT:    mov v6.d[1], v17.d[1]
-; CHECK-NEXT:    mov v7.d[1], v4.d[1]
-; CHECK-NEXT:    ext v1.16b, v2.16b, v1.16b, #12
+; CHECK-NEXT:    ext v2.16b, v2.16b, v18.16b, #12
+; CHECK-NEXT:    mov v7.d[1], v17.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v0.d[1]
-; CHECK-NEXT:    mov v3.d[1], v18.d[1]
-; CHECK-NEXT:    add v2.4s, v7.4s, v6.4s
-; CHECK-NEXT:    mov v1.d[1], v0.d[1]
-; CHECK-NEXT:    add v4.4s, v5.4s, v16.4s
-; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    rev64 v0.4s, v4.4s
-; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    sub v3.4s, v6.4s, v7.4s
-; CHECK-NEXT:    mov v5.d[1], v2.d[1]
-; CHECK-NEXT:    add v6.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mov v0.d[1], v4.d[1]
-; CHECK-NEXT:    add v4.4s, v4.4s, v5.4s
-; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    zip1 v2.4s, v4.4s, v6.4s
-; CHECK-NEXT:    uzp2 v3.4s, v4.4s, v6.4s
-; CHECK-NEXT:    zip2 v16.4s, v4.4s, v6.4s
-; CHECK-NEXT:    zip1 v5.4s, v0.4s, v1.4s
-; CHECK-NEXT:    trn1 v7.4s, v0.4s, v1.4s
-; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    trn2 v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    uzp2 v3.4s, v3.4s, v4.4s
-; CHECK-NEXT:    mov v4.s[1], v6.s[1]
-; CHECK-NEXT:    ext v0.16b, v0.16b, v5.16b, #8
-; CHECK-NEXT:    mov v16.d[1], v7.d[1]
-; CHECK-NEXT:    mov v3.d[1], v1.d[1]
-; CHECK-NEXT:    mov v4.d[1], v5.d[1]
+; CHECK-NEXT:    mov v3.d[1], v6.d[1]
 ; CHECK-NEXT:    mov v2.d[1], v0.d[1]
-; CHECK-NEXT:    add v0.4s, v16.4s, v3.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    add v1.4s, v4.4s, v2.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v4.4s, v16.4s, v7.4s
+; CHECK-NEXT:    sub v6.4s, v7.4s, v16.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT:    sub v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    rev64 v5.4s, v4.4s
+; CHECK-NEXT:    rev64 v0.4s, v1.4s
+; CHECK-NEXT:    add v3.4s, v2.4s, v6.4s
+; CHECK-NEXT:    sub v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    mov v5.d[1], v4.d[1]
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
+; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sub v0.4s, v4.4s, v0.4s
+; CHECK-NEXT:    zip1 v4.4s, v1.4s, v3.4s
+; CHECK-NEXT:    uzp2 v5.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v7.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip1 v6.4s, v0.4s, v2.4s
+; CHECK-NEXT:    trn1 v16.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip2 v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    trn2 v4.4s, v1.4s, v4.4s
+; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v1.4s
+; CHECK-NEXT:    mov v1.s[1], v3.s[1]
+; CHECK-NEXT:    ext v0.16b, v0.16b, v6.16b, #8
+; CHECK-NEXT:    mov v7.d[1], v16.d[1]
+; CHECK-NEXT:    mov v5.d[1], v2.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
+; CHECK-NEXT:    mov v4.d[1], v0.d[1]
+; CHECK-NEXT:    add v0.4s, v7.4s, v5.4s
+; CHECK-NEXT:    sub v3.4s, v5.4s, v7.4s
+; CHECK-NEXT:    add v2.4s, v1.4s, v4.4s
+; CHECK-NEXT:    sub v1.4s, v4.4s, v1.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #4
 ; CHECK-NEXT:    zip2 v6.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    zip2 v7.4s, v3.4s, v0.4s
-; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    zip2 v16.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip2 v17.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v5.16b, v2.16b, v2.16b, #4
+; CHECK-NEXT:    zip2 v16.4s, v1.4s, v2.4s
+; CHECK-NEXT:    zip2 v17.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    zip1 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    ext v18.16b, v4.16b, v3.16b, #8
-; CHECK-NEXT:    ext v19.16b, v5.16b, v2.16b, #8
+; CHECK-NEXT:    ext v19.16b, v5.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    add v2.4s, v16.4s, v7.4s
 ; CHECK-NEXT:    sub v3.4s, v6.4s, v17.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ext v4.16b, v18.16b, v4.16b, #4
-; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
 ; CHECK-NEXT:    cmlt v6.8h, v2.8h, #0
 ; CHECK-NEXT:    ext v5.16b, v19.16b, v5.16b, #4
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
 ; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
 ; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    add v4.4s, v5.4s, v4.4s
 ; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    cmlt v7.8h, v4.8h, #0
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
@@ -480,7 +479,7 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    uzp2 v4.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v7.4s, v1.4s, v0.4s
-; CHECK-NEXT:    mov v6.s[3], v5.s[2]
+; CHECK-NEXT:    trn1 v6.4s, v6.4s, v5.4s
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    zip2 v17.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    zip1 v2.4s, v2.4s, v3.4s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index bff4771319454..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,270 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
-  %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret void
-}
-
-define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset_nortn:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
-  %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    global_store_dword v[0:1], v0, off
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
-; GFX12-NEXT:    s_endpgm
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
-  %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  store i32 %ret, ptr addrspace(1) poison
-  ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v0, s2
-; GFX10-NEXT:    global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    buffer_gl1_inv
-; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT:    global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    buffer_gl1_inv
-; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT:    global_wb scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_inv scope:SCOPE_SYS
-; GFX12-NEXT:    s_endpgm
-  %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
-  %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
-  ret void
-}
-
-attributes #0 = { nounwind willreturn }
-attributes #1 = { argmemonly nounwind }
-
-!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 4af2d58b01518..d281492c647f1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -2,190 +2,222 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
 
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32)
-
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], -16
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr inbounds i32, ptr %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+  %gep = getelementptr i32, ptr %addr, i32 -4
+  %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], -16
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr inbounds i32, ptr %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+  %gep = getelementptr i32, ptr %addr, i32 -4
+  %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr %use) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr %use) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
 ; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-SDAG-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 16
+; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
 ; GFX12-GISEL-NEXT:    flat_store_b32 v[0:1], v2
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
-  %gep = getelementptr inbounds i32, ptr %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+  %gep = getelementptr i32, ptr %addr, i32 4
+  %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   store i32 %val, ptr %use
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], -16
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, -16
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+  %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], -16
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, -16
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v[0:1], v2
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+  %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_clause 0x1
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 16
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_add_co_u32 s0, s0, 16
-; GFX12-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT:    flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+  %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   store i32 %val, ptr addrspace(1) %use
   ret void
 }
 
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -193,9 +225,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX12-SDAG-NEXT:    ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -203,15 +237,17 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX12-GISEL-NEXT:    ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+  %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32_forced:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -219,9 +255,11 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr,
 ; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX12-SDAG-NEXT:    ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32_forced:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -229,38 +267,44 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr,
 ; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
 ; GFX12-GISEL-NEXT:    ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SE
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
-  %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+  %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   ret void
 }
 
-define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
-; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
+; GFX12-SDAG-LABEL: ds_usub_cond_rtn_u32:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX12-SDAG-NEXT:    ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-SDAG-NEXT:    ds_store_b32 v1, v0
 ; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_rtn_u32:
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
 ; GFX12-GISEL-NEXT:    ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX12-GISEL-NEXT:    ds_store_b32 v1, v0
 ; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
-  %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+  %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
   store i32 %val, ptr addrspace(3) %use
   ret void
 }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll
new file mode 100644
index 0000000000000..26acb4604cbcb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll
@@ -0,0 +1,469 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s --check-prefix=GFX950
+
+; Ensure we don't crash with: "Cannot scavenge register in FI elimination!"
+define amdgpu_kernel void @issue155902(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, i64 %arg48, i64 %arg49) {
+; GFX950-LABEL: issue155902:
+; GFX950:       ; %bb.0: ; %bb
+; GFX950-NEXT:    s_mov_b32 s33, 0x4008
+; GFX950-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; GFX950-NEXT:    v_writelane_b32 v2, s33, 0
+; GFX950-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX950-NEXT:    s_load_dwordx2 vcc, s[2:3], 0x8
+; GFX950-NEXT:    s_load_dwordx2 s[98:99], s[2:3], 0x10
+; GFX950-NEXT:    s_load_dwordx2 s[96:97], s[2:3], 0x18
+; GFX950-NEXT:    s_load_dwordx2 s[94:95], s[2:3], 0x20
+; GFX950-NEXT:    s_load_dwordx2 s[92:93], s[2:3], 0x28
+; GFX950-NEXT:    s_load_dwordx2 s[90:91], s[2:3], 0x30
+; GFX950-NEXT:    s_load_dwordx2 s[88:89], s[2:3], 0x38
+; GFX950-NEXT:    s_load_dwordx2 s[86:87], s[2:3], 0x40
+; GFX950-NEXT:    s_load_dwordx2 s[84:85], s[2:3], 0x48
+; GFX950-NEXT:    s_load_dwordx2 s[82:83], s[2:3], 0x50
+; GFX950-NEXT:    s_load_dwordx2 s[80:81], s[2:3], 0x58
+; GFX950-NEXT:    s_load_dwordx2 s[78:79], s[2:3], 0x60
+; GFX950-NEXT:    s_load_dwordx2 s[76:77], s[2:3], 0x68
+; GFX950-NEXT:    s_load_dwordx2 s[74:75], s[2:3], 0x70
+; GFX950-NEXT:    s_load_dwordx2 s[72:73], s[2:3], 0x78
+; GFX950-NEXT:    s_load_dwordx2 s[70:71], s[2:3], 0x80
+; GFX950-NEXT:    s_load_dwordx2 s[68:69], s[2:3], 0x88
+; GFX950-NEXT:    s_load_dwordx2 s[66:67], s[2:3], 0x90
+; GFX950-NEXT:    s_load_dwordx2 s[64:65], s[2:3], 0x98
+; GFX950-NEXT:    s_load_dwordx2 s[62:63], s[2:3], 0xa0
+; GFX950-NEXT:    s_load_dwordx2 s[60:61], s[2:3], 0xa8
+; GFX950-NEXT:    s_load_dwordx2 s[58:59], s[2:3], 0xb0
+; GFX950-NEXT:    s_load_dwordx2 s[56:57], s[2:3], 0xb8
+; GFX950-NEXT:    s_load_dwordx2 s[54:55], s[2:3], 0xc0
+; GFX950-NEXT:    s_load_dwordx2 s[52:53], s[2:3], 0xc8
+; GFX950-NEXT:    s_load_dwordx2 s[50:51], s[2:3], 0xd0
+; GFX950-NEXT:    s_load_dwordx2 s[48:49], s[2:3], 0xd8
+; GFX950-NEXT:    s_load_dwordx2 s[46:47], s[2:3], 0xe0
+; GFX950-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0xe8
+; GFX950-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0xf0
+; GFX950-NEXT:    s_load_dwordx2 s[40:41], s[2:3], 0xf8
+; GFX950-NEXT:    s_load_dwordx2 s[38:39], s[2:3], 0x100
+; GFX950-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x108
+; GFX950-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x110
+; GFX950-NEXT:    s_load_dwordx2 s[30:31], s[2:3], 0x118
+; GFX950-NEXT:    s_load_dwordx2 s[28:29], s[2:3], 0x120
+; GFX950-NEXT:    s_load_dwordx2 s[26:27], s[2:3], 0x128
+; GFX950-NEXT:    s_load_dwordx2 s[24:25], s[2:3], 0x130
+; GFX950-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x138
+; GFX950-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0x140
+; GFX950-NEXT:    s_load_dwordx2 s[18:19], s[2:3], 0x148
+; GFX950-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x150
+; GFX950-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x158
+; GFX950-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x160
+; GFX950-NEXT:    s_load_dwordx2 s[10:11], s[2:3], 0x168
+; GFX950-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x170
+; GFX950-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x178
+; GFX950-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x180
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x188
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x4008
+; GFX950-NEXT:    buffer_wbl2 sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_store_dwordx2 v3, v[0:1], off sc0 sc1
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s33
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0x384
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s0, v2, 0
+; GFX950-NEXT:    s_nop 4
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], vcc
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[98:99]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[96:97]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[94:95]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[92:93]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[90:91]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[88:89]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[86:87]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[84:85]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[82:83]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[80:81]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[78:79]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[76:77]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[74:75]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[72:73]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[70:71]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[68:69]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[66:67]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[64:65]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[62:63]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[60:61]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[58:59]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[56:57]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[54:55]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[52:53]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[50:51]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[48:49]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[46:47]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[44:45]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[42:43]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[40:41]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[38:39]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[34:35]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[30:31]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[26:27]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[22:23]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[14:15]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[10:11]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    s_endpgm
+bb:
+  %alloca.big = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca304 = alloca [2 x i64], align 8, addrspace(5)
+  %alloca307 = alloca i64, align 8, addrspace(5)
+  store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca304, align 8
+  store i64 900, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg1, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg2, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg3, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg4, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg5, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg6, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg7, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg8, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg9, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg10, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg11, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg12, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg13, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg14, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg15, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg16, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg17, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg18, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg19, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg20, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg21, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg22, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg23, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg24, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg25, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg26, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg27, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg28, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg29, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg30, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg31, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg32, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg33, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg34, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg35, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg36, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg37, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg38, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg39, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg40, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg41, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg42, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg43, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg44, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg45, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg46, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg47, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg48, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg49, ptr addrspace(5) %alloca307, align 8
+  ret void
+}
+
+define amdgpu_kernel void @issue155902_fp(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, i64 %arg48, i64 %arg49) #0 {
+; GFX950-LABEL: issue155902_fp:
+; GFX950:       ; %bb.0: ; %bb
+; GFX950-NEXT:    s_mov_b32 s33, 0
+; GFX950-NEXT:    s_add_i32 s1, s33, 0x4008
+; GFX950-NEXT:    s_mov_b32 s0, s1
+; GFX950-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; GFX950-NEXT:    v_writelane_b32 v2, s0, 0
+; GFX950-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX950-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_writelane_b32 v2, s4, 1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_writelane_b32 v2, s5, 2
+; GFX950-NEXT:    s_load_dwordx2 vcc, s[2:3], 0x8
+; GFX950-NEXT:    s_load_dwordx2 s[98:99], s[2:3], 0x10
+; GFX950-NEXT:    s_load_dwordx2 s[96:97], s[2:3], 0x18
+; GFX950-NEXT:    s_load_dwordx2 s[94:95], s[2:3], 0x20
+; GFX950-NEXT:    s_load_dwordx2 s[92:93], s[2:3], 0x28
+; GFX950-NEXT:    s_load_dwordx2 s[90:91], s[2:3], 0x30
+; GFX950-NEXT:    s_load_dwordx2 s[88:89], s[2:3], 0x38
+; GFX950-NEXT:    s_load_dwordx2 s[86:87], s[2:3], 0x40
+; GFX950-NEXT:    s_load_dwordx2 s[84:85], s[2:3], 0x48
+; GFX950-NEXT:    s_load_dwordx2 s[82:83], s[2:3], 0x50
+; GFX950-NEXT:    s_load_dwordx2 s[80:81], s[2:3], 0x58
+; GFX950-NEXT:    s_load_dwordx2 s[78:79], s[2:3], 0x60
+; GFX950-NEXT:    s_load_dwordx2 s[76:77], s[2:3], 0x68
+; GFX950-NEXT:    s_load_dwordx2 s[74:75], s[2:3], 0x70
+; GFX950-NEXT:    s_load_dwordx2 s[72:73], s[2:3], 0x78
+; GFX950-NEXT:    s_load_dwordx2 s[70:71], s[2:3], 0x80
+; GFX950-NEXT:    s_load_dwordx2 s[68:69], s[2:3], 0x88
+; GFX950-NEXT:    s_load_dwordx2 s[66:67], s[2:3], 0x90
+; GFX950-NEXT:    s_load_dwordx2 s[64:65], s[2:3], 0x98
+; GFX950-NEXT:    s_load_dwordx2 s[62:63], s[2:3], 0xa0
+; GFX950-NEXT:    s_load_dwordx2 s[60:61], s[2:3], 0xa8
+; GFX950-NEXT:    s_load_dwordx2 s[58:59], s[2:3], 0xb0
+; GFX950-NEXT:    s_load_dwordx2 s[56:57], s[2:3], 0xb8
+; GFX950-NEXT:    s_load_dwordx2 s[54:55], s[2:3], 0xc0
+; GFX950-NEXT:    s_load_dwordx2 s[52:53], s[2:3], 0xc8
+; GFX950-NEXT:    s_load_dwordx2 s[50:51], s[2:3], 0xd0
+; GFX950-NEXT:    s_load_dwordx2 s[48:49], s[2:3], 0xd8
+; GFX950-NEXT:    s_load_dwordx2 s[46:47], s[2:3], 0xe0
+; GFX950-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0xe8
+; GFX950-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0xf0
+; GFX950-NEXT:    s_load_dwordx2 s[40:41], s[2:3], 0xf8
+; GFX950-NEXT:    s_load_dwordx2 s[38:39], s[2:3], 0x100
+; GFX950-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x108
+; GFX950-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x110
+; GFX950-NEXT:    s_load_dwordx2 s[30:31], s[2:3], 0x118
+; GFX950-NEXT:    s_load_dwordx2 s[28:29], s[2:3], 0x120
+; GFX950-NEXT:    s_load_dwordx2 s[26:27], s[2:3], 0x128
+; GFX950-NEXT:    s_load_dwordx2 s[24:25], s[2:3], 0x130
+; GFX950-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x138
+; GFX950-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0x140
+; GFX950-NEXT:    s_load_dwordx2 s[18:19], s[2:3], 0x148
+; GFX950-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x150
+; GFX950-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x158
+; GFX950-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x160
+; GFX950-NEXT:    s_load_dwordx2 s[10:11], s[2:3], 0x168
+; GFX950-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x170
+; GFX950-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x178
+; GFX950-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x180
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x188
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX950-NEXT:    s_add_i32 s1, s33, 0x4008
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s1 offset:8
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0x384
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_readlane_b32 s0, v2, 1
+; GFX950-NEXT:    v_readlane_b32 s1, v2, 2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s0, v2, 0
+; GFX950-NEXT:    s_nop 4
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], vcc
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[98:99]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[96:97]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[94:95]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[92:93]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[90:91]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[88:89]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[86:87]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[84:85]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[82:83]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[80:81]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[78:79]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[76:77]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[74:75]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[72:73]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[70:71]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[68:69]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[66:67]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[64:65]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[62:63]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[60:61]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[58:59]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[56:57]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[54:55]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[52:53]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[50:51]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[48:49]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[46:47]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[44:45]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[42:43]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[40:41]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[38:39]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[34:35]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[30:31]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[26:27]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[22:23]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[14:15]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[10:11]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    s_endpgm
+bb:
+  %alloca.big = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca304 = alloca [2 x i64], align 8, addrspace(5)
+  %alloca307 = alloca i64, align 8, addrspace(5)
+  store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca304, align 8
+  store i64 900, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg1, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg2, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg3, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg4, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg5, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg6, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg7, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg8, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg9, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg10, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg11, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg12, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg13, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg14, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg15, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg16, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg17, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg18, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg19, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg20, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg21, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg22, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg23, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg24, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg25, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg26, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg27, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg28, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg29, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg30, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg31, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg32, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg33, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg34, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg35, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg36, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg37, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg38, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg39, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg40, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg41, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg42, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg43, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg44, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg45, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg46, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg47, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg48, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg49, ptr addrspace(5) %alloca307, align 8
+  ret void
+}
+
+attributes #0 = { "frame-pointer"="all" }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
deleted file mode 100644
index 243cd59c6a821..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ /dev/null
@@ -1,219 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-
-define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return_forced:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], null
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
-  ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_mov_b32_e32 v0, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
-  ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return_forced:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
-  %r = bitcast i32 %orig to float
-  ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
-  ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12:       ; %bb.0: ; %main_body
-; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT:    s_wait_expcnt 0x0
-; GFX12-NEXT:    s_wait_samplecnt 0x0
-; GFX12-NEXT:    s_wait_bvhcnt 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT:    s_mov_b32 s4, 4
-; GFX12-NEXT:    buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen
-; GFX12-NEXT:    s_setpc_b64 s[30:31]
-main_body:
-  %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
-  ret void
-}
-
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #0
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind "target-features"="+atomic-csub-no-rtn-insts" }
-
diff --git a/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll
new file mode 100644
index 0000000000000..6d9ad8d3ad436
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=nvptx-ir-peephole -S | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+define float @test_fsub_fmul_c(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fsub_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %mul = fmul contract float %a, %b
+  %sub = fsub contract float %mul, %c
+  ret float %sub
+}
+
+
+; fsub(c, fmul(a, b)) => fma(-a, b, c)
+define float @test_fsub_c_fmul(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fsub_c_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[B]], float [[C]])
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %mul = fmul contract float %a, %b
+  %sub = fsub contract float %c, %mul
+  ret float %sub
+}
+
+
+; fsub(fmul(a, b), fmul(c, d)) => fma(a, b, fneg(fmul(c, d)))
+define float @test_fsub_fmul_fmul(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fsub_fmul_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul contract float [[C]], [[D]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[MUL2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %mul1 = fmul contract float %a, %b
+  %mul2 = fmul contract float %c, %d
+  %sub = fsub contract float %mul1, %mul2
+  ret float %sub
+}
+
+
+; fsub(fmul(a, b), fmul(c, d)) => fma(fneg(c), d, fmul(a, b)))
+; fmul(a, b) has multiple uses.
+define float @test_fsub_fmul_fmul_multiple_use(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fsub_fmul_fmul_multiple_use(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[D]], float [[MUL1]])
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP2]], [[MUL1]]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %mul1 = fmul contract float %a, %b
+  %mul2 = fmul contract float %c, %d
+  %sub = fsub contract float %mul1, %mul2
+  %add = fadd float %sub, %mul1
+  ret float %add
+}
+
+
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c)) where fsub and fmul are in different BBs
+define float @test_fsub_fmul_different_BB(float %a, float %b, float %c, i32 %n) {
+; CHECK-LABEL: define float @test_fsub_fmul_different_BB(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[INIT:.*]]:
+; CHECK-NEXT:    [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10
+; CHECK-NEXT:    br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]]
+; CHECK:       [[ITERATION]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00
+; CHECK-NEXT:    [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = fneg contract float [[C_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP0]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+init:
+  %mul = fmul contract float %a, %b
+  %cmp_iter = icmp sgt i32 %n, 10
+  br i1 %cmp_iter, label %iteration, label %exit
+
+iteration:
+  %i = phi i32 [ 0, %init ], [ %i_next, %iteration ]
+  %acc = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %i_next = add i32 %i, 1
+  %acc_next = fadd contract float %acc, 1.0
+  %cmp_loop = icmp slt i32 %i_next, %n
+  br i1 %cmp_loop, label %iteration, label %exit
+
+exit:
+  %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %sub = fsub contract float %mul, %c_phi
+  ret float %sub
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c)
+define float @test_fadd_fmul_c(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fadd_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %mul = fmul contract float %a, %b
+  %add = fadd contract float %mul, %c
+  ret float %add
+}
+
+
+; fadd(c, fmul(a, b)) => fma(a, b, c)
+define float @test_fadd_c_fmul(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fadd_c_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %mul = fmul contract float %a, %b
+  %add = fadd contract float %c, %mul
+  ret float %add
+}
+
+
+; fadd(fmul(a, b), fmul(c, d)) => fma(a, b, fmul(c, d))
+define float @test_fadd_fmul_fmul(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fadd_fmul_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul contract float [[C]], [[D]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[MUL2]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %mul1 = fmul contract float %a, %b
+  %mul2 = fmul contract float %c, %d
+  %add = fadd contract float %mul1, %mul2
+  ret float %add
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c) where fadd and fmul are in different BBs
+define float @test_fadd_fmul_different_BB(float %a, float %b, float %c, i32 %n) {
+; CHECK-LABEL: define float @test_fadd_fmul_different_BB(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[INIT:.*]]:
+; CHECK-NEXT:    [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10
+; CHECK-NEXT:    br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]]
+; CHECK:       [[ITERATION]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00
+; CHECK-NEXT:    [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C_PHI]])
+; CHECK-NEXT:    ret float [[TMP0]]
+;
+init:
+  %mul = fmul contract float %a, %b
+  %cmp_iter = icmp sgt i32 %n, 10
+  br i1 %cmp_iter, label %iteration, label %exit
+
+iteration:
+  %i = phi i32 [ 0, %init ], [ %i_next, %iteration ]
+  %acc = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %i_next = add i32 %i, 1
+  %acc_next = fadd contract float %acc, 1.0
+  %cmp_loop = icmp slt i32 %i_next, %n
+  br i1 %cmp_loop, label %iteration, label %exit
+
+exit:
+  %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %add = fadd contract float %mul, %c_phi
+  ret float %add
+}
+
+
+; These scenarios shouldn't work.
+; fadd(fpext(fmul(a, b)), c) => fma(fpext(a), fpext(b), c)
+define double @test_fadd_fpext_fmul_c(float %a, float %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_fpext_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[MUL]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd contract double [[EXT]], [[C]]
+; CHECK-NEXT:    ret double [[ADD]]
+;
+  %mul = fmul contract float %a, %b
+  %ext = fpext float %mul to double
+  %add = fadd contract double %ext, %c
+  ret double %add
+}
+
+
+; fadd(c, fpext(fmul(a, b))) => fma(fpext(a), fpext(b), c)
+define double @test_fadd_c_fpext_fmul(float %a, float %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_c_fpext_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[MUL]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd contract double [[C]], [[EXT]]
+; CHECK-NEXT:    ret double [[ADD]]
+;
+  %mul = fmul contract float %a, %b
+  %ext = fpext float %mul to double
+  %add = fadd contract double %c, %ext
+  ret double %add
+}
+
+
+; Double precision tests
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+define double @test_fsub_fmul_c_double(double %a, double %b, double %c) {
+; CHECK-LABEL: define double @test_fsub_fmul_c_double(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract double [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %mul = fmul contract double %a, %b
+  %sub = fsub contract double %mul, %c
+  ret double %sub
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c)
+define double @test_fadd_fmul_c_double(double %a, double %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_fmul_c_double(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %mul = fmul contract double %a, %b
+  %add = fadd contract double %mul, %c
+  ret double %add
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
index ada76a43639d7..b7cb295648b4e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
@@ -11,7 +11,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv1i8
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -19,7 +19,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv1i8
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -40,7 +40,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv4i8
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -48,7 +48,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv4i8
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -98,7 +98,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv64i8
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -106,7 +106,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv64i8
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -127,7 +127,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv2i16
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -135,7 +135,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv2i16
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -185,7 +185,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv32i16
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
@@ -193,7 +193,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv32i16
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
index e8b5325bf8ef1..abf7e5efa9952 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
@@ -10,12 +10,65 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1_undef_nonzero
-    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 2 x s1>), 2
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s1>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv2i1_nxv4i1_undef_nonzero
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s64)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s64)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_SUBVECTOR [[DEF2]], [[SELECT1]](<vscale x 2 x s8>), 0
+    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C4]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C5]](s64)
+    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C5]](s64)
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 1
+    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s64)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[VSLIDEUP_VL]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i1_nxv4i1_undef_nonzero
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[DEF2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_SUBVECTOR [[DEF2]], [[SELECT1]](<vscale x 2 x s8>), 0
+    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C4]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C5]](s32)
+    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C5]](s32)
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 1
+    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[VSLIDEUP_VL]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
     %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s1>), %1, 2
@@ -264,14 +317,63 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 1 x s1>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s1>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv2i1_nxv4i1_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C2]](s64)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C3]](s64)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](<vscale x 1 x s8>), 0
+    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C4]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C5]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VMV_V_V_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s64)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[VMV_V_V_VL]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i1_nxv4i1_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s1>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[COPY]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_SELECT [[DEF]](<vscale x 1 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](<vscale x 1 x s8>), 0
+    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C4]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C5]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VMV_V_V_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[VMV_V_V_VL]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s1>) = COPY $v8
     %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s1>), %1, 0
@@ -285,14 +387,63 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv4i1_nxv8i1_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 2 x s1>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s1>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv4i1_nxv8i1_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s64)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s64)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s64)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](<vscale x 2 x s8>), 0
+    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C4]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C5]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VMV_V_V_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C6]](s64)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[VMV_V_V_VL]](<vscale x 8 x s8>), [[SPLAT_VECTOR4]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv4i1_nxv8i1_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 8 x s1>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[COPY]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[SELECT1]](<vscale x 2 x s8>), 0
+    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C4]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C5]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VMV_V_V_VL [[SELECT]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[VMV_V_V_VL]](<vscale x 8 x s8>), [[SPLAT_VECTOR4]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s1>) = COPY $v8
     %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s1>), %1, 0
@@ -306,14 +457,43 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv32i1_nxv64i1_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 16 x s1>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 64 x s1>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv32i1_nxv64i1_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[COPY]](<vscale x 64 x s1>)
+    ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[DEF]](<vscale x 16 x s1>)
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[BITCAST1]](<vscale x 2 x s8>), 0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VMV_V_V_VL [[BITCAST]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: [[BITCAST2:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VMV_V_V_VL]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST2]](<vscale x 64 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv32i1_nxv64i1_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 64 x s1>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[COPY]](<vscale x 64 x s1>)
+    ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[DEF]](<vscale x 16 x s1>)
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[BITCAST1]](<vscale x 2 x s8>), 0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VMV_V_V_VL [[BITCAST]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: [[BITCAST2:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VMV_V_V_VL]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST2]](<vscale x 64 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 64 x s1>) = COPY $v8
     %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 64 x s1>), %1, 0
@@ -329,14 +509,37 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv1i8_nxv2i8_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 1 x s8>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 2 x s8>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv1i8_nxv2i8_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s8>), 0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VMV_V_V_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: $v8 = COPY [[VMV_V_V_VL]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv1i8_nxv2i8_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 2 x s8>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s8>), 0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL [[C]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VMV_V_V_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: $v8 = COPY [[VMV_V_V_VL]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s8>) = COPY $v8
     %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
     %2:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR %0(<vscale x 2 x s8>), %1, 0
@@ -350,14 +553,37 @@ tracksRegLiveness: true
 body:             |
   bb.0.entry:
     liveins: $v8
-    ; CHECK-LABEL: name: insert_subvector_nxv2i16_nxv4i16_zero
-    ; CHECK: liveins: $v8
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[COPY]], [[DEF]](<vscale x 1 x s16>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s16>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv2i16_nxv4i16_zero
+    ; RV32: liveins: $v8
+    ; RV32-NEXT: {{  $}}
+    ; RV32-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV32-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s16>), 0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C]](s64)
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64)
+    ; RV32-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VMV_V_V_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR]](s64)
+    ; RV32-NEXT: $v8 = COPY [[VMV_V_V_VL]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i16_nxv4i16_zero
+    ; RV64: liveins: $v8
+    ; RV64-NEXT: {{  $}}
+    ; RV64-NEXT: [[COPY:%[0-9]+]]:_(<vscale x 4 x s16>) = COPY $v8
+    ; RV64-NEXT: [[DEF:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s16>), 0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C]](s32)
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32)
+    ; RV64-NEXT: [[VMV_V_V_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VMV_V_V_VL [[COPY]], [[INSERT_SUBVECTOR]], [[LSHR]](s32)
+    ; RV64-NEXT: $v8 = COPY [[VMV_V_V_VL]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s16>) = COPY $v8
     %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
     %2:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s16>), %1, 0
@@ -524,7 +750,7 @@ body:             |
     ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
     ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s16>), 0
     ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C]](s64)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C]](s64)
     ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
     ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
     ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C1]](s64)
@@ -542,7 +768,7 @@ body:             |
     ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
     ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF1]], [[DEF]](<vscale x 1 x s16>), 0
     ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL [[C]](s32)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL [[C]](s32)
     ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
     ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C1]](s32)
diff --git a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
index 0654fe8bd8d66..3225d649f066e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
@@ -13,7 +13,7 @@ define <vscale x 1 x double> @foo(<vscale x 1 x double> %x, <vscale x 1 x double
   ; CHECK-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
   ; CHECK-NEXT:   [[SRLI:%[0-9]+]]:gprnox0 = SRLI killed [[SLLI]], 32
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVFMUL_VV_M1_E64_MASK:%[0-9]+]]:vrnov0 = nnan ninf nsz arcp contract afn reassoc nofpexcept PseudoVFMUL_VV_M1_E64_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], 7, killed [[SRLI]], 6 /* e64 */, 1 /* ta, mu */, implicit $frm
+  ; CHECK-NEXT:   [[PseudoVFMUL_VV_M1_E64_MASK:%[0-9]+]]:vmnov0 = nnan ninf nsz arcp contract afn reassoc nofpexcept PseudoVFMUL_VV_M1_E64_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], 7, killed [[SRLI]], 6 /* e64 */, 1 /* ta, mu */, implicit $frm
   ; CHECK-NEXT:   $v8 = COPY [[PseudoVFMUL_VV_M1_E64_MASK]]
   ; CHECK-NEXT:   PseudoRET implicit $v8
   %1 = call fast <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> %x, <vscale x 1 x double> %y, <vscale x 1 x i1> %m, i32 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index c73c2004834db..ece457a09dbdf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -11,7 +11,7 @@ body: |
     ; CHECK: liveins: $x1, $v8, $v9
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %false:vrnov0 = COPY $v8
-    ; CHECK-NEXT: %true:vrnov0 = COPY $v9
+    ; CHECK-NEXT: %true:vmnov0 = COPY $v9
     ; CHECK-NEXT: %avl:gprnox0 = COPY $x1
     ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */
     ; CHECK-NEXT: $v0 = COPY %mask
@@ -135,7 +135,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %false:vrnov0 = COPY $v8
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
-    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: %true:vmnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 1 /* ta, mu */
     %false:vrnov0 = COPY $v8
     %mask:vmv0 = COPY $v0
     %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
index f087efcc5f57b..d3649ef4b6664 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
@@ -15,7 +15,7 @@ define <vscale x 1 x i8> @strided_vpload_nxv1i8_i8(ptr %ptr, i8 signext %stride,
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vmnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
   ; CHECK-NEXT:   $v8 = COPY [[PseudoVLSE8_V_MF8_MASK]]
   ; CHECK-NEXT:   PseudoRET implicit $v8
   %load = call <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 %stride, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index 6b6276b838fba..7cbaceae858b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -42,9 +42,9 @@ define i64 @test_vleff_nxv8i8_mask(<vscale x 8 x i8> %maskedoff, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vmnov0 = COPY $v8
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vrnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vmnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_MASK1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
index 81a271bd975e3..bc78a7732c15a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
@@ -148,8 +148,8 @@ body: |
     ; CHECK-NEXT: %y:vr = COPY $v9
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
     ; CHECK-NEXT: %add0:vr = PseudoVADD_VV_M1 $noreg, %x, %y, -1, 5 /* e32 */, 3 /* ta, ma */
-    ; CHECK-NEXT: %add1:vrnov0 = COPY %add:vrnov0
-    ; CHECK-NEXT: %merge:vrnov0 = PseudoVOR_VV_M1_MASK %add:vrnov0, %add1, %y, %mask, -1, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: %add1:vmnov0 = COPY %add:vmnov0
+    ; CHECK-NEXT: %merge:vrnov0 = PseudoVOR_VV_M1_MASK %add:vmnov0, %add1, %y, %mask, -1, 5 /* e32 */, 1 /* ta, mu */
     %x:vr = COPY $v8
     %y:vr = COPY $v9
     %mask:vmv0 = COPY $v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index 68e74ff6ba05b..3f551ba91b3a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -112,7 +112,7 @@ body: |
     ; CHECK-LABEL: name: diff_regclass
     ; CHECK: liveins: $v8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vmnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vmv0 = COPY $v8
     ; CHECK-NEXT: [[PseudoVADD_VV_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVADD_VV_M1_MASK [[PseudoVMV_V_I_MF2_]], $noreg, $noreg, [[COPY]], 0, 5 /* e32 */, 0 /* tu, mu */
     %0:vr = PseudoVMV_V_I_MF2 $noreg, 0, -1, 5 /* e32 */, 0 /* tu, mu */
@@ -128,7 +128,7 @@ body: |
     ; CHECK-LABEL: name: diff_regclass_passthru
     ; CHECK: liveins: $v8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vmnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vmv0 = COPY $v8
     ; CHECK-NEXT: [[PseudoVLSE32_V_MF2_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE32_V_MF2_MASK [[PseudoVMV_V_I_MF2_]], $noreg, $noreg, [[COPY]], 0, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 4)
     %2:vr = PseudoVMV_V_I_MF2 $noreg, 0, -1, 5 /* e32 */, 0 /* tu, mu */
@@ -162,7 +162,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %passthru:vrnov0 = COPY $v8
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
-    ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, $noreg, %mask, 4, 5 /* e32 */
+    ; CHECK-NEXT: %x:vmnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, $noreg, %mask, 4, 5 /* e32 */
     %passthru:vrnov0 = COPY $v8
     %mask:vmv0 = COPY $v0
     %x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %passthru, $noreg, %mask, 4, 5 /* e32 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
index a35100654432c..0b242abeb035c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
@@ -793,7 +793,7 @@ body:             |
   ; CHECK-NEXT:   %idxs:vr = COPY $v0
   ; CHECK-NEXT:   %t1:vr = COPY $v1
   ; CHECK-NEXT:   %t3:vr = COPY $v2
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vrnov0 = COPY $v3
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vmnov0 = COPY $v3
   ; CHECK-NEXT:   %t5:vrnov0 = COPY $v1
   ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gprnox0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
   ; CHECK-NEXT:   %t6:vr = PseudoVMSEQ_VI_M1 %t1, 0, -1, 6 /* e64 */, implicit $vl, implicit $vtype
@@ -811,7 +811,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $v0 = COPY %mask
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0X0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl
-  ; CHECK-NEXT:   early-clobber [[COPY]]:vrnov0 = PseudoVLUXEI64_V_M1_MF8_MASK %t5, %inaddr, %idxs, $v0, -1, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   early-clobber [[COPY]]:vmnov0 = PseudoVLUXEI64_V_M1_MF8_MASK %t5, %inaddr, %idxs, $v0, -1, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
new file mode 100644
index 0000000000000..c90ffdd17996c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
@@ -0,0 +1,67 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeInt type with a width other than 8, 16, 32 or 64 bits requires the following SPIR-V extension: SPV_ALTERA_arbitrary_precision_integers
+
+; CHECK: OpCapability ArbitraryPrecisionIntegersALTERA
+; CHECK: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
+; CHECK: OpName %[[#TestAdd:]] "test_add"
+; CHECK: OpName %[[#TestSub:]] "test_sub"
+; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
+; CHECK: %[[#Const64Int128:]] = OpConstant %[[#Int128Ty]] 64 0 0 0
+
+; CHECK: %[[#TestAdd]] = OpFunction
+define spir_func void @test_add(i64 %AL, i64 %AH, i64 %BL, i64 %BH, ptr %RL, ptr %RH) {
+entry:
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpShiftLeftLogical %[[#Int128Ty]] {{%[0-9]+}} %[[#Const64Int128]]
+; CHECK: {{.*}} = OpBitwiseOr %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpIAdd %[[#Int128Ty]]
+	%tmp1 = zext i64 %AL to i128
+	%tmp23 = zext i64 %AH to i128
+	%tmp4 = shl i128 %tmp23, 64
+	%tmp5 = or i128 %tmp4, %tmp1
+	%tmp67 = zext i64 %BL to i128
+	%tmp89 = zext i64 %BH to i128
+	%tmp11 = shl i128 %tmp89, 64
+	%tmp12 = or i128 %tmp11, %tmp67
+	%tmp15 = add i128 %tmp12, %tmp5
+	%tmp1617 = trunc i128 %tmp15 to i64
+	store i64 %tmp1617, ptr %RL
+	%tmp21 = lshr i128 %tmp15, 64
+	%tmp2122 = trunc i128 %tmp21 to i64
+	store i64 %tmp2122, ptr %RH
+	ret void
+; CHECK: OpFunctionEnd
+}
+
+; CHECK: %[[#TestSub]] = OpFunction
+define spir_func void @test_sub(i64 %AL, i64 %AH, i64 %BL, i64 %BH, ptr %RL, ptr %RH) {
+entry:
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpShiftLeftLogical %[[#Int128Ty]] {{%[0-9]+}} %[[#Const64Int128]]
+; CHECK: {{.*}} = OpBitwiseOr %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpISub %[[#Int128Ty]]
+	%tmp1 = zext i64 %AL to i128
+	%tmp23 = zext i64 %AH to i128
+	%tmp4 = shl i128 %tmp23, 64
+	%tmp5 = or i128 %tmp4, %tmp1
+	%tmp67 = zext i64 %BL to i128
+	%tmp89 = zext i64 %BH to i128
+	%tmp11 = shl i128 %tmp89, 64
+	%tmp12 = or i128 %tmp11, %tmp67
+	%tmp15 = sub i128 %tmp5, %tmp12
+	%tmp1617 = trunc i128 %tmp15 to i64
+	store i64 %tmp1617, ptr %RL
+	%tmp21 = lshr i128 %tmp15, 64
+	%tmp2122 = trunc i128 %tmp21 to i64
+	store i64 %tmp2122, ptr %RH
+	ret void
+; CHECK: OpFunctionEnd
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll
new file mode 100644
index 0000000000000..d1de5df499566
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll
@@ -0,0 +1,27 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeInt type with a width other than 8, 16, 32 or 64 bits requires the following SPIR-V extension: SPV_ALTERA_arbitrary_precision_integers
+
+; CHECK: OpCapability ArbitraryPrecisionIntegersALTERA
+; CHECK: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
+; CHECK: OpName %[[#Foo:]] "foo"
+; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
+
+; CHECK: %[[#Foo]] = OpFunction
+define i64 @foo(i64 %x, i64 %y, i32 %amt) {
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpSConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpBitwiseOr %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpShiftRightLogical %[[#Int128Ty]]
+  %tmp0 = zext i64 %x to i128
+  %tmp1 = sext i64 %y to i128
+  %tmp2 = or i128 %tmp0, %tmp1
+  %tmp7 = zext i32 13 to i128
+  %tmp3 = lshr i128 %tmp2, %tmp7
+  %tmp4 = trunc i128 %tmp3 to i64
+  ret i64 %tmp4
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll
new file mode 100644
index 0000000000000..669e9362605a5
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll
@@ -0,0 +1,27 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeInt type with a width other than 8, 16, 32 or 64 bits requires the following SPIR-V extension: SPV_ALTERA_arbitrary_precision_integers
+
+; CHECK: OpCapability ArbitraryPrecisionIntegersALTERA
+; CHECK: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
+; CHECK: OpName %[[#Test:]] "test"
+; CHECK: OpName %[[#Exit:]] "exit"
+; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
+; CHECK: %[[#UndefInt128:]] = OpUndef %[[#Int128Ty]]
+
+; CHECK: %[[#Test]] = OpFunction
+define void @test() {
+entry:
+; CHECK: OpSwitch %[[#UndefInt128]] %[[#Exit]] 0 0 3 0 %[[#Exit]] 0 0 5 0 %[[#Exit]] 0 0 4 0 %[[#Exit]] 0 0 8 0 %[[#Exit]]
+  switch i128 poison, label %exit [
+    i128 55340232221128654848, label %exit
+    i128 92233720368547758080, label %exit
+    i128 73786976294838206464, label %exit
+    i128 147573952589676412928, label %exit
+  ]
+exit:
+  unreachable
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll
index 5ddfc85702540..ecf4807a1d5fc 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all,-SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=-SPV_ALTERA_arbitrary_precision_integers,all %s -o - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=KHR %s -o - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=khr %s -o - | FileCheck %s
 
diff --git a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
index 76d84c1159ee4..860d60ff0d4e1 100644
--- a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
@@ -97,3 +97,99 @@ define <16 x i32>@test_int_x86_avx10_vpdpbuuds_512(<16 x i32> %x0, <16 x i32> %x
   %res = call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   ret <16 x i32> %res
 }
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwsud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwsud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwsud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwsuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwsuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwsuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwusd_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwusd_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwusds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwusds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwuud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwuud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwuud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwuuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwuuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwuuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index a2aad604f19bc..e9c6cb6a19ba4 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -220,7 +220,7 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8
 
 ; VNNI INT16
 
-define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -231,12 +231,12 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpwsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) {
 ; X86-LABEL: test_mm512_mask_dpwsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -248,13 +248,13 @@ define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) {
 ; X86-LABEL: test_mm512_maskz_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -266,14 +266,14 @@ define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
 define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpwusd_epi32:
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 1f270d539cdb4..bf7f9375570f9 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -334,7 +334,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 ; VNNI INT16
 
-define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) {
 ; X86-LABEL: test_mm_mask_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -346,13 +346,13 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) {
 ; X86-LABEL: test_mm_maskz_dpwsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -364,13 +364,13 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) {
 ; X86-LABEL: test_mm256_maskz_dpwsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -382,13 +382,13 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) {
 ; X86-LABEL: test_mm256_mask_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -400,16 +400,16 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
 define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
 ; X86-LABEL: test_mm_mask_dpwusd_epi32:
diff --git a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
index 38d54cff6dc23..00db1fb07c78d 100644
--- a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
@@ -652,14 +652,14 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128(<2 x double> %x0, <4 x i32
 ; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8]
+; X64-NEXT:    vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8]
 ; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8]
+; X86-NEXT:    vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> %src, i8 %mask)
@@ -670,13 +670,13 @@ define <4 x i32> @test_int_x86_maskz_vcvtt_pd2udqs_128_z(<2 x double> %x0, i8 %m
 ; X64-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0]
+; X64-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0]
+; X86-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> zeroinitializer, i8 %mask)
   ret <4 x i32> %res
@@ -686,13 +686,13 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128_undef(<2 x double> %x0, i8
 ; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0]
+; X64-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0]
+; X86-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> undef, i8 %mask)
   ret <4 x i32> %res
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
index b8ebe2a4890a1..ddf0050dbd74a 100644
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -178,18 +178,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
   ret { <4 x i32>, <4 x i32> } %res2
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %1
 }
 
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -209,11 +209,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; X64-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
 ; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %x2 = load <16 x i16>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
-  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
   %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -221,18 +221,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
   ret { <8 x i32>, <8 x i32> } %res2
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %1
 }
 
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -252,12 +252,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; X64-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
 ; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <8 x i16>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
-  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
@@ -266,18 +266,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
   ret { <4 x i32>, <4 x i32> } %res2
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x53,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %1
 }
 
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -297,11 +297,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; X64-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
 ; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %x2 = load <16 x i16>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
-  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
   %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -309,9 +309,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
   ret { <8 x i32>, <8 x i32> } %res2
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p) {
 ; X86-LABEL: test_int_x86_avx512_vpdpwssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -322,12 +322,12 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpwssds (%rdi), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x53,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <8 x i16>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %1
 }
 
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -347,12 +347,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; X64-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
 ; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <8 x i16>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
-  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
index 63ff88a7fa4ae..2aabfab1c8666 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
@@ -102,21 +102,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpbusds_512(<16 x
   ret { <16 x i32>, <16 x i32> } %res3
 }
 
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
-; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
+define <16 x i32>@test_int_x86_avx512_vpdpwssd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_vpdpwssd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   ret <16 x i32> %res
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
+; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
@@ -125,7 +143,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
@@ -141,21 +159,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
   ret { <16 x i32>, <16 x i32> } %res3
 }
 
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   ret <16 x i32> %res
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
+; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
@@ -164,7 +200,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
index 60d0298e057f3..e97b8a5c5503f 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
@@ -86,18 +86,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
   ret { <16 x i32>, <16 x i32> } %res2
 }
 
-declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   ret <16 x i32> %1
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -116,11 +116,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda]
 ; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %x2 = load <32 x i16>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
-  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
@@ -128,18 +128,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
   ret { <16 x i32>, <16 x i32> } %res2
 }
 
-declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ask_vpdpwssds_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   ret <16 x i32> %1
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -158,11 +158,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda]
 ; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %x2 = load <32 x i16>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
-  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
index 0f4a4f27b9715..f359ecef8ceb3 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
@@ -45,3 +45,47 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8
     %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
     ret <8 x i32> %res
 }
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+    ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+    ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+    ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+    ret <8 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
index de8b2a41bf8c8..5748a426c76c3 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
@@ -68,9 +68,9 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <1
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
@@ -80,13 +80,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
@@ -96,13 +96,13 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
@@ -112,13 +112,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
@@ -128,6 +128,6 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %res
 }
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll
new file mode 100644
index 0000000000000..abdc296ae1e1c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
index abdc296ae1e1c..7576b12645bd0 100644
--- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
 ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
@@ -14,12 +14,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
@@ -29,12 +29,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
@@ -44,12 +44,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
@@ -59,12 +59,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
@@ -74,12 +74,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
@@ -89,12 +89,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
@@ -104,12 +104,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
@@ -119,12 +119,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
@@ -134,12 +134,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
@@ -149,12 +149,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
@@ -164,12 +164,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
@@ -179,7 +179,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 0fd555991ae29..749b3ddc96d0d 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -52,6 +52,63 @@ define i32 @load_ctpop_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctpop_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    popcntq %rcx, %rcx
+; SSE-NEXT:    popcntq %rax, %rax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctpop_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    popcntq %rax, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rcx, %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctpop_i128:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vmovq %xmm0, %rcx
+; AVX512F-NEXT:    popcntq %rax, %rdx
+; AVX512F-NEXT:    popcntq %rcx, %rax
+; AVX512F-NEXT:    addl %edx, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctpop_i128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    popcntq %rcx, %rcx
+; AVX512VL-NEXT:    popcntq %rax, %rax
+; AVX512VL-NEXT:    addl %ecx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i128:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT:    popcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl %ecx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.ctpop.i128(i128 %a0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctpop_i256(i256 %a0) nounwind {
 ; CHECK-LABEL: test_ctpop_i256:
 ; CHECK:       # %bb.0:
@@ -183,6 +240,107 @@ define i32 @load_ctpop_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctpop_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    popcntq %rsi, %rsi
+; SSE-NEXT:    popcntq %rdx, %rdx
+; SSE-NEXT:    addl %esi, %edx
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    popcntq %rax, %rsi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq %rcx, %rax
+; SSE-NEXT:    addl %esi, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctpop_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    popcntq %rdx, %rdx
+; AVX2-NEXT:    popcntq %rsi, %rsi
+; AVX2-NEXT:    addl %edx, %esi
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    popcntq %rax, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rcx, %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctpop_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vmovq %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    popcntq %rdx, %rdx
+; AVX512F-NEXT:    popcntq %rsi, %rsi
+; AVX512F-NEXT:    addl %edx, %esi
+; AVX512F-NEXT:    popcntq %rax, %rdx
+; AVX512F-NEXT:    popcntq %rcx, %rax
+; AVX512F-NEXT:    addl %edx, %eax
+; AVX512F-NEXT:    addl %esi, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctpop_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %xmm0, %rcx
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    popcntq %rsi, %rsi
+; AVX512VL-NEXT:    popcntq %rdx, %rdx
+; AVX512VL-NEXT:    addl %esi, %edx
+; AVX512VL-NEXT:    xorl %esi, %esi
+; AVX512VL-NEXT:    popcntq %rax, %rsi
+; AVX512VL-NEXT:    xorl %eax, %eax
+; AVX512VL-NEXT:    popcntq %rcx, %rax
+; AVX512VL-NEXT:    addl %esi, %eax
+; AVX512VL-NEXT:    addl %edx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT:    popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT:    popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT:    addl %esi, %edx
+; AVX512POPCNT-NEXT:    xorl %esi, %esi
+; AVX512POPCNT-NEXT:    popcntq %rax, %rsi
+; AVX512POPCNT-NEXT:    xorl %eax, %eax
+; AVX512POPCNT-NEXT:    popcntq %rcx, %rax
+; AVX512POPCNT-NEXT:    addl %esi, %eax
+; AVX512POPCNT-NEXT:    addl %edx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.ctpop.i256(i256 %a0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctpop_i512(i512 %a0) nounwind {
 ; CHECK-LABEL: test_ctpop_i512:
 ; CHECK:       # %bb.0:
@@ -404,6 +562,166 @@ define i32 @load_ctpop_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctpop_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    pextrq $1, %xmm2, %rdi
+; SSE-NEXT:    movq %xmm2, %r8
+; SSE-NEXT:    movq %xmm3, %r9
+; SSE-NEXT:    pextrq $1, %xmm3, %r10
+; SSE-NEXT:    popcntq %r10, %r10
+; SSE-NEXT:    popcntq %r9, %r9
+; SSE-NEXT:    addl %r10d, %r9d
+; SSE-NEXT:    popcntq %rdi, %rdi
+; SSE-NEXT:    popcntq %r8, %r8
+; SSE-NEXT:    addl %edi, %r8d
+; SSE-NEXT:    addl %r9d, %r8d
+; SSE-NEXT:    popcntq %rsi, %rsi
+; SSE-NEXT:    popcntq %rdx, %rdx
+; SSE-NEXT:    addl %esi, %edx
+; SSE-NEXT:    popcntq %rcx, %rcx
+; SSE-NEXT:    popcntq %rax, %rax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    addl %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctpop_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    vmovq %xmm0, %r10
+; AVX2-NEXT:    popcntq %r9, %r9
+; AVX2-NEXT:    popcntq %r10, %r10
+; AVX2-NEXT:    addl %r9d, %r10d
+; AVX2-NEXT:    popcntq %rdi, %rdi
+; AVX2-NEXT:    popcntq %r8, %r8
+; AVX2-NEXT:    addl %edi, %r8d
+; AVX2-NEXT:    addl %r10d, %r8d
+; AVX2-NEXT:    popcntq %rsi, %rsi
+; AVX2-NEXT:    popcntq %rdx, %rdx
+; AVX2-NEXT:    addl %esi, %edx
+; AVX2-NEXT:    popcntq %rcx, %rcx
+; AVX2-NEXT:    popcntq %rax, %rax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctpop_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rdi
+; AVX512F-NEXT:    vmovq %xmm1, %r8
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512F-NEXT:    vmovq %xmm0, %r10
+; AVX512F-NEXT:    popcntq %r9, %r9
+; AVX512F-NEXT:    popcntq %r10, %r10
+; AVX512F-NEXT:    addl %r9d, %r10d
+; AVX512F-NEXT:    popcntq %rdi, %rdi
+; AVX512F-NEXT:    popcntq %r8, %r8
+; AVX512F-NEXT:    addl %edi, %r8d
+; AVX512F-NEXT:    addl %r10d, %r8d
+; AVX512F-NEXT:    popcntq %rdx, %rdx
+; AVX512F-NEXT:    popcntq %rsi, %rsi
+; AVX512F-NEXT:    addl %edx, %esi
+; AVX512F-NEXT:    popcntq %rcx, %rcx
+; AVX512F-NEXT:    popcntq %rax, %rax
+; AVX512F-NEXT:    addl %ecx, %eax
+; AVX512F-NEXT:    addl %esi, %eax
+; AVX512F-NEXT:    addl %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctpop_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rsi
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm1, %rdi
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %r9
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512VL-NEXT:    popcntq %r10, %r10
+; AVX512VL-NEXT:    popcntq %r9, %r9
+; AVX512VL-NEXT:    addl %r10d, %r9d
+; AVX512VL-NEXT:    popcntq %r8, %r8
+; AVX512VL-NEXT:    popcntq %rdi, %rdi
+; AVX512VL-NEXT:    addl %r8d, %edi
+; AVX512VL-NEXT:    addl %r9d, %edi
+; AVX512VL-NEXT:    popcntq %rdx, %rdx
+; AVX512VL-NEXT:    popcntq %rsi, %rsi
+; AVX512VL-NEXT:    addl %edx, %esi
+; AVX512VL-NEXT:    popcntq %rcx, %rcx
+; AVX512VL-NEXT:    popcntq %rax, %rax
+; AVX512VL-NEXT:    addl %ecx, %eax
+; AVX512VL-NEXT:    addl %esi, %eax
+; AVX512VL-NEXT:    addl %edi, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rax
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rdi
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512POPCNT-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %r9
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512POPCNT-NEXT:    popcntq %r10, %r10
+; AVX512POPCNT-NEXT:    popcntq %r9, %r9
+; AVX512POPCNT-NEXT:    addl %r10d, %r9d
+; AVX512POPCNT-NEXT:    popcntq %r8, %r8
+; AVX512POPCNT-NEXT:    popcntq %rdi, %rdi
+; AVX512POPCNT-NEXT:    addl %r8d, %edi
+; AVX512POPCNT-NEXT:    addl %r9d, %edi
+; AVX512POPCNT-NEXT:    popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT:    popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT:    addl %edx, %esi
+; AVX512POPCNT-NEXT:    popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT:    popcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl %ecx, %eax
+; AVX512POPCNT-NEXT:    addl %esi, %eax
+; AVX512POPCNT-NEXT:    addl %edi, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.ctpop.i512(i512 %a0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_ctpop_i1024:
 ; SSE:       # %bb.0:
@@ -969,6 +1287,75 @@ define i32 @load_ctlz_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE-NEXT:    bsrq %rdx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_i128:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    lzcntq %rcx, %rdx
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edx, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_i128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    lzcntq %rcx, %rdx
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i128:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_i256:
 ; SSE:       # %bb.0:
@@ -1125,6 +1512,135 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    bsrq %rsi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rax, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %r8d
+; SSE-NEXT:    bsrq %rdx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    lzcntq %rsi, %rdi
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %rcx, %rdi
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    lzcntq %rsi, %rdi
+; AVX512F-NEXT:    lzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    lzcntq %rcx, %rdi
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rsi, %rdx
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    lzcntq %rsi, %rdi
+; AVX512VL-NEXT:    lzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    lzcntq %rcx, %rdi
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rsi, %rdx
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT:    lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rsi, %rdx
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_i512(i512 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_i512:
 ; SSE:       # %bb.0:
@@ -1423,10 +1939,155 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: load_ctlz_i512:
+; AVX512F-LABEL: load_ctlz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: load_ctlz_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rdx
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    pextrq $1, %xmm2, %rdi
+; SSE-NEXT:    movq %xmm2, %rsi
+; SSE-NEXT:    movq %xmm3, %r8
+; SSE-NEXT:    pextrq $1, %xmm3, %r9
+; SSE-NEXT:    bsrq %r9, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq %r8, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r10d, %r8d
+; SSE-NEXT:    bsrq %rdi, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq %rsi, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r9d, %esi
+; SSE-NEXT:    movq %xmm1, %rdi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    ptest %xmm3, %xmm3
+; SSE-NEXT:    cmovnel %r8d, %esi
+; SSE-NEXT:    bsrq %rax, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    bsrq %rdi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    orl $64, %edi
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %r8d, %edi
+; SSE-NEXT:    bsrq %rcx, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rdx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT:    vmovq %xmm2, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vmovq %xmm2, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    lzcntq %rax, %r10
+; AVX2-NEXT:    lzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %r9, %r10
+; AVX2-NEXT:    lzcntq %rdi, %rdi
+; AVX2-NEXT:    addl $64, %edi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %edi
+; AVX2-NEXT:    subl $-128, %edi
+; AVX2-NEXT:    orq %rax, %r8
+; AVX2-NEXT:    cmovnel %r11d, %edi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    lzcntq %rsi, %r9
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm1, %ymm1
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
@@ -1435,10 +2096,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX512F-NEXT:    vmovd %xmm1, %eax
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VL-LABEL: load_ctlz_i512:
+; AVX512VL-LABEL: vector_ctlz_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
 ; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
@@ -1448,10 +2109,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
-; AVX512POPCNT-LABEL: load_ctlz_i512:
+; AVX512POPCNT-LABEL: vector_ctlz_i512:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
 ; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
@@ -1460,7 +2121,7 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
-  %a0 = load i512, ptr %p0
+  %a0 = bitcast <16 x i32> %v0 to i512
   %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
   %res = trunc i512 %cnt to i32
   ret i32 %res
@@ -2312,6 +2973,74 @@ define i32 @load_ctlz_undef_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    bsrq %rcx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    bsrq %rax, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i128:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    lzcntq %rcx, %rdx
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edx, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    lzcntq %rcx, %rdx
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i128:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_undef_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i256:
 ; SSE:       # %bb.0:
@@ -2463,6 +3192,134 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    bsrq %rsi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rdx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    bsrq %rcx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %rax, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    lzcntq %rsi, %rdi
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %rcx, %rdi
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    lzcntq %rsi, %rdi
+; AVX512F-NEXT:    lzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    lzcntq %rcx, %rdi
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rsi, %rdx
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    lzcntq %rsi, %rdi
+; AVX512VL-NEXT:    lzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    lzcntq %rcx, %rdi
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rsi, %rdx
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT:    lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rsi, %rdx
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i512:
 ; SSE:       # %bb.0:
@@ -2796,6 +3653,147 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    pextrq $1, %xmm2, %rsi
+; SSE-NEXT:    movq %xmm2, %rdx
+; SSE-NEXT:    movq %xmm3, %rdi
+; SSE-NEXT:    pextrq $1, %xmm3, %r8
+; SSE-NEXT:    bsrq %r8, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq %rdi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    orl $64, %edi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r9d, %edi
+; SSE-NEXT:    bsrq %rsi, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    bsrq %rdx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r8d, %edx
+; SSE-NEXT:    movq %xmm0, %rsi
+; SSE-NEXT:    subl $-128, %edx
+; SSE-NEXT:    ptest %xmm3, %xmm3
+; SSE-NEXT:    movq %xmm1, %r8
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    bsrq %rax, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %r8, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %edi, %r8d
+; SSE-NEXT:    bsrq %rcx, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rsi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT:    vmovq %xmm2, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vmovq %xmm2, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    lzcntq %rax, %r10
+; AVX2-NEXT:    lzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %r9, %r10
+; AVX2-NEXT:    lzcntq %rdi, %rdi
+; AVX2-NEXT:    addl $64, %edi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %edi
+; AVX2-NEXT:    subl $-128, %edi
+; AVX2-NEXT:    orq %rax, %r8
+; AVX2-NEXT:    cmovnel %r11d, %edi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    lzcntq %rsi, %r9
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm1, %ymm1
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i1024:
 ; SSE:       # %bb.0:
@@ -3636,6 +4634,49 @@ define i32 @load_cttz_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    movq %xmm0, %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rsi
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vector_cttz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq %rax, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_cttz_i256:
 ; SSE:       # %bb.0:
@@ -3775,21 +4816,146 @@ define i32 @load_cttz_i256(ptr %p0) nounwind {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
-; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    movq %xmm0, %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rsi
+; SSE-NEXT:    rep bsfq %rax, %rdi
+; SSE-NEXT:    addl $64, %edi
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %edi
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rsi
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    tzcntq %rsi, %rdi
+; AVX2-NEXT:    tzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rcx, %rdi
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vmovq %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    tzcntq %rsi, %rdi
+; AVX512F-NEXT:    tzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    tzcntq %rcx, %rdi
+; AVX512F-NEXT:    tzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rdx, %rsi
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %xmm1, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rsi
+; AVX512VL-NEXT:    tzcntq %rsi, %rdi
+; AVX512VL-NEXT:    tzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    tzcntq %rcx, %rdi
+; AVX512VL-NEXT:    tzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rdx, %rsi
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i256:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
-; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
-; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
-; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
-; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT:    tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rdx, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
-  %a0 = load i256, ptr %p0
+  %a0 = bitcast <8 x i32> %v0 to i256
   %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
   %res = trunc i256 %cnt to i32
   ret i32 %res
@@ -4128,6 +5294,148 @@ define i32 @load_cttz_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm3, %rdx
+; SSE-NEXT:    movq %xmm3, %rcx
+; SSE-NEXT:    pextrq $1, %xmm2, %rax
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    movq %xmm1, %rdi
+; SSE-NEXT:    pextrq $1, %xmm0, %r8
+; SSE-NEXT:    movq %xmm0, %r9
+; SSE-NEXT:    rep bsfq %r9, %r10
+; SSE-NEXT:    rep bsfq %r8, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r10d, %r8d
+; SSE-NEXT:    rep bsfq %rdi, %r9
+; SSE-NEXT:    rep bsfq %rsi, %rsi
+; SSE-NEXT:    addl $64, %esi
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r9d, %esi
+; SSE-NEXT:    movq %xmm2, %rdi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %r8d, %esi
+; SSE-NEXT:    rep bsfq %rdi, %r8
+; SSE-NEXT:    rep bsfq %rax, %r9
+; SSE-NEXT:    addl $64, %r9d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r8d, %r9d
+; SSE-NEXT:    rep bsfq %rcx, %rdi
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vmovq %xmm0, %r9
+; AVX2-NEXT:    tzcntq %r9, %r10
+; AVX2-NEXT:    tzcntq %rdi, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    tzcntq %r8, %r10
+; AVX2-NEXT:    tzcntq %rsi, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %rdi, %r9
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rdx, %rdi
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    vmovq %xmm2, %rdi
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    tzcntq %rdi, %r9
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_cttz_i1024:
 ; SSE:       # %bb.0:
@@ -4930,6 +6238,48 @@ define i32 @load_cttz_undef_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    rep bsfq %rcx, %rdx
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_undef_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vector_cttz_undef_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq %rax, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_undef_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_cttz_undef_i256:
 ; SSE:       # %bb.0:
@@ -5084,6 +6434,130 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    movq %xmm1, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE-NEXT:    movq %xmm0, %rsi
+; SSE-NEXT:    rep bsfq %rsi, %rdi
+; SSE-NEXT:    rep bsfq %rdx, %rdx
+; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    rep bsfq %rcx, %rsi
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_undef_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    tzcntq %rsi, %rdi
+; AVX2-NEXT:    tzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rcx, %rdi
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vmovq %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    tzcntq %rsi, %rdi
+; AVX512F-NEXT:    tzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    tzcntq %rcx, %rdi
+; AVX512F-NEXT:    tzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rdx, %rsi
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %xmm1, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rsi
+; AVX512VL-NEXT:    tzcntq %rsi, %rdi
+; AVX512VL-NEXT:    tzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    tzcntq %rcx, %rdi
+; AVX512VL-NEXT:    tzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rdx, %rsi
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT:    tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rdx, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_undef_i512(i512 %a0) nounwind {
 ; SSE-LABEL: test_cttz_undef_i512:
 ; SSE:       # %bb.0:
@@ -5409,6 +6883,144 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm3, %rax
+; SSE-NEXT:    pextrq $1, %xmm2, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE-NEXT:    movq %xmm1, %rsi
+; SSE-NEXT:    pextrq $1, %xmm0, %rdi
+; SSE-NEXT:    movq %xmm0, %r8
+; SSE-NEXT:    rep bsfq %r8, %r9
+; SSE-NEXT:    rep bsfq %rdi, %rdi
+; SSE-NEXT:    addl $64, %edi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r9d, %edi
+; SSE-NEXT:    rep bsfq %rsi, %r8
+; SSE-NEXT:    rep bsfq %rcx, %rcx
+; SSE-NEXT:    addl $64, %ecx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r8d, %ecx
+; SSE-NEXT:    movq %xmm2, %rsi
+; SSE-NEXT:    subl $-128, %ecx
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %edi, %ecx
+; SSE-NEXT:    rep bsfq %rsi, %rdi
+; SSE-NEXT:    rep bsfq %rdx, %rdx
+; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    movq %xmm3, %rsi
+; SSE-NEXT:    rep bsfq %rsi, %rdi
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_undef_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vmovq %xmm0, %r9
+; AVX2-NEXT:    tzcntq %r9, %r10
+; AVX2-NEXT:    tzcntq %rdi, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    tzcntq %r8, %r10
+; AVX2-NEXT:    tzcntq %rsi, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %rdi, %r9
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rdx, %rdi
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    vmovq %xmm2, %rdi
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    tzcntq %rdi, %r9
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_cttz_undef_i1024:
 ; SSE:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
index cd576b19f8766..345fa0efa42df 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
@@ -4,16 +4,16 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -23,11 +23,11 @@ define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -37,11 +37,11 @@ define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -51,11 +51,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -65,11 +65,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -79,11 +79,11 @@ define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -93,11 +93,11 @@ define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -107,11 +107,11 @@ define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -121,7 +121,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
   ret <8 x i32> %2
 }
 
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
index 534352f322001..47537c8c4fb6e 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
 declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
 declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
@@ -14,7 +14,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i
 declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
 declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -26,11 +26,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -42,11 +42,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -58,11 +58,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -74,7 +74,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
index 8900085af030d..9d5cabca5fef8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
@@ -497,12 +497,12 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <64 x i8>, <64 x i8>
 declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
 
-define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwsud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -510,87 +510,123 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
+; CHECK-NEXT:    store <16 x i32> [[TMP22]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwsuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = and <32 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP19]], [[TMP22]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP23]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwsud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = and <32 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP24]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwusd_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -598,33 +634,57 @@ define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP21]], [[TMP2]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwusds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -636,23 +696,35 @@ define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwusd_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -664,21 +736,21 @@ define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwuud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -686,33 +758,57 @@ define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP21]], [[TMP2]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwuuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -724,23 +820,35 @@ define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwuud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -752,14 +860,14 @@ define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
 
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
index def7ba3f10770..6a53a1595271e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
@@ -739,17 +739,29 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 
-define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwsud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -761,23 +773,35 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwsuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -789,23 +813,35 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -817,23 +853,35 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwsud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -845,28 +893,40 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwusd_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -878,23 +938,35 @@ define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwusds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -906,23 +978,35 @@ define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwusds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -934,23 +1018,35 @@ define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwusd_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -962,28 +1058,40 @@ define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwuud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -995,23 +1103,35 @@ define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwuuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -1023,23 +1143,35 @@ define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -1051,23 +1183,35 @@ define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwuud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -1079,16 +1223,16 @@ define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
index 5e937485ff282..d0daea5e68fea 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
@@ -528,10 +528,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -546,7 +546,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -574,10 +574,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -592,7 +592,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -601,10 +601,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -619,7 +619,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -653,10 +653,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -671,7 +671,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
@@ -699,10 +699,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -717,7 +717,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -728,10 +728,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -746,7 +746,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -783,10 +783,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -801,7 +801,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -829,10 +829,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -847,7 +847,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -856,10 +856,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -874,7 +874,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -908,10 +908,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -926,7 +926,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
@@ -954,10 +954,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -972,7 +972,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -983,10 +983,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -1001,7 +1001,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
index 1d3046804b74f..f2b1e16e3d5c2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
@@ -495,10 +495,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -513,7 +513,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -541,10 +541,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -559,7 +559,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -568,10 +568,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -586,7 +586,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -623,10 +623,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -641,7 +641,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
@@ -669,10 +669,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -687,7 +687,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -698,10 +698,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -716,7 +716,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -757,10 +757,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -775,7 +775,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -803,10 +803,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -821,7 +821,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -830,10 +830,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -848,7 +848,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -896,10 +896,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i16> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i16> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i16> [[TMP26]], zeroinitializer
@@ -914,7 +914,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <4 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP25]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP26]], <8 x i16> [[TMP10]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
@@ -943,10 +943,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -961,7 +961,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -972,10 +972,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -990,7 +990,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
index 5c99f8a3a1fb6..4e7598b92abcf 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
@@ -270,10 +270,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -288,7 +288,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -316,10 +316,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -334,7 +334,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -343,10 +343,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -361,7 +361,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -395,10 +395,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -413,7 +413,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -441,10 +441,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -459,7 +459,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -468,10 +468,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -486,7 +486,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
index 236ff45c6cd08..0fd60149cc15e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
@@ -251,10 +251,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -269,7 +269,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -297,10 +297,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -315,7 +315,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -324,10 +324,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -342,7 +342,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -379,10 +379,10 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -397,7 +397,7 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -425,10 +425,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -443,7 +443,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -452,10 +452,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -470,7 +470,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
index 0344fbd5ee2a9..0a6c5f7b089ba 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
@@ -143,10 +143,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
@@ -161,7 +161,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -178,10 +178,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
@@ -196,7 +196,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -213,10 +213,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
@@ -231,7 +231,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -248,10 +248,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
@@ -266,7 +266,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
index 707b46bb8686e..fd9e0b953d002 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
@@ -22,218 +22,362 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
+; CHECK-NEXT:    store <8 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
+; CHECK-NEXT:    store <8 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
new file mode 100644
index 0000000000000..8a9ca6ce635a3
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
@@ -0,0 +1,261 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; We can turn a tbl/tbx intrinsic into a shufflevector instruction if the mask
+; is constant and references 2 or fewer operands.
+
+; Basic tbl1 with all in-bounds indices should optimize to shufflevector.
+define <16 x i8> @tbl1_basic(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_basic(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with both operands the same should optimize (1 unique source).
+define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) {
+; CHECK-LABEL: @tbl2_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl3 referencing 2 unique operands should optimize.
+define <16 x i8> @tbl3_two_sources(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl3_two_sources(
+; CHECK-NEXT:    [[TBL:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 0, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 with alternating duplicate operands should optimize (2 unique sources).
+define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl4_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 where mask only references first two operands should optimize.
+define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_unused_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 where mask only references one operand should optimize.
+define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_single_operand_used(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
+define <16 x i8> @tbl1_with_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
+define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl2_duplicate_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources).
+define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl2_with_oob_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl1 with all OOB indices should optimize to zero vector.
+define <16 x i8> @tbl1_all_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_all_oob(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl3 referencing all 3 operands should NOT optimize.
+define <16 x i8> @tbl3_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: @tbl3_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 referencing 3 unique operands should NOT optimize.
+define <16 x i8> @tbl4_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: @tbl4_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 referencing all 4 unique operands should NOT optimize.
+define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_four_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbx1 with no OOB should optimize.
+define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbx
+}
+
+; tbx2 where fallback == second source operand should optimize (deduplicated).
+define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbx2_fallback_equals_second_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with OOB where fallback == source should optimize (deduplicated).
+define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) {
+; CHECK-LABEL: @tbx1_oob_fallback_same_as_source(
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <16 x i8> [[A1:%.*]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[A]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources).
+define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbx2_with_oob_bail(
+; CHECK-NEXT:    [[TBX:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TBX]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with all OOB indices should optimize to fallback.
+define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_all_oob(
+; CHECK-NEXT:    ret <16 x i8> [[FALLBACK:%.*]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with OOB and mismatched fallback/source sizes should NOT optimize.
+define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_fallback_size_mismatch(
+; CHECK-NEXT:    [[TBX:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBX]]
+;
+  %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; tbx1 with no OOB and mismatched fallback/source sizes should optimize.
+define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbx
+}
+
+; tbl1 with non-i8 element type should NOT optimize.
+define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_8x16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; CHECK-NEXT:    ret <8 x i16> [[TBL1]]
+;
+entry:
+  ; `tbl1.v8i16` is not really a thing, but it's good to check.
+  %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %tbl1
+}
+
+; tbl1 with non-8/16 element count should NOT optimize.
+define <12 x i8> @tbl1_16x8(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_16x8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> [[VEC:%.*]], <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <12 x i8> [[TBL1]]
+;
+entry:
+  ; `tbl1.v12i8` is not really a thing, but it's good to check.
+  %tbl1 = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> %vec, <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <12 x i8> %tbl1
+}
+
+; Non-constant mask should NOT optimize.
+define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) {
+; CHECK-LABEL: @tbl1_non_constant_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[MASK:%.*]])
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %mask)
+  ret <16 x i8> %tbl
+}
+
+; Mask with some poison elements should optimize, with poison propagating to output.
+define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_poison_mask_elements(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %tbl
+}
+
+; Mask with all poison elements should optimize to poison.
+define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_all_poison_mask(
+; CHECK-NEXT:    ret <16 x i8> poison
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison)
+  ret <16 x i8> %tbl
+}
diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
deleted file mode 100644
index 362cc0f6c4493..0000000000000
--- a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64"
-
-; Turning a table lookup intrinsic into a shuffle vector instruction
-; can be beneficial. If the mask used for the lookup is the constant
-; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
-; instructions instead.
-
-define <8 x i8> @tbl1_8x8(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC:%.*]], <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
-;
-entry:
-  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %tbl1
-}
-
-; Bail the optimization if a mask index is out of range.
-define <8 x i8> @tbl1_8x8_out_of_range(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8_out_of_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <8 x i8> [[TBL1]]
-;
-entry:
-  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %tbl1
-}
-
-; Bail the optimization if the size of the return vector is not 8 elements.
-define <16 x i8> @tbl1_16x8(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_16x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[VEC:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <16 x i8> [[TBL1]]
-;
-entry:
-  %tbl1 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %vec, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <16 x i8> %tbl1
-}
-
-; Bail the optimization if the elements of the return vector are not of type i8.
-define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x16(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; CHECK-NEXT:    ret <8 x i16> [[TBL1]]
-;
-entry:
-  %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  ret <8 x i16> %tbl1
-}
-
-; The type <8 x i16> is not a valid return type for this intrinsic,
-; but we want to test that the optimization won't trigger for vector
-; elements of type different than i8.
-declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>)
-
-declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
new file mode 100644
index 0000000000000..d4d5ec284d0b7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-arm-none-eabi"
+
+; We can turn a vtbl/vtbx intrinsic into a shufflevector instruction if the mask
+; is constant and references 2 or fewer operands.
+
+; Basic vtbl1 with all in-bounds indices should optimize to shufflevector.
+define <8 x i8> @vtbl1_basic(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_basic(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with both operands the same should be optimized (1 unique source).
+define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl2_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl3 referencing 2 unique operands should optimize.
+define <8 x i8> @vtbl3_two_sources(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl3_two_sources(
+; CHECK-NEXT:    [[TBL:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 with alternating duplicate operands should optimize (2 unique sources).
+define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl4_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 0, i32 1, i32 8, i32 9>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 where mask only references first two operands should optimize.
+define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_unused_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 where mask only references one operand should optimize.
+define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_single_operand_used(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
+define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
+define <8 x i8> @vtbl2_duplicate_with_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl2_duplicate_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources).
+define <8 x i8> @vtbl2_with_oob_bail(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl2_with_oob_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl1 with all OOB indices should optimize to zero vector.
+define <8 x i8> @vtbl1_all_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_all_oob(
+; CHECK-NEXT:    ret <8 x i8> zeroinitializer
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl3 referencing all 3 operands should NOT optimize.
+define <8 x i8> @vtbl3_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: @vtbl3_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 referencing 3 unique operands should NOT optimize.
+define <8 x i8> @vtbl4_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: @vtbl4_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[A]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 referencing all 4 unique operands should NOT optimize.
+define <8 x i8> @vtbl4_four_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_four_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbx1 with no OOB should optimize.
+define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx2 where fallback == second source operand should optimize (deduplicated).
+define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbx2_fallback_equals_second_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx1 with OOB where fallback == source should optimize (deduplicated).
+define <8 x i8> @vtbx1_oob_fallback_same_as_source(<8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_oob_fallback_same_as_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources).
+define <8 x i8> @vtbx2_with_oob_bail(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbx2_with_oob_bail(
+; CHECK-NEXT:    [[TBX:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBX]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx1 with all OOB indices should optimize to fallback.
+define <8 x i8> @vtbx1_all_oob(<8 x i8> %fallback, <8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_all_oob(
+; CHECK-NEXT:    ret <8 x i8> [[FALLBACK:%.*]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; Non-constant mask should NOT optimize.
+define <8 x i8> @vtbl1_non_constant_mask(<8 x i8> %a, <8 x i8> %mask) {
+; CHECK-LABEL: @vtbl1_non_constant_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> [[MASK:%.*]])
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %mask)
+  ret <8 x i8> %tbl
+}
+
+; Mask with some poison elements should optimize, with poison propagating to output.
+define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_poison_mask_elements(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7>)
+  ret <8 x i8> %tbl
+}
+
+; Mask with all poison elements should optimize to poison.
+define <8 x i8> @vtbl1_all_poison_mask(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_all_poison_mask(
+; CHECK-NEXT:    ret <8 x i8> poison
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> poison)
+  ret <8 x i8> %tbl
+}
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
deleted file mode 100644
index fbec1a2bb7a07..0000000000000
--- a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv8-arm-none-eabi"
-
-; Turning a table lookup intrinsic into a shuffle vector instruction
-; can be beneficial. If the mask used for the lookup is the constant
-; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
-; instructions instead.
-
-define <8 x i8> @tbl1_8x8(<8 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
-;
-entry:
-  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %vtbl1
-}
-
-; Bail the optimization if a mask index is out of range.
-define <8 x i8> @tbl1_8x8_out_of_range(<8 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8_out_of_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VTBL1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <8 x i8> [[VTBL1]]
-;
-entry:
-  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %vtbl1
-}
-
-declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>)
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
index cdbe9bb555834..7450fcccbb484 100644
--- a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
@@ -764,7 +764,7 @@ define void @sink_multiple_store_groups_noalias_via_scev(ptr %dst, ptr %src) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = mul i64 [[INDEX1]], 16
 ; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 16
@@ -781,42 +781,30 @@ define void @sink_multiple_store_groups_noalias_via_scev(ptr %dst, ptr %src) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP22]], align 8, !alias.scope [[META78]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TMP13]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = insertelement <2 x double> [[TMP15]], double [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[TMP10]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP34:%.*]] = fadd <2 x double> [[WIDE_LOAD]], splat (double 8.000000e+00)
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0
-; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP34]], i32 0
-; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP18]], align 8, !alias.scope [[META81:![0-9]+]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x ptr> [[TMP31]], ptr [[TMP21]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP10]], <2 x double> [[WIDE_LOAD]], <2 x double> [[TMP34]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP20]], i32 0
+; CHECK-NEXT:    store double [[TMP32]], ptr [[TMP18]], align 8, !alias.scope [[META81:![0-9]+]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP20]], i32 1
+; CHECK-NEXT:    store double [[TMP33]], ptr [[TMP21]], align 8, !alias.scope [[META81]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP18]], i64 16
+; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP24]], align 8, !alias.scope [[META81]], !noalias [[META78]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP16]], i32 1
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
 ; CHECK:       [[PRED_STORE_IF2]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP34]], i32 1
-; CHECK-NEXT:    store double [[TMP33]], ptr [[TMP21]], align 8, !alias.scope [[META81]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP21]], i64 16
+; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP35]], align 8, !alias.scope [[META81]], !noalias [[META78]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
 ; CHECK:       [[PRED_STORE_CONTINUE3]]:
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
-; CHECK-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
-; CHECK:       [[PRED_STORE_IF4]]:
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store double [[TMP13]], ptr [[TMP31]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[TMP31]], i64 16
-; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP37]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
-; CHECK:       [[PRED_STORE_CONTINUE5]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
-; CHECK:       [[PRED_STORE_IF6]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]]
-; CHECK-NEXT:    store double [[TMP14]], ptr [[TMP32]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[TMP32]], i64 16
-; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP47]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
-; CHECK:       [[PRED_STORE_CONTINUE7]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP52]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP83:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
new file mode 100644
index 0000000000000..b7d36fe7928e5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
+; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
+
+; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
+; no runtime check is generated even though one is needed and !noalias
+; annotations are added.
+define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
+; LIMIT0-LABEL: define i16 @runtime_checks_needed(
+; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; LIMIT0-NEXT:  [[ENTRY:.*:]]
+; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
+; LIMIT0:       [[VECTOR_PH]]:
+; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
+; LIMIT0:       [[VECTOR_BODY]]:
+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
+; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; LIMIT0:       [[MIDDLE_BLOCK]]:
+; LIMIT0-NEXT:    br label %[[EXIT:.*]]
+; LIMIT0:       [[EXIT]]:
+; LIMIT0-NEXT:    ret i16 [[TMP0]]
+;
+; LIMIT1-LABEL: define i16 @runtime_checks_needed(
+; LIMIT1-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; LIMIT1-NEXT:  [[ENTRY:.*:]]
+; LIMIT1-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
+; LIMIT1:       [[VECTOR_MEMCHECK]]:
+; LIMIT1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2000
+; LIMIT1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 2
+; LIMIT1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; LIMIT1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; LIMIT1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; LIMIT1-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; LIMIT1:       [[VECTOR_PH]]:
+; LIMIT1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+; LIMIT1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; LIMIT1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; LIMIT1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; LIMIT1:       [[VECTOR_BODY]]:
+; LIMIT1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; LIMIT1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
+; LIMIT1-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; LIMIT1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; LIMIT1-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; LIMIT1-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; LIMIT1:       [[MIDDLE_BLOCK]]:
+; LIMIT1-NEXT:    br label %[[EXIT:.*]]
+; LIMIT1:       [[SCALAR_PH]]:
+; LIMIT1-NEXT:    br label %[[LOOP:.*]]
+; LIMIT1:       [[LOOP]]:
+; LIMIT1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; LIMIT1-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
+; LIMIT1-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; LIMIT1-NEXT:    store i16 [[L]], ptr [[GEP_DST]], align 1
+; LIMIT1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; LIMIT1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; LIMIT1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; LIMIT1:       [[EXIT]]:
+; LIMIT1-NEXT:    [[L_LCSSA:%.*]] = phi i16 [ [[L]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; LIMIT1-NEXT:    ret i16 [[L_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i16, ptr %src, align 1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %l, ptr %gep.dst, align 1
+  %iv.next = add nsw nuw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret i16 %l
+}
+
+!0 = distinct !{!0, !2, !3}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+;.
+; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
+; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
+; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
+; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
+; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
+; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; LIMIT1: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; LIMIT1: [[META3]] = !{[[META4:![0-9]+]]}
+; LIMIT1: [[META4]] = distinct !{[[META4]], [[META2]]}
+; LIMIT1: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; LIMIT1: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; LIMIT1: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; LIMIT1: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]}
+;.
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
index 215b11a746759..f4815dc331056 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.generated.expected
@@ -71,21 +71,27 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_remember_state
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    stur xzr, [x29, #-8]
-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-; CHECK-NEXT:    ldur w8, [x29, #-8]
-; CHECK-NEXT:    cbz w8, .LBB0_2
+; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    str w8, [sp, #16]
-; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:    ldur w8, [x29, #-8]
+; CHECK-NEXT:    cbz w8, .LBB0_4
 ; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #2 // =0x2
-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:    str w8, [sp, #16]
+; CHECK-NEXT:    b .LBB0_5
 ; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:    ldur w8, [x29, #-8]
+; CHECK-NEXT:    cbnz w8, .LBB0_2
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
 ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
@@ -128,6 +134,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ;
 ; CHECK-LABEL: OUTLINED_FUNCTION_0:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    stp w9, w8, [x29, #-12]
 ; CHECK-NEXT:    mov w9, #3 // =0x3
 ; CHECK-NEXT:    mov w8, #4 // =0x4
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
index bf7cf2b54983b..bc3500522672d 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/aarch64_generated_funcs.ll.nogenerated.expected
@@ -12,21 +12,27 @@ define dso_local i32 @check_boundaries() #0 {
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_remember_state
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    stur xzr, [x29, #-8]
-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
-; CHECK-NEXT:    ldur w8, [x29, #-8]
-; CHECK-NEXT:    cbz w8, .LBB0_2
+; CHECK-NEXT:    b .LBB0_3
 ; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    str w8, [sp, #16]
-; CHECK-NEXT:    b .LBB0_3
+; CHECK-NEXT:    ldur w8, [x29, #-8]
+; CHECK-NEXT:    cbz w8, .LBB0_4
 ; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    .cfi_restore_state
 ; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov w9, #2 // =0x2
-; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:    str w8, [sp, #16]
+; CHECK-NEXT:    b .LBB0_5
 ; CHECK-NEXT:  .LBB0_3:
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:    ldur w8, [x29, #-8]
+; CHECK-NEXT:    cbnz w8, .LBB0_2
+; CHECK-NEXT:  .LBB0_4:
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_0
+; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 48
 ; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index e13523b8e10c3..d3200b7722ee3 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "gtest/gtest.h"
+#include <initializer_list>
 
 using namespace llvm;
 
@@ -835,19 +836,27 @@ TYPED_TEST(BitVectorTest, BinOps) {
   A.resize(65);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(B));
+  EXPECT_TRUE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
 
   B.resize(64);
   A.set(64);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
 
   B.set(63);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_FALSE(B.subsetOf(A));
 
   A.set(63);
   EXPECT_TRUE(A.anyCommon(B));
   EXPECT_TRUE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
 
   B.resize(70);
   B.set(64);
@@ -855,6 +864,87 @@ TYPED_TEST(BitVectorTest, BinOps) {
   A.resize(64);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_FALSE(B.subsetOf(A));
+
+  B.set(63);
+  B.reset(64);
+  EXPECT_TRUE(A.anyCommon(B));
+  EXPECT_TRUE(B.anyCommon(A));
+  EXPECT_TRUE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
+}
+
+template <typename VecType>
+static inline VecType
+createBitVectorFromBits(uint32_t Size, std::initializer_list<int> SetBits) {
+  VecType V;
+  V.resize(Size);
+  for (int BitIndex : SetBits)
+    V.set(BitIndex);
+  return V;
+}
+
+TYPED_TEST(BitVectorTest, BinOpsLiteral) {
+  // More tests of binary operations with more focus on the semantics and
+  // less focus on mutability.
+
+  auto AnyCommon = [](uint32_t SizeLHS, std::initializer_list<int> SetBitsLHS,
+                      uint32_t SizeRHS, std::initializer_list<int> SetBitsRHS) {
+    auto LHS = createBitVectorFromBits<TypeParam>(SizeLHS, SetBitsLHS);
+    auto RHS = createBitVectorFromBits<TypeParam>(SizeRHS, SetBitsRHS);
+    return LHS.anyCommon(RHS);
+  };
+  auto SubsetOf = [](uint32_t SizeLHS, std::initializer_list<int> SetBitsLHS,
+                     uint32_t SizeRHS, std::initializer_list<int> SetBitsRHS) {
+    auto LHS = createBitVectorFromBits<TypeParam>(SizeLHS, SetBitsLHS);
+    auto RHS = createBitVectorFromBits<TypeParam>(SizeRHS, SetBitsRHS);
+    return LHS.subsetOf(RHS);
+  };
+
+  // clang-format off
+
+  // Test small-sized vectors.
+  EXPECT_TRUE (AnyCommon(10, {1, 2, 3}, 10, {3, 4, 5}));
+  EXPECT_FALSE(AnyCommon(10, {1, 2, 3}, 10, {4, 5}));
+
+  EXPECT_FALSE(SubsetOf(10, {1, 2, 3}, 10, {2, 3, 4}));
+  EXPECT_TRUE (SubsetOf(10, {2, 3},    10, {2, 3, 4}));
+  EXPECT_FALSE(SubsetOf(10, {1, 2, 3}, 10, {2, 3}));
+  EXPECT_TRUE (SubsetOf(10, {1, 2, 3}, 10, {1, 2, 3}));
+
+  // Test representations of empty sets of various sizes.
+  EXPECT_FALSE(AnyCommon(10,  {}, 10,  {}));
+  EXPECT_FALSE(AnyCommon(10,  {}, 123, {}));
+  EXPECT_FALSE(AnyCommon(123, {}, 10,  {}));
+  EXPECT_FALSE(AnyCommon(123, {}, 123, {}));
+  EXPECT_TRUE(SubsetOf(10,  {}, 10,  {}));
+  EXPECT_TRUE(SubsetOf(10,  {}, 123, {}));
+  EXPECT_TRUE(SubsetOf(123, {}, 10,  {}));
+  EXPECT_TRUE(SubsetOf(123, {}, 123, {}));
+
+  // Test handling of the remainder words.
+  EXPECT_FALSE(AnyCommon(10,  {1, 2},  123, {5, 70}));
+  EXPECT_TRUE (AnyCommon(10,  {1, 2},  123, {1, 70}));
+  EXPECT_FALSE(AnyCommon(123, {5, 70}, 10,  {1, 2}));
+  EXPECT_TRUE (AnyCommon(123, {1, 70}, 10,  {1, 2}));
+
+  EXPECT_FALSE(AnyCommon(10,  {1, 2}, 123, {5}));
+  EXPECT_TRUE (AnyCommon(10,  {1, 2}, 123, {1}));
+  EXPECT_FALSE(AnyCommon(123, {5},    10,  {1, 2}));
+  EXPECT_TRUE (AnyCommon(123, {1},    10,  {1, 2}));
+
+  EXPECT_FALSE(SubsetOf(10,  {1, 2},     123, {2, 70}));
+  EXPECT_TRUE (SubsetOf(10,  {1, 2},     123, {1, 2, 70}));
+  EXPECT_FALSE(SubsetOf(123, {2, 70},    10,  {1, 2}));
+  EXPECT_FALSE(SubsetOf(123, {1, 2, 70}, 10,  {1, 2}));
+
+  EXPECT_FALSE(SubsetOf(10,  {1, 2}, 123, {2}));
+  EXPECT_TRUE (SubsetOf(10,  {1, 2}, 123, {1, 2}));
+  EXPECT_TRUE (SubsetOf(123, {2},    10,  {1, 2}));
+  EXPECT_TRUE (SubsetOf(123, {1, 2}, 10,  {1, 2}));
+
+  // clang-format on
 }
 
 using RangeList = std::vector<std::pair<int, int>>;
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn
index 5815537318b3a..280d72f4d36b5 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn
@@ -22,7 +22,6 @@ static_library("clang-doc") {
     "ClangDoc.cpp",
     "Generators.cpp",
     "HTMLGenerator.cpp",
-    "HTMLMustacheGenerator.cpp",
     "JSONGenerator.cpp",
     "MDGenerator.cpp",
     "Mapper.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn
index 4cbd3b96d8ff4..427a64e7a8b10 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn
@@ -40,7 +40,6 @@ unittest("ClangDocTests") {
     "ClangDocTest.cpp",
     "GeneratorTest.cpp",
     "HTMLGeneratorTest.cpp",
-    "HTMLMustacheGeneratorTest.cpp",
     "JSONGeneratorTest.cpp",
     "MDGeneratorTest.cpp",
     "MergeTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn
index 7b0db8863c6f0..a6590e86e13b5 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn
@@ -44,6 +44,7 @@ static_library("LLVMNVPTXCodeGen") {
     "NVPTXForwardParams.cpp",
     "NVPTXFrameLowering.cpp",
     "NVPTXGenericToNVVM.cpp",
+    "NVPTXIRPeephole.cpp",
     "NVPTXISelDAGToDAG.cpp",
     "NVPTXISelLowering.cpp",
     "NVPTXImageOptimizer.cpp",
diff --git a/mlir/include/mlir-c/Dialect/IRDL.h b/mlir/include/mlir-c/Dialect/IRDL.h
index c4d6ffd989af9..d87ab864fb33f 100644
--- a/mlir/include/mlir-c/Dialect/IRDL.h
+++ b/mlir/include/mlir-c/Dialect/IRDL.h
@@ -22,6 +22,20 @@ MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(IRDL, irdl);
 /// the module's associated context.
 MLIR_CAPI_EXPORTED MlirLogicalResult mlirLoadIRDLDialects(MlirModule module);
 
+//===----------------------------------------------------------------------===//
+// VariadicityAttr
+//===----------------------------------------------------------------------===//
+
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirIRDLVariadicityAttrGet(MlirContext ctx, MlirStringRef value);
+
+//===----------------------------------------------------------------------===//
+// VariadicityArrayAttr
+//===----------------------------------------------------------------------===//
+
+MLIR_CAPI_EXPORTED MlirAttribute mlirIRDLVariadicityArrayAttrGet(
+    MlirContext ctx, intptr_t nValues, MlirAttribute const *values);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsTiling.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsTiling.h
new file mode 100644
index 0000000000000..6fcb706aa3488
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtilsTiling.h
@@ -0,0 +1,83 @@
+//===- OpenACCUtilsTiling.h - OpenACC Loop Tiling Utilities -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utility functions for tiling OpenACC loops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENACC_OPENACCUTILSTILING_H_
+#define MLIR_DIALECT_OPENACC_OPENACCUTILSTILING_H_
+
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/PatternMatch.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace mlir {
+namespace acc {
+
+/// Uncollapse tile loops with multiple IVs and collapseCount < tileCount.
+/// This is used to prepare loops for tiling when the collapse count is less
+/// than the tile count.
+///
+/// \param origLoop The original loop operation to uncollapse.
+/// \param tileCount The number of tile dimensions.
+/// \param collapseCount The collapse count from the original loop.
+/// \param rewriter The rewriter to use for modifications.
+/// \return A vector of uncollapsed loop operations.
+llvm::SmallVector<mlir::acc::LoopOp>
+uncollapseLoops(mlir::acc::LoopOp origLoop, unsigned tileCount,
+                unsigned collapseCount, mlir::RewriterBase &rewriter);
+
+/// Tile ACC loops according to the given tile sizes.
+///
+/// Tiling a 2-level nested loop will create two 'tile' loops containing two
+/// 'element' loops. The transformation looks like:
+///
+/// Before Tiling:
+/// \code
+/// #pragma acc loop tile(tile_size1, tile_size2)
+///  for (i = lb1; i < ub1; i += step1) { // original loop
+///    for (j = lb2; j < ub2; j += step2) {
+///      a[i,j] = i + j;
+///    }
+///  }
+/// \endcode
+///
+/// After Tiling:
+/// \code
+///  for (i = lb1; i < ub1; i += (step1 * tile_size1)) { // tile loop 1
+///    for (j = lb2; j < ub2; j += (step2 * tile_size2)) { // tile loop 2
+///      for (ii = i; ii < min(ub1, (step1 * tile_size1) + i); ii += step1) {
+///      // element loop 1
+///        for (jj = j; jj < min(ub2, (step2 * tile_size2) + j); jj += step2)
+///        { // element loop 2
+///          a[ii,jj] = i + j;
+///        }
+///      }
+///    }
+///  }
+/// \endcode
+///
+/// Unknown tile sizes (represented as -1 in acc dialect for `tile(*)`) are
+/// resolved to the provided default tile size.
+///
+/// \param tileLoops The loops to tile (outermost first).
+/// \param tileSizes The tile sizes for each dimension. Values of -1 are
+///        treated as unknown and resolved to defaultTileSize.
+/// \param defaultTileSize The default tile size to use for unknown (*) tiles.
+/// \param rewriter The rewriter to use for modifications.
+/// \return The outermost loop after tiling.
+mlir::acc::LoopOp tileACCLoops(llvm::SmallVector<mlir::acc::LoopOp> &tileLoops,
+                               const llvm::SmallVector<mlir::Value> &tileSizes,
+                               int32_t defaultTileSize,
+                               mlir::RewriterBase &rewriter);
+
+} // namespace acc
+} // namespace mlir
+
+#endif // MLIR_DIALECT_OPENACC_OPENACCUTILSTILING_H_
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 93c5187b00756..eae0bd4e68a84 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -223,6 +223,14 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
     InterfaceMethod<"Derive a new layout by dropping InstData",
                     "xegpu::DistributeLayoutAttr",
                     "dropInstData">,
+    InterfaceMethod<"Derive a new layout with sg_data, inst_data and lane_data set to 1 for the specified unit dims",
+                    "xegpu::DistributeLayoutAttr",
+                    "setUnitDimData",
+                    /*args=*/(ins "const llvm::SetVector<int64_t>": $unitDims)>,
+    InterfaceMethod<"Derive a new layout with sg_lane and lane_layout set to 1 for the specified unit dims",
+                    "xegpu::DistributeLayoutAttr",
+                    "setUnitDimLayout",
+                    /*args=*/(ins "const llvm::SetVector<int64_t>": $unitDims)>,
     InterfaceMethod<[{Delinearizes a linear ID into its multidimensional
                       indices based on the effective layout level.}],
                     "FailureOr<SmallVector<Value>>",
@@ -283,9 +291,14 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                       }
                       return true;
                     }]>,
-    InterfaceMethod</*desc=*/[{Check if this layout is a slice of some other layout.}],
+    InterfaceMethod</*desc=*/[{Check if this layout is a slice of another layout.}],
                     /*retTy=*/"bool",
                     /*methodName=*/"isSliceOf",
+                    /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other)>,
+
+    InterfaceMethod</*desc=*/[{Check if this layout is identical to another layout.}],
+                    /*retTy=*/"bool",
+                    /*methodName=*/"isEqualTo",
                     /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other)>
   ];
 }
@@ -487,6 +500,12 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
       return {};
     }
 
+    //set the layout for the sepcified unit dims: sg_data, inst_data and lane_data to 1
+    DistributeLayoutAttr setUnitDimData(SetVector<int64_t> unitDims);
+
+    //set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+    DistributeLayoutAttr setUnitDimLayout(SetVector<int64_t> unitDims);
+
     /// Delinearizes a linear ID into its multidimensional indices
     /// based on the effective level of the layout.
     FailureOr<SmallVector<Value>>
@@ -501,6 +520,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
+    
+    /// Check if this is identical to some other layout.
+    bool isEqualTo(const xegpu::DistributeLayoutAttr &other); 
 
   }];
 
@@ -649,6 +671,12 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
       return SliceAttr::get(getContext(), parent, attr.getDims());
     }
 
+    //set the layout for the sepcified unit dims: sg_data, inst_data and lane_data to 1
+    DistributeLayoutAttr setUnitDimData(SetVector<int64_t> unitDims);
+
+    //set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+    DistributeLayoutAttr setUnitDimLayout(SetVector<int64_t> unitDims);
+
     /// flatten a nested SliceAttr, e.g., for 2-level nested SliceAttr
     /// #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0]>, dims = [0]>
     /// it will coalese two slice operations and return a simplified SliceAttr
@@ -670,7 +698,9 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
-
+    
+    /// Check if this is identical to some other layout.
+    bool isEqualTo(const xegpu::DistributeLayoutAttr &other); 
   }];
 
   let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`";
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 4fe1087d18879..b54d620c3c0c3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -405,7 +405,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint, 
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
                        OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let results = (outs XeGPU_ValueType: $value);
diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
index 1659437e1eb24..0ac5fc5358ea5 100644
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -27,6 +27,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <deque>
 #include <list>
 #include <memory>
 #include <numeric>
@@ -830,6 +831,23 @@ namespace {
 /// This class provides support for reading attribute and type entries from the
 /// bytecode. Attribute and Type entries are read lazily on demand, so we use
 /// this reader to manage when to actually parse them from the bytecode.
+///
+/// The parsing of attributes & types are generally recursive, this can lead to
+/// stack overflows for deeply nested structures, so we track a few extra pieces
+/// of information to avoid this:
+///
+/// - `depth`: The current depth while parsing nested attributes. We defer on
+///   parsing deeply nested attributes to avoid potential stack overflows. The
+///   deferred parsing is achieved by reporting a failure when parsing a nested
+///   attribute/type and registering the index of the encountered attribute/type
+///   in the deferred parsing worklist. Hence, a failure with deffered entry
+///   does not constitute a failure, it also requires that folks return on
+///   first failure rather than attempting additional parses.
+/// - `deferredWorklist`: A list of attribute/type indices that we could not
+///   parse due to hitting the depth limit. The worklist is used to capture the
+///   indices of attributes/types that need to be parsed/reparsed when we hit
+///   the depth limit. This enables moving the tracking of what needs to be
+///   parsed to the heap.
 class AttrTypeReader {
   /// This class represents a single attribute or type entry.
   template <typename T>
@@ -863,12 +881,34 @@ class AttrTypeReader {
              ArrayRef<uint8_t> sectionData,
              ArrayRef<uint8_t> offsetSectionData);
 
+  LogicalResult readAttribute(uint64_t index, Attribute &result,
+                              uint64_t depth = 0) {
+    return readEntry(attributes, index, result, "attribute", depth);
+  }
+
+  LogicalResult readType(uint64_t index, Type &result, uint64_t depth = 0) {
+    return readEntry(types, index, result, "type", depth);
+  }
+
   /// Resolve the attribute or type at the given index. Returns nullptr on
   /// failure.
-  Attribute resolveAttribute(size_t index) {
-    return resolveEntry(attributes, index, "Attribute");
+  Attribute resolveAttribute(size_t index, uint64_t depth = 0) {
+    return resolveEntry(attributes, index, "Attribute", depth);
+  }
+  Type resolveType(size_t index, uint64_t depth = 0) {
+    return resolveEntry(types, index, "Type", depth);
+  }
+
+  Attribute getAttributeOrSentinel(size_t index) {
+    if (index >= attributes.size())
+      return nullptr;
+    return attributes[index].entry;
+  }
+  Type getTypeOrSentinel(size_t index) {
+    if (index >= types.size())
+      return nullptr;
+    return types[index].entry;
   }
-  Type resolveType(size_t index) { return resolveEntry(types, index, "Type"); }
 
   /// Parse a reference to an attribute or type using the given reader.
   LogicalResult parseAttribute(EncodingReader &reader, Attribute &result) {
@@ -909,23 +949,33 @@ class AttrTypeReader {
                             llvm::getTypeName<T>(), ", but got: ", baseResult);
   }
 
+  /// Add an index to the deferred worklist for re-parsing.
+  void addDeferredParsing(uint64_t index) { deferredWorklist.push_back(index); }
+
 private:
   /// Resolve the given entry at `index`.
   template <typename T>
-  T resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
-                 StringRef entryType);
+  T resolveEntry(SmallVectorImpl<Entry<T>> &entries, uint64_t index,
+                 StringRef entryType, uint64_t depth = 0);
 
-  /// Parse an entry using the given reader that was encoded using the textual
-  /// assembly format.
+  /// Read the entry at the given index, returning failure if the entry is not
+  /// yet resolved.
   template <typename T>
-  LogicalResult parseAsmEntry(T &result, EncodingReader &reader,
-                              StringRef entryType);
+  LogicalResult readEntry(SmallVectorImpl<Entry<T>> &entries, uint64_t index,
+                          T &result, StringRef entryType, uint64_t depth);
 
   /// Parse an entry using the given reader that was encoded using a custom
   /// bytecode format.
   template <typename T>
   LogicalResult parseCustomEntry(Entry<T> &entry, EncodingReader &reader,
-                                 StringRef entryType);
+                                 StringRef entryType, uint64_t index,
+                                 uint64_t depth);
+
+  /// Parse an entry using the given reader that was encoded using the textual
+  /// assembly format.
+  template <typename T>
+  LogicalResult parseAsmEntry(T &result, EncodingReader &reader,
+                              StringRef entryType);
 
   /// The string section reader used to resolve string references when parsing
   /// custom encoded attribute/type entries.
@@ -951,6 +1001,10 @@ class AttrTypeReader {
 
   /// Reference to the parser configuration.
   const ParserConfig &parserConfig;
+
+  /// Worklist for deferred attribute/type parsing. This is used to handle
+  /// deeply nested structures like CallSiteLoc iteratively.
+  std::vector<uint64_t> deferredWorklist;
 };
 
 class DialectReader : public DialectBytecodeReader {
@@ -959,10 +1013,11 @@ class DialectReader : public DialectBytecodeReader {
                 const StringSectionReader &stringReader,
                 const ResourceSectionReader &resourceReader,
                 const llvm::StringMap<BytecodeDialect *> &dialectsMap,
-                EncodingReader &reader, uint64_t &bytecodeVersion)
+                EncodingReader &reader, uint64_t &bytecodeVersion,
+                uint64_t depth = 0)
       : attrTypeReader(attrTypeReader), stringReader(stringReader),
         resourceReader(resourceReader), dialectsMap(dialectsMap),
-        reader(reader), bytecodeVersion(bytecodeVersion) {}
+        reader(reader), bytecodeVersion(bytecodeVersion), depth(depth) {}
 
   InFlightDiagnostic emitError(const Twine &msg) const override {
     return reader.emitError(msg);
@@ -998,14 +1053,40 @@ class DialectReader : public DialectBytecodeReader {
   // IR
   //===--------------------------------------------------------------------===//
 
+  /// The maximum depth to eagerly parse nested attributes/types before
+  /// deferring.
+  static constexpr uint64_t maxAttrTypeDepth = 5;
+
   LogicalResult readAttribute(Attribute &result) override {
-    return attrTypeReader.parseAttribute(reader, result);
+    uint64_t index;
+    if (failed(reader.parseVarInt(index)))
+      return failure();
+    if (depth > maxAttrTypeDepth) {
+      if (Attribute attr = attrTypeReader.getAttributeOrSentinel(index)) {
+        result = attr;
+        return success();
+      }
+      attrTypeReader.addDeferredParsing(index);
+      return failure();
+    }
+    return attrTypeReader.readAttribute(index, result, depth + 1);
   }
   LogicalResult readOptionalAttribute(Attribute &result) override {
     return attrTypeReader.parseOptionalAttribute(reader, result);
   }
   LogicalResult readType(Type &result) override {
-    return attrTypeReader.parseType(reader, result);
+    uint64_t index;
+    if (failed(reader.parseVarInt(index)))
+      return failure();
+    if (depth > maxAttrTypeDepth) {
+      if (Type type = attrTypeReader.getTypeOrSentinel(index)) {
+        result = type;
+        return success();
+      }
+      attrTypeReader.addDeferredParsing(index);
+      return failure();
+    }
+    return attrTypeReader.readType(index, result, depth + 1);
   }
 
   FailureOr<AsmDialectResourceHandle> readResourceHandle() override {
@@ -1095,6 +1176,7 @@ class DialectReader : public DialectBytecodeReader {
   const llvm::StringMap<BytecodeDialect *> &dialectsMap;
   EncodingReader &reader;
   uint64_t &bytecodeVersion;
+  uint64_t depth;
 };
 
 /// Wraps the properties section and handles reading properties out of it.
@@ -1238,69 +1320,112 @@ LogicalResult AttrTypeReader::initialize(
 }
 
 template <typename T>
-T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
-                               StringRef entryType) {
+T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries,
+                               uint64_t index, StringRef entryType,
+                               uint64_t depth) {
   if (index >= entries.size()) {
     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
     return {};
   }
 
-  // If the entry has already been resolved, there is nothing left to do.
-  Entry<T> &entry = entries[index];
-  if (entry.entry)
-    return entry.entry;
+  // Fast path: Try direct parsing without worklist overhead. This handles the
+  // common case where there are no deferred dependencies.
+  assert(deferredWorklist.empty());
+  T result;
+  if (succeeded(readEntry(entries, index, result, entryType, depth))) {
+    assert(deferredWorklist.empty());
+    return result;
+  }
+  if (deferredWorklist.empty()) {
+    // Failed with no deferred entries is error.
+    return T();
+  }
 
-  // Parse the entry.
-  EncodingReader reader(entry.data, fileLoc);
+  // Slow path: Use worklist to handle deferred dependencies. Use a deque to
+  // iteratively resolve entries with dependencies.
+  // - Pop from front to process
+  // - Push new dependencies to front (depth-first)
+  // - Move failed entries to back (retry after dependencies)
+  std::deque<size_t> worklist;
+  llvm::DenseSet<size_t> inWorklist;
 
-  // Parse based on how the entry was encoded.
-  if (entry.hasCustomEncoding) {
-    if (failed(parseCustomEntry(entry, reader, entryType)))
-      return T();
-  } else if (failed(parseAsmEntry(entry.entry, reader, entryType))) {
-    return T();
+  // Add the original index and any dependencies from the fast path attempt.
+  worklist.push_back(index);
+  inWorklist.insert(index);
+  for (uint64_t idx : llvm::reverse(deferredWorklist)) {
+    if (inWorklist.insert(idx).second)
+      worklist.push_front(idx);
   }
 
-  if (!reader.empty()) {
-    reader.emitError("unexpected trailing bytes after " + entryType + " entry");
-    return T();
+  while (!worklist.empty()) {
+    size_t currentIndex = worklist.front();
+    worklist.pop_front();
+
+    // Clear the deferred worklist before parsing to capture any new entries.
+    deferredWorklist.clear();
+
+    T result;
+    if (succeeded(readEntry(entries, currentIndex, result, entryType, depth))) {
+      inWorklist.erase(currentIndex);
+      continue;
+    }
+
+    if (deferredWorklist.empty()) {
+      // Parsing failed with no deferred entries which implies an error.
+      return T();
+    }
+
+    // Move this entry to the back to retry after dependencies.
+    worklist.push_back(currentIndex);
+
+    // Add dependencies to the front (in reverse so they maintain order).
+    for (uint64_t idx : llvm::reverse(deferredWorklist)) {
+      if (inWorklist.insert(idx).second)
+        worklist.push_front(idx);
+    }
+    deferredWorklist.clear();
   }
-  return entry.entry;
+  return entries[index].entry;
 }
 
 template <typename T>
-LogicalResult AttrTypeReader::parseAsmEntry(T &result, EncodingReader &reader,
-                                            StringRef entryType) {
-  StringRef asmStr;
-  if (failed(reader.parseNullTerminatedString(asmStr)))
-    return failure();
+LogicalResult AttrTypeReader::readEntry(SmallVectorImpl<Entry<T>> &entries,
+                                        uint64_t index, T &result,
+                                        StringRef entryType, uint64_t depth) {
+  if (index >= entries.size())
+    return emitError(fileLoc) << "invalid " << entryType << " index: " << index;
 
-  // Invoke the MLIR assembly parser to parse the entry text.
-  size_t numRead = 0;
-  MLIRContext *context = fileLoc->getContext();
-  if constexpr (std::is_same_v<T, Type>)
-    result =
-        ::parseType(asmStr, context, &numRead, /*isKnownNullTerminated=*/true);
-  else
-    result = ::parseAttribute(asmStr, context, Type(), &numRead,
-                              /*isKnownNullTerminated=*/true);
-  if (!result)
+  // If the entry has already been resolved, return it.
+  Entry<T> &entry = entries[index];
+  if (entry.entry) {
+    result = entry.entry;
+    return success();
+  }
+
+  // If the entry hasn't been resolved, try to parse it.
+  EncodingReader reader(entry.data, fileLoc);
+  LogicalResult parseResult =
+      entry.hasCustomEncoding
+          ? parseCustomEntry(entry, reader, entryType, index, depth)
+          : parseAsmEntry(entry.entry, reader, entryType);
+  if (failed(parseResult))
     return failure();
 
-  // Ensure there weren't dangling characters after the entry.
-  if (numRead != asmStr.size()) {
-    return reader.emitError("trailing characters found after ", entryType,
-                            " assembly format: ", asmStr.drop_front(numRead));
-  }
+  if (!reader.empty())
+    return reader.emitError("unexpected trailing bytes after " + entryType +
+                            " entry");
+
+  result = entry.entry;
   return success();
 }
 
 template <typename T>
 LogicalResult AttrTypeReader::parseCustomEntry(Entry<T> &entry,
                                                EncodingReader &reader,
-                                               StringRef entryType) {
+                                               StringRef entryType,
+                                               uint64_t index, uint64_t depth) {
   DialectReader dialectReader(*this, stringReader, resourceReader, dialectsMap,
-                              reader, bytecodeVersion);
+                              reader, bytecodeVersion, depth);
   if (failed(entry.dialect->load(dialectReader, fileLoc.getContext())))
     return failure();
 
@@ -1350,6 +1475,33 @@ LogicalResult AttrTypeReader::parseCustomEntry(Entry<T> &entry,
   return success(!!entry.entry);
 }
 
+template <typename T>
+LogicalResult AttrTypeReader::parseAsmEntry(T &result, EncodingReader &reader,
+                                            StringRef entryType) {
+  StringRef asmStr;
+  if (failed(reader.parseNullTerminatedString(asmStr)))
+    return failure();
+
+  // Invoke the MLIR assembly parser to parse the entry text.
+  size_t numRead = 0;
+  MLIRContext *context = fileLoc->getContext();
+  if constexpr (std::is_same_v<T, Type>)
+    result =
+        ::parseType(asmStr, context, &numRead, /*isKnownNullTerminated=*/true);
+  else
+    result = ::parseAttribute(asmStr, context, Type(), &numRead,
+                              /*isKnownNullTerminated=*/true);
+  if (!result)
+    return failure();
+
+  // Ensure there weren't dangling characters after the entry.
+  if (numRead != asmStr.size()) {
+    return reader.emitError("trailing characters found after ", entryType,
+                            " assembly format: ", asmStr.drop_front(numRead));
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Bytecode Reader
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/CAPI/Dialect/IRDL.cpp b/mlir/lib/CAPI/Dialect/IRDL.cpp
index cb9dc8ceb6795..43f420bf6db05 100644
--- a/mlir/lib/CAPI/Dialect/IRDL.cpp
+++ b/mlir/lib/CAPI/Dialect/IRDL.cpp
@@ -16,3 +16,30 @@ MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(IRDL, irdl, mlir::irdl::IRDLDialect)
 MlirLogicalResult mlirLoadIRDLDialects(MlirModule module) {
   return wrap(mlir::irdl::loadDialects(unwrap(module)));
 }
+
+//===----------------------------------------------------------------------===//
+// VariadicityAttr
+//===----------------------------------------------------------------------===//
+
+MlirAttribute mlirIRDLVariadicityAttrGet(MlirContext ctx, MlirStringRef value) {
+  return wrap(mlir::irdl::VariadicityAttr::get(
+      unwrap(ctx), mlir::irdl::symbolizeVariadicity(unwrap(value)).value()));
+}
+
+//===----------------------------------------------------------------------===//
+// VariadicityArrayAttr
+//===----------------------------------------------------------------------===//
+
+MlirAttribute mlirIRDLVariadicityArrayAttrGet(MlirContext ctx, intptr_t nValues,
+                                              MlirAttribute const *values) {
+  llvm::SmallVector<mlir::Attribute> attrs;
+  llvm::ArrayRef<mlir::Attribute> unwrappedAttrs =
+      unwrapList(nValues, values, attrs);
+
+  llvm::SmallVector<mlir::irdl::VariadicityAttr> variadicities;
+  for (auto attr : unwrappedAttrs)
+    variadicities.push_back(llvm::cast<mlir::irdl::VariadicityAttr>(attr));
+
+  return wrap(
+      mlir::irdl::VariadicityArrayAttr::get(unwrap(ctx), variadicities));
+}
diff --git a/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
index 68e124625921f..c3de4f7e3e282 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Utils/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_mlir_dialect_library(MLIROpenACCUtils
+  OpenACCUtilsTiling.cpp
   OpenACCUtils.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -14,7 +15,9 @@ add_mlir_dialect_library(MLIROpenACCUtils
   MLIROpenACCTypeInterfacesIncGen
 
   LINK_LIBS PUBLIC
+  MLIRArithDialect
   MLIROpenACCDialect
   MLIRIR
   MLIRSupport
+  MLIRTransformUtils
 )
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp
new file mode 100644
index 0000000000000..0b344ba2f8316
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtilsTiling.cpp
@@ -0,0 +1,311 @@
+//===- OpenACCUtilsTiling.cpp - OpenACC Loop Tiling Utilities -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utility functions for tiling OpenACC loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACCUtilsTiling.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+// Resolve unknown tile sizes (represented as -1 for tile(*)) to the default.
+static mlir::Value resolveUnknownTileSize(mlir::Value tileSize,
+                                          int32_t defaultTileSize,
+                                          mlir::RewriterBase &rewriter,
+                                          mlir::Location loc) {
+  auto constVal = mlir::getConstantIntValue(tileSize);
+  if (constVal && *constVal < 0)
+    return mlir::arith::ConstantOp::create(
+        rewriter, loc, rewriter.getI32Type(),
+        rewriter.getI32IntegerAttr(defaultTileSize));
+  return tileSize;
+}
+
+// Remove vector/worker attributes from loop
+static void removeWorkerVectorFromLoop(mlir::acc::LoopOp loop) {
+  if (loop.hasVector() || loop.getVectorValue()) {
+    loop.removeVectorAttr();
+    loop.removeVectorOperandsDeviceTypeAttr();
+  } else if (loop.hasWorker() || loop.getWorkerValue()) {
+    loop.removeWorkerAttr();
+    loop.removeWorkerNumOperandsDeviceTypeAttr();
+  }
+}
+
+// Create a new ACC loop with new steps, lb, ub from original loop
+static mlir::acc::LoopOp
+createACCLoopFromOriginal(mlir::acc::LoopOp origLoop,
+                          mlir::RewriterBase &rewriter, mlir::ValueRange lb,
+                          mlir::ValueRange ub, mlir::ValueRange step,
+                          mlir::DenseBoolArrayAttr inclusiveUBAttr,
+                          mlir::acc::CombinedConstructsTypeAttr combinedAttr,
+                          mlir::Location loc, bool preserveCollapse) {
+  mlir::ArrayAttr collapseAttr = mlir::ArrayAttr{};
+  mlir::ArrayAttr collapseDeviceTypeAttr = mlir::ArrayAttr{};
+  if (preserveCollapse) {
+    collapseAttr = origLoop.getCollapseAttr();
+    collapseDeviceTypeAttr = origLoop.getCollapseDeviceTypeAttr();
+  }
+  auto newLoop = mlir::acc::LoopOp::create(
+      rewriter, loc, origLoop->getResultTypes(), lb, ub, step, inclusiveUBAttr,
+      collapseAttr, collapseDeviceTypeAttr, origLoop.getGangOperands(),
+      origLoop.getGangOperandsArgTypeAttr(),
+      origLoop.getGangOperandsSegmentsAttr(),
+      origLoop.getGangOperandsDeviceTypeAttr(), origLoop.getWorkerNumOperands(),
+      origLoop.getWorkerNumOperandsDeviceTypeAttr(),
+      origLoop.getVectorOperands(), origLoop.getVectorOperandsDeviceTypeAttr(),
+      origLoop.getSeqAttr(), origLoop.getIndependentAttr(),
+      origLoop.getAuto_Attr(), origLoop.getGangAttr(), origLoop.getWorkerAttr(),
+      origLoop.getVectorAttr(), mlir::ValueRange{}, mlir::DenseI32ArrayAttr{},
+      mlir::ArrayAttr{}, origLoop.getCacheOperands(),
+      origLoop.getPrivateOperands(), origLoop.getFirstprivateOperands(),
+      origLoop.getReductionOperands(), combinedAttr);
+  return newLoop;
+}
+
+// Create inner loop inside input loop
+static mlir::acc::LoopOp
+createInnerLoop(mlir::acc::LoopOp inputLoop, mlir::RewriterBase &rewriter,
+                mlir::ValueRange lb, mlir::ValueRange ub, mlir::ValueRange step,
+                mlir::DenseBoolArrayAttr inclusiveUBAttr, mlir::Location loc) {
+  mlir::acc::LoopOp elementLoop = createACCLoopFromOriginal(
+      inputLoop, rewriter, lb, ub, step, inclusiveUBAttr,
+      mlir::acc::CombinedConstructsTypeAttr{}, loc, /*preserveCollapse*/ false);
+
+  // Remove gang/worker attributes from inner loops
+  rewriter.startOpModification(elementLoop);
+  if (inputLoop.hasGang() ||
+      inputLoop.getGangValue(mlir::acc::GangArgType::Num) ||
+      inputLoop.getGangValue(mlir::acc::GangArgType::Dim) ||
+      inputLoop.getGangValue(mlir::acc::GangArgType::Static)) {
+    elementLoop.removeGangAttr();
+    elementLoop.removeGangOperandsArgTypeAttr();
+    elementLoop.removeGangOperandsSegmentsAttr();
+    elementLoop.removeGangOperandsDeviceTypeAttr();
+  }
+  if (inputLoop.hasVector() || inputLoop.getVectorValue()) {
+    elementLoop.removeWorkerAttr();
+    elementLoop.removeWorkerNumOperandsDeviceTypeAttr();
+  }
+  rewriter.finalizeOpModification(elementLoop);
+
+  // Create empty block in elementLoop and add IV argument
+  mlir::Block *blk = rewriter.createBlock(&elementLoop.getRegion(),
+                                          elementLoop.getRegion().begin());
+  rewriter.setInsertionPointToEnd(blk);
+  mlir::acc::YieldOp::create(rewriter, loc);
+  elementLoop.getBody().addArgument(
+      inputLoop.getBody().getArgument(0).getType(), loc);
+
+  return elementLoop;
+}
+
+// Move ops from source to target Loop and replace uses of IVs
+static void moveOpsAndReplaceIVs(mlir::acc::LoopOp sourceLoop,
+                                 mlir::acc::LoopOp targetLoop,
+                                 llvm::ArrayRef<mlir::Value> newIVs,
+                                 llvm::ArrayRef<mlir::Value> origIVs,
+                                 size_t nOps, mlir::RewriterBase &rewriter) {
+  // Move ops from source to target loop [begin, begin + nOps - 1)
+  mlir::Block::iterator begin = sourceLoop.getBody().begin();
+  targetLoop.getBody().getOperations().splice(
+      targetLoop.getBody().getOperations().begin(),
+      sourceLoop.getBody().getOperations(), begin, std::next(begin, nOps - 1));
+
+  // Replace uses of origIV with newIV
+  for (auto [i, newIV] : llvm::enumerate(newIVs))
+    mlir::replaceAllUsesInRegionWith(origIVs[i], newIV, targetLoop.getRegion());
+}
+
+mlir::acc::LoopOp
+mlir::acc::tileACCLoops(llvm::SmallVector<mlir::acc::LoopOp> &tileLoops,
+                        const llvm::SmallVector<mlir::Value> &tileSizes,
+                        int32_t defaultTileSize, mlir::RewriterBase &rewriter) {
+  // Tile collapsed and/or nested loops
+  mlir::acc::LoopOp outerLoop = tileLoops[0];
+  const mlir::Location loc = outerLoop.getLoc();
+
+  // Resolve unknown tile sizes (tile(*) represented as -1)
+  llvm::SmallVector<mlir::Value> resolvedTileSizes;
+  rewriter.setInsertionPoint(outerLoop);
+  for (mlir::Value tileSize : tileSizes)
+    resolvedTileSizes.push_back(
+        resolveUnknownTileSize(tileSize, defaultTileSize, rewriter, loc));
+
+  mlir::acc::LoopOp innerLoop = tileLoops[tileLoops.size() - 1];
+  llvm::SmallVector<mlir::Value, 3> origIVs;
+  llvm::SmallVector<mlir::Value, 3> origSteps;
+  llvm::SmallVector<mlir::Value, 3> origUBs;
+  llvm::SmallVector<mlir::Value, 3> newSteps;
+  llvm::SmallVector<mlir::Value, 3> newUBs;
+  llvm::SmallVector<mlir::Value, 3> newIVs;
+  size_t nOps = innerLoop.getBody().getOperations().size();
+
+  // Extract original inclusiveUBs
+  llvm::SmallVector<bool> inclusiveUBs;
+  for (auto tileLoop : tileLoops) {
+    for (auto [j, step] : llvm::enumerate(tileLoop.getStep())) {
+      // inclusiveUBs are present on the IR from Fortran frontend for DO loops
+      // but might not be present from other frontends (python)
+      // So check if it exists
+      if (tileLoop.getInclusiveUpperboundAttr())
+        inclusiveUBs.push_back(
+            tileLoop.getInclusiveUpperboundAttr().asArrayRef()[j]);
+      else
+        inclusiveUBs.push_back(false);
+    }
+  }
+
+  // Extract original ivs, UBs, steps, and calculate new steps
+  rewriter.setInsertionPoint(outerLoop);
+  for (auto [i, tileLoop] : llvm::enumerate(tileLoops)) {
+    for (auto arg : tileLoop.getBody().getArguments())
+      origIVs.push_back(arg);
+    for (auto ub : tileLoop.getUpperbound())
+      origUBs.push_back(ub);
+
+    llvm::SmallVector<mlir::Value, 3> currentLoopSteps;
+    for (auto [j, step] : llvm::enumerate(tileLoop.getStep())) {
+      origSteps.push_back(step);
+      if (i + j >= resolvedTileSizes.size()) {
+        currentLoopSteps.push_back(step);
+      } else {
+        mlir::Value tileSize = resolvedTileSizes[i + j];
+        auto newLoopStep =
+            mlir::arith::MulIOp::create(rewriter, loc, step, tileSize);
+        currentLoopSteps.push_back(newLoopStep);
+        newSteps.push_back(newLoopStep);
+      }
+    }
+
+    rewriter.startOpModification(tileLoop);
+    tileLoop.getStepMutable().clear();
+    tileLoop.getStepMutable().append(currentLoopSteps);
+    rewriter.finalizeOpModification(tileLoop);
+  }
+
+  // Calculate new upper bounds for element loops
+  for (size_t i = 0; i < newSteps.size(); i++) {
+    rewriter.setInsertionPoint(innerLoop.getBody().getTerminator());
+    // UpperBound: min(origUB, origIV+(originalStep*tile_size))
+    auto stepped =
+        mlir::arith::AddIOp::create(rewriter, loc, origIVs[i], newSteps[i]);
+    mlir::Value newUB = stepped;
+    if (inclusiveUBs[i]) {
+      // Handle InclusiveUB
+      // UpperBound: min(origUB, origIV+(originalStep*tile_size - 1))
+      auto c1 = mlir::arith::ConstantOp::create(
+          rewriter, loc, newSteps[i].getType(),
+          rewriter.getIntegerAttr(newSteps[i].getType(), 1));
+      newUB = mlir::arith::SubIOp::create(rewriter, loc, stepped, c1);
+    }
+    newUBs.push_back(
+        mlir::arith::MinSIOp::create(rewriter, loc, origUBs[i], newUB));
+  }
+
+  // Create and insert nested elementLoopOps before terminator of outer loopOp
+  mlir::acc::LoopOp currentLoop = innerLoop;
+  for (size_t i = 0; i < resolvedTileSizes.size(); i++) {
+    rewriter.setInsertionPoint(currentLoop.getBody().getTerminator());
+    mlir::DenseBoolArrayAttr inclusiveUBAttr = mlir::DenseBoolArrayAttr{};
+    if (inclusiveUBs[i])
+      inclusiveUBAttr = rewriter.getDenseBoolArrayAttr({true});
+
+    mlir::acc::LoopOp elementLoop =
+        createInnerLoop(innerLoop, rewriter, mlir::ValueRange{origIVs[i]},
+                        mlir::ValueRange{newUBs[i]},
+                        mlir::ValueRange{origSteps[i]}, inclusiveUBAttr, loc);
+
+    // Remove vector/worker attributes from inner element loops except
+    // outermost element loop
+    if (i > 0) {
+      rewriter.startOpModification(elementLoop);
+      removeWorkerVectorFromLoop(elementLoop);
+      rewriter.finalizeOpModification(elementLoop);
+    }
+    newIVs.push_back(elementLoop.getBody().getArgument(0));
+    currentLoop = elementLoop;
+  }
+
+  // Remove vector/worker attributes from outer tile loops
+  for (auto tileLoop : tileLoops) {
+    rewriter.startOpModification(tileLoop);
+    removeWorkerVectorFromLoop(tileLoop);
+    rewriter.finalizeOpModification(tileLoop);
+  }
+
+  // Move ops from inner tile loop to inner element loop and replace IV uses
+  moveOpsAndReplaceIVs(innerLoop, currentLoop, newIVs, origIVs, nOps, rewriter);
+
+  return outerLoop;
+}
+
+llvm::SmallVector<mlir::acc::LoopOp>
+mlir::acc::uncollapseLoops(mlir::acc::LoopOp origLoop, unsigned tileCount,
+                           unsigned collapseCount,
+                           mlir::RewriterBase &rewriter) {
+  llvm::SmallVector<mlir::acc::LoopOp> newLoops;
+  llvm::SmallVector<mlir::Value, 3> newIVs;
+  mlir::Location loc = origLoop.getLoc();
+  llvm::SmallVector<bool> newInclusiveUBs;
+  llvm::SmallVector<mlir::Value, 3> lbs, ubs, steps;
+  for (unsigned i = 0; i < collapseCount; i++) {
+    // inclusiveUpperbound attribute might not be set, default to false
+    bool inclusiveUB = false;
+    if (origLoop.getInclusiveUpperboundAttr())
+      inclusiveUB = origLoop.getInclusiveUpperboundAttr().asArrayRef()[i];
+    newInclusiveUBs.push_back(inclusiveUB);
+    lbs.push_back(origLoop.getLowerbound()[i]);
+    ubs.push_back(origLoop.getUpperbound()[i]);
+    steps.push_back(origLoop.getStep()[i]);
+  }
+  mlir::acc::LoopOp outerLoop = createACCLoopFromOriginal(
+      origLoop, rewriter, lbs, ubs, steps,
+      rewriter.getDenseBoolArrayAttr(newInclusiveUBs),
+      origLoop.getCombinedAttr(), loc, /*preserveCollapse*/ true);
+  mlir::Block *blk = rewriter.createBlock(&outerLoop.getRegion(),
+                                          outerLoop.getRegion().begin());
+  rewriter.setInsertionPointToEnd(blk);
+  mlir::acc::YieldOp::create(rewriter, loc);
+  for (unsigned i = 0; i < collapseCount; i++) {
+    outerLoop.getBody().addArgument(origLoop.getBody().getArgument(i).getType(),
+                                    loc);
+    newIVs.push_back(outerLoop.getBody().getArgument(i));
+  }
+  newLoops.push_back(outerLoop);
+
+  mlir::acc::LoopOp currentLoopOp = outerLoop;
+  for (unsigned i = collapseCount; i < tileCount; i++) {
+    rewriter.setInsertionPoint(currentLoopOp.getBody().getTerminator());
+    bool inclusiveUB = false;
+    if (origLoop.getInclusiveUpperboundAttr())
+      inclusiveUB = origLoop.getInclusiveUpperboundAttr().asArrayRef()[i];
+    mlir::DenseBoolArrayAttr inclusiveUBAttr =
+        rewriter.getDenseBoolArrayAttr({inclusiveUB});
+    mlir::acc::LoopOp innerLoop = createInnerLoop(
+        origLoop, rewriter, mlir::ValueRange{origLoop.getLowerbound()[i]},
+        mlir::ValueRange{origLoop.getUpperbound()[i]},
+        mlir::ValueRange{origLoop.getStep()[i]}, inclusiveUBAttr, loc);
+    newIVs.push_back(innerLoop.getBody().getArgument(0));
+    newLoops.push_back(innerLoop);
+    currentLoopOp = innerLoop;
+  }
+  // Move ops from origLoop to innermost loop and replace uses of IVs
+  size_t nOps = origLoop.getBody().getOperations().size();
+  llvm::SmallVector<mlir::Value, 3> origIVs;
+  for (auto arg : origLoop.getBody().getArguments())
+    origIVs.push_back(arg);
+  moveOpsAndReplaceIVs(origLoop, currentLoopOp, newIVs, origIVs, nOps,
+                       rewriter);
+
+  return newLoops;
+}
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index e487e4162932f..7a30475e33410 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -453,6 +453,8 @@ struct ReorderCastOpsOnBroadcast
                                 PatternRewriter &rewriter) const override {
     if (op->getNumOperands() != 1)
       return failure();
+    if (!isa<VectorType>(op->getResult(0).getType()))
+      return failure();
     auto bcastOp = op->getOperand(0).getDefiningOp<vector::BroadcastOp>();
     if (!bcastOp)
       return failure();
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7ab2e612ed890..1a19ab5fd970b 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -390,6 +390,86 @@ LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
   return genCoordinates(builder, loc, ids, layout, subShape, shape);
 }
 
+bool LayoutAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
+  if (dyn_cast<xegpu::SliceAttr>(other))
+    return false;
+
+  return *this == dyn_cast<xegpu::LayoutAttr>(other);
+}
+
+// set the layout for unit dims: sg_data, inst_data and lane_data to 1
+DistributeLayoutAttr LayoutAttr::setUnitDimData(SetVector<int64_t> unitDims) {
+  auto sgDataOpt = getSgData();
+  auto instDataOpt = getInstData();
+  auto laneDataOpt = getLaneData();
+
+  SmallVector<int32_t> sgData;
+  SmallVector<int32_t> instData;
+  SmallVector<int32_t> laneData;
+
+  if (sgDataOpt) {
+    sgData = llvm::to_vector(sgDataOpt.asArrayRef());
+  }
+  if (instDataOpt) {
+    instData = llvm::to_vector(instDataOpt.asArrayRef());
+  }
+  if (laneDataOpt) {
+    laneData = llvm::to_vector(laneDataOpt.asArrayRef());
+  }
+
+  for (auto dim : unitDims) {
+    if (dim < static_cast<int64_t>(sgData.size()))
+      sgData[dim] = 1;
+    if (dim < static_cast<int64_t>(instData.size()))
+      instData[dim] = 1;
+    if (dim < static_cast<int64_t>(laneData.size()))
+      laneData[dim] = 1;
+  }
+
+  return LayoutAttr::get(
+      getContext(), getSgLayout(),
+      sgData.empty() ? DenseI32ArrayAttr()
+                     : DenseI32ArrayAttr::get(getContext(), sgData),
+      instData.empty() ? DenseI32ArrayAttr()
+                       : DenseI32ArrayAttr::get(getContext(), instData),
+      getLaneLayout(),
+      laneData.empty() ? DenseI32ArrayAttr()
+                       : DenseI32ArrayAttr::get(getContext(), laneData),
+      getOrder());
+}
+
+// set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+DistributeLayoutAttr LayoutAttr::setUnitDimLayout(SetVector<int64_t> unitDims) {
+  auto sgLayoutOpt = getSgLayout();
+  auto laneLayoutOpt = getLaneLayout();
+
+  SmallVector<int32_t> sgLayout;
+  SmallVector<int32_t> laneLayout;
+
+  if (sgLayoutOpt) {
+    sgLayout = llvm::to_vector(sgLayoutOpt.asArrayRef());
+  }
+  if (laneLayoutOpt) {
+    laneLayout = llvm::to_vector(laneLayoutOpt.asArrayRef());
+  }
+
+  for (auto dim : unitDims) {
+    if (dim < static_cast<int64_t>(sgLayout.size()))
+      sgLayout[dim] = 1;
+    if (dim < static_cast<int64_t>(laneLayout.size()))
+      laneLayout[dim] = 1;
+  }
+
+  return LayoutAttr::get(
+      getContext(),
+      sgLayout.empty() ? DenseI32ArrayAttr()
+                       : DenseI32ArrayAttr::get(getContext(), sgLayout),
+      getSgData(), getInstData(),
+      laneLayout.empty() ? DenseI32ArrayAttr()
+                         : DenseI32ArrayAttr::get(getContext(), laneLayout),
+      getLaneData(), getOrder());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
@@ -510,6 +590,69 @@ bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
                       [&](int64_t dim) { return thisDims.contains(dim); });
 }
 
+bool SliceAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
+  if (dyn_cast<xegpu::LayoutAttr>(other))
+    return false;
+
+  auto flattenedThis = flatten();
+  auto flattenedOther = dyn_cast<xegpu::SliceAttr>(other).flatten();
+
+  return ((flattenedThis.getParent() == flattenedOther.getParent()) &&
+          (flattenedThis.getDims() == flattenedOther.getDims()));
+}
+
+// Helper function to adjust unit dimensions from sliced space to parent space
+static SetVector<int64_t>
+adjustUnitDimsWithSliceDims(const SetVector<int64_t> &unitDims,
+                            ArrayRef<int64_t> sliceDims) {
+  // Reconstruct parent's non-sliced dimensions
+
+  int64_t parentRank = sliceDims.size() + unitDims.size();
+  llvm::SmallDenseSet<int64_t> slicedDimsSet(sliceDims.begin(),
+                                             sliceDims.end());
+  SmallVector<int64_t> nonSlicedDims;
+  for (int64_t i = 0; i < parentRank; ++i) {
+    if (!slicedDimsSet.contains(i))
+      nonSlicedDims.push_back(i);
+  }
+
+  // Map unit dims from sliced space to parent space
+  SetVector<int64_t> adjustUnitDims;
+  for (auto dim : unitDims) {
+    if (dim < static_cast<int64_t>(nonSlicedDims.size())) {
+      adjustUnitDims.insert(nonSlicedDims[dim]);
+    }
+  }
+
+  return adjustUnitDims;
+}
+
+// set the layout for unit dims: sg_data, inst_data and lane_data to 1
+DistributeLayoutAttr SliceAttr::setUnitDimData(SetVector<int64_t> unitDims) {
+  SliceAttr attr = flatten();
+  ArrayRef<int64_t> sliceDims = attr.getDims().asArrayRef();
+  auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+
+  SetVector<int64_t> adjustUnitDims =
+      adjustUnitDimsWithSliceDims(unitDims, sliceDims);
+
+  return SliceAttr::get(getContext(), parent.setUnitDimData(adjustUnitDims),
+                        attr.getDims());
+}
+
+// set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+DistributeLayoutAttr SliceAttr::setUnitDimLayout(SetVector<int64_t> unitDims) {
+  SliceAttr attr = flatten();
+  ArrayRef<int64_t> sliceDims = attr.getDims().asArrayRef();
+  auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+
+  SetVector<int64_t> adjustUnitDims =
+      adjustUnitDimsWithSliceDims(unitDims, sliceDims);
+
+  return SliceAttr::get(getContext(), parent.setUnitDimLayout(adjustUnitDims),
+                        attr.getDims());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_RangeAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 59a1ad9dbe189..dc9eb96c169b4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -580,23 +580,39 @@ void LayoutInfoPropagation::visitVectorBroadCastOp(
   // Only consider vector to vector broadcasts for now.
   VectorType resultTy = broadcast.getResultVectorType();
   VectorType sourceTy = dyn_cast<VectorType>(broadcast.getSourceType());
-  if (!sourceTy) {
-    broadcast.emitWarning("Expecting source type to be a vector type.");
+  // skip layout propagation for non-vector source operand.
+  if (!sourceTy)
     return;
-  }
 
-  // Only consider nD -> nD broadcast.
+  // Hanlding broadcast from low-rank to high-rank (e.g., 1D to 2D) case.
   if (sourceTy.getRank() != resultTy.getRank()) {
-    broadcast.emitWarning("Expecting source and result to have same rank.");
+    auto sourceDims = sourceTy.getShape();
+    auto resultDims = resultTy.getShape();
+    SmallVector<int64_t> bcastDims;
+    auto dimDiff = resultTy.getRank() - sourceTy.getRank();
+    // adding the missing leading dims
+    for (int i = 0; i < dimDiff; i++)
+      bcastDims.push_back(i);
+
+    // for the rest dims in the resultTy, if sourceTy dim is 1, then it's
+    // broadcasted dim
+    for (size_t i = 0; i < sourceDims.size(); i++)
+      if ((sourceDims[i] == 1) && (resultDims[i + dimDiff] != 1))
+        bcastDims.push_back(i + dimDiff);
+
+    // create a slice layout for the source
+    xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get(
+        broadcast->getContext(),
+        cast<xegpu::DistributeLayoutAttr>(resultLayout.get()),
+        DenseI64ArrayAttr::get(broadcast->getContext(), bcastDims));
+
+    propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout)));
     return;
   }
+
   SetVector<int64_t> broadcastUnitDims = broadcast.computeBroadcastedUnitDims();
-  if (broadcastUnitDims.size() != 1) {
-    broadcast.emitWarning("Expecting source type to be nD vector only with "
-                          "one broadcasted dimension.");
-    return;
-  }
-  // Propagate the result layout to the source operand.
+  resultLayout = cast<xegpu::DistributeLayoutAttr>(resultLayout.get())
+                     .setUnitDimData(broadcastUnitDims);
   propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
 }
 
@@ -917,7 +933,7 @@ void LayoutInfoPropagation::visitLoadGatherOp(
   } else {
 
     // The layout is strictly determined by the payload type.
-    auto payloadTy = dyn_cast<VectorType>(load.getValueType());
+    VectorType payloadTy = load.getValueType();
     if (!payloadTy) {
       load.emitWarning("Not propagating, non-vector payload supplied.");
       return;
@@ -987,7 +1003,7 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     // Currently, for 2D StoreScatterOp we expect that the height dimension of
     // the tensor descriptor is equal to the subgroup size. This is ensured by
     // the op verifier.
-    auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType());
+    VectorType payloadTy = storeScatter.getValueType();
     if (!payloadTy) {
       storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
       return;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 0d1c5eeeff711..ca81c3cd7be42 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -99,7 +99,6 @@ getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
     if (i < distributionStart)
       continue;
-
     // Check if the dimension can be distributed evenly.
     if (dim % effectiveLaneLayout[i - distributionStart] != 0)
       return failure();
@@ -1424,6 +1423,166 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
   }
 };
 
+/// This pattern distributes the `vector.broadcast` operation across lanes in a
+/// warp. The pattern supports three use cases:
+///
+/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
+/// vector
+///    must have a slice layout of the result. If the distributed source and
+///    target vector types are identical, this lowers to a no-op; otherwise, it
+///    remains a broadcast but operates on distributed vectors.
+///
+/// 2) Broadcast a same-rank vector with identical layouts for source and
+/// target:
+///    The source vector must have unit dimensions, and lane_data must be unit
+///    size for those unit dims. This always lowers to a no-op.
+///
+/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
+///    scalar to distributed result type.
+///
+/// Example 1 (lowering to a broadcast with distributed types):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [0]> } : () -> (vector<32xf32>)
+///   %2 = vector.broadcast %0 {layout_result_0 =
+///     #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>}
+///     : vector<32xf32> to vector<8x32xf32>
+///     gpu.yield %1 : vector<8x32xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [0]> } : () -> (vector<32xf32>)
+///   gpu.yield %0 : vector<32xf32>
+/// }
+/// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
+///
+/// Example 2 (no-op):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [1]> } : () -> (vector<8xf32>)
+///   %1 = vector.shape_cast %0
+///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+///      1]>}: vector<8xf32> to vector<8x1xf32>
+///   %2 = vector.broadcast %1
+///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+///     1]>}: vector<8x1xf32> to vector<8x32xf32>
+///   gpu.yield %1 : vector<8x32xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [1]> } : () -> (vector<8xf32>)
+///   %1 = vector.shape_cast %0
+///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+///     1]>}: vector<8xf32> to vector<8x1xf32>
+///   gpu.yield %1 : vector<8x1xf32>
+/// }
+/// // The broadcast is implicit through layout transformation (no-op)
+///  "some_use"(%r#0)
+/// ```
+struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *yieldOperand =
+        getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
+    if (!yieldOperand)
+      return failure();
+    auto broadcastOp =
+        cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
+    unsigned operandIdx = yieldOperand->getOperandNumber();
+
+    VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
+    VectorType destType =
+        dyn_cast<VectorType>(broadcastOp.getResult().getType());
+
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getDistributeLayoutAttr(broadcastOp->getOpOperand(0));
+    xegpu::DistributeLayoutAttr resultLayout =
+        xegpu::getDistributeLayoutAttr(broadcastOp.getResult());
+
+    FailureOr<VectorType> sourceDistType;
+    Type sourceElemOrDistType;
+    if (sourceType) {
+
+      // Case 1 and 2: source is a vector type.
+      int64_t rankDiff = destType.getRank() - sourceType.getRank();
+      if (rankDiff > 0) {
+        // Case 1: source is lower-rank than result.
+        bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
+        if (!isSliceOf)
+          return rewriter.notifyMatchFailure(
+              warpOp,
+              "Broadcast input layout must be a slice of result layout.");
+      }
+      // case 2: source and result have same rank
+      if (rankDiff == 0) {
+        SetVector<int64_t> broadcastUnitDims =
+            broadcastOp.computeBroadcastedUnitDims();
+        resultLayout = resultLayout.setUnitDimData(broadcastUnitDims);
+        bool isEqualTo = sourceLayout.isEqualTo(resultLayout);
+        if (!isEqualTo)
+          return rewriter.notifyMatchFailure(
+              warpOp, "For same-rank broadcast, source must be identical to "
+                      "adjusted result layouts with unit dims.");
+        sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);
+      }
+
+      sourceDistType =
+          getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
+      if (failed(sourceDistType)) {
+        return rewriter.notifyMatchFailure(
+            warpOp, "Failed to distribute the source vector type.");
+      }
+      sourceElemOrDistType = sourceDistType.value();
+
+    } else {
+      // Case 3: source is a scalar type.
+      if (sourceLayout) {
+        return rewriter.notifyMatchFailure(
+            warpOp, "Broadcast from scalar must not have a layout attribute.");
+      }
+      sourceElemOrDistType = broadcastOp.getSourceType();
+    }
+    FailureOr<VectorType> destDistType =
+        getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
+    if (failed(destDistType)) {
+      return rewriter.notifyMatchFailure(
+          warpOp, "Failed to distribute the dest vector type.");
+    }
+
+    SmallVector<size_t> newRetIndices;
+    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType,
+        newRetIndices);
+
+    Value distributedSource = newWarpOp.getResult(newRetIndices[0]);
+
+    Value newBroadcast = distributedSource;
+
+    if (sourceElemOrDistType != destDistType.value()) {
+      rewriter.setInsertionPointAfter(newWarpOp);
+      newBroadcast =
+          vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(),
+                                      destDistType.value(), distributedSource);
+    }
+
+    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
+    return success();
+  }
+};
+
 /// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
 /// `gpu.warp_execute_on_lane_0` region.
 struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
@@ -1865,7 +2024,7 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
   // patterns. Therefore, assign higher benefit.
   patterns
       .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
-           VectorInsertStridedSliceDistribution>(
+           VectorInsertStridedSliceDistribution, VectorBroadcastDistribution>(
           patterns.getContext(),
           /*pattern benefit=*/highPatternBenefit);
 }
diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp
index 26c965cfc0237..40950312d566f 100644
--- a/mlir/lib/Transforms/Utils/Inliner.cpp
+++ b/mlir/lib/Transforms/Utils/Inliner.cpp
@@ -613,8 +613,8 @@ Inliner::Impl::inlineCallsInSCC(InlinerInterfaceImpl &inlinerIface,
 
   LLVM_DEBUG({
     LDBG() << "* Inliner: Initial calls in SCC are: {";
-    for (unsigned i = 0, e = calls.size(); i < e; ++i)
-      LDBG() << "  " << i << ". " << calls[i].call << ",";
+    for (unsigned I = 0, E = calls.size(); I < E; ++I)
+      LDBG() << "  " << I << ". " << calls[I].call << ",";
     LDBG() << "}";
   });
 
diff --git a/mlir/test/CAPI/irdl.c b/mlir/test/CAPI/irdl.c
index ad52ece6a41ce..20cf35f2501ff 100644
--- a/mlir/test/CAPI/irdl.c
+++ b/mlir/test/CAPI/irdl.c
@@ -12,6 +12,7 @@
 
 #include "mlir-c/Dialect/IRDL.h"
 #include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
 
 const char irdlDialect[] = "\
   irdl.dialect @foo {\
@@ -37,10 +38,39 @@ const char newDialectUsage[] = "\
     \"bar.op\"(%res) : (i32) -> ()\
   }";
 
+void testVariadicityAttributes(MlirContext ctx) {
+  MlirAttribute variadicitySingle =
+      mlirIRDLVariadicityAttrGet(ctx, mlirStringRefCreateFromCString("single"));
+
+  // CHECK: #irdl<variadicity single>
+  mlirAttributeDump(variadicitySingle);
+
+  MlirAttribute variadicityOptional = mlirIRDLVariadicityAttrGet(
+      ctx, mlirStringRefCreateFromCString("optional"));
+
+  // CHECK: #irdl<variadicity optional>
+  mlirAttributeDump(variadicityOptional);
+
+  MlirAttribute variadicityVariadic = mlirIRDLVariadicityAttrGet(
+      ctx, mlirStringRefCreateFromCString("variadic"));
+
+  // CHECK: #irdl<variadicity variadic>
+  mlirAttributeDump(variadicityVariadic);
+
+  MlirAttribute variadicities[] = {variadicitySingle, variadicityOptional,
+                                   variadicityVariadic};
+  MlirAttribute variadicityArray =
+      mlirIRDLVariadicityArrayAttrGet(ctx, 3, variadicities);
+
+  // CHECK: #irdl<variadicity_array[ single, optional,  variadic]>
+  mlirAttributeDump(variadicityArray);
+}
+
 int main(void) {
   MlirContext ctx = mlirContextCreate();
   mlirDialectHandleLoadDialect(mlirGetDialectHandle__irdl__(), ctx);
 
+  // Test loading an IRDL dialect and using it.
   MlirModule dialectDecl =
       mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(irdlDialect));
 
@@ -53,6 +83,10 @@ int main(void) {
   mlirOperationDump(mlirModuleGetOperation(usingModule));
 
   mlirModuleDestroy(usingModule);
+
+  // Test variadicity attributes.
+  testVariadicityAttributes(ctx);
+
   mlirContextDestroy(ctx);
   return 0;
 }
diff --git a/mlir/test/Dialect/Vector/vector-sink.mlir b/mlir/test/Dialect/Vector/vector-sink.mlir
index beaba52af1841..69fba88a14048 100644
--- a/mlir/test/Dialect/Vector/vector-sink.mlir
+++ b/mlir/test/Dialect/Vector/vector-sink.mlir
@@ -382,6 +382,21 @@ func.func @broadcast_scalar_extsi_scalable(%a : i8) -> vector<2x[4]xi32> {
   return %r : vector<2x[4]xi32>
 }
 
+// -----
+
+// CHECK-LABEL: func.func @negative_broadcast_cast_non_vector_result
+// CHECK-SAME: (%[[ARG:.*]]: i64)
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG]] : i64 to vector<26x7xi64>
+// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[BCAST]] : vector<26x7xi64> to !llvm.array<26 x vector<7xi64>>
+// CHECK: return %[[CAST]] : !llvm.array<26 x vector<7xi64>>
+/// This test ensures that the `ReorderCastOpsOnBroadcast` pattern does not
+/// attempt to reorder a cast operation that produces a non-vector result type.
+func.func @negative_broadcast_cast_non_vector_result(%arg0: i64) -> !llvm.array<26 x vector<7xi64>> {
+  %0 = vector.broadcast %arg0 : i64 to vector<26x7xi64>
+  %1 = builtin.unrealized_conversion_cast %0 : vector<26x7xi64> to !llvm.array<26 x vector<7xi64>>
+  return %1 : !llvm.array<26 x vector<7xi64>>
+}
+
 //===----------------------------------------------------------------------===//
 // [Pattern: ReorderElementwiseOpsOnTranspose]
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index f8b59b87a122b..48e77d867508b 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -640,3 +640,61 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
   return
 }
 }
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row(
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK-SAME:      !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT:    %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
+// CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
+// CHECK-NEXT:    %[[BROADCAST:.*]] = vector.broadcast %[[REDUCE]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
+func.func @vector_broadcast_1d_to_2d_broadcast_along_row(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.0000> : vector<16xf16>
+  %3 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = vector.multi_reduction <add>, %3, %cst [0] : vector<16x16xf16> to vector<16xf16>
+  %5 = vector.broadcast %4 : vector<16xf16> to vector<16x16xf16>
+  xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_2d_to_2d_along_column(
+// CHECK:            %[[REDUCE:.*]] = vector.multi_reduction <add>
+// CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<16x16xf16> to vector<16xf16>
+// CHECK-NEXT:    %[[SHAPECAST:.*]] = vector.shape_cast %[[REDUCE]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
+// CHECK-NEXT:    vector.broadcast %[[SHAPECAST]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+
+func.func @vector_broadcast_2d_to_2d_along_column(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.0000> : vector<16xf16>
+  %3 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = vector.multi_reduction <add>, %3, %cst [1] : vector<16x16xf16> to vector<16xf16>
+  %5 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16>
+  %6 = vector.broadcast %5 : vector<16x1xf16> to vector<16x16xf16>
+  xegpu.store_nd %6, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_scalar_to_vector(
+// CHECK:         %[[CST:.*]] = arith.constant 0.{{.*}} : f16
+// CHECK-NEXT:    %[[BROADCAST:.*]] = vector.broadcast %[[CST]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+
+func.func @vector_broadcast_scalar_to_vector(%arg0: !xegpu.tensor_desc<16x16xf16>) {
+  %cst = arith.constant 0.0000 : f16
+  %6 = vector.broadcast %cst : f16 to vector<16x16xf16>
+  xegpu.store_nd %6, %arg0  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 44ec21359593f..216f3d19cff94 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -920,4 +920,69 @@ gpu.func @vector_insert_strided_slice_unsupported_offset(%laneid: index) {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane
+// CHECK-SAME: (%[[ARG0:.*]]: index) {
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<1xf16>)
+// CHECK: %[[DEF:.*]] = "some_def"()
+// CHECK: %[[BCAST_INNER:.*]] = vector.broadcast %[[DEF]]
+// CHECK: gpu.yield %[[BCAST_INNER]], %[[DEF]]
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[R]]#1 : vector<1xf16> to vector<16x1xf16>
+// CHECK: "some_use"(%[[BCAST]])
+gpu.func  @vector_broadcast_1d_to_2d_broadcast_within_lane(%laneid: index) {
+
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
+
+    %1 = "some_def"() : () -> vector<16xf16>
+
+    %2 = vector.broadcast %1 {
+      layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    } : vector<16xf16> to vector<16x16xf16>
+
+    gpu.yield %2 : vector<16x16xf16>
+  }
+  "some_use"(%r) : (vector<16x1xf16>) -> ()
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case
+// CHECK-SAME: (%[[ARG0:.*]]: index)
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<16x1xf16>)
+// CHECK:   %[[DEF:.*]] = "some_def"() : () -> vector<16x1xf16>
+// CHECK:   %[[BCAST:.*]] = vector.broadcast %[[DEF]]
+// CHECK-SAME: : vector<16x1xf16> to vector<16x16xf16>
+// CHECK:   gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, vector<16x1xf16>
+// CHECK: "some_use"(%[[R]]#1) : (vector<16x1xf16>) -> ()
+gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: index) {
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
+    %1 = "some_def"() : () -> vector<16x1xf16>
+    %2 = vector.broadcast %1 {
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    } : vector<16x1xf16> to vector<16x16xf16>
+    gpu.yield %2: vector<16x16xf16>
+  }
+  "some_use"(%0) : (vector<16x1xf16>) -> ()
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector
+// CHECK-SAME: (%[[ARG0:.*]]: index)
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, f16)
+// CHECK: %[[DEF:.*]] = "some_def"()
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+// CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16
+// CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x1xf16>
+// CHECK: "some_use"(%[[RESULT]])
+gpu.func
+@vector_shape_cast_scalar_to_vector(%arg0: index) {
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
+    %1 = "some_def"() : () -> f16
+    %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+    gpu.yield %2 : vector<16x16xf16>
+  }
+  "some_use"(%0) : (vector<16x1xf16>) -> ()
+  gpu.return
+}
+
 }
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 22177f8f6a15f..e5e3d2a1c1ad5 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -330,3 +330,64 @@ gpu.module @xevm_module{
     gpu.return
   }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane({{.*}}) {
+gpu.module @xevm_module{
+   gpu.func  @vector_broadcast_1d_to_2d_broadcast_within_lane(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
+    %tdesc0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
+    // CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16>
+    %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case({{.*}}) {
+gpu.module @xevm_module{
+   gpu.func  @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
+    %1 = xegpu.load %arg0[%c0], %mask {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
+    
+    %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
+    %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+    // CHECK-NOT: vector.broadcast
+    // CHECK-NOT: vector.shape_cast
+ 
+    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK: xegpu.store_nd {{.*}}, {{.*}}[{{.*}}, {{.*}}]
+    // CHECK-SAME: : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector({{.*}}) {
+gpu.module @xevm_module{
+   gpu.func  @vector_shape_cast_scalar_to_vector(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %9 = gpu.block_id  x
+    %10 = arith.index_cast %9 : index to i16
+    %11 = arith.bitcast %10 : i16 to f16
+    // CHECK: vector.broadcast {{.*}} : f16 to vector<16xf16>
+    %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+
diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp
index d7b442f6832d0..30e7ed9b6cb7e 100644
--- a/mlir/unittests/Bytecode/BytecodeTest.cpp
+++ b/mlir/unittests/Bytecode/BytecodeTest.cpp
@@ -15,6 +15,7 @@
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Parser/Parser.h"
 
+#include "mlir/IR/BuiltinOps.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Endian.h"
@@ -228,3 +229,39 @@ TEST(Bytecode, OpWithoutProperties) {
   EXPECT_TRUE(OperationEquivalence::computeHash(op.get()) ==
               OperationEquivalence::computeHash(roundtripped));
 }
+
+TEST(Bytecode, DeepCallSiteLoc) {
+  MLIRContext context;
+  ParserConfig config(&context);
+
+  // Create a deep CallSiteLoc chain to test iterative parsing.
+  Location baseLoc = FileLineColLoc::get(&context, "test.mlir", 1, 1);
+  Location loc = baseLoc;
+  constexpr int kDepth = 1000;
+  for (int i = 0; i < kDepth; ++i) {
+    loc = CallSiteLoc::get(loc, baseLoc);
+  }
+
+  // Create a simple module with the deep location.
+  Builder builder(&context);
+  OwningOpRef<ModuleOp> module =
+      ModuleOp::create(loc, /*attributes=*/std::nullopt);
+  ASSERT_TRUE(module);
+
+  // Write to bytecode.
+  std::string bytecode;
+  llvm::raw_string_ostream os(bytecode);
+  ASSERT_TRUE(succeeded(writeBytecodeToFile(module.get(), os)));
+
+  // Parse it back using the bytecode reader.
+  std::unique_ptr<Block> block = std::make_unique<Block>();
+  ASSERT_TRUE(succeeded(readBytecodeFile(
+      llvm::MemoryBufferRef(bytecode, "string-buffer"), block.get(), config)));
+
+  // Verify we got the roundtripped module.
+  ASSERT_FALSE(block->empty());
+  Operation *roundTripped = &block->front();
+
+  // Verify the location matches.
+  EXPECT_EQ(module.get()->getLoc(), roundTripped->getLoc());
+}
diff --git a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
index c8c2bb96b0539..060c8b8d2679d 100644
--- a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
@@ -2,6 +2,7 @@ add_mlir_unittest(MLIROpenACCTests
   OpenACCOpsTest.cpp
   OpenACCOpsInterfacesTest.cpp
   OpenACCUtilsTest.cpp
+  OpenACCUtilsTilingTest.cpp
 )
 mlir_target_link_libraries(MLIROpenACCTests
   PRIVATE
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTilingTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTilingTest.cpp
new file mode 100644
index 0000000000000..95bc1eab7d3fe
--- /dev/null
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTilingTest.cpp
@@ -0,0 +1,348 @@
+//===- OpenACCUtilsTilingTest.cpp - Unit tests for loop tiling utilities --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/OpenACCUtilsTiling.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+//===----------------------------------------------------------------------===//
+// Test Fixture
+//===----------------------------------------------------------------------===//
+
+class OpenACCUtilsTilingTest : public ::testing::Test {
+protected:
+  OpenACCUtilsTilingTest() : b(&context), loc(UnknownLoc::get(&context)) {
+    context.loadDialect<acc::OpenACCDialect, arith::ArithDialect,
+                        memref::MemRefDialect, func::FuncDialect>();
+  }
+
+  // Create a simple LoopOp with specified bounds using the simple builder
+  acc::LoopOp createLoopOp(OpBuilder &builder, ValueRange lbs, ValueRange ubs,
+                           ValueRange steps) {
+    auto loopOp = acc::LoopOp::create(builder, loc, lbs, ubs, steps,
+                                      acc::LoopParMode::loop_independent);
+
+    // Add body block with IV arguments and yield
+    Region &region = loopOp.getRegion();
+    Block *block = builder.createBlock(&region, region.begin());
+    for (Value lb : lbs)
+      block->addArgument(lb.getType(), loc);
+    builder.setInsertionPointToEnd(block);
+    acc::YieldOp::create(builder, loc);
+
+    return loopOp;
+  }
+
+  // Helper to count nested acc.loop ops within a loop
+  unsigned countNestedLoops(acc::LoopOp loop) {
+    unsigned count = 0;
+    loop.getBody().walk([&](acc::LoopOp) { ++count; });
+    return count;
+  }
+
+  // Helper to collect all nested acc.loop ops in order
+  SmallVector<acc::LoopOp> collectNestedLoops(acc::LoopOp loop) {
+    SmallVector<acc::LoopOp> loops;
+    loop.getBody().walk(
+        [&](acc::LoopOp nestedLoop) { loops.push_back(nestedLoop); });
+    return loops;
+  }
+
+  MLIRContext context;
+  OpBuilder b;
+  Location loc;
+};
+
+//===----------------------------------------------------------------------===//
+// tileACCLoops Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTilingTest, tileACCLoopsSingleLoop) {
+  // Create a module to hold the function
+  OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+  Block *moduleBlock = module->getBody();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleBlock);
+
+  // Create a function
+  auto funcType = b.getFunctionType({}, {});
+  OwningOpRef<func::FuncOp> funcOp =
+      func::FuncOp::create(b, loc, "test_func", funcType);
+  Block *funcBlock = funcOp->addEntryBlock();
+
+  b.setInsertionPointToStart(funcBlock);
+
+  // Create loop bounds
+  Value lb =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(100));
+  Value step =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+  Value tileSize =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(4));
+
+  // Create the loop
+  acc::LoopOp loopOp = createLoopOp(b, {lb}, {ub}, {step});
+
+  // Tile the loop using IRRewriter
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPoint(loopOp);
+
+  SmallVector<acc::LoopOp> loopsToTile = {loopOp};
+  SmallVector<Value> tileSizes = {tileSize};
+
+  acc::LoopOp tiledLoop =
+      tileACCLoops(loopsToTile, tileSizes, /*defaultTileSize=*/128, rewriter);
+
+  // Verify the tiled loop was created
+  EXPECT_TRUE(tiledLoop != nullptr);
+  EXPECT_FALSE(tiledLoop.getBody().empty());
+
+  // After tiling a single loop with tile(4), we should have:
+  // - 1 tile loop (the outer loop)
+  // - 1 element loop nested inside
+  // Total: 1 nested loop inside the tile loop
+  EXPECT_EQ(countNestedLoops(tiledLoop), 1u);
+
+  // The tile loop (outer) should have 1 IV
+  EXPECT_EQ(tiledLoop.getBody().getNumArguments(), 1u);
+
+  // Collect nested loops and verify
+  auto nestedLoops = collectNestedLoops(tiledLoop);
+  EXPECT_EQ(nestedLoops.size(), 1u);
+  // The element loop should have 1 IV
+  if (!nestedLoops.empty())
+    EXPECT_EQ(nestedLoops[0].getBody().getNumArguments(), 1u);
+}
+
+TEST_F(OpenACCUtilsTilingTest, tileACCLoopsNestedLoops) {
+  // Create a module to hold the function
+  OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+  Block *moduleBlock = module->getBody();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleBlock);
+
+  // Create a function
+  auto funcType = b.getFunctionType({}, {});
+  OwningOpRef<func::FuncOp> funcOp =
+      func::FuncOp::create(b, loc, "test_func", funcType);
+  Block *funcBlock = funcOp->addEntryBlock();
+
+  b.setInsertionPointToStart(funcBlock);
+
+  // Create loop bounds for outer loop
+  Value lb1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(100));
+  Value step1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+
+  // Create loop bounds for inner loop
+  Value lb2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(50));
+  Value step2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+
+  // Tile sizes
+  Value tileSize1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(4));
+  Value tileSize2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(8));
+
+  // Create outer loop
+  acc::LoopOp outerLoop = createLoopOp(b, {lb1}, {ub1}, {step1});
+
+  // Create inner loop inside outer loop
+  b.setInsertionPoint(outerLoop.getBody().getTerminator());
+  acc::LoopOp innerLoop = createLoopOp(b, {lb2}, {ub2}, {step2});
+
+  // Tile the loops
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPoint(outerLoop);
+
+  SmallVector<acc::LoopOp> loopsToTile = {outerLoop, innerLoop};
+  SmallVector<Value> tileSizes = {tileSize1, tileSize2};
+
+  acc::LoopOp tiledLoop =
+      tileACCLoops(loopsToTile, tileSizes, /*defaultTileSize=*/128, rewriter);
+
+  // Verify the tiled loop nest was created
+  EXPECT_TRUE(tiledLoop != nullptr);
+  EXPECT_FALSE(tiledLoop.getBody().empty());
+
+  // After tiling a 2-level nested loop with tile(4,8), we should have:
+  // tile_loop_1 -> tile_loop_2 -> element_loop_1 -> element_loop_2
+  // Total: 3 nested loops inside the outermost tile loop
+  unsigned nestedCount = countNestedLoops(tiledLoop);
+  EXPECT_EQ(nestedCount, 3u);
+
+  // The outermost tile loop should have 1 IV
+  EXPECT_EQ(tiledLoop.getBody().getNumArguments(), 1u);
+
+  // Collect all nested loops and verify each has 1 IV
+  auto nestedLoops = collectNestedLoops(tiledLoop);
+  EXPECT_EQ(nestedLoops.size(), 3u);
+  for (auto loop : nestedLoops)
+    EXPECT_EQ(loop.getBody().getNumArguments(), 1u);
+}
+
+//===----------------------------------------------------------------------===//
+// uncollapseLoops Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTilingTest, uncollapseLoopsBasic) {
+  // Create a module to hold the function
+  OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+  Block *moduleBlock = module->getBody();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleBlock);
+
+  // Create a function
+  auto funcType = b.getFunctionType({}, {});
+  OwningOpRef<func::FuncOp> funcOp =
+      func::FuncOp::create(b, loc, "test_func", funcType);
+  Block *funcBlock = funcOp->addEntryBlock();
+
+  b.setInsertionPointToStart(funcBlock);
+
+  // Create loop bounds for a collapsed 2-level loop
+  Value lb1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(10));
+  Value step1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+  Value lb2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(20));
+  Value step2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+
+  // Create a collapsed loop with 2 IVs
+  acc::LoopOp collapsedLoop =
+      createLoopOp(b, {lb1, lb2}, {ub1, ub2}, {step1, step2});
+
+  // Set the collapse attribute
+  collapsedLoop.setCollapseForDeviceTypes(&context, {acc::DeviceType::None},
+                                          llvm::APInt(64, 1));
+
+  // Uncollapse the loop: tileCount=2, collapseCount=1
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPoint(collapsedLoop);
+
+  SmallVector<acc::LoopOp> uncollapsedLoops = uncollapseLoops(
+      collapsedLoop, /*tileCount=*/2, /*collapseCount=*/1, rewriter);
+
+  // Should produce 2 loops (one outer with collapse=1, one inner)
+  EXPECT_EQ(uncollapsedLoops.size(), 2u);
+
+  if (uncollapsedLoops.size() >= 2) {
+    // Verify the outer loop has 1 IV (collapseCount=1)
+    acc::LoopOp outerLoop = uncollapsedLoops[0];
+    EXPECT_EQ(outerLoop.getBody().getNumArguments(), 1u);
+    EXPECT_EQ(outerLoop.getLowerbound().size(), 1u);
+    EXPECT_EQ(outerLoop.getUpperbound().size(), 1u);
+    EXPECT_EQ(outerLoop.getStep().size(), 1u);
+
+    // Verify the inner loop has 1 IV
+    acc::LoopOp innerLoop = uncollapsedLoops[1];
+    EXPECT_EQ(innerLoop.getBody().getNumArguments(), 1u);
+    EXPECT_EQ(innerLoop.getLowerbound().size(), 1u);
+    EXPECT_EQ(innerLoop.getUpperbound().size(), 1u);
+    EXPECT_EQ(innerLoop.getStep().size(), 1u);
+
+    // Verify nesting: inner loop should be inside outer loop
+    unsigned nestedCount = countNestedLoops(outerLoop);
+    EXPECT_EQ(nestedCount, 1u);
+  }
+}
+
+TEST_F(OpenACCUtilsTilingTest, uncollapseLoopsThreeLevels) {
+  // Test uncollapsing with 3 levels: collapse(2) with tile(3)
+  OwningOpRef<ModuleOp> module = ModuleOp::create(loc);
+  Block *moduleBlock = module->getBody();
+
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPointToStart(moduleBlock);
+
+  auto funcType = b.getFunctionType({}, {});
+  OwningOpRef<func::FuncOp> funcOp =
+      func::FuncOp::create(b, loc, "test_func", funcType);
+  Block *funcBlock = funcOp->addEntryBlock();
+
+  b.setInsertionPointToStart(funcBlock);
+
+  // Create 3 sets of bounds
+  Value lb1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(10));
+  Value step1 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+  Value lb2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(20));
+  Value step2 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+  Value lb3 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(0));
+  Value ub3 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(30));
+  Value step3 =
+      arith::ConstantOp::create(b, loc, b.getIndexType(), b.getIndexAttr(1));
+
+  // Create a collapsed loop with 3 IVs
+  acc::LoopOp collapsedLoop =
+      createLoopOp(b, {lb1, lb2, lb3}, {ub1, ub2, ub3}, {step1, step2, step3});
+
+  // Set collapse(2)
+  collapsedLoop.setCollapseForDeviceTypes(&context, {acc::DeviceType::None},
+                                          llvm::APInt(64, 2));
+
+  // Uncollapse: tileCount=3, collapseCount=2
+  // This should create: outer loop with 2 IVs, then 1 inner loop
+  IRRewriter rewriter(&context);
+  rewriter.setInsertionPoint(collapsedLoop);
+
+  SmallVector<acc::LoopOp> uncollapsedLoops = uncollapseLoops(
+      collapsedLoop, /*tileCount=*/3, /*collapseCount=*/2, rewriter);
+
+  // Should produce 2 loops
+  EXPECT_EQ(uncollapsedLoops.size(), 2u);
+
+  if (uncollapsedLoops.size() >= 2) {
+    // Outer loop should have 2 IVs (from collapse=2)
+    acc::LoopOp outerLoop = uncollapsedLoops[0];
+    EXPECT_EQ(outerLoop.getBody().getNumArguments(), 2u);
+    EXPECT_EQ(outerLoop.getLowerbound().size(), 2u);
+
+    // Inner loop should have 1 IV (the 3rd dimension)
+    acc::LoopOp innerLoop = uncollapsedLoops[1];
+    EXPECT_EQ(innerLoop.getBody().getNumArguments(), 1u);
+    EXPECT_EQ(innerLoop.getLowerbound().size(), 1u);
+  }
+}
diff --git a/revert_patches.txt b/revert_patches.txt
index 57ede019ece2b..133652271d4e4 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -5,9 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485)
 breaks build of ROCmValidationSuite
 [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662)
 ---
-complicated build, deferring
-[llvm][mlir][OpenMP] Support translation for linear clause in omp.wsloop
----
 needs more work to land
 [Flang] Move builtin .mod generation into runtimes (Reapply #137828)
 ---
diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
index 45dada5884cb4..179658cadb0e2 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
@@ -28,7 +28,6 @@ cc_library(
         exclude = [
             "Generators.cpp",
             "HTMLGenerator.cpp",
-            "HTMLMustacheGenerator.cpp",
             "MDGenerator.cpp",
             "YAMLGenerator.cpp",
         ],
@@ -53,7 +52,6 @@ cc_library(
     srcs = [
         "Generators.cpp",
         "HTMLGenerator.cpp",
-        "HTMLMustacheGenerator.cpp",
         "MDGenerator.cpp",
         "YAMLGenerator.cpp",
     ],
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 9967aa8e08fd9..e6f577d6665c2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10393,6 +10393,8 @@ cc_library(
     ),
     includes = ["include"],
     deps = [
+        ":ArithDialect",
+        ":DialectUtils",
         ":FunctionInterfaces",
         ":IR",
         ":OpenACCDialect",
@@ -10400,6 +10402,7 @@ cc_library(
         ":OpenACCPassIncGen",
         ":OpenACCTypeInterfacesIncGen",
         ":Support",
+        ":TransformUtils",
         ":ViewLikeInterface",
         "//llvm:Support",
         "//llvm:ir_headers",