From 800a47d6cd33ea1c2a888ceb67d566366c61e7ed Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 25 Oct 2024 10:17:16 -0400
Subject: [PATCH 01/39] [libc++][NFC] Fix include guards inside locale_base_api

---
 libcxx/include/__locale_dir/locale_base_api/android.h       | 6 +++---
 .../__locale_dir/locale_base_api/bsd_locale_defaults.h      | 6 +++---
 .../__locale_dir/locale_base_api/bsd_locale_fallbacks.h     | 6 +++---
 libcxx/include/__locale_dir/locale_base_api/fuchsia.h       | 6 +++---
 libcxx/include/__locale_dir/locale_base_api/ibm.h           | 6 +++---
 libcxx/include/__locale_dir/locale_base_api/locale_guard.h  | 6 +++---
 libcxx/include/__locale_dir/locale_base_api/musl.h          | 6 +++---
 libcxx/include/__locale_dir/locale_base_api/newlib.h        | 6 +++---
 libcxx/include/__locale_dir/locale_base_api/openbsd.h       | 6 +++---
 libcxx/include/__locale_dir/locale_base_api/win32.h         | 6 +++---
 10 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/libcxx/include/__locale_dir/locale_base_api/android.h b/libcxx/include/__locale_dir/locale_base_api/android.h
index 9965d8bbf6a2ecc..08ef5407dedf4e0 100644
--- a/libcxx/include/__locale_dir/locale_base_api/android.h
+++ b/libcxx/include/__locale_dir/locale_base_api/android.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_ANDROID_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_ANDROID_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H
 
 #include <stdlib.h>
 
@@ -47,4 +47,4 @@ inline _LIBCPP_HIDE_FROM_ABI double strtod_l(const char* __nptr, char** __endptr
 #  endif // __NDK_MAJOR__ <= 16
 #endif   // __has_include(<android/ndk-version.h>)
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_ANDROID_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_ANDROID_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_defaults.h b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_defaults.h
index 1f9607209842cad..e88eb4fa41d7af9 100644
--- a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_defaults.h
+++ b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_defaults.h
@@ -11,8 +11,8 @@
 // we will define the mapping from an internal macro to the real BSD symbol.
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_BSD_LOCALE_DEFAULTS_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_BSD_LOCALE_DEFAULTS_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_DEFAULTS_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_DEFAULTS_H
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -33,4 +33,4 @@
 #define __libcpp_asprintf_l(...) asprintf_l(__VA_ARGS__)
 #define __libcpp_sscanf_l(...) sscanf_l(__VA_ARGS__)
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_BSD_LOCALE_DEFAULTS_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_DEFAULTS_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
index 76b94287cd6cc88..5f99c7aea02a96a 100644
--- a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
+++ b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
@@ -10,8 +10,8 @@
 // of those functions for non-BSD platforms.
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
 
 #include <__locale_dir/locale_base_api/locale_guard.h>
 #include <cstdio>
@@ -123,4 +123,4 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __libcpp_sscanf_l(
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/fuchsia.h b/libcxx/include/__locale_dir/locale_base_api/fuchsia.h
index 4c3440f981c6d08..f6ef454ba7ada75 100644
--- a/libcxx/include/__locale_dir/locale_base_api/fuchsia.h
+++ b/libcxx/include/__locale_dir/locale_base_api/fuchsia.h
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_FUCHSIA_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_FUCHSIA_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H
 
 #include <__support/xlocale/__posix_l_fallback.h>
 #include <__support/xlocale/__strtonum_fallback.h>
 #include <cstdlib>
 #include <cwchar>
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_FUCHSIA_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_FUCHSIA_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/ibm.h b/libcxx/include/__locale_dir/locale_base_api/ibm.h
index fa3bc1c3633f5dc..1d1d15df9f7995e 100644
--- a/libcxx/include/__locale_dir/locale_base_api/ibm.h
+++ b/libcxx/include/__locale_dir/locale_base_api/ibm.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_IBM_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_IBM_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_IBM_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_IBM_H
 
 #if defined(__MVS__)
 #  include <__support/ibm/locale_mgmt_zos.h>
@@ -105,4 +105,4 @@ _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 2, 0) int vasprintf(char** strp, const char
   return str_size;
 }
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_IBM_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_IBM_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h b/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
index 2baacb51cd06555..7d15f2d253adc39 100644
--- a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
+++ b/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_LOCALE_GUARD_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_LOCALE_GUARD_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_LOCALE_GUARD_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_LOCALE_GUARD_H
 
 #include <__config>
 #include <__locale> // for locale_t
@@ -75,4 +75,4 @@ struct __libcpp_locale_guard {
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_LOCALE_GUARD_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_LOCALE_GUARD_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/musl.h b/libcxx/include/__locale_dir/locale_base_api/musl.h
index bf7b849d5863421..1653214cdba1e39 100644
--- a/libcxx/include/__locale_dir/locale_base_api/musl.h
+++ b/libcxx/include/__locale_dir/locale_base_api/musl.h
@@ -14,8 +14,8 @@
 // in Musl.
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_MUSL_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_MUSL_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_MUSL_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_MUSL_H
 
 #include <cstdlib>
 #include <cwchar>
@@ -28,4 +28,4 @@ inline _LIBCPP_HIDE_FROM_ABI unsigned long long strtoull_l(const char* __nptr, c
   return ::strtoull(__nptr, __endptr, __base);
 }
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_MUSL_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_MUSL_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/newlib.h b/libcxx/include/__locale_dir/locale_base_api/newlib.h
index a8c1cff16e6d800..7da10e5889843dd 100644
--- a/libcxx/include/__locale_dir/locale_base_api/newlib.h
+++ b/libcxx/include/__locale_dir/locale_base_api/newlib.h
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_NEWLIB_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_NEWLIB_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_NEWLIB_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_NEWLIB_H
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_NEWLIB_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_NEWLIB_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/openbsd.h b/libcxx/include/__locale_dir/locale_base_api/openbsd.h
index 0c05d6a0f788747..d4fb224e0c80a09 100644
--- a/libcxx/include/__locale_dir/locale_base_api/openbsd.h
+++ b/libcxx/include/__locale_dir/locale_base_api/openbsd.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_OPENBSD_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_OPENBSD_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_OPENBSD_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_OPENBSD_H
 
 #include <__support/xlocale/__strtonum_fallback.h>
 #include <clocale>
@@ -16,4 +16,4 @@
 #include <ctype.h>
 #include <cwctype>
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_OPENBSD_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_OPENBSD_H
diff --git a/libcxx/include/__locale_dir/locale_base_api/win32.h b/libcxx/include/__locale_dir/locale_base_api/win32.h
index f66baffb6920456..f488a0dc0d69b3f 100644
--- a/libcxx/include/__locale_dir/locale_base_api/win32.h
+++ b/libcxx/include/__locale_dir/locale_base_api/win32.h
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_LOCALE_BASE_API_WIN32_H
-#define _LIBCPP___LOCALE_LOCALE_BASE_API_WIN32_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_WIN32_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_WIN32_H
 
 #include <__config>
 #include <cstddef>
@@ -232,4 +232,4 @@ _LIBCPP_EXPORTED_FROM_ABI int vasprintf_l(char** __ret, locale_t __loc, const ch
 // not-so-pressing FIXME: use locale to determine blank characters
 inline int iswblank_l(wint_t __c, locale_t /*loc*/) { return (__c == L' ' || __c == L'\t'); }
 
-#endif // _LIBCPP___LOCALE_LOCALE_BASE_API_WIN32_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_WIN32_H

From 577c7dd7cc4c5a9f62f9654cfa30ee9d55709426 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 25 Oct 2024 15:20:24 +0100
Subject: [PATCH 02/39] [AArch64] Add a phase-ordering test for vectorizing
 predicated selects. NFC

---
 .../AArch64/predicated-reduction.ll           | 294 ++++++++++++++++++
 1 file changed, 294 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
new file mode 100644
index 000000000000000..7274e952567693d
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes="default<O3>" -S < %s  | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef %RAND_BLOCK_LENGTH, ptr noundef %samples, double noundef nofpclass(nan inf) %Y, double noundef nofpclass(nan inf) %Z) {
+; CHECK-LABEL: define nofpclass(nan inf) double @monte_simple(
+; CHECK-SAME: i32 noundef [[NBLOCKS:%.*]], i32 noundef [[RAND_BLOCK_LENGTH:%.*]], ptr nocapture noundef readonly [[SAMPLES:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[RAND_BLOCK_LENGTH]], 0
+; CHECK-NEXT:    br i1 [[CMP8]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[RAND_BLOCK_LENGTH]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_011:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[V1_1:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[V0_010:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[V0_1:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[TMP0]] to double
+; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[Y]], [[CONV]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]]
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast ogt double [[SUB]], 0.000000e+00
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast double [[SUB]], [[V0_010]]
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast double [[SUB]], [[SUB]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast double [[MUL3]], [[V1_011]]
+; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], double [[ADD]], double [[V0_010]]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP1]], double [[ADD4]], double [[V1_011]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END_LOOPEXIT]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast double [[V1_1]], [[V0_1]]
+; CHECK-NEXT:    br label %[[FOR_END]]
+; CHECK:       [[FOR_END]]:
+; CHECK-NEXT:    [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    ret double [[ADD5]]
+;
+entry:
+  %nblocks.addr = alloca i32, align 4
+  %RAND_BLOCK_LENGTH.addr = alloca i32, align 4
+  %samples.addr = alloca ptr, align 8
+  %Y.addr = alloca double, align 8
+  %Z.addr = alloca double, align 8
+  %i = alloca i32, align 4
+  %block = alloca i32, align 4
+  %rngVal = alloca double, align 8
+  %callValue = alloca double, align 8
+  %v0 = alloca double, align 8
+  %v1 = alloca double, align 8
+  store i32 %nblocks, ptr %nblocks.addr, align 4
+  store i32 %RAND_BLOCK_LENGTH, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  store ptr %samples, ptr %samples.addr, align 8
+  store double %Y, ptr %Y.addr, align 8
+  store double %Z, ptr %Z.addr, align 8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr %block) #2
+  call void @llvm.lifetime.start.p0(i64 8, ptr %rngVal) #2
+  call void @llvm.lifetime.start.p0(i64 8, ptr %callValue) #2
+  call void @llvm.lifetime.start.p0(i64 8, ptr %v0) #2
+  store double 0.000000e+00, ptr %v0, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr %v1) #2
+  store double 0.000000e+00, ptr %v1, align 8
+  store i32 0, ptr %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load ptr, ptr %samples.addr, align 8
+  %3 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds float, ptr %2, i64 %idxprom
+  %4 = load float, ptr %arrayidx, align 4
+  %conv = fpext float %4 to double
+  store double %conv, ptr %rngVal, align 8
+  %5 = load double, ptr %Y.addr, align 8
+  %6 = load double, ptr %rngVal, align 8
+  %mul = fmul fast double %5, %6
+  %7 = load double, ptr %Z.addr, align 8
+  %sub = fsub fast double %mul, %7
+  store double %sub, ptr %callValue, align 8
+  %8 = load double, ptr %callValue, align 8
+  %cmp1 = fcmp fast ogt double %8, 0.000000e+00
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %9 = load double, ptr %callValue, align 8
+  %10 = load double, ptr %v0, align 8
+  %add = fadd fast double %10, %9
+  store double %add, ptr %v0, align 8
+  %11 = load double, ptr %callValue, align 8
+  %12 = load double, ptr %callValue, align 8
+  %mul3 = fmul fast double %11, %12
+  %13 = load double, ptr %v1, align 8
+  %add4 = fadd fast double %13, %mul3
+  store double %add4, ptr %v1, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %14 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %14, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %15 = load double, ptr %v0, align 8
+  %16 = load double, ptr %v1, align 8
+  %add5 = fadd fast double %15, %16
+  call void @llvm.lifetime.end.p0(i64 8, ptr %v1) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr %v0) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr %callValue) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr %rngVal) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %block) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2
+  ret double %add5
+}
+
+define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %RAND_BLOCK_LENGTH, ptr noundef %samples, double noundef nofpclass(nan inf) %Y, double noundef nofpclass(nan inf) %Z) {
+; CHECK-LABEL: define nofpclass(nan inf) double @monte_exp(
+; CHECK-SAME: i32 noundef [[NBLOCKS:%.*]], i32 noundef [[RAND_BLOCK_LENGTH:%.*]], ptr noundef [[SAMPLES:%.*]], double noundef nofpclass(nan inf) [[Y:%.*]], double noundef nofpclass(nan inf) [[Z:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[CMP16:%.*]] = icmp sgt i32 [[NBLOCKS]], 0
+; CHECK-NEXT:    br i1 [[CMP16]], label %[[FOR_BODY_LR_PH:.*]], label %[[FOR_END10:.*]]
+; CHECK:       [[FOR_BODY_LR_PH]]:
+; CHECK-NEXT:    [[CMP211:%.*]] = icmp sgt i32 [[RAND_BLOCK_LENGTH]], 0
+; CHECK-NEXT:    br i1 [[CMP211]], label %[[FOR_BODY_US_PREHEADER:.*]], label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY_US_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[RAND_BLOCK_LENGTH]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY_US:.*]]
+; CHECK:       [[FOR_BODY_US]]:
+; CHECK-NEXT:    [[V1_019_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[V0_018_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[BLOCK_017_US:%.*]] = phi i32 [ [[INC9_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
+; CHECK-NEXT:    br label %[[FOR_BODY3_US:.*]]
+; CHECK:       [[FOR_BODY3_US]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ]
+; CHECK-NEXT:    [[V1_114_US:%.*]] = phi double [ [[V1_019_US]], %[[FOR_BODY_US]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ]
+; CHECK-NEXT:    [[V0_113_US:%.*]] = phi double [ [[V0_018_US]], %[[FOR_BODY_US]] ], [ [[V0_2_US]], %[[FOR_BODY3_US]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[CONV_US:%.*]] = fpext float [[TMP0]] to double
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.exp2.f64(double [[CONV_US]])
+; CHECK-NEXT:    [[MUL_US:%.*]] = fmul fast double [[TMP1]], [[Y]]
+; CHECK-NEXT:    [[SUB_US:%.*]] = fsub fast double [[MUL_US]], [[Z]]
+; CHECK-NEXT:    [[CMP4_US:%.*]] = fcmp fast ogt double [[SUB_US]], 0.000000e+00
+; CHECK-NEXT:    [[ADD_US:%.*]] = fadd fast double [[SUB_US]], [[V0_113_US]]
+; CHECK-NEXT:    [[MUL6_US:%.*]] = fmul fast double [[SUB_US]], [[SUB_US]]
+; CHECK-NEXT:    [[ADD7_US:%.*]] = fadd fast double [[MUL6_US]], [[V1_114_US]]
+; CHECK-NEXT:    [[V0_2_US]] = select i1 [[CMP4_US]], double [[ADD_US]], double [[V0_113_US]]
+; CHECK-NEXT:    [[V1_2_US]] = select i1 [[CMP4_US]], double [[ADD7_US]], double [[V1_114_US]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]]
+; CHECK:       [[FOR_COND1_FOR_INC8_CRIT_EDGE_US]]:
+; CHECK-NEXT:    [[INC9_US]] = add nuw nsw i32 [[BLOCK_017_US]], 1
+; CHECK-NEXT:    [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC9_US]], [[NBLOCKS]]
+; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label %[[FOR_END10]], label %[[FOR_BODY_US]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[BLOCK_017:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
+; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[BLOCK_017]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[NBLOCKS]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END10]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END10]]:
+; CHECK-NEXT:    [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD11:%.*]] = fadd fast double [[V1_0_LCSSA]], [[V0_0_LCSSA]]
+; CHECK-NEXT:    ret double [[ADD11]]
+;
+entry:
+  %nblocks.addr = alloca i32, align 4
+  %RAND_BLOCK_LENGTH.addr = alloca i32, align 4
+  %samples.addr = alloca ptr, align 8
+  %Y.addr = alloca double, align 8
+  %Z.addr = alloca double, align 8
+  %i = alloca i32, align 4
+  %block = alloca i32, align 4
+  %rngVal = alloca double, align 8
+  %callValue = alloca double, align 8
+  %v0 = alloca double, align 8
+  %v1 = alloca double, align 8
+  store i32 %nblocks, ptr %nblocks.addr, align 4
+  store i32 %RAND_BLOCK_LENGTH, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  store ptr %samples, ptr %samples.addr, align 8
+  store double %Y, ptr %Y.addr, align 8
+  store double %Z, ptr %Z.addr, align 8
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #4
+  call void @llvm.lifetime.start.p0(i64 4, ptr %block) #4
+  call void @llvm.lifetime.start.p0(i64 8, ptr %rngVal) #4
+  call void @llvm.lifetime.start.p0(i64 8, ptr %callValue) #4
+  call void @llvm.lifetime.start.p0(i64 8, ptr %v0) #4
+  store double 0.000000e+00, ptr %v0, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr %v1) #4
+  store double 0.000000e+00, ptr %v1, align 8
+  store i32 0, ptr %block, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc8, %entry
+  %0 = load i32, ptr %block, align 4
+  %1 = load i32, ptr %nblocks.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end10
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  %3 = load ptr, ptr %samples.addr, align 8
+  call void @resample(i32 noundef %2, ptr noundef %3)
+  store i32 0, ptr %i, align 4
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %4 = load i32, ptr %i, align 4
+  %5 = load i32, ptr %RAND_BLOCK_LENGTH.addr, align 4
+  %cmp2 = icmp slt i32 %4, %5
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.body3:                                        ; preds = %for.cond1
+  %6 = load ptr, ptr %samples.addr, align 8
+  %7 = load i32, ptr %i, align 4
+  %idxprom = sext i32 %7 to i64
+  %arrayidx = getelementptr inbounds float, ptr %6, i64 %idxprom
+  %8 = load float, ptr %arrayidx, align 4
+  %conv = fpext float %8 to double
+  store double %conv, ptr %rngVal, align 8
+  %9 = load double, ptr %Y.addr, align 8
+  %10 = load double, ptr %rngVal, align 8
+  %11 = call fast double @llvm.exp2.f64(double %10)
+  %mul = fmul fast double %9, %11
+  %12 = load double, ptr %Z.addr, align 8
+  %sub = fsub fast double %mul, %12
+  store double %sub, ptr %callValue, align 8
+  %13 = load double, ptr %callValue, align 8
+  %cmp4 = fcmp fast ogt double %13, 0.000000e+00
+  br i1 %cmp4, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body3
+  %14 = load double, ptr %callValue, align 8
+  %15 = load double, ptr %v0, align 8
+  %add = fadd fast double %15, %14
+  store double %add, ptr %v0, align 8
+  %16 = load double, ptr %callValue, align 8
+  %17 = load double, ptr %callValue, align 8
+  %mul6 = fmul fast double %16, %17
+  %18 = load double, ptr %v1, align 8
+  %add7 = fadd fast double %18, %mul6
+  store double %add7, ptr %v1, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body3
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %19 = load i32, ptr %i, align 4
+  %inc = add nsw i32 %19, 1
+  store i32 %inc, ptr %i, align 4
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond1
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %for.end
+  %20 = load i32, ptr %block, align 4
+  %inc9 = add nsw i32 %20, 1
+  store i32 %inc9, ptr %block, align 4
+  br label %for.cond
+
+for.end10:                                        ; preds = %for.cond
+  %21 = load double, ptr %v0, align 8
+  %22 = load double, ptr %v1, align 8
+  %add11 = fadd fast double %21, %22
+  call void @llvm.lifetime.end.p0(i64 8, ptr %v1) #4
+  call void @llvm.lifetime.end.p0(i64 8, ptr %v0) #4
+  call void @llvm.lifetime.end.p0(i64 8, ptr %callValue) #4
+  call void @llvm.lifetime.end.p0(i64 8, ptr %rngVal) #4
+  call void @llvm.lifetime.end.p0(i64 4, ptr %block) #4
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #4
+  ret double %add11
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @resample(i32 noundef, ptr noundef)
+declare double @llvm.exp2.f64(double)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)

From 9f6c632ecda08bfff76b798c46d5d7cfde57b5e9 Mon Sep 17 00:00:00 2001
From: Andrea Faulds <andrea.faulds@amd.com>
Date: Fri, 25 Oct 2024 16:21:59 +0200
Subject: [PATCH 03/39] [mlir][mlir-spirv-cpu-runner] Move MLIR pass pipeline
 to mlir-opt (#113594)

Adds a new mlir-opt test-only pass, -test-spirv-cpu-runner-pipeline,
which runs the set of MLIR passes needed for the mlir-spirv-cpu-runner,
and removes them from the runner. The tests are changed to invoke
mlir-opt with this flag before running the runner. The eventual goal is
to move all host/device code generation steps out of the runner, like
with some of the other runners.

Recommit of 17e9752267ed9c81c8da87f3a6d0e01f130b0d04. It was reverted
due to a build failure, but the build failure had in fact already been
fixed in e7302319b52e3d231216d54d10622b0698928a96.
---
 mlir/test/lib/Pass/CMakeLists.txt             |  1 +
 .../lib/Pass/TestSPIRVCPURunnerPipeline.cpp   | 47 +++++++++++++++++++
 mlir/test/mlir-spirv-cpu-runner/double.mlir   |  3 +-
 .../mlir-spirv-cpu-runner/simple_add.mlir     |  3 +-
 mlir/tools/mlir-opt/mlir-opt.cpp              |  2 +
 .../mlir-spirv-cpu-runner.cpp                 | 24 ----------
 6 files changed, 54 insertions(+), 26 deletions(-)
 create mode 100644 mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp

diff --git a/mlir/test/lib/Pass/CMakeLists.txt b/mlir/test/lib/Pass/CMakeLists.txt
index b190f054e50bd1c..f489b7e51e5038a 100644
--- a/mlir/test/lib/Pass/CMakeLists.txt
+++ b/mlir/test/lib/Pass/CMakeLists.txt
@@ -3,6 +3,7 @@ get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
 add_mlir_library(MLIRTestPass
   TestDynamicPipeline.cpp
   TestPassManager.cpp
+  TestSPIRVCPURunnerPipeline.cpp
 
   EXCLUDE_FROM_LIBMLIR
 
diff --git a/mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp b/mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp
new file mode 100644
index 000000000000000..ded0d22c31307e9
--- /dev/null
+++ b/mlir/test/lib/Pass/TestSPIRVCPURunnerPipeline.cpp
@@ -0,0 +1,47 @@
+//===------------------ TestSPIRVCPURunnerPipeline.cpp --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements a pipeline for use by mlir-spirv-cpu-runner tests.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h"
+#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
+#include "mlir/Pass/PassManager.h"
+
+using namespace mlir;
+
+namespace {
+
+void buildTestSPIRVCPURunnerPipeline(OpPassManager &passManager) {
+  passManager.addPass(createGpuKernelOutliningPass());
+  passManager.addPass(createConvertGPUToSPIRVPass(/*mapMemorySpace=*/true));
+
+  OpPassManager &nestedPM = passManager.nest<spirv::ModuleOp>();
+  nestedPM.addPass(spirv::createSPIRVLowerABIAttributesPass());
+  nestedPM.addPass(spirv::createSPIRVUpdateVCEPass());
+  passManager.addPass(createLowerHostCodeToLLVMPass());
+  passManager.addPass(createConvertSPIRVToLLVMPass());
+}
+
+} // namespace
+
+namespace mlir {
+namespace test {
+void registerTestSPIRVCPURunnerPipeline() {
+  PassPipelineRegistration<>(
+      "test-spirv-cpu-runner-pipeline",
+      "Runs a series of passes for lowering SPIR-V-dialect MLIR to "
+      "LLVM-dialect MLIR intended for mlir-spirv-cpu-runner.",
+      buildTestSPIRVCPURunnerPipeline);
+}
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/mlir-spirv-cpu-runner/double.mlir b/mlir/test/mlir-spirv-cpu-runner/double.mlir
index cd551ffb1bd0623..35557ba1e94c003 100644
--- a/mlir/test/mlir-spirv-cpu-runner/double.mlir
+++ b/mlir/test/mlir-spirv-cpu-runner/double.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-spirv-cpu-runner %s -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \
+// RUN: mlir-opt %s -test-spirv-cpu-runner-pipeline \
+// RUN: | mlir-spirv-cpu-runner - -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \
 // RUN: | FileCheck %s
 
 // CHECK: [8,  8,  8,  8,  8,  8]
diff --git a/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir b/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir
index 119e973e45e4a7b..75675a69a675833 100644
--- a/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir
+++ b/mlir/test/mlir-spirv-cpu-runner/simple_add.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-spirv-cpu-runner %s -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \
+// RUN: mlir-opt %s -test-spirv-cpu-runner-pipeline \
+// RUN: | mlir-spirv-cpu-runner - -e main --entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_test_spirv_cpu_runner_c_wrappers \
 // RUN: | FileCheck %s
 
 // CHECK: data =
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 36b142484bb04a6..002c3900056dee1 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -142,6 +142,7 @@ void registerTestSCFWhileOpBuilderPass();
 void registerTestSCFWrapInZeroTripCheckPasses();
 void registerTestShapeMappingPass();
 void registerTestSliceAnalysisPass();
+void registerTestSPIRVCPURunnerPipeline();
 void registerTestSPIRVFuncSignatureConversion();
 void registerTestSPIRVVectorUnrolling();
 void registerTestTensorCopyInsertionPass();
@@ -278,6 +279,7 @@ void registerTestPasses() {
   mlir::test::registerTestSCFWrapInZeroTripCheckPasses();
   mlir::test::registerTestShapeMappingPass();
   mlir::test::registerTestSliceAnalysisPass();
+  mlir::test::registerTestSPIRVCPURunnerPipeline();
   mlir::test::registerTestSPIRVFuncSignatureConversion();
   mlir::test::registerTestSPIRVVectorUnrolling();
   mlir::test::registerTestTensorCopyInsertionPass();
diff --git a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp
index 7e0b51cac806213..22ad1024db4a0b6 100644
--- a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp
+++ b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp
@@ -12,18 +12,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
-#include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h"
-#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/GPU/Transforms/Passes.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
-#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
-#include "mlir/Dialect/SPIRV/Transforms/Passes.h"
 #include "mlir/ExecutionEngine/JitRunner.h"
 #include "mlir/ExecutionEngine/OptUtils.h"
 #include "mlir/Pass/Pass.h"
@@ -75,23 +69,6 @@ convertMLIRModule(Operation *op, llvm::LLVMContext &context) {
   return mainModule;
 }
 
-static LogicalResult runMLIRPasses(Operation *module,
-                                   JitRunnerOptions &options) {
-  PassManager passManager(module->getContext(),
-                          module->getName().getStringRef());
-  if (failed(applyPassManagerCLOptions(passManager)))
-    return failure();
-  passManager.addPass(createGpuKernelOutliningPass());
-  passManager.addPass(createConvertGPUToSPIRVPass(/*mapMemorySpace=*/true));
-
-  OpPassManager &nestedPM = passManager.nest<spirv::ModuleOp>();
-  nestedPM.addPass(spirv::createSPIRVLowerABIAttributesPass());
-  nestedPM.addPass(spirv::createSPIRVUpdateVCEPass());
-  passManager.addPass(createLowerHostCodeToLLVMPass());
-  passManager.addPass(createConvertSPIRVToLLVMPass());
-  return passManager.run(module);
-}
-
 int main(int argc, char **argv) {
   llvm::InitLLVM y(argc, argv);
 
@@ -99,7 +76,6 @@ int main(int argc, char **argv) {
   llvm::InitializeNativeTargetAsmPrinter();
 
   mlir::JitRunnerConfig jitRunnerConfig;
-  jitRunnerConfig.mlirTransformer = runMLIRPasses;
   jitRunnerConfig.llvmModuleBuilder = convertMLIRModule;
 
   mlir::DialectRegistry registry;

From cbdfb18794026b0d662d7de1fa39c02ad6227abb Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Fri, 25 Oct 2024 15:39:07 +0100
Subject: [PATCH 04/39] [RISCV] Add Supm extension to RVA23 profiles (#113619)

This is mandatory for both RVA23U64 and RVA23S64 in the ratified version
of the specification

<https://github.com/riscv/riscv-profiles/blob/main/src/rva23-profile.adoc>.
---
 llvm/lib/Target/RISCV/RISCVProfiles.td | 3 ++-
 llvm/test/CodeGen/RISCV/attributes.ll  | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVProfiles.td b/llvm/lib/Target/RISCV/RISCVProfiles.td
index 157e087a64da07b..ce7d1973989fc13 100644
--- a/llvm/lib/Target/RISCV/RISCVProfiles.td
+++ b/llvm/lib/Target/RISCV/RISCVProfiles.td
@@ -73,7 +73,8 @@ defvar RVA23U64Features = !listconcat(RVA22U64Features,
                                        FeatureStdExtZcmop,
                                        FeatureStdExtZcb,
                                        FeatureStdExtZfa,
-                                       FeatureStdExtZawrs]);
+                                       FeatureStdExtZawrs,
+                                       FeatureStdExtSupm]);
 
 defvar RVA23S64BaseFeatures = !listconcat(RVA22S64BaseFeatures,
                                           [FeatureStdExtSvnapot,
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index e9743d484f776f0..9be9ddd05ee2900 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -578,8 +578,8 @@
 ; RVA20S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_zmmul1p0_za128rs1p0_ssccptr1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0"
 ; RVA22U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zihintpause2p0_zihpm2p0_zmmul1p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0"
 ; RVA22S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicsr2p0_zifencei2p0_zihintpause2p0_zihpm2p0_zmmul1p0_za64rs1p0_zfhmin1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscounterenw1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0_svinval1p0_svpbmt1p0"
-; RVA23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0"
-; RVA23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_h1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_shcounterenw1p0_shgatpa1p0_shtvala1p0_shvsatpa1p0_shvstvala1p0_shvstvecd1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_ssnpm1p0_ssstateen1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
+; RVA23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_supm1p0"
+; RVA23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_v1p0_h1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zfhmin1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_shcounterenw1p0_shgatpa1p0_shtvala1p0_shvsatpa1p0_shvstvala1p0_shvstvecd1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_ssnpm1p0_ssstateen1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_supm1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
 ; RVB23U64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0"
 ; RVB23S64: .attribute 5, "rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zawrs1p0_zfa1p0_zca1p0_zcb1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0"
 ; RVM23U32: .attribute 5, "rv32i2p1_m2p0_zicbop1p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zimop1p0_zmmul1p0_zca1p0_zcb1p0_zce1p0_zcmop1p0_zcmp1p0_zcmt1p0_zba1p0_zbb1p0_zbs1p0"

From bbc0e631d2d3facd5952aeafc7400761813acc3a Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme@arm.com>
Date: Fri, 25 Oct 2024 15:41:39 +0100
Subject: [PATCH 05/39] [MLIR] Remove unneeded LLVMDialect.h include in
 ControlFlowToSCF.cpp (#113560)

This fixes the following failure when doing a clean build (in particular
no .ninja* lying around) of lib/libMLIRControlFlowToSCF.a only:
```
In file included from llvm/include/llvm/IR/Module.h:22,
                 from mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h:37,
                 from mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp:19
llvm/include/llvm/IR/Attributes.h:90:14: fatal error: llvm/IR/Attributes.inc: No such file or directory
```
---
 mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp b/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp
index d3ee89743da9db5..1c592d665f3e4c5 100644
--- a/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp
+++ b/mlir/lib/Conversion/ControlFlowToSCF/ControlFlowToSCF.cpp
@@ -16,7 +16,6 @@
 #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Pass/Pass.h"

From e47bf3d08d51306f2e534951a1b77043dc540ceb Mon Sep 17 00:00:00 2001
From: Kai Nacke <kai.peter.nacke@ibm.com>
Date: Wed, 23 Oct 2024 16:13:39 -0400
Subject: [PATCH 06/39] [JIT] Fix crash in unit tests

The unit tests `ReOptimizeLayerTest.BasicReOptimization` and `JITLinkRedirectionManagerTest.BasicRedirectionOperation` are failing for me with the error:

```
Program aborted due to an unhandled Error:
Error value was Success. (Note: Success values must still be checked prior to being destroyed).
```

The error is raised when a value is assigned to `Err`, due to the the missing `ErrorAsOutParameter`.

The fix is to move the error handling out of the constructor.
---
 .../Orc/JITLinkRedirectableSymbolManager.h    | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h b/llvm/include/llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h
index 52f284c89bdade5..ef42cc5f798fd93 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/JITLinkRedirectableSymbolManager.h
@@ -26,12 +26,16 @@ class JITLinkRedirectableSymbolManager : public RedirectableSymbolManager,
   /// Create redirection manager that uses JITLink based implementaion.
   static Expected<std::unique_ptr<RedirectableSymbolManager>>
   Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &JD) {
-    Error Err = Error::success();
-    auto RM = std::unique_ptr<RedirectableSymbolManager>(
-        new JITLinkRedirectableSymbolManager(ObjLinkingLayer, JD, Err));
-    if (Err)
-      return Err;
-    return std::move(RM);
+    auto AnonymousPtrCreator(jitlink::getAnonymousPointerCreator(
+        ObjLinkingLayer.getExecutionSession().getTargetTriple()));
+    auto PtrJumpStubCreator(jitlink::getPointerJumpStubCreator(
+        ObjLinkingLayer.getExecutionSession().getTargetTriple()));
+    if (!AnonymousPtrCreator || !PtrJumpStubCreator)
+      return make_error<StringError>("Architecture not supported",
+                                     inconvertibleErrorCode());
+    return std::unique_ptr<RedirectableSymbolManager>(
+        new JITLinkRedirectableSymbolManager(
+            ObjLinkingLayer, JD, AnonymousPtrCreator, PtrJumpStubCreator));
   }
 
   void emitRedirectableSymbols(std::unique_ptr<MaterializationResponsibility> R,
@@ -52,18 +56,13 @@ class JITLinkRedirectableSymbolManager : public RedirectableSymbolManager,
   constexpr static StringRef JumpStubTableName = "$IND_JUMP_";
   constexpr static StringRef StubPtrTableName = "$__IND_JUMP_PTRS";
 
-  JITLinkRedirectableSymbolManager(ObjectLinkingLayer &ObjLinkingLayer,
-                                   JITDylib &JD, Error &Err)
+  JITLinkRedirectableSymbolManager(
+      ObjectLinkingLayer &ObjLinkingLayer, JITDylib &JD,
+      jitlink::AnonymousPointerCreator &AnonymousPtrCreator,
+      jitlink::PointerJumpStubCreator &PtrJumpStubCreator)
       : ObjLinkingLayer(ObjLinkingLayer), JD(JD),
-        AnonymousPtrCreator(jitlink::getAnonymousPointerCreator(
-            ObjLinkingLayer.getExecutionSession().getTargetTriple())),
-        PtrJumpStubCreator(jitlink::getPointerJumpStubCreator(
-            ObjLinkingLayer.getExecutionSession().getTargetTriple())) {
-    if (!AnonymousPtrCreator || !PtrJumpStubCreator)
-      Err = make_error<StringError>("Architecture not supported",
-                                    inconvertibleErrorCode());
-    if (Err)
-      return;
+        AnonymousPtrCreator(std::move(AnonymousPtrCreator)),
+        PtrJumpStubCreator(std::move(PtrJumpStubCreator)) {
     ObjLinkingLayer.getExecutionSession().registerResourceManager(*this);
   }
 

From aba39c3974c7e43a83a9d647dca9b67caca8572e Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1@linux.ibm.com>
Date: Fri, 25 Oct 2024 17:40:00 +0200
Subject: [PATCH 07/39] [System] Precommit of test for #112491 (#113704)

---
 .../SystemZ/vec-elt-insertion.ll              | 128 ++++++++++++++++++
 1 file changed, 128 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
new file mode 100644
index 000000000000000..eb8dd72e0304d91
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/vec-elt-insertion.ll
@@ -0,0 +1,128 @@
+; RUN: opt < %s -mtriple=s390x-unknown-linux -mcpu=z16 -S -passes=slp-vectorizer \
+; RUN:   -pass-remarks-output=%t | FileCheck %s
+; RUN: cat %t | FileCheck -check-prefix=REMARK %s
+;
+; NB! This is a pre-commit version (for #112491) with current codegen and remarks.
+;
+; Test functions that (at least currently) only gets vectorized if the
+; insertion cost for an element load is counted as free.
+
+; This function needs the free element load to be recognized in SLP
+; getGatherCost().
+define void @fun0(ptr nocapture %0, double %1) {
+; CHECK-LABEL: define void @fun0(
+; CHECK:         fmul double
+; CHECK:         call double @llvm.fmuladd.f64(
+; CHECK-NEXT:    call double @llvm.fmuladd.f64(
+; CHECK-NEXT:    call double @llvm.sqrt.f64(
+; CHECK:         fmul double
+; CHECK:         call double @llvm.fmuladd.f64(
+; CHECK-NEXT:    call double @llvm.fmuladd.f64(
+; CHECK-NEXT:    call double @llvm.sqrt.f64(
+;
+; REMARK-LABEL: Function: fun0
+; REMARK: Args:
+; REMARK-NEXT: - String:          'List vectorization was possible but not beneficial with cost '
+; REMARK-NEXT: - Cost:            '0'
+
+  %3 = fmul double %1, 2.000000e+00
+  %4 = tail call double @llvm.fmuladd.f64(double %3, double %3, double 0.000000e+00)
+  %5 = tail call double @llvm.fmuladd.f64(double %3, double %3, double %4)
+  %sqrt1 = tail call double @llvm.sqrt.f64(double %5)
+  %6 = load double, ptr %0, align 8
+  %7 = fmul double %6, 2.000000e+00
+  %8 = tail call double @llvm.fmuladd.f64(double %7, double %7, double 0.000000e+00)
+  %9 = tail call double @llvm.fmuladd.f64(double %7, double %7, double %8)
+  %sqrt = tail call double @llvm.sqrt.f64(double %9)
+  %10 = fadd double %sqrt1, %sqrt
+  store double %10, ptr %0, align 8
+  ret void
+}
+
+; This function needs the element-load to be recognized in SystemZ
+; getVectorInstrCost().
+define void @fun1(double %0) {
+; CHECK-LABEL: define void @fun1(
+; CHECK:         phi double
+; CHECK-NEXT:    phi double
+; CHECK-NEXT:    phi double
+; CHECK-NEXT:    phi double
+; CHECK-NEXT:    phi double
+; CHECK-NEXT:    phi double
+; CHECK-NEXT:    fsub double
+; CHECK-NEXT:    fsub double
+; CHECK-NEXT:    fmul double
+; CHECK-NEXT:    fmul double
+; CHECK-NEXT:    fsub double
+; CHECK-NEXT:    fsub double
+; CHECK-NEXT:    call double @llvm.fmuladd.f64(
+; CHECK-NEXT:    call double @llvm.fmuladd.f64(
+; CHECK-NEXT:    fsub double
+; CHECK-NEXT:    fsub double
+; CHECK-NEXT:    call double @llvm.fmuladd.f64(
+; CHECK-NEXT:    call double @llvm.fmuladd.f64(
+; CHECK:         fcmp olt double
+; CHECK-NEXT:    fcmp olt double
+; CHECK-NEXT:    or i1
+;
+; REMARK-LABEL: Function: fun1
+; REMARK: Args:
+; REMARK:      - String:          'List vectorization was possible but not beneficial with cost '
+; REMARK-NEXT: - Cost:            '0'
+
+  br label %2
+
+2:
+  %3 = phi double [ poison, %1 ], [ poison, %2 ]
+  %4 = phi double [ undef, %1 ], [ poison, %2 ]
+  %5 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
+  %6 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
+  %7 = phi double [ 0.000000e+00, %1 ], [ poison, %2 ]
+  %8 = phi double [ 0.000000e+00, %1 ], [ %21, %2 ]
+  %9 = fsub double 0.000000e+00, %8
+  %10 = fsub double 0.000000e+00, %7
+  %11 = fmul double %9, 0.000000e+00
+  %12 = fmul double %10, 0.000000e+00
+  %13 = fsub double 0.000000e+00, %6
+  %14 = fsub double 0.000000e+00, %5
+  %15 = tail call double @llvm.fmuladd.f64(double %13, double %13, double %11)
+  %16 = tail call double @llvm.fmuladd.f64(double %14, double %14, double %12)
+  %17 = fsub double 0.000000e+00, %4
+  %18 = fsub double 0.000000e+00, %3
+  %19 = tail call double @llvm.fmuladd.f64(double %17, double %17, double %15)
+  %20 = tail call double @llvm.fmuladd.f64(double %18, double %18, double %16)
+  %21 = load double, ptr null, align 8
+  %22 = fcmp olt double %19, %0
+  %23 = fcmp olt double %20, 0.000000e+00
+  %24 = or i1 %23, %22
+  br label %2
+}
+
+declare double @llvm.fmuladd.f64(double, double, double)
+
+; This should *not* be vectorized as the insertion into the vector isn't free,
+; which is recognized in SystemZTTImpl::getScalarizationOverhead().
+define void @fun2(ptr %0, ptr %Dst) {
+; CHECK-LABEL: define void @fun2(
+; CHECK: insertelement
+; CHECK: store <2 x i64>
+;
+; REMARK-LABEL: Function: fun2
+; REMARK: Args:
+; REMARK-NEXT: - String:          'Stores SLP vectorized with cost '
+; REMARK-NEXT: - Cost:            '-1'
+
+  %3 = load i64, ptr %0, align 8
+  %4 = icmp eq i64 %3, 0
+  br i1 %4, label %5, label %6
+
+5:
+  ret void
+
+6:
+  %7 = getelementptr i8, ptr %Dst, i64 24
+  store i64 %3, ptr %7, align 8
+  %8 = getelementptr i8, ptr %Dst, i64 16
+  store i64 0, ptr %8, align 8
+  br label %5
+}

From 81e536ec87a108d012cf9156a2c3fc672fb92155 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 25 Oct 2024 15:43:47 +0000
Subject: [PATCH 08/39] [clang][test] Fix typo in arm-mfp8.cpp

New test added by https://github.com/llvm/llvm-project/pull/97277.
---
 clang/test/AST/arm-mfp8.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/AST/arm-mfp8.cpp b/clang/test/AST/arm-mfp8.cpp
index a00d055f7d96794..51bebba067eb9f6 100644
--- a/clang/test/AST/arm-mfp8.cpp
+++ b/clang/test/AST/arm-mfp8.cpp
@@ -69,7 +69,7 @@ class C1 {
 //CHECK-NEXT:  | | `-CompoundStmt {{.*}}
 //CHECK-NEXT:  | |   `-ReturnStmt {{.*}}
 //CHECK-NEXT:  | |     `-ImplicitCastExpr {{.*}} '__mfp8':'__MFloat8_t' <LValueToRValue>
-//CHECK-NEXT:  | |       `-DeclRefExpr {{.*}} '__mfp8':'__MFloat8_t' lvalue ParmVar {{.*}}8 'arg' '__mfp8':'__MFloat8_t'
+//CHECK-NEXT:  | |       `-DeclRefExpr {{.*}} '__mfp8':'__MFloat8_t' lvalue ParmVar {{.*}} 'arg' '__mfp8':'__MFloat8_t'
 //CHECK-NEXT:  | `-CXXMethodDecl {{.*}} func2c '__mfp8 (__mfp8)' static implicit-inline
 //CHECK-NEXT:  |   |-ParmVarDecl {{.*}} arg '__mfp8':'__MFloat8_t'
 //CHECK-NEXT:  |   `-CompoundStmt {{.*}}

From 5c20891b2bb60f82dd82a8e90b111f8c13a13ad4 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Fri, 25 Oct 2024 08:52:56 -0700
Subject: [PATCH 09/39] [WebKit Checkers] Allow a guardian
 CheckedPtr/CheckedRef (#110222)

This PR makes WebKit checkers allow a guardian variable which is
CheckedPtr or CheckedRef as in addition to RefPtr or Ref.
---
 .../Checkers/WebKit/ASTUtils.cpp              | 16 +++---
 .../Checkers/WebKit/PtrTypesSemantics.cpp     | 43 +++++++++++++---
 .../Checkers/WebKit/PtrTypesSemantics.h       | 22 ++++++--
 .../WebKit/UncountedCallArgsChecker.cpp       |  2 +
 .../WebKit/UncountedLocalVarsChecker.cpp      |  1 +
 .../Checkers/WebKit/call-args-checked.cpp     | 46 +++++++++++++++++
 .../Analysis/Checkers/WebKit/mock-types.h     | 16 ++++--
 .../Checkers/WebKit/uncounted-local-vars.cpp  | 51 +++++++++++++++++++
 8 files changed, 177 insertions(+), 20 deletions(-)
 create mode 100644 clang/test/Analysis/Checkers/WebKit/call-args-checked.cpp

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index b7b2f8a16f07b31..9d34dfd3cea636b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -17,6 +17,10 @@
 
 namespace clang {
 
+bool isSafePtr(clang::CXXRecordDecl *Decl) {
+  return isRefCounted(Decl) || isCheckedPtr(Decl);
+}
+
 bool tryToFindPtrOrigin(
     const Expr *E, bool StopAtFirstRefCountedObj,
     std::function<bool(const clang::Expr *, bool)> callback) {
@@ -31,7 +35,7 @@ bool tryToFindPtrOrigin(
     }
     if (auto *tempExpr = dyn_cast<CXXTemporaryObjectExpr>(E)) {
       if (auto *C = tempExpr->getConstructor()) {
-        if (auto *Class = C->getParent(); Class && isRefCounted(Class))
+        if (auto *Class = C->getParent(); Class && isSafePtr(Class))
           return callback(E, true);
         break;
       }
@@ -56,7 +60,7 @@ bool tryToFindPtrOrigin(
       if (StopAtFirstRefCountedObj) {
         if (auto *ConversionFunc =
                 dyn_cast_or_null<FunctionDecl>(cast->getConversionFunction())) {
-          if (isCtorOfRefCounted(ConversionFunc))
+          if (isCtorOfSafePtr(ConversionFunc))
             return callback(E, true);
         }
       }
@@ -68,7 +72,7 @@ bool tryToFindPtrOrigin(
     if (auto *call = dyn_cast<CallExpr>(E)) {
       if (auto *memberCall = dyn_cast<CXXMemberCallExpr>(call)) {
         if (auto *decl = memberCall->getMethodDecl()) {
-          std::optional<bool> IsGetterOfRefCt = isGetterOfRefCounted(decl);
+          std::optional<bool> IsGetterOfRefCt = isGetterOfSafePtr(decl);
           if (IsGetterOfRefCt && *IsGetterOfRefCt) {
             E = memberCall->getImplicitObjectArgument();
             if (StopAtFirstRefCountedObj) {
@@ -87,7 +91,7 @@ bool tryToFindPtrOrigin(
       }
 
       if (auto *callee = call->getDirectCallee()) {
-        if (isCtorOfRefCounted(callee)) {
+        if (isCtorOfRefCounted(callee) || isCtorOfCheckedPtr(callee)) {
           if (StopAtFirstRefCountedObj)
             return callback(E, true);
 
@@ -95,7 +99,7 @@ bool tryToFindPtrOrigin(
           continue;
         }
 
-        if (isRefType(callee->getReturnType()))
+        if (isSafePtrType(callee->getReturnType()))
           return callback(E, true);
 
         if (isSingleton(callee))
@@ -114,7 +118,7 @@ bool tryToFindPtrOrigin(
     }
     if (auto *ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(E)) {
       if (auto *Method = ObjCMsgExpr->getMethodDecl()) {
-        if (isRefType(Method->getReturnType()))
+        if (isSafePtrType(Method->getReturnType()))
           return callback(E, true);
       }
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 71440e6d08a1c9a..2293dcf1d4bd643 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -135,7 +135,16 @@ bool isCtorOfRefCounted(const clang::FunctionDecl *F) {
          || FunctionName == "Identifier";
 }
 
-bool isRefType(const clang::QualType T) {
+bool isCtorOfCheckedPtr(const clang::FunctionDecl *F) {
+  assert(F);
+  return isCheckedPtr(safeGetName(F));
+}
+
+bool isCtorOfSafePtr(const clang::FunctionDecl *F) {
+  return isCtorOfRefCounted(F) || isCtorOfCheckedPtr(F);
+}
+
+bool isSafePtrType(const clang::QualType T) {
   QualType type = T;
   while (!type.isNull()) {
     if (auto *elaboratedT = type->getAs<ElaboratedType>()) {
@@ -145,7 +154,7 @@ bool isRefType(const clang::QualType T) {
     if (auto *specialT = type->getAs<TemplateSpecializationType>()) {
       if (auto *decl = specialT->getTemplateName().getAsTemplateDecl()) {
         auto name = decl->getNameAsString();
-        return isRefType(name);
+        return isRefType(name) || isCheckedPtr(name);
       }
       return false;
     }
@@ -177,6 +186,12 @@ std::optional<bool> isUncounted(const CXXRecordDecl* Class)
   return (*IsRefCountable);
 }
 
+std::optional<bool> isUnchecked(const CXXRecordDecl *Class) {
+  if (isCheckedPtr(Class))
+    return false; // Cheaper than below
+  return isCheckedPtrCapable(Class);
+}
+
 std::optional<bool> isUncountedPtr(const QualType T) {
   if (T->isPointerType() || T->isReferenceType()) {
     if (auto *CXXRD = T->getPointeeCXXRecordDecl())
@@ -185,8 +200,16 @@ std::optional<bool> isUncountedPtr(const QualType T) {
   return false;
 }
 
-std::optional<bool> isGetterOfRefCounted(const CXXMethodDecl* M)
-{
+std::optional<bool> isUnsafePtr(const QualType T) {
+  if (T->isPointerType() || T->isReferenceType()) {
+    if (auto *CXXRD = T->getPointeeCXXRecordDecl()) {
+      return isUncounted(CXXRD) || isUnchecked(CXXRD);
+    }
+  }
+  return false;
+}
+
+std::optional<bool> isGetterOfSafePtr(const CXXMethodDecl *M) {
   assert(M);
 
   if (isa<CXXMethodDecl>(M)) {
@@ -194,6 +217,9 @@ std::optional<bool> isGetterOfRefCounted(const CXXMethodDecl* M)
     auto className = safeGetName(calleeMethodsClass);
     auto method = safeGetName(M);
 
+    if (isCheckedPtr(className) && (method == "get" || method == "ptr"))
+      return true;
+
     if ((isRefType(className) && (method == "get" || method == "ptr")) ||
         ((className == "String" || className == "AtomString" ||
           className == "AtomStringImpl" || className == "UniqueString" ||
@@ -205,7 +231,12 @@ std::optional<bool> isGetterOfRefCounted(const CXXMethodDecl* M)
     // FIXME: Currently allowing any Ref<T> -> whatever cast.
     if (isRefType(className)) {
       if (auto *maybeRefToRawOperator = dyn_cast<CXXConversionDecl>(M))
-        return isUncountedPtr(maybeRefToRawOperator->getConversionType());
+        return isUnsafePtr(maybeRefToRawOperator->getConversionType());
+    }
+
+    if (isCheckedPtr(className)) {
+      if (auto *maybeRefToRawOperator = dyn_cast<CXXConversionDecl>(M))
+        return isUnsafePtr(maybeRefToRawOperator->getConversionType());
     }
   }
   return false;
@@ -448,7 +479,7 @@ class TrivialFunctionAnalysisVisitor
     if (!Callee)
       return false;
 
-    std::optional<bool> IsGetterOfRefCounted = isGetterOfRefCounted(Callee);
+    std::optional<bool> IsGetterOfRefCounted = isGetterOfSafePtr(Callee);
     if (IsGetterOfRefCounted && *IsGetterOfRefCounted)
       return true;
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
index 8e6aadf63b6d679..4b41ca96e1df1d3 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.h
@@ -63,18 +63,30 @@ std::optional<bool> isUncounted(const clang::CXXRecordDecl* Class);
 /// class, false if not, std::nullopt if inconclusive.
 std::optional<bool> isUncountedPtr(const clang::QualType T);
 
-/// \returns true if Name is a RefPtr, Ref, or its variant, false if not.
-bool isRefType(const std::string &Name);
+/// \returns true if \p T is a RefPtr, Ref, CheckedPtr, CheckedRef, or its
+/// variant, false if not.
+bool isSafePtrType(const clang::QualType T);
 
 /// \returns true if \p F creates ref-countable object from uncounted parameter,
 /// false if not.
 bool isCtorOfRefCounted(const clang::FunctionDecl *F);
 
-/// \returns true if \p T is RefPtr, Ref, or its variant, false if not.
-bool isRefType(const clang::QualType T);
+/// \returns true if \p F creates checked ptr object from uncounted parameter,
+/// false if not.
+bool isCtorOfCheckedPtr(const clang::FunctionDecl *F);
+
+/// \returns true if \p F creates ref-countable or checked ptr object from
+/// uncounted parameter, false if not.
+bool isCtorOfSafePtr(const clang::FunctionDecl *F);
+
+/// \returns true if \p Name is RefPtr, Ref, or its variant, false if not.
+bool isRefType(const std::string &Name);
+
+/// \returns true if \p Name is CheckedRef or CheckedPtr, false if not.
+bool isCheckedPtr(const std::string &Name);
 
 /// \returns true if \p M is getter of a ref-counted class, false if not.
-std::optional<bool> isGetterOfRefCounted(const clang::CXXMethodDecl* Method);
+std::optional<bool> isGetterOfSafePtr(const clang::CXXMethodDecl *Method);
 
 /// \returns true if \p F is a conversion between ref-countable or ref-counted
 /// pointer types.
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
index cea3503fa2c314d..1a5a7309a54f167 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedCallArgsChecker.cpp
@@ -96,6 +96,8 @@ class UncountedCallArgsChecker
           auto name = safeGetName(MD);
           if (name == "ref" || name == "deref")
             return;
+          if (name == "incrementPtrCount" || name == "decrementPtrCount")
+            return;
         }
         auto *E = MemberCallExpr->getImplicitObjectArgument();
         QualType ArgType = MemberCallExpr->getObjectType().getCanonicalType();
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
index 81d21100de878db..5cdf047738abcb2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLocalVarsChecker.cpp
@@ -227,6 +227,7 @@ class UncountedLocalVarsChecker
                       if (MaybeGuardianArgCXXRecord) {
                         if (MaybeGuardian->isLocalVarDecl() &&
                             (isRefCounted(MaybeGuardianArgCXXRecord) ||
+                             isCheckedPtr(MaybeGuardianArgCXXRecord) ||
                              isRefcountedStringsHack(MaybeGuardian)) &&
                             isGuardedScopeEmbeddedInGuardianScope(
                                 V, MaybeGuardian))
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args-checked.cpp b/clang/test/Analysis/Checkers/WebKit/call-args-checked.cpp
new file mode 100644
index 000000000000000..49b6bfcd7cadfdc
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/call-args-checked.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UncountedCallArgsChecker -verify %s
+
+#include "mock-types.h"
+
+RefCountableAndCheckable* makeObj();
+CheckedRef<RefCountableAndCheckable> makeObjChecked();
+void someFunction(RefCountableAndCheckable*);
+
+namespace call_args_unchecked_uncounted {
+
+static void foo() {
+  someFunction(makeObj());
+  // expected-warning@-1{{Call argument is uncounted and unsafe [alpha.webkit.UncountedCallArgsChecker]}}
+}
+
+} // namespace call_args_checked
+
+namespace call_args_checked {
+
+static void foo() {
+  CheckedPtr<RefCountableAndCheckable> ptr = makeObj();
+  someFunction(ptr.get());
+}
+
+static void bar() {
+  someFunction(CheckedPtr { makeObj() }.get());
+}
+
+static void baz() {
+  someFunction(makeObjChecked().ptr());
+}
+
+} // namespace call_args_checked
+
+namespace call_args_default {
+
+void someFunction(RefCountableAndCheckable* = makeObj());
+// expected-warning@-1{{Call argument is uncounted and unsafe [alpha.webkit.UncountedCallArgsChecker]}}
+void otherFunction(RefCountableAndCheckable* = makeObjChecked().ptr());
+
+void foo() {
+  someFunction();
+  otherFunction();
+}
+
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index 933b4c5e62a79cc..8d8a90f0afae0e1 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -114,8 +114,8 @@ template <typename T> struct CheckedRef {
 
 public:
   CheckedRef() : t{} {};
-  CheckedRef(T &t) : t(t) { t->incrementPtrCount(); }
-  CheckedRef(const CheckedRef& o) : t(o.t) { if (t) t->incrementPtrCount(); }
+  CheckedRef(T &t) : t(&t) { t.incrementPtrCount(); }
+  CheckedRef(const CheckedRef &o) : t(o.t) { if (t) t->incrementPtrCount(); }
   ~CheckedRef() { if (t) t->decrementPtrCount(); }
   T &get() { return *t; }
   T *ptr() { return t; }
@@ -135,7 +135,7 @@ template <typename T> struct CheckedPtr {
     if (t)
       t->incrementPtrCount();
   }
-  CheckedPtr(Ref<T>&& o)
+  CheckedPtr(Ref<T> &&o)
     : t(o.leakRef())
   { }
   ~CheckedPtr() {
@@ -156,4 +156,14 @@ class CheckedObj {
   void decrementPtrCount();
 };
 
+class RefCountableAndCheckable {
+public:
+  void incrementPtrCount() const;
+  void decrementPtrCount() const;
+  void ref() const;
+  void deref() const;
+  void method();
+  int trivial() { return 0; }
+};
+
 #endif
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
index b5f6b8535bf4181..1c0df42cdda663c 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
@@ -290,6 +290,57 @@ void foo() {
 
 } // namespace local_assignment_to_global
 
+namespace local_refcountable_checkable_object {
+
+RefCountableAndCheckable* provide_obj();
+
+void local_raw_ptr() {
+  RefCountableAndCheckable* a = nullptr;
+  // expected-warning@-1{{Local variable 'a' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  a = provide_obj();
+  a->method();
+}
+
+void local_checked_ptr() {
+  CheckedPtr<RefCountableAndCheckable> a = nullptr;
+  a = provide_obj();
+  a->method();
+}
+
+void local_var_with_guardian_checked_ptr() {
+  CheckedPtr<RefCountableAndCheckable> a = provide_obj();
+  {
+    auto* b = a.get();
+    b->method();
+  }
+}
+
+void local_var_with_guardian_checked_ptr_with_assignment() {
+  CheckedPtr<RefCountableAndCheckable> a = provide_obj();
+  {
+    RefCountableAndCheckable* b = a.get();
+    // expected-warning@-1{{Local variable 'b' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+    b = provide_obj();
+    b->method();
+  }
+}
+
+void local_var_with_guardian_checked_ref() {
+  CheckedRef<RefCountableAndCheckable> a = *provide_obj();
+  {
+    RefCountableAndCheckable& b = a;
+    b.method();
+  }
+}
+
+void static_var() {
+  static RefCountableAndCheckable* a = nullptr;
+  // expected-warning@-1{{Static local variable 'a' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+  a = provide_obj();
+}
+
+} // namespace local_refcountable_checkable_object
+
 namespace local_var_in_recursive_function {
 
 struct TreeNode {

From 1f2b7ae6d78906df4f0c06961e3c9ed227986acf Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 25 Oct 2024 12:28:55 -0400
Subject: [PATCH 10/39] [libc++] Refactor locale_guard (#113694)

Rename __libcpp_locale_guard to just __locale_guard, since there's no
reason for it to have __libcpp_ in its name -- it's just an internal
utility.

Also, define __locale_guard unconditionally of
_LIBCPP_LOCALE__L_EXTENSIONS, since that header is only used on Windows
(where it has a custom definition) or from bsd_locale_fallbacks.h, which
is only included when the L extensions are not provided.
---
 libcxx/include/CMakeLists.txt                 |  2 +-
 .../locale_base_api/bsd_locale_fallbacks.h    | 30 ++++++-------
 .../{locale_base_api => }/locale_guard.h      | 42 +++++++++----------
 libcxx/include/module.modulemap               |  2 +-
 libcxx/src/iostream.cpp                       |  4 +-
 libcxx/src/support/win32/locale_win32.cpp     | 34 +++++++--------
 6 files changed, 56 insertions(+), 58 deletions(-)
 rename libcxx/include/__locale_dir/{locale_base_api => }/locale_guard.h (73%)

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 975adc03ec81da0..63aa74e09bb1a27 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -496,11 +496,11 @@ set(files
   __locale_dir/locale_base_api/bsd_locale_fallbacks.h
   __locale_dir/locale_base_api/fuchsia.h
   __locale_dir/locale_base_api/ibm.h
-  __locale_dir/locale_base_api/locale_guard.h
   __locale_dir/locale_base_api/musl.h
   __locale_dir/locale_base_api/newlib.h
   __locale_dir/locale_base_api/openbsd.h
   __locale_dir/locale_base_api/win32.h
+  __locale_dir/locale_guard.h
   __math/abs.h
   __math/copysign.h
   __math/error_functions.h
diff --git a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
index 5f99c7aea02a96a..ae2db6ae70bebcb 100644
--- a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
+++ b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
@@ -13,7 +13,7 @@
 #ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
 #define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
 
-#include <__locale_dir/locale_base_api/locale_guard.h>
+#include <__locale_dir/locale_guard.h>
 #include <cstdio>
 #include <stdarg.h>
 #include <stdlib.h>
@@ -29,64 +29,64 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 inline _LIBCPP_HIDE_FROM_ABI decltype(MB_CUR_MAX) __libcpp_mb_cur_max_l(locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return MB_CUR_MAX;
 }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI wint_t __libcpp_btowc_l(int __c, locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return btowc(__c);
 }
 
 inline _LIBCPP_HIDE_FROM_ABI int __libcpp_wctob_l(wint_t __c, locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return wctob(__c);
 }
 
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __libcpp_wcsnrtombs_l(char* __dest, const wchar_t** __src, size_t __nwc, size_t __len, mbstate_t* __ps, locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return wcsnrtombs(__dest, __src, __nwc, __len, __ps);
 }
 
 inline _LIBCPP_HIDE_FROM_ABI size_t __libcpp_wcrtomb_l(char* __s, wchar_t __wc, mbstate_t* __ps, locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return wcrtomb(__s, __wc, __ps);
 }
 
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __libcpp_mbsnrtowcs_l(wchar_t* __dest, const char** __src, size_t __nms, size_t __len, mbstate_t* __ps, locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return mbsnrtowcs(__dest, __src, __nms, __len, __ps);
 }
 
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __libcpp_mbrtowc_l(wchar_t* __pwc, const char* __s, size_t __n, mbstate_t* __ps, locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return mbrtowc(__pwc, __s, __n, __ps);
 }
 
 inline _LIBCPP_HIDE_FROM_ABI int __libcpp_mbtowc_l(wchar_t* __pwc, const char* __pmb, size_t __max, locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return mbtowc(__pwc, __pmb, __max);
 }
 
 inline _LIBCPP_HIDE_FROM_ABI size_t __libcpp_mbrlen_l(const char* __s, size_t __n, mbstate_t* __ps, locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return mbrlen(__s, __n, __ps);
 }
 #endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
 
 inline _LIBCPP_HIDE_FROM_ABI lconv* __libcpp_localeconv_l(locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return localeconv();
 }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
 inline _LIBCPP_HIDE_FROM_ABI size_t
 __libcpp_mbsrtowcs_l(wchar_t* __dest, const char** __src, size_t __len, mbstate_t* __ps, locale_t __l) {
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   return mbsrtowcs(__dest, __src, __len, __ps);
 }
 #endif
@@ -95,7 +95,7 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 4, 5) int __libcpp_snprintf_l(
     char* __s, size_t __n, locale_t __l, const char* __format, ...) {
   va_list __va;
   va_start(__va, __format);
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   int __res = vsnprintf(__s, __n, __format, __va);
   va_end(__va);
   return __res;
@@ -105,7 +105,7 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__printf__, 3, 4) int __libcpp_asprintf_l(
     char** __s, locale_t __l, const char* __format, ...) {
   va_list __va;
   va_start(__va, __format);
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   int __res = vasprintf(__s, __format, __va);
   va_end(__va);
   return __res;
@@ -115,7 +115,7 @@ inline _LIBCPP_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __libcpp_sscanf_l(
     const char* __s, locale_t __l, const char* __format, ...) {
   va_list __va;
   va_start(__va, __format);
-  __libcpp_locale_guard __current(__l);
+  __locale_guard __current(__l);
   int __res = vsscanf(__s, __format, __va);
   va_end(__va);
   return __res;
diff --git a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h b/libcxx/include/__locale_dir/locale_guard.h
similarity index 73%
rename from libcxx/include/__locale_dir/locale_base_api/locale_guard.h
rename to libcxx/include/__locale_dir/locale_guard.h
index 7d15f2d253adc39..e0c414c001c41f1 100644
--- a/libcxx/include/__locale_dir/locale_base_api/locale_guard.h
+++ b/libcxx/include/__locale_dir/locale_guard.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_LOCALE_GUARD_H
-#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_LOCALE_GUARD_H
+#ifndef _LIBCPP___LOCALE_DIR_LOCALE_GUARD_H
+#define _LIBCPP___LOCALE_DIR_LOCALE_GUARD_H
 
 #include <__config>
 #include <__locale> // for locale_t
@@ -19,23 +19,9 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if !defined(_LIBCPP_LOCALE__L_EXTENSIONS)
-struct __libcpp_locale_guard {
-  _LIBCPP_HIDE_FROM_ABI __libcpp_locale_guard(locale_t& __loc) : __old_loc_(uselocale(__loc)) {}
-
-  _LIBCPP_HIDE_FROM_ABI ~__libcpp_locale_guard() {
-    if (__old_loc_)
-      uselocale(__old_loc_);
-  }
-
-  locale_t __old_loc_;
-
-  __libcpp_locale_guard(__libcpp_locale_guard const&)            = delete;
-  __libcpp_locale_guard& operator=(__libcpp_locale_guard const&) = delete;
-};
-#elif defined(_LIBCPP_MSVCRT_LIKE)
-struct __libcpp_locale_guard {
-  __libcpp_locale_guard(locale_t __l) : __status(_configthreadlocale(_ENABLE_PER_THREAD_LOCALE)) {
+#if defined(_LIBCPP_MSVCRT_LIKE)
+struct __locale_guard {
+  __locale_guard(locale_t __l) : __status(_configthreadlocale(_ENABLE_PER_THREAD_LOCALE)) {
     // Setting the locale can be expensive even when the locale given is
     // already the current locale, so do an explicit check to see if the
     // current locale is already the one we want.
@@ -51,7 +37,7 @@ struct __libcpp_locale_guard {
       __setlocale(__l.__get_locale());
     }
   }
-  ~__libcpp_locale_guard() {
+  ~__locale_guard() {
     // The CRT documentation doesn't explicitly say, but setlocale() does the
     // right thing when given a semicolon-separated list of locale settings
     // for the different categories in the same format as returned by
@@ -71,8 +57,22 @@ struct __libcpp_locale_guard {
   int __status;
   char* __locale_all = nullptr;
 };
+#else
+struct __locale_guard {
+  _LIBCPP_HIDE_FROM_ABI __locale_guard(locale_t& __loc) : __old_loc_(uselocale(__loc)) {}
+
+  _LIBCPP_HIDE_FROM_ABI ~__locale_guard() {
+    if (__old_loc_)
+      uselocale(__old_loc_);
+  }
+
+  locale_t __old_loc_;
+
+  __locale_guard(__locale_guard const&)            = delete;
+  __locale_guard& operator=(__locale_guard const&) = delete;
+};
 #endif
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_LOCALE_GUARD_H
+#endif // _LIBCPP___LOCALE_DIR_LOCALE_GUARD_H
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index b429d7cff702b81..c79070c318759db 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1443,7 +1443,7 @@ module std [system] {
   module locale {
     header "locale"
     header "__locale_dir/locale_base_api.h"
-    header "__locale_dir/locale_base_api/locale_guard.h"
+    header "__locale_dir/locale_guard.h"
     module locale_base_api {
       textual header "__locale_dir/locale_base_api/android.h"
       textual header "__locale_dir/locale_base_api/bsd_locale_defaults.h"
diff --git a/libcxx/src/iostream.cpp b/libcxx/src/iostream.cpp
index c5ad77a01916084..48d2fdb866a332c 100644
--- a/libcxx/src/iostream.cpp
+++ b/libcxx/src/iostream.cpp
@@ -12,7 +12,7 @@
 #include <string>
 
 #ifdef _LIBCPP_MSVCRT_LIKE
-#  include <__locale_dir/locale_base_api/locale_guard.h>
+#  include <__locale_dir/locale_guard.h>
 #endif
 
 #define _str(s) #s
@@ -109,7 +109,7 @@ static void force_locale_initialization() {
   static bool once = []() {
     auto loc = newlocale(LC_ALL_MASK, "C", 0);
     {
-      __libcpp_locale_guard g(loc); // forces initialization of locale TLS
+      __locale_guard g(loc); // forces initialization of locale TLS
       ((void)g);
     }
     freelocale(loc);
diff --git a/libcxx/src/support/win32/locale_win32.cpp b/libcxx/src/support/win32/locale_win32.cpp
index 57ef94932ba0a76..2a08e97b8645b40 100644
--- a/libcxx/src/support/win32/locale_win32.cpp
+++ b/libcxx/src/support/win32/locale_win32.cpp
@@ -11,12 +11,10 @@
 #include <memory>
 #include <type_traits>
 
-#include <__locale_dir/locale_base_api/locale_guard.h>
+#include <__locale_dir/locale_guard.h>
 
 int __libcpp_vasprintf(char** sptr, const char* __restrict fmt, va_list ap);
 
-using std::__libcpp_locale_guard;
-
 // FIXME: base and mask currently unused. Needs manual work to construct the new locale
 locale_t newlocale(int /*mask*/, const char* locale, locale_t /*base*/) {
   return {_create_locale(LC_ALL, locale), locale};
@@ -26,33 +24,33 @@ decltype(MB_CUR_MAX) MB_CUR_MAX_L(locale_t __l) {
 #if defined(_LIBCPP_MSVCRT)
   return ___mb_cur_max_l_func(__l);
 #else
-  __libcpp_locale_guard __current(__l);
+  std::__locale_guard __current(__l);
   return MB_CUR_MAX;
 #endif
 }
 
 lconv* localeconv_l(locale_t& loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   lconv* lc = localeconv();
   if (!lc)
     return lc;
   return loc.__store_lconv(lc);
 }
 size_t mbrlen_l(const char* __restrict s, size_t n, mbstate_t* __restrict ps, locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return mbrlen(s, n, ps);
 }
 size_t
 mbsrtowcs_l(wchar_t* __restrict dst, const char** __restrict src, size_t len, mbstate_t* __restrict ps, locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return mbsrtowcs(dst, src, len, ps);
 }
 size_t wcrtomb_l(char* __restrict s, wchar_t wc, mbstate_t* __restrict ps, locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return wcrtomb(s, wc, ps);
 }
 size_t mbrtowc_l(wchar_t* __restrict pwc, const char* __restrict s, size_t n, mbstate_t* __restrict ps, locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return mbrtowc(pwc, s, n, ps);
 }
 size_t mbsnrtowcs_l(wchar_t* __restrict dst,
@@ -61,7 +59,7 @@ size_t mbsnrtowcs_l(wchar_t* __restrict dst,
                     size_t len,
                     mbstate_t* __restrict ps,
                     locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return mbsnrtowcs(dst, src, nms, len, ps);
 }
 size_t wcsnrtombs_l(char* __restrict dst,
@@ -70,15 +68,15 @@ size_t wcsnrtombs_l(char* __restrict dst,
                     size_t len,
                     mbstate_t* __restrict ps,
                     locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return wcsnrtombs(dst, src, nwc, len, ps);
 }
 wint_t btowc_l(int c, locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return btowc(c);
 }
 int wctob_l(wint_t c, locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return wctob(c);
 }
 
@@ -90,7 +88,7 @@ int snprintf_l(char* ret, size_t n, locale_t loc, const char* format, ...) {
   int result = __stdio_common_vsprintf(
       _CRT_INTERNAL_LOCAL_PRINTF_OPTIONS | _CRT_INTERNAL_PRINTF_STANDARD_SNPRINTF_BEHAVIOR, ret, n, format, loc, ap);
 #else
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   _LIBCPP_DIAGNOSTIC_PUSH
   _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wformat-nonliteral")
   int result = vsnprintf(ret, n, format, ap);
@@ -108,25 +106,25 @@ int asprintf_l(char** ret, locale_t loc, const char* format, ...) {
   return result;
 }
 int vasprintf_l(char** ret, locale_t loc, const char* format, va_list ap) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return __libcpp_vasprintf(ret, format, ap);
 }
 
 #if !defined(_LIBCPP_MSVCRT)
 float strtof_l(const char* nptr, char** endptr, locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return strtof(nptr, endptr);
 }
 
 long double strtold_l(const char* nptr, char** endptr, locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return strtold(nptr, endptr);
 }
 #endif
 
 #if defined(__MINGW32__) && __MSVCRT_VERSION__ < 0x0800
 size_t strftime_l(char* ret, size_t n, const char* format, const struct tm* tm, locale_t loc) {
-  __libcpp_locale_guard __current(loc);
+  std::__locale_guard __current(loc);
   return strftime(ret, n, format, tm);
 }
 #endif

From ba81e1949a4f25216e2b3ea3a1507a52db88562a Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Fri, 25 Oct 2024 17:32:25 +0100
Subject: [PATCH 11/39] [AArch64] Add assembly/disassembly for BFMOP4{A,S}
 (widening) instructions (#113203)

The new instructions are described in
https://developer.arm.com/documentation/ddi0602/2024-09/SME-Instructions
---
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |   3 +
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  37 +++
 .../SME2p2/bfmop4as-widening-diagnostics.s    | 220 ++++++++++++++++++
 .../MC/AArch64/SME2p2/bfmop4as-widening.s     | 178 ++++++++++++++
 4 files changed, 438 insertions(+)
 create mode 100644 llvm/test/MC/AArch64/SME2p2/bfmop4as-widening-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/SME2p2/bfmop4as-widening.s

diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 6044b5bb7d81511..b763aa15a7c3f15 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1004,6 +1004,9 @@ let Predicates = [HasSME2p2] in {
   def FTMOPA_M2ZZZI_HtoS  : sme_tmopa_32b<0b11000, ZZ_h_mul_r, ZPR16, "ftmopa">;
   def FTMOPA_M2ZZZI_StoS  : sme_tmopa_32b<0b00000, ZZ_s_mul_r, ZPR32, "ftmopa">;
   def BFTMOPA_M2ZZZI_HtoS : sme_tmopa_32b<0b10000, ZZ_h_mul_r, ZPR16, "bftmopa">;
+
+  defm BFMOP4A : sme2_bfmop4as_widening<0, "bfmop4a">;
+  defm BFMOP4S : sme2_bfmop4as_widening<1, "bfmop4s">;
 } // [HasSME2p2]
 
 let Predicates = [HasSME2p2, HasSMEB16B16] in {
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 08929ed5616b2c8..4cfe18eddf481cb 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -5188,3 +5188,40 @@ class sme2_luti4_vector_vg4_strided<bits<2> sz, bits<2> op, string mnemonic>
   let Inst{3-2}   = 0b00;
   let Inst{1-0}   = Zd{1-0};
 }
+
+class sme2_bf16_fp32_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic, RegisterOperand zn_ty, RegisterOperand zm_ty>
+    : I<(outs TileOp32:$ZAda),
+        (ins TileOp32:$_ZAda, zn_ty:$Zn, zm_ty:$Zm),
+        mnemonic, "\t$ZAda, $Zn, $Zm",
+        "", []>, Sched<[]> {
+  bits<2> ZAda;
+  bits<3> Zn;
+  bits<3> Zm;
+
+  let Inst{31-21} = 0b10000001000;
+  let Inst{20} = M;
+  let Inst{19-17} = Zm;
+  let Inst{16-10} = 0b0000000;
+  let Inst{9} = N;
+  let Inst{8-6} = Zn;
+  let Inst{5} = 0;
+  let Inst{4} = S;
+  let Inst{3-2} = 0b00;
+  let Inst{1-0} = ZAda;
+
+  let Constraints = "$ZAda = $_ZAda";
+}
+
+multiclass sme2_bfmop4as_widening<bit S, string mnemonic> {
+  // Single vectors
+  def _MZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 0, S, mnemonic, ZPR16Mul2_Lo, ZPR16Mul2_Hi>;
+
+  // Multiple and single vectors
+  def _M2ZZ_S : sme2_bf16_fp32_quarter_tile_outer_product<0, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZPR16Mul2_Hi>;
+
+  // Single and multiple vectors
+  def _MZ2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 0, S, mnemonic, ZPR16Mul2_Lo, ZZ_h_mul_r_Hi>;
+
+  // Multiple vectors
+  def _M2Z2Z_S : sme2_bf16_fp32_quarter_tile_outer_product<1, 1, S, mnemonic, ZZ_h_mul_r_Lo, ZZ_h_mul_r_Hi>;
+}
diff --git a/llvm/test/MC/AArch64/SME2p2/bfmop4as-widening-diagnostics.s b/llvm/test/MC/AArch64/SME2p2/bfmop4as-widening-diagnostics.s
new file mode 100644
index 000000000000000..5906bcb07f15d5a
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p2/bfmop4as-widening-diagnostics.s
@@ -0,0 +1,220 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+sme2p2 < %s 2>&1 | FileCheck %s
+
+// BFMOP4A
+
+// Single vectors
+
+bfmop4a za0.d, z0.h, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s
+
+bfmop4a za4.s, z0.h, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4a za0.s, z0.s, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4a za0.s, z15.h, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4a za0.s, z16.h, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4a za0.s, z0.h, z16.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4a za0.s, z12.h, z17.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4a za0.s, z12.h, z14.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4a za0.s, z12.h, z31.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+// Single and multiple vectors
+
+bfmop4a za0.d, z0.h, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s
+
+bfmop4a za4.s, z0.h, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4a za0.s, z0.s, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4a za0.s, z1.h, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4a za0.s, z16.h, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4a za0.s, z0.h, {z16.s-z17.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4a za0.s, z0.h, {z17.h-z18.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4a za0.s, z0.h, {z12.h-z13.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types
+
+// Multiple and single vectors
+
+bfmop4a za0.d, {z0.h-z1.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s
+
+bfmop4a za4.s, {z0.h-z1.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4a za0.s, {z0.s-z1.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+
+bfmop4a za0.s, {z1.h-z2.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4a za0.s, {z16.h-z17.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4a za0.s, {z0.h-z1.h}, z16.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4a za0.s, {z0.h-z1.h}, z17.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4a za0.s, {z0.h-z1.h}, z12.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+// Multiple vectors
+
+bfmop4a za0.d, {z0.h-z1.h}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s
+
+bfmop4a za4.s, {z0.h-z1.h}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4a za0.s, {z0.s-z1.s}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4a za0.s, {z1.h-z2.h}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4a za0.s, {z18.h-z19.h}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4a za0.s, {z0.h-z1.h}, {z16.s-z17.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4a za0.s, {z0.h-z1.h}, {z19.h-z20.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4a za0.s, {z0.h-z1.h}, {z10.h-z11.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types
+
+
+// BFMOP4S
+
+// Single vectors
+
+bfmop4s za0.d, z0.h, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s
+
+bfmop4s za4.s, z0.h, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4s za0.s, z0.s, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4s za0.s, z15.h, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4s za0.s, z16.h, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4s za0.s, z0.h, z16.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4s za0.s, z12.h, z17.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4s za0.s, z12.h, z14.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4s za0.s, z12.h, z31.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+// Single and multiple vectors
+
+bfmop4s za0.d, z0.h, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s
+
+bfmop4s za4.s, z0.h, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4s za0.s, z0.s, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4s za0.s, z1.h, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4s za0.s, z16.h, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z0.h..z14.h
+
+bfmop4s za0.s, z0.h, {z16.s-z17.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4s za0.s, z0.h, {z17.h-z18.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4s za0.s, z0.h, {z12.h-z13.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types
+
+// Multiple and single vectors
+
+bfmop4s za0.d, {z0.h-z1.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s
+
+bfmop4s za4.s, {z0.h-z1.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4s za0.s, {z0.s-z1.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+
+bfmop4s za0.s, {z1.h-z2.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4s za0.s, {z16.h-z17.h}, z16.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4s za0.s, {z0.h-z1.h}, z16.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4s za0.s, {z0.h-z1.h}, z17.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+bfmop4s za0.s, {z0.h-z1.h}, z12.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected even register in z16.h..z30.h
+
+// Multiple vectors
+
+bfmop4s za0.d, {z0.h-z1.h}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid matrix operand, expected za[0-3].s
+
+bfmop4s za4.s, {z0.h-z1.h}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4s za0.s, {z0.s-z1.s}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4s za0.s, {z1.h-z2.h}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4s za0.s, {z18.h-z19.h}, {z16.h-z17.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z0-z14, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4s za0.s, {z0.h-z1.h}, {z16.s-z17.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+bfmop4s za0.s, {z0.h-z1.h}, {z19.h-z20.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types
+
+bfmop4s za0.s, {z0.h-z1.h}, {z10.h-z11.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors in the range z16-z30, where the first vector is a multiple of 2 and with matching element types
diff --git a/llvm/test/MC/AArch64/SME2p2/bfmop4as-widening.s b/llvm/test/MC/AArch64/SME2p2/bfmop4as-widening.s
new file mode 100644
index 000000000000000..40d08e503c8bb32
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p2/bfmop4as-widening.s
@@ -0,0 +1,178 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2p2 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p2 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2p2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p2 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p2 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// BFMOP4A
+
+// Single vectors
+
+bfmop4a za0.s, z0.h, z16.h  // 10000001-00000000-00000000-00000000
+// CHECK-INST: bfmop4a za0.s, z0.h, z16.h
+// CHECK-ENCODING: [0x00,0x00,0x00,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81000000 <unknown>
+
+bfmop4a za3.s, z14.h, z30.h  // 10000001-00001110-00000001-11000011
+// CHECK-INST: bfmop4a za3.s, z14.h, z30.h
+// CHECK-ENCODING: [0xc3,0x01,0x0e,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 810e01c3 <unknown>
+
+bfmop4a za1.s, z10.h, z20.h  // 10000001-00000100-00000001-01000001
+// CHECK-INST: bfmop4a za1.s, z10.h, z20.h
+// CHECK-ENCODING: [0x41,0x01,0x04,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81040141 <unknown>
+
+// Single and multiple vectors
+
+bfmop4a za0.s, z0.h, {z16.h-z17.h}  // 10000001-00010000-00000000-00000000
+// CHECK-INST: bfmop4a za0.s, z0.h, { z16.h, z17.h }
+// CHECK-ENCODING: [0x00,0x00,0x10,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81100000 <unknown>
+
+bfmop4a za3.s, z14.h, {z30.h-z31.h}  // 10000001-00011110-00000001-11000011
+// CHECK-INST: bfmop4a za3.s, z14.h, { z30.h, z31.h }
+// CHECK-ENCODING: [0xc3,0x01,0x1e,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 811e01c3 <unknown>
+
+bfmop4a za2.s, z12.h, {z24.h-z25.h}  // 10000001-00011000-00000001-10000010
+// CHECK-INST: bfmop4a za2.s, z12.h, { z24.h, z25.h }
+// CHECK-ENCODING: [0x82,0x01,0x18,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81180182 <unknown>
+
+// Multiple and single vectors
+
+bfmop4a za0.s, {z0.h-z1.h}, z16.h  // 10000001-00000000-00000010-00000000
+// CHECK-INST: bfmop4a za0.s, { z0.h, z1.h }, z16.h
+// CHECK-ENCODING: [0x00,0x02,0x00,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81000200 <unknown>
+
+bfmop4a za3.s, {z14.h-z15.h}, z30.h  // 10000001-00001110-00000011-11000011
+// CHECK-INST: bfmop4a za3.s, { z14.h, z15.h }, z30.h
+// CHECK-ENCODING: [0xc3,0x03,0x0e,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 810e03c3 <unknown>
+
+bfmop4a za2.s, {z12.h-z13.h}, z28.h  // 10000001-00001100-00000011-10000010
+// CHECK-INST: bfmop4a za2.s, { z12.h, z13.h }, z28.h
+// CHECK-ENCODING: [0x82,0x03,0x0c,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 810c0382 <unknown>
+
+// Multiple vectors
+
+bfmop4a za0.s, {z0.h-z1.h}, {z16.h-z17.h}  // 10000001-00010000-00000010-00000000
+// CHECK-INST: bfmop4a za0.s, { z0.h, z1.h }, { z16.h, z17.h }
+// CHECK-ENCODING: [0x00,0x02,0x10,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81100200 <unknown>
+
+bfmop4a za3.s, {z14.h-z15.h}, {z30.h-z31.h}  // 10000001-00011110-00000011-11000011
+// CHECK-INST: bfmop4a za3.s, { z14.h, z15.h }, { z30.h, z31.h }
+// CHECK-ENCODING: [0xc3,0x03,0x1e,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 811e03c3 <unknown>
+
+bfmop4a za2.s, {z12.h-z13.h}, {z26.h-z27.h}  // 10000001-00011010-00000011-10000010
+// CHECK-INST: bfmop4a za2.s, { z12.h, z13.h }, { z26.h, z27.h }
+// CHECK-ENCODING: [0x82,0x03,0x1a,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 811a0382 <unknown>
+
+
+// BFMOP4S
+
+// Single vectors
+
+bfmop4s za0.s, z0.h, z16.h  // 10000001-00000000-00000000-00010000
+// CHECK-INST: bfmop4s za0.s, z0.h, z16.h
+// CHECK-ENCODING: [0x10,0x00,0x00,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81000010 <unknown>
+
+bfmop4s za3.s, z14.h, z30.h  // 10000001-00001110-00000001-11010011
+// CHECK-INST: bfmop4s za3.s, z14.h, z30.h
+// CHECK-ENCODING: [0xd3,0x01,0x0e,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 810e01d3 <unknown>
+
+bfmop4s za1.s, z10.h, z20.h  // 10000001-00000100-00000001-01010001
+// CHECK-INST: bfmop4s za1.s, z10.h, z20.h
+// CHECK-ENCODING: [0x51,0x01,0x04,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81040151 <unknown>
+
+// Single and multiple vectors
+
+bfmop4s za0.s, z0.h, {z16.h-z17.h}  // 10000001-00010000-00000000-00010000
+// CHECK-INST: bfmop4s za0.s, z0.h, { z16.h, z17.h }
+// CHECK-ENCODING: [0x10,0x00,0x10,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81100010 <unknown>
+
+bfmop4s za3.s, z14.h, {z30.h-z31.h}  // 10000001-00011110-00000001-11010011
+// CHECK-INST: bfmop4s za3.s, z14.h, { z30.h, z31.h }
+// CHECK-ENCODING: [0xd3,0x01,0x1e,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 811e01d3 <unknown>
+
+bfmop4s za2.s, z12.h, {z24.h-z25.h}  // 10000001-00011000-00000001-10010010
+// CHECK-INST: bfmop4s za2.s, z12.h, { z24.h, z25.h }
+// CHECK-ENCODING: [0x92,0x01,0x18,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81180192 <unknown>
+
+// Multiple and single vectors
+
+bfmop4s za0.s, {z0.h-z1.h}, z16.h  // 10000001-00000000-00000010-00010000
+// CHECK-INST: bfmop4s za0.s, { z0.h, z1.h }, z16.h
+// CHECK-ENCODING: [0x10,0x02,0x00,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81000210 <unknown>
+
+bfmop4s za3.s, {z14.h-z15.h}, z30.h  // 10000001-00001110-00000011-11010011
+// CHECK-INST: bfmop4s za3.s, { z14.h, z15.h }, z30.h
+// CHECK-ENCODING: [0xd3,0x03,0x0e,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 810e03d3 <unknown>
+
+bfmop4s za2.s, {z12.h-z13.h}, z28.h  // 10000001-00001100-00000011-10010010
+// CHECK-INST: bfmop4s za2.s, { z12.h, z13.h }, z28.h
+// CHECK-ENCODING: [0x92,0x03,0x0c,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 810c0392 <unknown>
+
+// Multiple vectors
+
+bfmop4s za0.s, {z0.h-z1.h}, {z16.h-z17.h}  // 10000001-00010000-00000010-00010000
+// CHECK-INST: bfmop4s za0.s, { z0.h, z1.h }, { z16.h, z17.h }
+// CHECK-ENCODING: [0x10,0x02,0x10,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 81100210 <unknown>
+
+bfmop4s za3.s, {z14.h-z15.h}, {z30.h-z31.h}  // 10000001-00011110-00000011-11010011
+// CHECK-INST: bfmop4s za3.s, { z14.h, z15.h }, { z30.h, z31.h }
+// CHECK-ENCODING: [0xd3,0x03,0x1e,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 811e03d3 <unknown>
+
+bfmop4s za2.s, {z12.h-z13.h}, {z26.h-z27.h}  // 10000001-00011010-00000011-10010010
+// CHECK-INST: bfmop4s za2.s, { z12.h, z13.h }, { z26.h, z27.h }
+// CHECK-ENCODING: [0x92,0x03,0x1a,0x81]
+// CHECK-ERROR: instruction requires: sme2p2
+// CHECK-UNKNOWN: 811a0392 <unknown>

From 4161ca2092d3b92034515190f577aa200ec615bf Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray@arm.com>
Date: Fri, 25 Oct 2024 14:54:56 +0100
Subject: [PATCH 12/39] [NFC][AArch64][LLVM] Update ReleaseNotes.md with
 Armv9.6-A (2024) arch extensions

---
 llvm/docs/ReleaseNotes.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 7cca9116a513451..be51b0af56ddbf7 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -110,6 +110,9 @@ Changes to the AArch64 Backend
   the required alignment space with a sequence of `0x0` bytes (the requested
   fill value) rather than NOPs.
 
+* Assembler/disassembler support has been added for Armv9.6-A (2024)
+  architecture extensions.
+
 Changes to the AMDGPU Backend
 -----------------------------
 

From 2ec5c69b6872b8b474f3d37b9125d3d57d144d1b Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Fri, 25 Oct 2024 09:42:01 -0700
Subject: [PATCH 13/39] Revert "[Sanitizers] Intercept timer_create" (#113710)

Reverts llvm/llvm-project#112285
---
 .../lib/hwasan/hwasan_platform_interceptors.h |  3 ---
 compiler-rt/lib/msan/tests/msan_test.cpp      | 23 -------------------
 .../sanitizer_common_interceptors.inc         | 19 ---------------
 .../sanitizer_platform_interceptors.h         |  3 ---
 .../sanitizer_platform_limits_posix.h         |  4 ----
 5 files changed, 52 deletions(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
index e8011014c2331d7..d92b51052194275 100644
--- a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
+++ b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
@@ -200,9 +200,6 @@
 #undef SANITIZER_INTERCEPT_CLOCK_GETCPUCLOCKID
 #define SANITIZER_INTERCEPT_CLOCK_GETCPUCLOCKID 0
 
-#undef SANITIZER_INTERCEPT_TIMER_CREATE
-#define SANITIZER_INTERCEPT_TIMER_CREATE 0
-
 #undef SANITIZER_INTERCEPT_GETITIMER
 #define SANITIZER_INTERCEPT_GETITIMER 0
 
diff --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp
index ad265acf4c1e39a..41b99fabe84f478 100644
--- a/compiler-rt/lib/msan/tests/msan_test.cpp
+++ b/compiler-rt/lib/msan/tests/msan_test.cpp
@@ -4881,27 +4881,4 @@ TEST(MemorySanitizer, throw_catch) {
     // pass
   }
 }
-
-#if defined(__linux__)
-TEST(MemorySanitizer, timer_create) {
-  timer_t timer;
-  EXPECT_POISONED(timer);
-  int res = timer_create(CLOCK_REALTIME, nullptr, &timer);
-  ASSERT_EQ(0, res);
-  EXPECT_NOT_POISONED(timer);
-
-  // Make sure the timer is usable.
-  struct itimerspec cur_value {};
-  cur_value.it_value.tv_sec = 1;
-  EXPECT_EQ(0, timer_settime(timer, 0, &cur_value, nullptr));
-
-  timer_t timer2;
-  EXPECT_POISONED(timer2);
-  // Use an invalid clock_id to make timer_create fail.
-  res = timer_create(INT_MAX, nullptr, &timer2);
-  ASSERT_EQ(-1, res);
-  EXPECT_POISONED(timer2);
-  timer_delete(timer);
-}
-#endif
 } // namespace
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index 211f9f70d7e4c6c..b8627f8557afe29 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -2289,24 +2289,6 @@ INTERCEPTOR(int, pthread_getcpuclockid, uptr thread,
 #define INIT_CLOCK_GETCPUCLOCKID
 #endif
 
-#if SANITIZER_INTERCEPT_TIMER_CREATE
-INTERCEPTOR(int, timer_create, __sanitizer_clockid_t clockid, void *sevp,
-            __sanitizer_timer_t *timer) {
-  void *ctx;
-  COMMON_INTERCEPTOR_ENTER(ctx, timer_create, clockid, sevp, timer);
-  int res = REAL(timer_create)(clockid, sevp, timer);
-  if (!res && timer) {
-    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, timer, sizeof *timer);
-  }
-  return res;
-}
-
-#  define INIT_TIMER_CREATE \
-    COMMON_INTERCEPT_FUNCTION_GLIBC_VER_MIN(timer_create, "GLIBC_2.3.3");
-#else
-#  define INIT_TIMER_CREATE
-#endif
-
 #if SANITIZER_INTERCEPT_GETITIMER
 INTERCEPTOR(int, getitimer, int which, void *curr_value) {
   void *ctx;
@@ -10284,7 +10266,6 @@ static void InitializeCommonInterceptors() {
   INIT_SETPWENT;
   INIT_CLOCK_GETTIME;
   INIT_CLOCK_GETCPUCLOCKID;
-  INIT_TIMER_CREATE;
   INIT_GETITIMER;
   INIT_TIME;
   INIT_GLOB;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 36fafdc642642bf..6959a6d52d604e0 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -237,9 +237,6 @@
   (SI_FREEBSD || SI_NETBSD || SI_LINUX || SI_SOLARIS)
 #define SANITIZER_INTERCEPT_CLOCK_GETCPUCLOCKID \
   (SI_LINUX || SI_FREEBSD || SI_NETBSD)
-// TODO: This should be SI_POSIX, adding Linux first until I have time
-// to verify all timer_t typedefs on other platforms.
-#define SANITIZER_INTERCEPT_TIMER_CREATE SI_LINUX
 #define SANITIZER_INTERCEPT_GETITIMER SI_POSIX
 #define SANITIZER_INTERCEPT_TIME SI_POSIX
 #define SANITIZER_INTERCEPT_GLOB (SI_GLIBC || SI_SOLARIS)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index b4ccf7b3d7bef48..e8c81aa8e281637 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -1517,10 +1517,6 @@ extern const int si_SEGV_ACCERR;
 
 #define SIGACTION_SYMNAME sigaction
 
-#  if SANITIZER_LINUX
-typedef void *__sanitizer_timer_t;
-#  endif
-
 #endif  // SANITIZER_LINUX || SANITIZER_APPLE
 
 #endif

From 9ea6fcd02b172ec12c9d4b9157d4a37765d83421 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 25 Oct 2024 16:47:08 +0000
Subject: [PATCH 14/39] [gn build] Port 1f2b7ae6d789

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 1630c8004d31575..0586704850a51b2 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -568,11 +568,11 @@ if (current_toolchain == default_toolchain) {
       "__locale_dir/locale_base_api/bsd_locale_fallbacks.h",
       "__locale_dir/locale_base_api/fuchsia.h",
       "__locale_dir/locale_base_api/ibm.h",
-      "__locale_dir/locale_base_api/locale_guard.h",
       "__locale_dir/locale_base_api/musl.h",
       "__locale_dir/locale_base_api/newlib.h",
       "__locale_dir/locale_base_api/openbsd.h",
       "__locale_dir/locale_base_api/win32.h",
+      "__locale_dir/locale_guard.h",
       "__math/abs.h",
       "__math/copysign.h",
       "__math/error_functions.h",

From 305a1ceae371b482375545650ba9fd9e4c165157 Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Fri, 25 Oct 2024 10:02:40 -0700
Subject: [PATCH 15/39] [DataLayout] Refactor storage of non-integral address
 spaces

Instead of storing this as a separate array of non-integral pointers,
add it to the PointerSpec class instead. This will allow for future
simplifications such as splitting the non-integral property into
multiple distinct ones: relocatable (i.e. non-stable representation) and
non-integral representation (i.e. pointers with metadata).

Reviewed By: arsenm

Pull Request: https://github.com/llvm/llvm-project/pull/105734
---
 llvm/include/llvm/IR/DataLayout.h | 28 ++++++++++++++++-----------
 llvm/lib/IR/DataLayout.cpp        | 32 ++++++++++++++++++++++---------
 2 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 8f7ab2f9df389ef..93bd519f5727d80 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -78,7 +78,11 @@ class DataLayout {
     Align ABIAlign;
     Align PrefAlign;
     uint32_t IndexBitWidth;
-
+    /// Pointers in this address space don't have a well-defined bitwise
+    /// representation (e.g. may be relocated by a copying garbage collector).
+    /// Additionally, they may also be non-integral (i.e. containing additional
+    /// metadata such as bounds information/permissions).
+    bool IsNonIntegral;
     bool operator==(const PointerSpec &Other) const;
   };
 
@@ -133,10 +137,6 @@ class DataLayout {
   // The StructType -> StructLayout map.
   mutable void *LayoutMap = nullptr;
 
-  /// Pointers in these address spaces are non-integral, and don't have a
-  /// well-defined bitwise representation.
-  SmallVector<unsigned, 8> NonIntegralAddressSpaces;
-
   /// Sets or updates the specification for the given primitive type.
   void setPrimitiveSpec(char Specifier, uint32_t BitWidth, Align ABIAlign,
                         Align PrefAlign);
@@ -147,7 +147,8 @@ class DataLayout {
 
   /// Sets or updates the specification for pointer in the given address space.
   void setPointerSpec(uint32_t AddrSpace, uint32_t BitWidth, Align ABIAlign,
-                      Align PrefAlign, uint32_t IndexBitWidth);
+                      Align PrefAlign, uint32_t IndexBitWidth,
+                      bool IsNonIntegral);
 
   /// Internal helper to get alignment for integer of given bitwidth.
   Align getIntegerAlignment(uint32_t BitWidth, bool abi_or_pref) const;
@@ -165,7 +166,8 @@ class DataLayout {
   Error parsePointerSpec(StringRef Spec);
 
   /// Attempts to parse a single specification.
-  Error parseSpecification(StringRef Spec);
+  Error parseSpecification(StringRef Spec,
+                           SmallVectorImpl<unsigned> &NonIntegralAddressSpaces);
 
   /// Attempts to parse a data layout string.
   Error parseLayoutString(StringRef LayoutString);
@@ -337,13 +339,17 @@ class DataLayout {
 
   /// Return the address spaces containing non-integral pointers.  Pointers in
   /// this address space don't have a well-defined bitwise representation.
-  ArrayRef<unsigned> getNonIntegralAddressSpaces() const {
-    return NonIntegralAddressSpaces;
+  SmallVector<unsigned, 8> getNonIntegralAddressSpaces() const {
+    SmallVector<unsigned, 8> AddrSpaces;
+    for (const PointerSpec &PS : PointerSpecs) {
+      if (PS.IsNonIntegral)
+        AddrSpaces.push_back(PS.AddrSpace);
+    }
+    return AddrSpaces;
   }
 
   bool isNonIntegralAddressSpace(unsigned AddrSpace) const {
-    ArrayRef<unsigned> NonIntegralSpaces = getNonIntegralAddressSpaces();
-    return is_contained(NonIntegralSpaces, AddrSpace);
+    return getPointerSpec(AddrSpace).IsNonIntegral;
   }
 
   bool isNonIntegralPointerType(PointerType *PT) const {
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index d295d1f5785eb9d..a4af0ead07cf616 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -151,7 +151,8 @@ bool DataLayout::PrimitiveSpec::operator==(const PrimitiveSpec &Other) const {
 bool DataLayout::PointerSpec::operator==(const PointerSpec &Other) const {
   return AddrSpace == Other.AddrSpace && BitWidth == Other.BitWidth &&
          ABIAlign == Other.ABIAlign && PrefAlign == Other.PrefAlign &&
-         IndexBitWidth == Other.IndexBitWidth;
+         IndexBitWidth == Other.IndexBitWidth &&
+         IsNonIntegral == Other.IsNonIntegral;
 }
 
 namespace {
@@ -206,7 +207,8 @@ constexpr DataLayout::PrimitiveSpec DefaultVectorSpecs[] = {
 
 // Default pointer type specifications.
 constexpr DataLayout::PointerSpec DefaultPointerSpecs[] = {
-    {0, 64, Align::Constant<8>(), Align::Constant<8>(), 64} // p0:64:64:64:64
+    // p0:64:64:64:64
+    {0, 64, Align::Constant<8>(), Align::Constant<8>(), 64, false},
 };
 
 DataLayout::DataLayout()
@@ -239,13 +241,11 @@ DataLayout &DataLayout::operator=(const DataLayout &Other) {
   PointerSpecs = Other.PointerSpecs;
   StructABIAlignment = Other.StructABIAlignment;
   StructPrefAlignment = Other.StructPrefAlignment;
-  NonIntegralAddressSpaces = Other.NonIntegralAddressSpaces;
   return *this;
 }
 
 bool DataLayout::operator==(const DataLayout &Other) const {
   // NOTE: StringRepresentation might differ, it is not canonicalized.
-  // FIXME: NonIntegralAddressSpaces isn't compared.
   return BigEndian == Other.BigEndian &&
          AllocaAddrSpace == Other.AllocaAddrSpace &&
          ProgramAddrSpace == Other.ProgramAddrSpace &&
@@ -454,11 +454,13 @@ Error DataLayout::parsePointerSpec(StringRef Spec) {
     return createStringError(
         "index size cannot be larger than the pointer size");
 
-  setPointerSpec(AddrSpace, BitWidth, ABIAlign, PrefAlign, IndexBitWidth);
+  setPointerSpec(AddrSpace, BitWidth, ABIAlign, PrefAlign, IndexBitWidth,
+                 false);
   return Error::success();
 }
 
-Error DataLayout::parseSpecification(StringRef Spec) {
+Error DataLayout::parseSpecification(
+    StringRef Spec, SmallVectorImpl<unsigned> &NonIntegralAddressSpaces) {
   // The "ni" specifier is the only two-character specifier. Handle it first.
   if (Spec.starts_with("ni")) {
     // ni:<address space>[:<address space>]...
@@ -614,12 +616,23 @@ Error DataLayout::parseLayoutString(StringRef LayoutString) {
 
   // Split the data layout string into specifications separated by '-' and
   // parse each specification individually, updating internal data structures.
+  SmallVector<unsigned, 8> NonIntegralAddressSpaces;
   for (StringRef Spec : split(LayoutString, '-')) {
     if (Spec.empty())
       return createStringError("empty specification is not allowed");
-    if (Error Err = parseSpecification(Spec))
+    if (Error Err = parseSpecification(Spec, NonIntegralAddressSpaces))
       return Err;
   }
+  // Mark all address spaces that were qualified as non-integral now. This has
+  // to be done later since the non-integral property is not part of the data
+  // layout pointer specification.
+  for (unsigned AS : NonIntegralAddressSpaces) {
+    // If there is no special spec for a given AS, getPointerSpec(AS) returns
+    // the spec for AS0, and we then update that to mark it non-integral.
+    const PointerSpec &PS = getPointerSpec(AS);
+    setPointerSpec(AS, PS.BitWidth, PS.ABIAlign, PS.PrefAlign, PS.IndexBitWidth,
+                   true);
+  }
 
   return Error::success();
 }
@@ -666,16 +679,17 @@ DataLayout::getPointerSpec(uint32_t AddrSpace) const {
 
 void DataLayout::setPointerSpec(uint32_t AddrSpace, uint32_t BitWidth,
                                 Align ABIAlign, Align PrefAlign,
-                                uint32_t IndexBitWidth) {
+                                uint32_t IndexBitWidth, bool IsNonIntegral) {
   auto I = lower_bound(PointerSpecs, AddrSpace, LessPointerAddrSpace());
   if (I == PointerSpecs.end() || I->AddrSpace != AddrSpace) {
     PointerSpecs.insert(I, PointerSpec{AddrSpace, BitWidth, ABIAlign, PrefAlign,
-                                       IndexBitWidth});
+                                       IndexBitWidth, IsNonIntegral});
   } else {
     I->BitWidth = BitWidth;
     I->ABIAlign = ABIAlign;
     I->PrefAlign = PrefAlign;
     I->IndexBitWidth = IndexBitWidth;
+    I->IsNonIntegral = IsNonIntegral;
   }
 }
 

From 9d88543301f262e584a36ea969237a2cf054328b Mon Sep 17 00:00:00 2001
From: Abhina Sree <Abhina.Sreeskantharajan@ibm.com>
Date: Fri, 25 Oct 2024 13:06:02 -0400
Subject: [PATCH 16/39] [AIX] Use internal lit shell for TableGen instead of a
 global setting (#113627)

This is to address the latest lit regressions
https://lab.llvm.org/buildbot/#/builders/64/builds/1285 caused by using
the internal lit shell. This change will limit using the internal lit
shell to TableGen on AIX so we do not hit these regressions.
---
 llvm/test/TableGen/lit.local.cfg  | 8 ++++++++
 llvm/utils/lit/lit/llvm/config.py | 7 -------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/test/TableGen/lit.local.cfg b/llvm/test/TableGen/lit.local.cfg
index 0e827479cd41235..9d6dfdc14bbfb06 100644
--- a/llvm/test/TableGen/lit.local.cfg
+++ b/llvm/test/TableGen/lit.local.cfg
@@ -1,2 +1,10 @@
+import platform
+import lit.formats
+
 config.suffixes = [".td"]
 config.excludes = ["Common", "Inputs"]
+
+# AIX 'diff' command doesn't support --strip-trailing-cr, but the internal
+# python implementation does, so use that for cross platform compatibility
+if platform.system() == "AIX":
+    config.test_format = lit.formats.ShTest()
diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py
index 1ef5796cd32e448..5f762ec7f3514ab 100644
--- a/llvm/utils/lit/lit/llvm/config.py
+++ b/llvm/utils/lit/lit/llvm/config.py
@@ -57,13 +57,6 @@ def __init__(self, lit_config, config):
                 self.lit_config.note("using lit tools: {}".format(path))
                 lit_path_displayed = True
 
-        if platform.system() == "AIX":
-            # Diff on AIX doesn't have all the required features (see
-            # https://github.com/llvm/llvm-project/pull/108871 and
-            # https://github.com/llvm/llvm-project/pull/112997#issuecomment-2429656192)
-            # so always use the internal shell.
-            self.use_lit_shell = True
-
         if platform.system() == "OS/390":
             self.with_environment("_BPXK_AUTOCVT", "ON")
             self.with_environment("_TAG_REDIR_IN", "TXT")

From f24c1dd08ea71fa7334a85fd2772c2f728de0c56 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 25 Oct 2024 18:11:20 +0100
Subject: [PATCH 17/39] Fix MSVC "signed/unsigned mismatch" warning. NFC.

---
 clang/tools/clang-format/ClangFormat.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 96fb85e99bf5f0f..5522d05744a2b4c 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -510,7 +510,7 @@ static bool format(StringRef FileName, bool ErrorOnIncompleteFormat = false) {
       reformat(*FormatStyle, *ChangedCode, Ranges, AssumedFileName, &Status);
   Replaces = Replaces.merge(FormatChanges);
   if (DryRun) {
-    return Replaces.size() > (IsJson ? 1 : 0) &&
+    return Replaces.size() > (IsJson ? 1u : 0u) &&
            emitReplacementWarnings(Replaces, AssumedFileName, Code);
   }
   if (OutputXML) {

From e6917e95548f81e7f00b8bca70ce571780e2afc9 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Fri, 25 Oct 2024 21:15:21 +0400
Subject: [PATCH 18/39] =?UTF-8?q?[clang][NFC]=20Add=20test=20for=20CWG1898?=
 =?UTF-8?q?=20"Use=20of=20=E2=80=9Cequivalent=E2=80=9D=20in=20overload=20r?=
 =?UTF-8?q?esolution"=20(#113439)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[CWG1898](https://cplusplus.github.io/CWG/issues/1898.html) Use of
“equivalent” in overload resolution
====================

[P1787R6](https://wg21.link/p1787r6):
> CWG1898 is resolved by explicitly using the defined term
parameter-type-list.

Except that now it's called non-object-parameter-type-list, which is
defined in [dcl.fct] [p8](https://eel.is/c++draft/dcl.fct#8) and
[p4](https://eel.is/c++draft/dcl.fct#8).

As for the wording, the first sentence
[\_N4140\_.[over.dcl]/1](https://timsong-cpp.github.io/cppwp/n4140/over.dcl#1)
where the word "equivalent" was used:
> Two function declarations of the same name refer to the same function
if they are in the same scope and have equivalent parameter declarations
([over.load]).

was replaced with what is now known as "corresponding overloads",
defined in
[[basic.scope.scope]/4](https://eel.is/c++draft/basic.scope#scope-4).
The definition is present in P1787R6, but it's hard to reference,
because the "corresponding overloads" term was coined later.
---
 clang/test/CXX/drs/cwg18xx.cpp | 83 ++++++++++++++++++++++++++++++++++
 clang/www/cxx_dr_status.html   |  2 +-
 2 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp
index 7f0fb8cf589d48c..b059492637bd5cf 100644
--- a/clang/test/CXX/drs/cwg18xx.cpp
+++ b/clang/test/CXX/drs/cwg18xx.cpp
@@ -640,3 +640,86 @@ namespace H {
   struct S s;
 }
 }
+
+namespace cwg1898 { // cwg1898: 2.7
+void e(int) {} // #cwg1898-e
+void e(int) {}
+// expected-error@-1 {{redefinition of 'e'}}
+//   expected-note@#cwg1898-e {{previous definition is here}}
+
+void e2(int) {}
+void e2(long) {} // OK, different type
+
+void f(int) {} // #cwg1898-f
+void f(const int) {}
+// expected-error@-1 {{redefinition of 'f'}}
+//   expected-note@#cwg1898-f {{previous definition is here}}
+
+void g(int) {} // #cwg1898-g
+void g(volatile int) {}
+// since-cxx20-warning@-1 {{volatile-qualified parameter type 'volatile int' is deprecated}}
+// expected-error@-2 {{redefinition of 'g'}}
+//   expected-note@#cwg1898-g {{previous definition is here}}
+
+void h(int *) {} // #cwg1898-h
+void h(int[]) {}
+// expected-error@-1 {{redefinition of 'h'}}
+//   expected-note@#cwg1898-h {{previous definition is here}}
+
+void h2(int *) {} // #cwg1898-h2
+void h2(int[2]) {}
+// expected-error@-1 {{redefinition of 'h2'}}
+//   expected-note@#cwg1898-h2 {{previous definition is here}}
+
+void h3(int (*)[2]) {} // #cwg1898-h3
+void h3(int [3][2]) {}
+// expected-error@-1 {{redefinition of 'h3'}}
+//   expected-note@#cwg1898-h3 {{previous definition is here}}
+
+void h4(int (*)[2]) {}
+void h4(int [3][3]) {} // OK, differ in non-top-level extent of array
+
+void i(int *) {}
+void i(const int *) {} // OK, pointee cv-qualification is not discarded
+
+void i2(int *) {} // #cwg1898-i2
+void i2(int * const) {}
+// expected-error@-1 {{redefinition of 'i2'}}
+//   expected-note@#cwg1898-i2 {{previous definition is here}}
+
+void j(void(*)()) {} // #cwg1898-j
+void j(void()) {}
+// expected-error@-1 {{redefinition of 'j'}}
+//   expected-note@#cwg1898-j {{previous definition is here}}
+
+void j2(void(int)) {} // #cwg1898-j2
+void j2(void(const int)) {}
+// expected-error@-1 {{redefinition of 'j2'}}
+//   expected-note@#cwg1898-j2 {{previous definition is here}}
+
+struct A {
+  void k(int) {} // #cwg1898-k
+  void k(int) {}
+  // expected-error@-1 {{class member cannot be redeclared}}
+  //   expected-note@#cwg1898-k {{previous definition is here}}
+};
+
+struct B : A {
+  void k(int) {} // OK, shadows A::k
+};
+
+void l() {}
+void l(...) {}
+
+#if __cplusplus >= 201103L
+template <typename T>
+void m(T) {}
+template <typename... Ts>
+void m(Ts...) {}
+
+template <typename T, typename U>
+void m2(T, U) {}
+template <typename... Ts, typename U>
+void m2(Ts..., U) {}
+#endif
+} // namespace cwg1898
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 82ba9b370ba5953..6640ed477a241e5 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -11219,7 +11219,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1898.html">1898</a></td>
     <td>CD6</td>
     <td>Use of &#8220;equivalent&#8221; in overload resolution</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1899">
     <td><a href="https://cplusplus.github.io/CWG/issues/1899.html">1899</a></td>

From d3c29e8d2f11742e83e2b80df47391598bf2e857 Mon Sep 17 00:00:00 2001
From: Yijia Gu <yijiagu@google.com>
Date: Fri, 25 Oct 2024 10:24:31 -0700
Subject: [PATCH 19/39] [mlir][test][bazel] add missing deps for TestPass

---
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 34beb758a12dd44..c69f793943beeca 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -547,8 +547,13 @@ cc_library(
         ":TestDialect",
         "//llvm:Support",
         "//mlir:FuncDialect",
+        "//mlir:GPUToSPIRV",
+        "//mlir:GPUTransforms",
         "//mlir:IR",
         "//mlir:Pass",
+        "//mlir:SPIRVDialect",
+        "//mlir:SPIRVToLLVM",
+        "//mlir:SPIRVTransforms",
         "//mlir:Support",
     ],
 )

From 6e7375031a1a3172d5e369cf2c108da2bcf65c8a Mon Sep 17 00:00:00 2001
From: Arvind Sudarsanam <arvind.sudarsanam@intel.com>
Date: Fri, 25 Oct 2024 10:27:42 -0700
Subject: [PATCH 20/39] [clang-linker-wrapper] Add error handling for missing
 linker path (#113613)

In clang-linker-wrapper, we do not explicitly check if --linker-path is
provided.
This PR adds a check to capture this.

Thanks

---------

Signed-off-by: Arvind Sudarsanam <arvind.sudarsanam@intel.com>
---
 clang/test/Driver/linker-wrapper.c                      | 4 ++++
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index 068ea2d7d3c663c..470af4d5d70cac7 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -250,3 +250,7 @@ __attribute__((visibility("protected"), used)) int x;
 //       MLLVM-SAME: -Xlinker -mllvm=-pass-remarks=foo,bar
 //  OFFLOAD-OPT-NOT: -Xlinker -mllvm=-pass-remarks=foo,bar
 // OFFLOAD-OPT-SAME: {{$}}
+
+// Error handling when --linker-path is not provided for clang-linker-wrapper
+// RUN: not clang-linker-wrapper 2>&1 | FileCheck --check-prefix=LINKER-PATH-NOT-PROVIDED %s
+// LINKER-PATH-NOT-PROVIDED: linker path missing, must pass 'linker-path'
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 9fea1fdcd5fb466..9fcecaee318a79f 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -370,6 +370,8 @@ Error runLinker(ArrayRef<StringRef> Files, const ArgList &Args) {
   // Render the linker arguments and add the newly created image. We add it
   // after the output file to ensure it is linked with the correct libraries.
   StringRef LinkerPath = Args.getLastArgValue(OPT_linker_path_EQ);
+  if (LinkerPath.empty())
+    return createStringError("linker path missing, must pass 'linker-path'");
   ArgStringList NewLinkerArgs;
   for (const opt::Arg *Arg : Args) {
     // Do not forward arguments only intended for the linker wrapper.

From ac4bd74190fedfbe025ef757ff308dd184a507f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 25 Oct 2024 10:39:26 -0700
Subject: [PATCH 21/39] [mlir] Add apply_patterns.linalg.pad_vectorization TD
 Op (#112504)

This PR simply wraps `populatePadOpVectorizationPatterns` into a new
Transform Dialect Op: `apply_patterns.linalg.pad_vectorization`.

This change makes it possible to run (and test) the corresponding
patterns _without_:

  `transform.structured.vectorize_children_and_apply_patterns`.

Note that the Op above only supports non-masked vectorisation (i.e. when
the inputs are static), so, effectively, only fixed-width vectorisation
(as opposed to scalable vectorisation). As such, this change is required
to construct vectorization pipelines for tensor.pad targeting scalable
vectors.

To test the new Op and the corresponding patterns, I added
"vectorization-pad-patterns.mlir" - most tests have been extracted from
"vectorization-with-patterns.mlir".
---
 .../Linalg/TransformOps/LinalgTransformOps.td |  20 ++
 .../TransformOps/LinalgTransformOps.cpp       |   5 +
 .../Linalg/Transforms/Vectorization.cpp       |   3 +
 .../Linalg/vectorization-pad-patterns.mlir    | 274 ++++++++++++++++++
 .../Linalg/vectorization-with-patterns.mlir   | 143 ---------
 5 files changed, 302 insertions(+), 143 deletions(-)
 create mode 100644 mlir/test/Dialect/Linalg/vectorization-pad-patterns.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 040c04b0410ecf5..abf446887c54425 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -84,6 +84,26 @@ def ApplyFoldAddIntoDestPatternsOp : Op<Transform_Dialect,
   let assemblyFormat = "attr-dict";
 }
 
+def ApplyPadVectorizationPatternsOp : Op<Transform_Dialect,
+    "apply_patterns.linalg.pad_vectorization",
+    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> {
+  let description = [{
+    Apply patterns that vectorize tensor.pad.
+
+    These patterns rewrite tensor.pad Ops using vector.transfer_read and
+    vector.transfer_write operations. This is done either by:
+      1. Folding tensor.pad with an existing vector.transfer_read /
+      vector.transfer_write Op (generated prior to running these patterns). 
+      2. Rewriting it (when matched together with q tensor.insert_slice
+      consumer Op) as a vector.transfer_read + vector.transfer_write pair.
+
+    In both cases, these patterns look at producers and consumers for the
+    matched tensor.pad Op to find opportunities for vectorization.
+  }];
+
+  let assemblyFormat = "attr-dict";
+}
+
 //===----------------------------------------------------------------------===//
 // BufferizeToAllocationOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 1f1d8ad89ae2b9b..3d3f0a93a3829bf 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -253,6 +253,11 @@ void transform::ApplyFoldAddIntoDestPatternsOp::populatePatterns(
   linalg::populateFoldAddIntoDestPatterns(patterns);
 }
 
+void transform::ApplyPadVectorizationPatternsOp::populatePatterns(
+    RewritePatternSet &patterns) {
+  linalg::populatePadOpVectorizationPatterns(patterns);
+}
+
 //===----------------------------------------------------------------------===//
 // BufferizeToAllocationOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index e1b97fbf985df81..0a2457176a1d474 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2712,6 +2712,9 @@ struct PadOpVectorizationWithInsertSlicePattern
 
 void mlir::linalg::populatePadOpVectorizationPatterns(
     RewritePatternSet &patterns, PatternBenefit baseBenefit) {
+  // TODO: The following pattern implements "decomposition" and
+  // optional "vectorization". Seperate "decomposition" into a sepereate
+  // pre-processing pattern group.
   patterns.add<GenericPadOpVectorizationPattern>(patterns.getContext(),
                                                  baseBenefit);
   // Try these specialized patterns first before resorting to the generic one.
diff --git a/mlir/test/Dialect/Linalg/vectorization-pad-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-pad-patterns.mlir
new file mode 100644
index 000000000000000..2aa4638af3f0f3b
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/vectorization-pad-patterns.mlir
@@ -0,0 +1,274 @@
+// RUN: mlir-opt %s -transform-interpreter -split-input-file | FileCheck %s
+
+///----------------------------------------------------------------------------------------
+/// [Pattern: PadOpVectorizationWithTransferReadPattern]
+///----------------------------------------------------------------------------------------
+// CHECK-LABEL: func @pad_and_transfer_read
+//  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
+//   CHECK-NOT:   tensor.pad
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C5:.*]] = arith.constant 5.0
+//       CHECK:   %[[RESULT:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], %[[C5]] : tensor<5x6xf32>, vector<7x9xf32>
+//       CHECK:   return %[[RESULT]]
+func.func @pad_and_transfer_read(%arg0: tensor<5x6xf32>) -> vector<7x9xf32> {
+  %c0 = arith.constant 0 : index
+  %c5 = arith.constant 5.0 : f32
+  %c6 = arith.constant 6.0 : f32
+  %0 = tensor.pad %arg0 low[0, 0] high[5, 7] {
+    ^bb0(%arg1: index, %arg2: index):
+      tensor.yield %c5 : f32
+  } : tensor<5x6xf32> to tensor<10x13xf32>
+  %1 = vector.transfer_read %0[%c0, %c0], %c6
+      : tensor<10x13xf32>, vector<7x9xf32>
+  return %1 : vector<7x9xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.linalg.pad_vectorization
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
+
+// -----
+
+///----------------------------------------------------------------------------------------
+/// [Pattern: PadOpVectorizationWithTransferWritePattern]
+///----------------------------------------------------------------------------------------
+func.func private @make_vector() -> vector<7x9xf32>
+
+// CHECK-LABEL: func @pad_and_transfer_write_static_low_and_high
+//  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
+//   CHECK-NOT:   tensor.pad
+//       CHECK:   %[[C0:.*]] = arith.constant 0 : index
+//       CHECK:   %[[VEC0:.*]] = call @make_vector() : () -> vector<7x9xf32>
+//       CHECK:   %[[RESULT:.*]] = vector.transfer_write %[[VEC0]], %[[ARG0]][%[[C0]], %[[C0]]] : vector<7x9xf32>, tensor<5x6xf32>
+//       CHECK:   return %[[RESULT]]
+func.func @pad_and_transfer_write_static_low_and_high(
+    %arg0: tensor<5x6xf32>) -> tensor<5x6xf32> {
+  %c0 = arith.constant 0 : index
+  %c5 = arith.constant 5.0 : f32
+  %0 = tensor.pad %arg0 low[0, 0] high[5, 7] {
+    ^bb0(%arg2: index, %arg3: index):
+      tensor.yield %c5 : f32
+  } : tensor<5x6xf32> to tensor<10x13xf32>
+  %1 = call @make_vector() : () -> vector<7x9xf32>
+  %2 = vector.transfer_write %1, %0[%c0, %c0]
+      : vector<7x9xf32>, tensor<10x13xf32>
+  %3 = tensor.extract_slice %2[0, 0] [5, 6] [1, 1] : tensor<10x13xf32> to tensor<5x6xf32>
+  return %3 : tensor<5x6xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.linalg.pad_vectorization
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
+
+// -----
+
+func.func private @make_vector() -> vector<7x9xf32>
+
+// CHECK-LABEL: func @pad_and_transfer_write_static_low_dynamic_high
+//  CHECK-SAME:     %[[ARG0:.*]]: tensor<?x?xf32>, %[[SIZE:.*]]: index, %[[PADDING:.*]]: index
+//   CHECK-NOT:   tensor.pad
+//       CHECK:   %[[C0:.*]] = arith.constant 0 : index
+//       CHECK:   %[[SUB:.*]] = tensor.extract_slice %[[ARG0]][0, 0] [%[[SIZE]], 6] [1, 1] : tensor<?x?xf32> to tensor<?x6xf32>
+//       CHECK:   %[[VEC0:.*]] = call @make_vector() : () -> vector<7x9xf32>
+//       CHECK:   %[[RESULT:.*]] = vector.transfer_write %[[VEC0]], %[[SUB]][%[[C0]], %[[C0]]] : vector<7x9xf32>, tensor<?x6xf32>
+//       CHECK:   return %[[RESULT]]
+func.func @pad_and_transfer_write_static_low_dynamic_high(
+    %arg0: tensor<?x?xf32>, %size: index, %padding: index) -> tensor<?x6xf32> {
+  %c0 = arith.constant 0 : index
+  %c5 = arith.constant 5.0 : f32
+  %s = tensor.extract_slice %arg0[0, 0] [%size, 6] [1, 1]
+      : tensor<?x?xf32> to tensor<?x6xf32>
+  %0 = tensor.pad %s low[0, 0] high[%padding, 7] {
+    ^bb0(%arg2: index, %arg3: index):
+      tensor.yield %c5 : f32
+  } : tensor<?x6xf32> to tensor<?x13xf32>
+  %1 = call @make_vector() : () -> vector<7x9xf32>
+  %2 = vector.transfer_write %1, %0[%c0, %c0]
+      : vector<7x9xf32>, tensor<?x13xf32>
+  %3 = tensor.extract_slice %2[0, 0] [%size, 6] [1, 1] : tensor<?x13xf32> to tensor<?x6xf32>
+  return %3 : tensor<?x6xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.linalg.pad_vectorization
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
+
+
+// -----
+
+///----------------------------------------------------------------------------------------
+/// [Pattern: PadOpVectorizationWithInsertSlicePattern]
+///----------------------------------------------------------------------------------------
+
+func.func private @make_vector() -> tensor<12x13xf32>
+
+// CHECK-LABEL: func @pad_and_insert_slice_source
+//  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
+//   CHECK-NOT:   tensor.pad
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C5:.*]] = arith.constant 5.0
+//       CHECK:   %[[VEC0:.*]] = call @make_vector() : () -> tensor<12x13xf32>
+//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], %[[C5]] : tensor<5x6xf32>, vector<7x9xf32>
+//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[READ]], %[[VEC0]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<7x9xf32>, tensor<12x13xf32>
+//       CHECK:   return %[[WRITE]]
+func.func @pad_and_insert_slice_source(
+    %arg0: tensor<5x6xf32>) -> tensor<12x13xf32> {
+  %c0 = arith.constant 0 : index
+  %c5 = arith.constant 5.0 : f32
+  %0 = tensor.pad %arg0 low[0, 0] high[2, 3] {
+    ^bb0(%arg2: index, %arg3: index):
+      tensor.yield %c5 : f32
+  } : tensor<5x6xf32> to tensor<7x9xf32>
+  %1 = call @make_vector() : () -> tensor<12x13xf32>
+  %r = tensor.insert_slice %0 into %1[0, 0][7, 9][1, 1] : tensor<7x9xf32> into tensor<12x13xf32>
+  return %r : tensor<12x13xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.linalg.pad_vectorization
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
+
+
+// -----
+
+///----------------------------------------------------------------------------------------
+/// tensor::PadOp -> tensor::EmptyOp + linalg::FillOp/tensor::GenerateOp + tensor::InsertSliceOp
+/// [Pattern: GenericPadOpVectorizationPattern]
+///----------------------------------------------------------------------------------------
+
+func.func private @make_vector() -> tensor<12x13xf32>
+
+// Same as @pad_and_insert_slice_dest in vectorization-with-patterns.mlir, but
+// over here linalg::fill is not vectorized (patterns for linalg.fill are not
+// included here)
+// CHECK-LABEL:   func.func @pad_and_insert_slice_dest(
+// CHECK-SAME:      %[[ARG_0:.*]]: tensor<1x5x6xf32>) -> tensor<1x12x13xf32> {
+//  CHECK-NOT:     tensor.pad
+//  CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+//  CHECK-DAG:     %[[PAD:.*]] = arith.constant 5.000000e+00 : f32
+//      CHECK:     %[[EMPTY:.*]] = tensor.empty() : tensor<1x12x13xf32>
+//      CHECK:     %[[FILL:.*]] = linalg.fill ins(%[[PAD]] : f32) outs(%[[EMPTY]] : tensor<1x12x13xf32>) -> tensor<1x12x13xf32>
+//      CHECK:     %[[READ:.*]] = vector.transfer_read %[[ARG_0]]{{\[}}%[[C0]], %[[C0]], %[[C0]]], %[[PAD]] {in_bounds = [true, true, true]} : tensor<1x5x6xf32>, vector<1x5x6xf32>
+//      CHECK:     %[[WRITE:.*]] = vector.transfer_write %[[READ]], %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[C0]]] {in_bounds = [true, true, true]} : vector<1x5x6xf32>, tensor<1x12x13xf32>
+//      CHECK:     %[[VEC:.*]] = call @make_vector() : () -> tensor<12x13xf32>
+//      CHECK:     %[[RES:.*]] = tensor.insert_slice %[[VEC]] into %[[WRITE]][0, 0, 0] [1, 12, 13] [1, 1, 1] : tensor<12x13xf32> into tensor<1x12x13xf32>
+//      CHECK:     return %[[RES]] : tensor<1x12x13xf32>
+
+func.func @pad_and_insert_slice_dest(
+    %arg0: tensor<1x5x6xf32>) -> tensor<1x12x13xf32> {
+  %c5 = arith.constant 5.0 : f32
+  %0 = tensor.pad %arg0 low[0, 0, 0] high[0, 7, 7] {
+    ^bb0(%arg2: index, %arg3: index, %arg4: index):
+      tensor.yield %c5 : f32
+  } : tensor<1x5x6xf32> to tensor<1x12x13xf32>
+  %1 = call @make_vector() : () -> tensor<12x13xf32>
+  %r = tensor.insert_slice %1 into %0[0, 0, 0][1, 12, 13][1, 1, 1] : tensor<12x13xf32> into tensor<1x12x13xf32>
+  return %r : tensor<1x12x13xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.linalg.pad_vectorization
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
+
+// -----
+func.func private @make_vector() -> vector<7x9xf32>
+
+// Variant of @pad_and_transfer_write_static
+
+// CHECK-LABEL: func @pad_and_transfer_write_static_non_zero_low_pad
+//   CHECK-NOT:   tensor.pad
+//       CHECK:   linalg.fill
+func.func @pad_and_transfer_write_static_non_zero_low_pad(
+    %arg0: tensor<5x6xf32>) -> tensor<5x6xf32> {
+  %c0 = arith.constant 0 : index
+  %c5 = arith.constant 5.0 : f32
+  %0 = tensor.pad %arg0 low[0, 1] high[5, 6] {
+    ^bb0(%arg2: index, %arg3: index):
+      tensor.yield %c5 : f32
+  } : tensor<5x6xf32> to tensor<10x13xf32>
+  %1 = call @make_vector() : () -> vector<7x9xf32>
+  %2 = vector.transfer_write %1, %0[%c0, %c0]
+      : vector<7x9xf32>, tensor<10x13xf32>
+  %3 = tensor.extract_slice %2[0, 0] [5, 6] [1, 1] : tensor<10x13xf32> to tensor<5x6xf32>
+  return %3 : tensor<5x6xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.linalg.pad_vectorization
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
+
+// -----
+func.func private @make_vector() -> vector<7x9xf32>
+
+// Variant of @pad_and_transfer_write_static
+
+// CHECK-LABEL: func @pad_and_transfer_write_static_non_zero_offset
+//   CHECK-NOT:   tensor.pad
+//       CHECK:   linalg.fill
+func.func @pad_and_transfer_write_static_non_zero_offset(
+    %arg0: tensor<5x6xf32>) -> tensor<5x6xf32> {
+  %c0 = arith.constant 0 : index
+  %c5 = arith.constant 5.0 : f32
+  %0 = tensor.pad %arg0 low[0, 1] high[5, 6] {
+    ^bb0(%arg2: index, %arg3: index):
+      tensor.yield %c5 : f32
+  } : tensor<5x6xf32> to tensor<10x13xf32>
+  %1 = call @make_vector() : () -> vector<7x9xf32>
+  %2 = vector.transfer_write %1, %0[%c0, %c0]
+      : vector<7x9xf32>, tensor<10x13xf32>
+  %3 = tensor.extract_slice %2[0, 1] [5, 6] [1, 1] : tensor<10x13xf32> to tensor<5x6xf32>
+  return %3 : tensor<5x6xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %func_op = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.op<"func.func">
+
+    transform.apply_patterns to %func_op {
+      transform.apply_patterns.linalg.pad_vectorization
+    } : !transform.op<"func.func">
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
index 1c6a786bfa436d9..189507d97d6dc2f 100644
--- a/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-with-patterns.mlir
@@ -935,149 +935,6 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// -----
-
-// CHECK-LABEL: func @pad_and_transfer_read
-//  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
-//   CHECK-NOT:   tensor.pad
-//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C5:.*]] = arith.constant 5.0
-//       CHECK:   %[[RESULT:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], %[[C5]] : tensor<5x6xf32>, vector<7x9xf32>
-//       CHECK:   return %[[RESULT]]
-func.func @pad_and_transfer_read(%arg0: tensor<5x6xf32>) -> vector<7x9xf32> {
-  %c0 = arith.constant 0 : index
-  %c5 = arith.constant 5.0 : f32
-  %c6 = arith.constant 6.0 : f32
-  %0 = tensor.pad %arg0 low[0, 0] high[5, 7] {
-    ^bb0(%arg1: index, %arg2: index):
-      tensor.yield %c5 : f32
-  } : tensor<5x6xf32> to tensor<10x13xf32>
-  %1 = vector.transfer_read %0[%c0, %c0], %c6
-      : tensor<10x13xf32>, vector<7x9xf32>
-  return %1 : vector<7x9xf32>
-}
-
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %2 = transform.structured.vectorize_children_and_apply_patterns %1 { vectorize_padding } : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-// -----
-
-func.func private @make_vector() -> vector<7x9xf32>
-
-// CHECK-LABEL: func @pad_and_transfer_write_static
-//  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
-//   CHECK-NOT:   tensor.pad
-//       CHECK:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[VEC0:.*]] = call @make_vector() : () -> vector<7x9xf32>
-//       CHECK:   %[[RESULT:.*]] = vector.transfer_write %[[VEC0]], %[[ARG0]][%[[C0]], %[[C0]]] : vector<7x9xf32>, tensor<5x6xf32>
-//       CHECK:   return %[[RESULT]]
-func.func @pad_and_transfer_write_static(
-    %arg0: tensor<5x6xf32>) -> tensor<5x6xf32> {
-  %c0 = arith.constant 0 : index
-  %c5 = arith.constant 5.0 : f32
-  %0 = tensor.pad %arg0 low[0, 0] high[5, 7] {
-    ^bb0(%arg2: index, %arg3: index):
-      tensor.yield %c5 : f32
-  } : tensor<5x6xf32> to tensor<10x13xf32>
-  %1 = call @make_vector() : () -> vector<7x9xf32>
-  %2 = vector.transfer_write %1, %0[%c0, %c0]
-      : vector<7x9xf32>, tensor<10x13xf32>
-  %3 = tensor.extract_slice %2[0, 0] [5, 6] [1, 1] : tensor<10x13xf32> to tensor<5x6xf32>
-  return %3 : tensor<5x6xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %3 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %4 = transform.get_parent_op %3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %5 = transform.structured.vectorize_children_and_apply_patterns %4  { vectorize_padding } : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-
-// -----
-
-func.func private @make_vector() -> vector<7x9xf32>
-
-// CHECK-LABEL: func @pad_and_transfer_write_dynamic_static
-//  CHECK-SAME:     %[[ARG0:.*]]: tensor<?x?xf32>, %[[SIZE:.*]]: index, %[[PADDING:.*]]: index
-//   CHECK-NOT:   tensor.pad
-//       CHECK:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK:   %[[SUB:.*]] = tensor.extract_slice %[[ARG0]][0, 0] [%[[SIZE]], 6] [1, 1] : tensor<?x?xf32> to tensor<?x6xf32>
-//       CHECK:   %[[VEC0:.*]] = call @make_vector() : () -> vector<7x9xf32>
-//       CHECK:   %[[RESULT:.*]] = vector.transfer_write %[[VEC0]], %[[SUB]][%[[C0]], %[[C0]]] : vector<7x9xf32>, tensor<?x6xf32>
-//       CHECK:   return %[[RESULT]]
-func.func @pad_and_transfer_write_dynamic_static(
-    %arg0: tensor<?x?xf32>, %size: index, %padding: index) -> tensor<?x6xf32> {
-  %c0 = arith.constant 0 : index
-  %c5 = arith.constant 5.0 : f32
-  %s = tensor.extract_slice %arg0[0, 0] [%size, 6] [1, 1]
-      : tensor<?x?xf32> to tensor<?x6xf32>
-  %0 = tensor.pad %s low[0, 0] high[%padding, 7] {
-    ^bb0(%arg2: index, %arg3: index):
-      tensor.yield %c5 : f32
-  } : tensor<?x6xf32> to tensor<?x13xf32>
-  %1 = call @make_vector() : () -> vector<7x9xf32>
-  %2 = vector.transfer_write %1, %0[%c0, %c0]
-      : vector<7x9xf32>, tensor<?x13xf32>
-  %3 = tensor.extract_slice %2[0, 0] [%size, 6] [1, 1] : tensor<?x13xf32> to tensor<?x6xf32>
-  return %3 : tensor<?x6xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %3 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %4 = transform.get_parent_op %3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %5 = transform.structured.vectorize_children_and_apply_patterns %4 { vectorize_padding } : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-
-// -----
-
-func.func private @make_vector() -> tensor<12x13xf32>
-
-// CHECK-LABEL: func @pad_and_insert_slice_source
-//  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
-//   CHECK-NOT:   tensor.pad
-//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C5:.*]] = arith.constant 5.0
-//       CHECK:   %[[VEC0:.*]] = call @make_vector() : () -> tensor<12x13xf32>
-//       CHECK:   %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], %[[C5]] : tensor<5x6xf32>, vector<7x9xf32>
-//       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[READ]], %[[VEC0]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<7x9xf32>, tensor<12x13xf32>
-//       CHECK:   return %[[WRITE]]
-func.func @pad_and_insert_slice_source(
-    %arg0: tensor<5x6xf32>) -> tensor<12x13xf32> {
-  %c0 = arith.constant 0 : index
-  %c5 = arith.constant 5.0 : f32
-  %0 = tensor.pad %arg0 low[0, 0] high[2, 3] {
-    ^bb0(%arg2: index, %arg3: index):
-      tensor.yield %c5 : f32
-  } : tensor<5x6xf32> to tensor<7x9xf32>
-  %1 = call @make_vector() : () -> tensor<12x13xf32>
-  %r = tensor.insert_slice %0 into %1[0, 0][7, 9][1, 1] : tensor<7x9xf32> into tensor<12x13xf32>
-  return %r : tensor<12x13xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %3 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %4 = transform.get_parent_op %3 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
-    %5 = transform.structured.vectorize_children_and_apply_patterns %4  { vectorize_padding } : (!transform.any_op) -> !transform.any_op
-    transform.yield
-  }
-}
-
-
 // -----
 
 func.func private @make_vector() -> tensor<12x13xf32>

From 14db06946839729befd6bd3ced8142547f5fd139 Mon Sep 17 00:00:00 2001
From: ssijaric-nv <ssijaric@nvidia.com>
Date: Fri, 25 Oct 2024 10:47:39 -0700
Subject: [PATCH 22/39] [InstCombine] Fix a cycle when folding fneg(select)
 with scalable vector types (#112465)

The two folding operations are causing a cycle for the following case
with
scalable vector types:

define <vscale x 2 x double> @test_fneg_select_abs(<vscale x 2 x i1>
%cond, <vscale x 2 x double> %b) {
%1 = select <vscale x 2 x i1> %cond, <vscale x 2 x double>
zeroinitializer, <vscale x 2 x double> %b
  %2 = fneg fast <vscale x 2 x double> %1
  ret <vscale x 2 x double> %2
}

1) fold fneg:  -(Cond ? C : Y) -> Cond ? -C : -Y

2) fold select: (Cond ? -X : -Y) -> -(Cond ? X : Y)

1) results in the following since '<vscale x 2 x double>
zeroinitializer' passes
the check for the immediate constant:

%.neg = fneg fast <vscale x 2 x double> zeroinitializer
%b.neg = fneg fast <vscale x 2 x double> %b
%1 = select fast <vscale x 2 x i1> %cond, <vscale x 2 x double> %.neg,
<vscale x 2 x double> %b.neg

and so we end up going back and forth between 1) and 2).

Attempt to fold scalable vector constants, so that we end up with a
splat instead:

define <vscale x 2 x double> @test_fneg_select_abs(<vscale x 2 x i1>
%cond, <vscale x 2 x double> %b) {
  %b.neg = fneg fast <vscale x 2 x double> %b
%1 = select fast <vscale x 2 x i1> %cond, <vscale x 2 x double>
shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x
double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double>
poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double>
%b.neg
  ret <vscale x 2 x double> %1
}
---
 llvm/lib/IR/ConstantFold.cpp                | 29 ++++++++++---------
 llvm/test/Transforms/InstCombine/fneg.ll    | 32 +++++++++++++++++++++
 llvm/test/Transforms/InstSimplify/fp-nan.ll |  6 ++--
 3 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 57d9a03c9c22b83..07dfbc41e79b005 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -581,26 +581,27 @@ Constant *llvm::ConstantFoldUnaryInstruction(unsigned Opcode, Constant *C) {
     case Instruction::FNeg:
       return ConstantFP::get(C->getContext(), neg(CV));
     }
-  } else if (auto *VTy = dyn_cast<FixedVectorType>(C->getType())) {
-
-    Type *Ty = IntegerType::get(VTy->getContext(), 32);
+  } else if (auto *VTy = dyn_cast<VectorType>(C->getType())) {
     // Fast path for splatted constants.
     if (Constant *Splat = C->getSplatValue())
       if (Constant *Elt = ConstantFoldUnaryInstruction(Opcode, Splat))
         return ConstantVector::getSplat(VTy->getElementCount(), Elt);
 
-    // Fold each element and create a vector constant from those constants.
-    SmallVector<Constant *, 16> Result;
-    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
-      Constant *ExtractIdx = ConstantInt::get(Ty, i);
-      Constant *Elt = ConstantExpr::getExtractElement(C, ExtractIdx);
-      Constant *Res = ConstantFoldUnaryInstruction(Opcode, Elt);
-      if (!Res)
-        return nullptr;
-      Result.push_back(Res);
-    }
+    if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+      // Fold each element and create a vector constant from those constants.
+      Type *Ty = IntegerType::get(FVTy->getContext(), 32);
+      SmallVector<Constant *, 16> Result;
+      for (unsigned i = 0, e = FVTy->getNumElements(); i != e; ++i) {
+        Constant *ExtractIdx = ConstantInt::get(Ty, i);
+        Constant *Elt = ConstantExpr::getExtractElement(C, ExtractIdx);
+        Constant *Res = ConstantFoldUnaryInstruction(Opcode, Elt);
+        if (!Res)
+          return nullptr;
+        Result.push_back(Res);
+      }
 
-    return ConstantVector::get(Result);
+      return ConstantVector::get(Result);
+    }
   }
 
   // We don't know how to fold this.
diff --git a/llvm/test/Transforms/InstCombine/fneg.ll b/llvm/test/Transforms/InstCombine/fneg.ll
index 3c4088832feaaa6..6a9b3309bb347ec 100644
--- a/llvm/test/Transforms/InstCombine/fneg.ll
+++ b/llvm/test/Transforms/InstCombine/fneg.ll
@@ -1109,4 +1109,36 @@ define float @test_fneg_select_maxnum(float %x) {
   ret float %neg
 }
 
+; Check that there's no infinite loop.
+define <vscale x 2 x double> @test_fneg_select_svec(<vscale x 2 x i1> %cond, <vscale x 2 x double> %b) {
+; CHECK-LABEL: @test_fneg_select_svec(
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg fast <vscale x 2 x double> [[TMP1:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = select fast <vscale x 2 x i1> [[COND:%.*]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x double> [[TMP2]]
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP3]]
+;
+  %1 = select <vscale x 2 x i1> %cond, <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> %b
+  %2 = fneg fast <vscale x 2 x double> %1
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @test_fneg_select_svec_2(<vscale x 2 x i1> %cond, <vscale x 2 x double> %a) {
+; CHECK-LABEL: @test_fneg_select_svec_2(
+; CHECK-NEXT:    [[A_NEG:%.*]] = fneg fast <vscale x 2 x double> [[A:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = select fast <vscale x 2 x i1> [[COND:%.*]], <vscale x 2 x double> [[A_NEG]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+;
+  %1 = select <vscale x 2 x i1> %cond, <vscale x 2 x double> %a, <vscale x 2 x double> zeroinitializer
+  %2 = fneg fast <vscale x 2 x double> %1
+  ret <vscale x 2 x double> %2
+}
+
+define <vscale x 2 x double> @test_fneg_select_svec_3(<vscale x 2 x i1> %cond, <vscale x 2 x double> %b) {
+; CHECK-LABEL: @test_fneg_select_svec_3(
+; CHECK-NEXT:    ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+;
+  %1 = select <vscale x 2 x i1> %cond, <vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer
+  %2 = fneg fast <vscale x 2 x double> %1
+  ret <vscale x 2 x double> %2
+}
+
 !0 = !{}
diff --git a/llvm/test/Transforms/InstSimplify/fp-nan.ll b/llvm/test/Transforms/InstSimplify/fp-nan.ll
index bb557500822c143..06b23200bafff81 100644
--- a/llvm/test/Transforms/InstSimplify/fp-nan.ll
+++ b/llvm/test/Transforms/InstSimplify/fp-nan.ll
@@ -237,8 +237,7 @@ define <2 x double> @unary_fneg_nan_2(<2 x double> %x) {
 ; FIXME: This doesn't behave the same way as the fixed-length vectors above
 define <vscale x 1 x double> @unary_fneg_nan_2_scalable_vec_0() {
 ; CHECK-LABEL: @unary_fneg_nan_2_scalable_vec_0(
-; CHECK-NEXT:    [[R:%.*]] = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF1234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <vscale x 1 x double> [[R]]
+; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF1234567890ABC, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
   %r = fneg <vscale x 1 x double> splat (double 0xFFF1234567890ABC)
   ret <vscale x 1 x double> %r
@@ -247,8 +246,7 @@ define <vscale x 1 x double> @unary_fneg_nan_2_scalable_vec_0() {
 ; FIXME: This doesn't behave the same way as the fixed-length vectors above
 define <vscale x 1 x double> @unary_fneg_nan_2_scalable_vec_1() {
 ; CHECK-LABEL: @unary_fneg_nan_2_scalable_vec_1(
-; CHECK-NEXT:    [[R:%.*]] = fneg <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0x7FF0000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
-; CHECK-NEXT:    ret <vscale x 1 x double> [[R]]
+; CHECK-NEXT:    ret <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 0xFFF0000000000001, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
 ;
   %r = fneg <vscale x 1 x double> splat (double 0x7FF0000000000001)
   ret <vscale x 1 x double> %r

From 843c2fbe7f983c2a2059f753e4494f06fb645a9e Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Fri, 25 Oct 2024 18:57:01 +0100
Subject: [PATCH 23/39] Add parser+semantics support for scope construct
 (#113700)

Test parsing, semantics and a couple of basic semantic checks for
block/worksharing constructs.
Add TODO message in lowering.
---
 .../flang/Semantics/openmp-directive-sets.h   |  2 ++
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 12 ++++++++++
 flang/lib/Parser/openmp-parsers.cpp           |  1 +
 flang/lib/Parser/unparse.cpp                  |  3 +++
 flang/lib/Semantics/check-omp-structure.cpp   |  7 +++++-
 flang/lib/Semantics/resolve-directives.cpp    |  2 ++
 flang/test/Lower/OpenMP/Todo/scope.f90        | 13 ++++++++++
 flang/test/Parser/OpenMP/scope.f90            | 24 +++++++++++++++++++
 .../test/Semantics/OpenMP/invalid-branch.f90  |  8 +++++++
 flang/test/Semantics/OpenMP/nested01.f90      |  7 ++++++
 llvm/include/llvm/Frontend/OpenMP/OMP.td      | 10 +++++++-
 11 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/Todo/scope.f90
 create mode 100644 flang/test/Parser/OpenMP/scope.f90

diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h
index 8eb736bb098fe4e..50d6d5b59ef7dd9 100644
--- a/flang/include/flang/Semantics/openmp-directive-sets.h
+++ b/flang/include/flang/Semantics/openmp-directive-sets.h
@@ -211,6 +211,7 @@ static const OmpDirectiveSet blockConstructSet{
     Directive::OMPD_parallel,
     Directive::OMPD_parallel_masked,
     Directive::OMPD_parallel_workshare,
+    Directive::OMPD_scope,
     Directive::OMPD_single,
     Directive::OMPD_target,
     Directive::OMPD_target_data,
@@ -281,6 +282,7 @@ static const OmpDirectiveSet workShareSet{
         Directive::OMPD_workshare,
         Directive::OMPD_parallel_workshare,
         Directive::OMPD_parallel_sections,
+        Directive::OMPD_scope,
         Directive::OMPD_sections,
         Directive::OMPD_single,
     } | allDoSet,
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index fc54da8babe63e9..01a40d6e2204ef2 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1650,6 +1650,15 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   return sectionsOp;
 }
 
+static void genScopeOp(lower::AbstractConverter &converter,
+                       lower::SymMap &symTable,
+                       semantics::SemanticsContext &semaCtx,
+                       lower::pft::Evaluation &eval, mlir::Location loc,
+                       const ConstructQueue &queue,
+                       ConstructQueue::const_iterator item) {
+  TODO(loc, "Scope construct");
+}
+
 static mlir::omp::SingleOp
 genSingleOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
             semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
@@ -2478,6 +2487,9 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
   case llvm::omp::Directive::OMPD_simd:
     genStandaloneSimd(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
+  case llvm::omp::Directive::OMPD_scope:
+    genScopeOp(converter, symTable, semaCtx, eval, loc, queue, item);
+    break;
   case llvm::omp::Directive::OMPD_single:
     genSingleOp(converter, symTable, semaCtx, eval, loc, queue, item);
     break;
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 59a8757e58e8cc4..e740c421ca80276 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -697,6 +697,7 @@ TYPE_PARSER(construct<OmpBlockDirective>(first(
     "PARALLEL MASKED" >> pure(llvm::omp::Directive::OMPD_parallel_masked),
     "PARALLEL WORKSHARE" >> pure(llvm::omp::Directive::OMPD_parallel_workshare),
     "PARALLEL" >> pure(llvm::omp::Directive::OMPD_parallel),
+    "SCOPE" >> pure(llvm::omp::Directive::OMPD_scope),
     "SINGLE" >> pure(llvm::omp::Directive::OMPD_single),
     "TARGET DATA" >> pure(llvm::omp::Directive::OMPD_target_data),
     "TARGET PARALLEL" >> pure(llvm::omp::Directive::OMPD_target_parallel),
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 04df988223e8f8d..19ceb2a3ebc3178 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2386,6 +2386,9 @@ class UnparseVisitor {
     case llvm::omp::Directive::OMPD_parallel:
       Word("PARALLEL ");
       break;
+    case llvm::omp::Directive::OMPD_scope:
+      Word("SCOPE ");
+      break;
     case llvm::omp::Directive::OMPD_single:
       Word("SINGLE ");
       break;
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 46486907ceb9e1f..1c2cf304d0ee95f 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -972,6 +972,7 @@ void OmpStructureChecker::Enter(const parser::OpenMPBlockConstruct &x) {
     HasInvalidWorksharingNesting(
         beginDir.source, llvm::omp::nestedWorkshareErrSet);
     break;
+  case llvm::omp::Directive::OMPD_scope:
   case llvm::omp::Directive::OMPD_single:
     // TODO: This check needs to be extended while implementing nesting of
     // regions checks.
@@ -1864,6 +1865,9 @@ void OmpStructureChecker::Enter(const parser::OmpEndBlockDirective &x) {
   const auto &dir{std::get<parser::OmpBlockDirective>(x.t)};
   ResetPartialContext(dir.source);
   switch (dir.v) {
+  case llvm::omp::Directive::OMPD_scope:
+    PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_end_scope);
+    break;
   // 2.7.3 end-single-clause -> copyprivate-clause |
   //                            nowait-clause
   case llvm::omp::Directive::OMPD_single:
@@ -1886,7 +1890,8 @@ void OmpStructureChecker::Enter(const parser::OmpEndBlockDirective &x) {
 // end_workshareare popped as they are pushed while entering the
 // EndBlockDirective.
 void OmpStructureChecker::Leave(const parser::OmpEndBlockDirective &x) {
-  if ((GetContext().directive == llvm::omp::Directive::OMPD_end_single) ||
+  if ((GetContext().directive == llvm::omp::Directive::OMPD_end_scope) ||
+      (GetContext().directive == llvm::omp::Directive::OMPD_end_single) ||
       (GetContext().directive == llvm::omp::Directive::OMPD_end_workshare)) {
     dirContext_.pop_back();
   }
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 33936ba4c2b34f1..513e42bee976a9a 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -1526,6 +1526,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPBlockConstruct &x) {
   case llvm::omp::Directive::OMPD_master:
   case llvm::omp::Directive::OMPD_ordered:
   case llvm::omp::Directive::OMPD_parallel:
+  case llvm::omp::Directive::OMPD_scope:
   case llvm::omp::Directive::OMPD_single:
   case llvm::omp::Directive::OMPD_target:
   case llvm::omp::Directive::OMPD_target_data:
@@ -1557,6 +1558,7 @@ void OmpAttributeVisitor::Post(const parser::OpenMPBlockConstruct &x) {
   case llvm::omp::Directive::OMPD_masked:
   case llvm::omp::Directive::OMPD_parallel_masked:
   case llvm::omp::Directive::OMPD_parallel:
+  case llvm::omp::Directive::OMPD_scope:
   case llvm::omp::Directive::OMPD_single:
   case llvm::omp::Directive::OMPD_target:
   case llvm::omp::Directive::OMPD_task:
diff --git a/flang/test/Lower/OpenMP/Todo/scope.f90 b/flang/test/Lower/OpenMP/Todo/scope.f90
new file mode 100644
index 000000000000000..16a067dc8f256be
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/scope.f90
@@ -0,0 +1,13 @@
+! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: Scope construct
+program omp_scope
+  integer i
+  i = 10
+
+  !$omp scope private(i)
+  print *, "omp scope", i
+  !$omp end scope
+
+end program omp_scope
diff --git a/flang/test/Parser/OpenMP/scope.f90 b/flang/test/Parser/OpenMP/scope.f90
new file mode 100644
index 000000000000000..6574136311e7187
--- /dev/null
+++ b/flang/test/Parser/OpenMP/scope.f90
@@ -0,0 +1,24 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=51 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=51 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+program omp_scope
+  integer i
+  i = 10
+
+!CHECK: !$OMP SCOPE  PRIVATE(i)
+!CHECK: !$OMP END SCOPE
+
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPBlockConstruct
+!PARSE-TREE: OmpBeginBlockDirective
+!PARSE-TREE: OmpBlockDirective -> llvm::omp::Directive = scope
+!PARSE-TREE: OmpClauseList -> OmpClause -> Private -> OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'i'
+!PARSE-TREE: Block
+!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> PrintStmt
+!PARSE-TREE: OmpEndBlockDirective
+!PARSE-TREE: OmpBlockDirective -> llvm::omp::Directive = scope
+!PARSE-TREE: OmpClauseList -> OmpClause -> Nowait
+
+  !$omp scope private(i)
+  print *, "omp scope", i
+  !$omp end scope nowait
+end program omp_scope
diff --git a/flang/test/Semantics/OpenMP/invalid-branch.f90 b/flang/test/Semantics/OpenMP/invalid-branch.f90
index ed9e4d268f65a8c..28aab8b122f3f2c 100644
--- a/flang/test/Semantics/OpenMP/invalid-branch.f90
+++ b/flang/test/Semantics/OpenMP/invalid-branch.f90
@@ -105,4 +105,12 @@ program omp_invalid_branch
   !$omp end parallel
   9 print *, "2nd alternate return"
 
+  !CHECK: invalid branch into an OpenMP structured block
+  goto 100
+  !$omp scope
+    100 continue
+    !CHECK: invalid branch leaving an OpenMP structured block
+    goto 200
+  !$omp end scope
+  200 continue
 end program
diff --git a/flang/test/Semantics/OpenMP/nested01.f90 b/flang/test/Semantics/OpenMP/nested01.f90
index 49c964ab86aa6bd..0936e4c1b45a5db 100644
--- a/flang/test/Semantics/OpenMP/nested01.f90
+++ b/flang/test/Semantics/OpenMP/nested01.f90
@@ -25,6 +25,13 @@
    !$omp end target
   enddo
   
+  !$omp do
+  do i = 1, N
+     !ERROR: A worksharing region may not be closely nested inside a worksharing, explicit task, taskloop, critical, ordered, atomic, or master region
+     !$omp scope
+     !$omp end scope
+  end do
+  !$omp end do
 
   !$omp do
   do i = 1, N
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 1834ad4d037f3d9..d592f369a17f92c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -892,7 +892,7 @@ def OMP_Scan : Directive<"scan"> {
   let association = AS_Separating;
   let category = CA_Subsidiary;
 }
-def OMP_scope : Directive<"scope"> {
+def OMP_Scope : Directive<"scope"> {
   let allowedClauses = [
     VersionedClause<OMPC_Private, 51>,
     VersionedClause<OMPC_Reduction, 51>,
@@ -905,6 +905,14 @@ def OMP_scope : Directive<"scope"> {
   let association = AS_Block;
   let category = CA_Executable;
 }
+def OMP_EndScope : Directive<"end scope"> {
+  let allowedOnceClauses = [
+    VersionedClause<OMPC_NoWait>,
+  ];
+  let leafConstructs = OMP_Scope.leafConstructs;
+  let association = OMP_Scope.association;
+  let category = OMP_Scope.category;
+}
 def OMP_Section : Directive<"section"> {
   let association = AS_Separating;
   let category = CA_Subsidiary;

From 144ddca9ed6a439ad8a421c3ff2ea763532341ba Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Fri, 25 Oct 2024 11:09:57 -0700
Subject: [PATCH 24/39] [MemProf] Avoid duplicate edges between nodes (#113337)

The recent change to add support for cloning indirect calls
inadvertantly caused duplicate edges to be created between the same
caller/callee pair. This is due to the new moveCalleeEdgeToNewCaller
not properly guarding the addition of a new edge (ironically I was
testing for that in an assertion, but failed to handle that case
specially otherwise). Now simply move the context ids over to any
existing edge.

This issue in turn led to some assumptions in cloning being violated,
resulting in a later crash.

Add a test for this case to checkNode.
---
 .../IPO/MemProfContextDisambiguation.cpp      | 21 +++++++++++-
 llvm/test/ThinLTO/X86/memprof-icp.ll          | 34 +++++++++++++++----
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 4efd683dfca3633..905186edcbecc40 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -1352,6 +1352,17 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
     }
     assert(NodeContextIds == CalleeEdgeContextIds);
   }
+  // FIXME: Since this checking is only invoked under an option, we should
+  // change the error checking from using assert to something that will trigger
+  // an error on a release build.
+#ifndef NDEBUG
+  // Make sure we don't end up with duplicate edges between the same caller and
+  // callee.
+  DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
+  for (const auto &E : Node->CalleeEdges)
+    NodeSet.insert(E->Callee);
+  assert(NodeSet.size() == Node->CalleeEdges.size());
+#endif
 }
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
@@ -3125,7 +3136,15 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
     // from the same callers as the old node. That should be true in the current
     // use case, where we will remove None-type edges after copying over all
     // caller edges from the callee.
-    assert(IsNewNode || NewCaller->findEdgeFromCaller(OldCallerEdge->Caller));
+    auto *ExistingCallerEdge =
+        NewCaller->findEdgeFromCaller(OldCallerEdge->Caller);
+    assert(IsNewNode || ExistingCallerEdge);
+    if (ExistingCallerEdge) {
+      ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
+                                                 EdgeContextIdsToMove.end());
+      ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
+      continue;
+    }
     auto NewEdge = std::make_shared<ContextEdge>(
         NewCaller, OldCallerEdge->Caller,
         computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
diff --git a/llvm/test/ThinLTO/X86/memprof-icp.ll b/llvm/test/ThinLTO/X86/memprof-icp.ll
index f17e19e1f77ef25..99e071898765567 100644
--- a/llvm/test/ThinLTO/X86/memprof-icp.ll
+++ b/llvm/test/ThinLTO/X86/memprof-icp.ll
@@ -186,9 +186,13 @@
 ; REMARKS-MAIN: created clone _ZN2B03barEj.memprof.1
 ; REMARKS-MAIN: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
 ; REMARKS-MAIN: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS-MAIN: call in clone _ZN2B03barEj marked with memprof allocation attribute notcold
+; REMARKS-MAIN: call in clone _ZN2B03barEj.memprof.1 marked with memprof allocation attribute cold
 ; REMARKS-MAIN: created clone _ZN1B3barEj.memprof.1
 ; REMARKS-MAIN: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
 ; REMARKS-MAIN: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
+; REMARKS-MAIN: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
+; REMARKS-MAIN: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
 ; REMARKS-FOO: created clone _Z3fooR2B0j.memprof.1
 ;; In each version of foo we should have promoted the indirect call to two conditional
 ;; direct calls, one to B::bar and one to B0::bar. The cloned version of foo should call
@@ -208,10 +212,10 @@
 ; REMARKS-FOO: call in clone _ZN1B3barEj marked with memprof allocation attribute notcold
 ; REMARKS-FOO: call in clone _ZN1B3barEj.memprof.1 marked with memprof allocation attribute cold
 
-; STATS: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during whole program analysis
-; STATS-BE: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
-; STATS: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during whole program analysis
-; STATS-BE: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during whole program analysis
+; STATS-BE: 8 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during whole program analysis
+; STATS-BE: 8 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
 ; STATS: 3 memprof-context-disambiguation - Number of function clones created during whole program analysis
 ; STATS-BE: 5 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
 
@@ -247,8 +251,8 @@
 ; IR: attributes #[[NOTCOLD]] = {{.*}} "memprof"="notcold"
 ; IR: attributes #[[COLD]] = {{.*}} "memprof"="cold"
 
-; STATS-BE-DISTRIB: 2 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
-; STATS-BE-DISTRIB: 2 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE-DISTRIB: 4 memprof-context-disambiguation - Number of cold static allocations (possibly cloned) during ThinLTO backend
+; STATS-BE-DISTRIB: 4 memprof-context-disambiguation - Number of not cold static allocations (possibly cloned) during ThinLTO backend
 ; STATS-BE-DISTRIB: 3 memprof-context-disambiguation - Number of function clones created during ThinLTO backend
 
 ;--- foo.ll
@@ -298,6 +302,9 @@ declare i32 @_Z3fooR2B0j(ptr, i32)
 define i32 @_ZN2B03barEj(ptr %this, i32 %s) {
 entry:
   %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !33, !callsite !38
+  ;; Second allocation in this function, to ensure that indirect edges to the
+  ;; same callee are partitioned correctly.
+  %call2 = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !45, !callsite !50
   store volatile i32 0, ptr %call, align 4
   ret i32 0
 }
@@ -311,6 +318,9 @@ declare void @_ZdlPvm()
 define i32 @_ZN1B3barEj(ptr %this, i32 %s) {
 entry:
   %call = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !39, !callsite !44
+  ;; Second allocation in this function, to ensure that indirect edges to the
+  ;; same callee are partitioned correctly.
+  %call2 = tail call ptr @_Znwm(i64 noundef 4) #0, !memprof !51, !callsite !56
   store volatile i32 0, ptr %call, align 4
   ret i32 0
 }
@@ -367,3 +377,15 @@ attributes #0 = { builtin allocsize(0) }
 !42 = !{!43, !"cold"}
 !43 = !{i64 4457553070050523782, i64 -2101080423462424381, i64 -6490791336773930154}
 !44 = !{i64 4457553070050523782}
+!45 = !{!46, !48}
+!46 = !{!47, !"notcold"}
+!47 = !{i64 456, i64 -2101080423462424381, i64 5188446645037944434}
+!48 = !{!49, !"cold"}
+!49 = !{i64 456, i64 -2101080423462424381, i64 5583420417449503557}
+!50 = !{i64 456}
+!51 = !{!52, !54}
+!52 = !{!53, !"notcold"}
+!53 = !{i64 789, i64 -2101080423462424381, i64 132626519179914298}
+!54 = !{!55, !"cold"}
+!55 = !{i64 789, i64 -2101080423462424381, i64 -6490791336773930154}
+!56 = !{i64 789}

From f4db221258cb44a8f9804ce852c0403328de39b2 Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Fri, 25 Oct 2024 11:12:41 -0700
Subject: [PATCH 25/39] [libc++][test] Use
 `ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings)` for `-Wno-psabi` (#113608)

MSVC doesn't understand `-Wno-psabi`, which was introduced here by
@ldionne in #106077.

Using `ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings)` (implemented by
#75317) avoids passing this to MSVC.
---
 .../std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp
index 5130758d5efd52d..abb12d6a3c24730 100644
--- a/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.lockfree/is_always_lock_free.pass.cpp
@@ -18,7 +18,7 @@
 
 // Ignore diagnostic about vector types changing the ABI on some targets, since
 // that is irrelevant for this test.
-// ADDITIONAL_COMPILE_FLAGS: -Wno-psabi
+// ADDITIONAL_COMPILE_FLAGS(gcc-style-warnings): -Wno-psabi
 
 #include <atomic>
 #include <cassert>

From a0c318938a528cfbef509a2516b36dd2411a52b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ga=C3=ABtan=20Bossu?=
 <41161573+gbossu@users.noreply.github.com>
Date: Fri, 25 Oct 2024 20:19:22 +0200
Subject: [PATCH 26/39] [CodeGen][NFC] Properly split MachineLICM and
 EarlyMachineLICM (#113573)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both are based on MachineLICMBase, and the functionality there is
"switched" based on a PreRegAlloc flag. This commit is simply about
trusting the original value of that flag, defined by the `MachineLICM`
and `EarlyMachineLICM` classes.

The `PreRegAlloc` flag used to be overwritten it based on MRI.isSSA(),
which is un-reliable due to how it is inferred by the MIRParser. I see
that we can now define isSSA in MIR (thanks @gargaroff ), meaning the
fix isn’t really needed anymore, but redefining that flag still feels
wrong.

Note that I'm looking into upstreaming more changes to MachineLICM, see
[the discourse
thread](https://discourse.llvm.org/t/extending-post-regalloc-machinelicm/82725).
---
 llvm/lib/CodeGen/MachineLICM.cpp                         | 6 ------
 llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir    | 9 +--------
 llvm/test/CodeGen/AMDGPU/licm-regpressure.mir            | 4 ++--
 llvm/test/CodeGen/AMDGPU/licm-valu.mir                   | 4 ++--
 llvm/test/CodeGen/X86/unfoldMemoryOperand.mir            | 2 +-
 llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir | 4 ++--
 6 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 793ad75759ccb86..7ea07862b839d02 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -391,12 +391,6 @@ bool MachineLICMImpl::run(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   SchedModel.init(&ST);
 
-  // FIXME: Remove this assignment or convert to an assert? (dead variable PreRegAlloc)
-  // MachineLICM and PostRAMachineLICM were distinguished by introducing
-  // EarlyMachineLICM and MachineLICM respectively to avoid "using an unreliable
-  // MRI::isSSA() check to determine whether register allocation has happened"
-  // (See 4a7c8e7).
-  PreRegAlloc = MRI->isSSA();
   HasProfileData = MF.getFunction().hasProfileData();
 
   if (PreRegAlloc)
diff --git a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir
index 406025c4fde3022..90ff68d30a3a0e5 100644
--- a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir
+++ b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir
@@ -3,9 +3,6 @@
 ---
 name: test
 tracksRegLiveness: true
-isSSA: false
-registers:
-  - { id: 0, class: gpr64 }
 stack:
   - { id: 0, size: 8, type: spill-slot }
 body: |
@@ -30,14 +27,11 @@ body: |
 
   bb.2:
     liveins: $x0
-    %0 = COPY $x0
 ...
+
 ---
 name: test2
 tracksRegLiveness: true
-isSSA: false
-registers:
-  - { id: 0, class: gpr64 }
 stack:
   - { id: 0, size: 8, type: spill-slot }
 body: |
@@ -62,5 +56,4 @@ body: |
 
   bb.2:
     liveins: $x0
-    %0 = COPY $x0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
index e63009fdcb43cf2..dd478f94e1039ec 100644
--- a/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
+++ b/llvm/test/CodeGen/AMDGPU/licm-regpressure.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass machinelicm -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes machinelicm -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass early-machinelicm -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes early-machinelicm -o - %s | FileCheck -check-prefix=GCN %s
 
 # MachineLICM shall limit hoisting of V_CVT instructions out of the loop keeping
 # register pressure within the budget. VGPR budget at occupancy 10 is 24 vgprs.
diff --git a/llvm/test/CodeGen/AMDGPU/licm-valu.mir b/llvm/test/CodeGen/AMDGPU/licm-valu.mir
index b4f5e057f532b51..6a28eee19d503cf 100644
--- a/llvm/test/CodeGen/AMDGPU/licm-valu.mir
+++ b/llvm/test/CodeGen/AMDGPU/licm-valu.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=machinelicm -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=machinelicm -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=early-machinelicm -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=early-machinelicm -o - %s | FileCheck -check-prefix=GCN %s
 
 ---
 name: hoist_move
diff --git a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
index ff3d9ca378dbd52..135b14d6836a090 100644
--- a/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
+++ b/llvm/test/CodeGen/X86/unfoldMemoryOperand.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=x86_64-- -passes machinelicm -mcpu=skx -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=x86_64-- -passes early-machinelicm -mcpu=skx -verify-machineinstrs -o - %s | FileCheck %s
 --- |
   @x = dso_local global i32 0, align 4
   @z = dso_local local_unnamed_addr global [1024 x i32] zeroinitializer, align 16
diff --git a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir
index d4d59e14724ebe7..b65a0e71af1dd2d 100644
--- a/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir
+++ b/llvm/test/DebugInfo/MIR/X86/mlicm-hoist-pre-regalloc.mir
@@ -1,6 +1,6 @@
 --- | 
-  ; RUN: llc -run-pass=machinelicm -o - %s | FileCheck %s
-  ; RUN: llc -passes=machinelicm -o - %s | FileCheck %s
+  ; RUN: llc -run-pass=early-machinelicm -o - %s | FileCheck %s
+  ; RUN: llc -passes=early-machinelicm -o - %s | FileCheck %s
   ; Line numbers should not be retained when loop invariant instructions are hoisted.
   ; Doing so causes poor stepping bevavior.
   ;

From eccdb2489483ca58d2cb35bc38967a8e33117575 Mon Sep 17 00:00:00 2001
From: Ye Luo <yeluo@anl.gov>
Date: Fri, 25 Oct 2024 13:19:58 -0500
Subject: [PATCH 27/39] [OpenMP] Create versioned libgomp softlinks (#112973)

Add libgomp.1.dylib for MacOS and libgomp.so.1 for Linux

Linkers on Mac and Linux pick up versioned libgomp dynamic library
files. The existing softlinks (libgomp.dylib for MacOS and libgomp.so
for Linux) are insufficient. This helps alleviate the issue of mixing
libgomp and libomp at runtime.
---
 openmp/runtime/src/CMakeLists.txt | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 439cc20963a1298..61c0bacc9f20629 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -253,6 +253,17 @@ if(NOT WIN32)
       libiomp5${LIBOMP_LIBRARY_SUFFIX}
     WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
   )
+  if(LIBOMP_ENABLE_SHARED)
+    if(APPLE)
+      set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX})
+    else()
+      set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1)
+    endif()
+    add_custom_command(TARGET omp POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME}
+      WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
+    )
+  endif()
 endif()
 
 # Definitions for testing, for reuse when testing libomptarget-nvptx.
@@ -439,13 +450,18 @@ else()
 
   if(${LIBOMP_INSTALL_ALIASES})
     # Create aliases (symlinks) of the library for backwards compatibility
+    extend_path(outdir "${CMAKE_INSTALL_PREFIX}" "${OPENMP_INSTALL_LIBDIR}")
     set(LIBOMP_ALIASES "libgomp;libiomp5")
     foreach(alias IN LISTS LIBOMP_ALIASES)
-      extend_path(outdir "${CMAKE_INSTALL_PREFIX}" "${OPENMP_INSTALL_LIBDIR}")
       install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${LIBOMP_LIB_FILE}\"
         \"${alias}${LIBOMP_LIBRARY_SUFFIX}\" WORKING_DIRECTORY
         \"\$ENV{DESTDIR}${outdir}\")")
     endforeach()
+    if(LIBOMP_ENABLE_SHARED)
+      install(CODE "execute_process(COMMAND \"\${CMAKE_COMMAND}\" -E create_symlink \"${LIBOMP_LIB_FILE}\"
+        \"${VERSIONED_LIBGOMP_NAME}\" WORKING_DIRECTORY
+        \"\$ENV{DESTDIR}${outdir}\")")
+    endif()
   endif()
 endif()
 install(

From 88cc7ac0cc43a739c25f6988c1bfe3949ca4da62 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 25 Oct 2024 14:21:43 -0400
Subject: [PATCH 28/39] [libc++][NFC] Remove unused functions from
 posix_l_fallbacks (#113709)

---
 .../__support/xlocale/__posix_l_fallback.h     | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/libcxx/include/__support/xlocale/__posix_l_fallback.h b/libcxx/include/__support/xlocale/__posix_l_fallback.h
index 8a3a6f27f48dde9..c83589181747094 100644
--- a/libcxx/include/__support/xlocale/__posix_l_fallback.h
+++ b/libcxx/include/__support/xlocale/__posix_l_fallback.h
@@ -25,24 +25,10 @@
 #  include <wctype.h>
 #endif
 
-inline _LIBCPP_HIDE_FROM_ABI int isalnum_l(int __c, locale_t) { return ::isalnum(__c); }
-
-inline _LIBCPP_HIDE_FROM_ABI int isalpha_l(int __c, locale_t) { return ::isalpha(__c); }
-
-inline _LIBCPP_HIDE_FROM_ABI int iscntrl_l(int __c, locale_t) { return ::iscntrl(__c); }
-
 inline _LIBCPP_HIDE_FROM_ABI int isdigit_l(int __c, locale_t) { return ::isdigit(__c); }
 
-inline _LIBCPP_HIDE_FROM_ABI int isgraph_l(int __c, locale_t) { return ::isgraph(__c); }
-
 inline _LIBCPP_HIDE_FROM_ABI int islower_l(int __c, locale_t) { return ::islower(__c); }
 
-inline _LIBCPP_HIDE_FROM_ABI int isprint_l(int __c, locale_t) { return ::isprint(__c); }
-
-inline _LIBCPP_HIDE_FROM_ABI int ispunct_l(int __c, locale_t) { return ::ispunct(__c); }
-
-inline _LIBCPP_HIDE_FROM_ABI int isspace_l(int __c, locale_t) { return ::isspace(__c); }
-
 inline _LIBCPP_HIDE_FROM_ABI int isupper_l(int __c, locale_t) { return ::isupper(__c); }
 
 inline _LIBCPP_HIDE_FROM_ABI int isxdigit_l(int __c, locale_t) { return ::isxdigit(__c); }
@@ -52,8 +38,6 @@ inline _LIBCPP_HIDE_FROM_ABI int toupper_l(int __c, locale_t) { return ::toupper
 inline _LIBCPP_HIDE_FROM_ABI int tolower_l(int __c, locale_t) { return ::tolower(__c); }
 
 #ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
-inline _LIBCPP_HIDE_FROM_ABI int iswalnum_l(wint_t __c, locale_t) { return ::iswalnum(__c); }
-
 inline _LIBCPP_HIDE_FROM_ABI int iswalpha_l(wint_t __c, locale_t) { return ::iswalpha(__c); }
 
 inline _LIBCPP_HIDE_FROM_ABI int iswblank_l(wint_t __c, locale_t) { return ::iswblank(__c); }
@@ -62,8 +46,6 @@ inline _LIBCPP_HIDE_FROM_ABI int iswcntrl_l(wint_t __c, locale_t) { return ::isw
 
 inline _LIBCPP_HIDE_FROM_ABI int iswdigit_l(wint_t __c, locale_t) { return ::iswdigit(__c); }
 
-inline _LIBCPP_HIDE_FROM_ABI int iswgraph_l(wint_t __c, locale_t) { return ::iswgraph(__c); }
-
 inline _LIBCPP_HIDE_FROM_ABI int iswlower_l(wint_t __c, locale_t) { return ::iswlower(__c); }
 
 inline _LIBCPP_HIDE_FROM_ABI int iswprint_l(wint_t __c, locale_t) { return ::iswprint(__c); }

From 4ac0e7e400fe2a66d1fd5d5d1fa1c899dfb16716 Mon Sep 17 00:00:00 2001
From: Gang Chen <gangc@amd.com>
Date: Fri, 25 Oct 2024 11:24:47 -0700
Subject: [PATCH 29/39] [AMDGPU] Add a type for the named barrier (#113614)

---
 clang/include/clang/Basic/AMDGPUTypes.def     |  8 ++++
 clang/lib/CodeGen/CGDebugInfo.cpp             |  7 ++++
 clang/lib/CodeGen/CodeGenTypes.cpp            |  4 ++
 clang/test/AST/ast-dump-amdgpu-types.c        | 13 ++++--
 .../CodeGen/amdgpu-barrier-type-debug-info.c  |  8 ++++
 .../CodeGenCXX/amdgpu-barrier-typeinfo.cpp    | 10 +++++
 clang/test/CodeGenHIP/amdgpu-barrier-type.hip | 42 +++++++++++++++++++
 clang/test/SemaCXX/amdgpu-barrier.cpp         | 17 ++++++++
 clang/test/SemaHIP/amdgpu-barrier.hip         | 20 +++++++++
 clang/test/SemaOpenCL/amdgpu-barrier.cl       | 12 ++++++
 clang/test/SemaOpenMP/amdgpu-barrier.cpp      | 17 ++++++++
 llvm/lib/IR/Type.cpp                          | 14 +++++++
 12 files changed, 168 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CodeGen/amdgpu-barrier-type-debug-info.c
 create mode 100644 clang/test/CodeGenCXX/amdgpu-barrier-typeinfo.cpp
 create mode 100644 clang/test/CodeGenHIP/amdgpu-barrier-type.hip
 create mode 100644 clang/test/SemaCXX/amdgpu-barrier.cpp
 create mode 100644 clang/test/SemaHIP/amdgpu-barrier.hip
 create mode 100644 clang/test/SemaOpenCL/amdgpu-barrier.cl
 create mode 100644 clang/test/SemaOpenMP/amdgpu-barrier.cpp

diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def
index e47e544fdc82c1c..d3dff446f9edf01 100644
--- a/clang/include/clang/Basic/AMDGPUTypes.def
+++ b/clang/include/clang/Basic/AMDGPUTypes.def
@@ -15,7 +15,15 @@
   AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
 #endif
 
+#ifndef AMDGPU_NAMED_BARRIER_TYPE
+#define AMDGPU_NAMED_BARRIER_TYPE(Name, Id, SingletonId, Width, Align, Scope) \
+  AMDGPU_TYPE(Name, Id, SingletonId, Width, Align)
+#endif
+
 AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", AMDGPUBufferRsrc, AMDGPUBufferRsrcTy, 128, 128, 8)
 
+AMDGPU_NAMED_BARRIER_TYPE("__amdgpu_named_workgroup_barrier_t", AMDGPUNamedWorkgroupBarrier, AMDGPUNamedWorkgroupBarrierTy, 128, 32, 0)
+
 #undef AMDGPU_TYPE
 #undef AMDGPU_OPAQUE_PTR_TYPE
+#undef AMDGPU_NAMED_BARRIER_TYPE
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 59a761c2303c951..5fd6cfa63e6efab 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -916,6 +916,13 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
                                      TheCU, TheCU->getFile(), 0);              \
     return SingletonId;                                                        \
   }
+#define AMDGPU_NAMED_BARRIER_TYPE(Name, Id, SingletonId, Width, Align, Scope)  \
+  case BuiltinType::Id: {                                                      \
+    if (!SingletonId)                                                          \
+      SingletonId =                                                            \
+          DBuilder.createBasicType(Name, Width, llvm::dwarf::DW_ATE_unsigned); \
+    return SingletonId;                                                        \
+  }
 #include "clang/Basic/AMDGPUTypes.def"
   case BuiltinType::UChar:
   case BuiltinType::Char_U:
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index f87184fc77832ca..09191a4901f4932 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -564,6 +564,10 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
 #define AMDGPU_OPAQUE_PTR_TYPE(Name, Id, SingletonId, Width, Align, AS)        \
   case BuiltinType::Id:                                                        \
     return llvm::PointerType::get(getLLVMContext(), AS);
+#define AMDGPU_NAMED_BARRIER_TYPE(Name, Id, SingletonId, Width, Align, Scope)  \
+  case BuiltinType::Id:                                                        \
+    return llvm::TargetExtType::get(getLLVMContext(), "amdgcn.named.barrier",  \
+                                    {}, {Scope});
 #include "clang/Basic/AMDGPUTypes.def"
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/HLSLIntangibleTypes.def"
diff --git a/clang/test/AST/ast-dump-amdgpu-types.c b/clang/test/AST/ast-dump-amdgpu-types.c
index e032d678f1a09e8..f01461cdba2374e 100644
--- a/clang/test/AST/ast-dump-amdgpu-types.c
+++ b/clang/test/AST/ast-dump-amdgpu-types.c
@@ -1,10 +1,15 @@
 // REQUIRES: amdgpu-registered-target
 // Test without serialization:
-// RUN: %clang_cc1 -triple amdgcn -ast-dump -ast-dump-filter __amdgpu_buffer_rsrc_t %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn -ast-dump -ast-dump-filter __amdgpu_buffer_rsrc_t %s | FileCheck %s -check-prefix=BUFFER-RSRC
+// RUN: %clang_cc1 -triple amdgcn -ast-dump -ast-dump-filter __amdgpu_named_workgroup_barrier %s | FileCheck %s -check-prefix=WORKGROUP-BARRIER
 //
 // Test with serialization:
 // RUN: %clang_cc1 -triple amdgcn -emit-pch -o %t %s
-// RUN: %clang_cc1 -x c -triple amdgcn -include-pch %t -ast-dump-all -ast-dump-filter __amdgpu_buffer_rsrc_t /dev/null | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" | FileCheck %s
+// RUN: %clang_cc1 -x c -triple amdgcn -include-pch %t -ast-dump-all -ast-dump-filter __amdgpu_buffer_rsrc_t /dev/null | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" | FileCheck %s -check-prefix=BUFFER-RSRC
+// RUN: %clang_cc1 -x c -triple amdgcn -include-pch %t -ast-dump-all -ast-dump-filter __amdgpu_named_workgroup_barrier /dev/null | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" | FileCheck %s -check-prefix=WORKGROUP-BARRIER
 
-// CHECK: TypedefDecl {{.*}} implicit __amdgpu_buffer_rsrc_t
-// CHECK-NEXT: -BuiltinType {{.*}} '__amdgpu_buffer_rsrc_t'
+// BUFFER-RSRC: TypedefDecl {{.*}} implicit __amdgpu_buffer_rsrc_t
+// BUFFER-RSRC-NEXT: -BuiltinType {{.*}} '__amdgpu_buffer_rsrc_t'
+
+// WORKGROUP-BARRIER: TypedefDecl {{.*}} implicit __amdgpu_named_workgroup_barrier_t
+// WORKGROUP-BARRIER-NEXT: -BuiltinType {{.*}} '__amdgpu_named_workgroup_barrier_t'
diff --git a/clang/test/CodeGen/amdgpu-barrier-type-debug-info.c b/clang/test/CodeGen/amdgpu-barrier-type-debug-info.c
new file mode 100644
index 000000000000000..f595f1b222c4f65
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-barrier-type-debug-info.c
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s
+
+// CHECK: name: "__amdgpu_named_workgroup_barrier_t",{{.*}}baseType: ![[BT:[0-9]+]]
+// CHECK: [[BT]] = !DIBasicType(name: "__amdgpu_named_workgroup_barrier_t", size: 128, encoding: DW_ATE_unsigned)
+void test_locals(void) {
+  __amdgpu_named_workgroup_barrier_t k0;
+}
diff --git a/clang/test/CodeGenCXX/amdgpu-barrier-typeinfo.cpp b/clang/test/CodeGenCXX/amdgpu-barrier-typeinfo.cpp
new file mode 100644
index 000000000000000..a47f217dcd3db67
--- /dev/null
+++ b/clang/test/CodeGenCXX/amdgpu-barrier-typeinfo.cpp
@@ -0,0 +1,10 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn %s -emit-llvm -o - | FileCheck %s
+
+namespace std { class type_info; };
+
+auto &b0 = typeid(__amdgpu_named_workgroup_barrier_t);
+
+// CHECK-DAG: @_ZTSu34__amdgpu_named_workgroup_barrier_t = {{.*}} c"u34__amdgpu_named_workgroup_barrier_t\00"
+// CHECK-DAG: @_ZTIu34__amdgpu_named_workgroup_barrier_t = {{.*}} @_ZTVN10__cxxabiv123__fundamental_type_infoE, {{.*}} @_ZTSu34__amdgpu_named_workgroup_barrier_t
+
diff --git a/clang/test/CodeGenHIP/amdgpu-barrier-type.hip b/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
new file mode 100644
index 000000000000000..229e8b3c737c6aa
--- /dev/null
+++ b/clang/test/CodeGenHIP/amdgpu-barrier-type.hip
@@ -0,0 +1,42 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
+ // REQUIRES: amdgpu-registered-target
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s
+
+#define __shared__ __attribute__((shared))
+
+__shared__ __amdgpu_named_workgroup_barrier_t bar;
+__shared__ __amdgpu_named_workgroup_barrier_t arr[2];
+__shared__ struct {
+  __amdgpu_named_workgroup_barrier_t x;
+  __amdgpu_named_workgroup_barrier_t y;
+} str;
+
+__amdgpu_named_workgroup_barrier_t *getBar();
+void useBar(__amdgpu_named_workgroup_barrier_t *);
+
+// CHECK-LABEL: define {{[^@]+}}@_Z7testSemPu34__amdgpu_named_workgroup_barrier_t
+// CHECK-SAME: (ptr noundef [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-NEXT:    [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
+// CHECK-NEXT:    [[P_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[P_ADDR]] to ptr
+// CHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef [[TMP0]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef addrspacecast (ptr addrspace(1) @bar to ptr)) #[[ATTR2]]
+// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds ([2 x target("amdgcn.named.barrier", 0)], ptr addrspacecast (ptr addrspace(1) @arr to ptr), i64 0, i64 1)) #[[ATTR2]]
+// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef getelementptr inbounds nuw ([[STRUCT_ANON:%.*]], ptr addrspacecast (ptr addrspace(1) @str to ptr), i32 0, i32 1)) #[[ATTR2]]
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef ptr @_Z6getBarv() #[[ATTR2]]
+// CHECK-NEXT:    call void @_Z6useBarPu34__amdgpu_named_workgroup_barrier_t(ptr noundef [[CALL]]) #[[ATTR2]]
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef ptr @_Z6getBarv() #[[ATTR2]]
+// CHECK-NEXT:    ret ptr [[CALL1]]
+//
+__amdgpu_named_workgroup_barrier_t *testSem(__amdgpu_named_workgroup_barrier_t *p) {
+  useBar(p);
+  useBar(&bar);
+  useBar(&arr[1]);
+  useBar(&str.y);
+  useBar(getBar());
+  return getBar();
+}
diff --git a/clang/test/SemaCXX/amdgpu-barrier.cpp b/clang/test/SemaCXX/amdgpu-barrier.cpp
new file mode 100644
index 000000000000000..a171433727dda41
--- /dev/null
+++ b/clang/test/SemaCXX/amdgpu-barrier.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++11 -triple amdgcn -Wno-unused-value %s
+
+void foo() {
+  int n = 100;
+  __amdgpu_named_workgroup_barrier_t v = 0; // expected-error {{cannot initialize a variable of type '__amdgpu_named_workgroup_barrier_t' with an rvalue of type 'int'}}
+  static_cast<__amdgpu_named_workgroup_barrier_t>(n); // expected-error {{static_cast from 'int' to '__amdgpu_named_workgroup_barrier_t' is not allowed}}
+  dynamic_cast<__amdgpu_named_workgroup_barrier_t>(n); // expected-error {{invalid target type '__amdgpu_named_workgroup_barrier_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+  reinterpret_cast<__amdgpu_named_workgroup_barrier_t>(n); // expected-error {{reinterpret_cast from 'int' to '__amdgpu_named_workgroup_barrier_t' is not allowed}}
+  int c(v); // expected-error {{cannot initialize a variable of type 'int' with an lvalue of type '__amdgpu_named_workgroup_barrier_t'}}
+  __amdgpu_named_workgroup_barrier_t k;
+  int *ip = (int *)k; // expected-error {{cannot cast from type '__amdgpu_named_workgroup_barrier_t' to pointer type 'int *'}}
+  void *vp = (void *)k; // expected-error {{cannot cast from type '__amdgpu_named_workgroup_barrier_t' to pointer type 'void *'}}
+}
+
+static_assert(sizeof(__amdgpu_named_workgroup_barrier_t) == 16, "wrong size");
+static_assert(alignof(__amdgpu_named_workgroup_barrier_t) == 4, "wrong alignment");
diff --git a/clang/test/SemaHIP/amdgpu-barrier.hip b/clang/test/SemaHIP/amdgpu-barrier.hip
new file mode 100644
index 000000000000000..ccd99b1e2c1f261
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-barrier.hip
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+
+#define __device__ __attribute__((device))
+
+__device__ void foo() {
+  int n = 100;
+  __amdgpu_named_workgroup_barrier_t v = 0; // expected-error {{cannot initialize a variable of type '__amdgpu_named_workgroup_barrier_t' with an rvalue of type 'int'}}
+  static_cast<__amdgpu_named_workgroup_barrier_t>(n); // expected-error {{static_cast from 'int' to '__amdgpu_named_workgroup_barrier_t' is not allowed}}
+  dynamic_cast<__amdgpu_named_workgroup_barrier_t>(n); // expected-error {{invalid target type '__amdgpu_named_workgroup_barrier_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+  reinterpret_cast<__amdgpu_named_workgroup_barrier_t>(n); // expected-error {{reinterpret_cast from 'int' to '__amdgpu_named_workgroup_barrier_t' is not allowed}}
+  int c(v); // expected-error {{cannot initialize a variable of type 'int' with an lvalue of type '__amdgpu_named_workgroup_barrier_t'}}
+  __amdgpu_named_workgroup_barrier_t k;
+  int *ip = (int *)k; // expected-error {{cannot cast from type '__amdgpu_named_workgroup_barrier_t' to pointer type 'int *'}}
+  void *vp = (void *)k; // expected-error {{cannot cast from type '__amdgpu_named_workgroup_barrier_t' to pointer type 'void *'}}
+}
+
+static_assert(sizeof(__amdgpu_named_workgroup_barrier_t) == 16, "wrong size");
+static_assert(alignof(__amdgpu_named_workgroup_barrier_t) == 4, "wrong alignment");
diff --git a/clang/test/SemaOpenCL/amdgpu-barrier.cl b/clang/test/SemaOpenCL/amdgpu-barrier.cl
new file mode 100644
index 000000000000000..150c311c7c59303
--- /dev/null
+++ b/clang/test/SemaOpenCL/amdgpu-barrier.cl
@@ -0,0 +1,12 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -verify -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -Wno-unused-value %s
+
+void foo() {
+    int n = 100;
+    __amdgpu_named_workgroup_barrier_t v = 0; // expected-error {{initializing '__private __amdgpu_named_workgroup_barrier_t' with an expression of incompatible type 'int'}}
+    int c = v; // expected-error {{initializing '__private int' with an expression of incompatible type '__private __amdgpu_named_workgroup_barrier_t'}}
+    __amdgpu_named_workgroup_barrier_t k;
+    int *ip = (int *)k; // expected-error {{operand of type '__amdgpu_named_workgroup_barrier_t' where arithmetic or pointer type is required}}
+    void *vp = (void *)k; // expected-error {{operand of type '__amdgpu_named_workgroup_barrier_t' where arithmetic or pointer type is required}}
+ }
diff --git a/clang/test/SemaOpenMP/amdgpu-barrier.cpp b/clang/test/SemaOpenMP/amdgpu-barrier.cpp
new file mode 100644
index 000000000000000..70aaefd080885e6
--- /dev/null
+++ b/clang/test/SemaOpenMP/amdgpu-barrier.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -Wno-unused-value %s
+
+void foo() {
+#pragma omp target
+  {
+    int n = 100;
+    __amdgpu_named_workgroup_barrier_t v = 0; // expected-error {{cannot initialize a variable of type '__amdgpu_named_workgroup_barrier_t' with an rvalue of type 'int'}}
+    static_cast<__amdgpu_named_workgroup_barrier_t>(n); // expected-error {{static_cast from 'int' to '__amdgpu_named_workgroup_barrier_t' is not allowed}}
+    dynamic_cast<__amdgpu_named_workgroup_barrier_t>(n); // expected-error {{invalid target type '__amdgpu_named_workgroup_barrier_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+    reinterpret_cast<__amdgpu_named_workgroup_barrier_t>(n); // expected-error {{reinterpret_cast from 'int' to '__amdgpu_named_workgroup_barrier_t' is not allowed}}
+    int c(v); // expected-error {{cannot initialize a variable of type 'int' with an lvalue of type '__amdgpu_named_workgroup_barrier_t'}}
+    __amdgpu_named_workgroup_barrier_t k;
+    int *ip = (int *)k; // expected-error {{cannot cast from type '__amdgpu_named_workgroup_barrier_t' to pointer type 'int *'}}
+    void *vp = (void *)k; // expected-error {{cannot cast from type '__amdgpu_named_workgroup_barrier_t' to pointer type 'void *'}}
+  }
+ }
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 912b1a3960ef196..e311cde415174a9 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -834,6 +834,14 @@ Expected<TargetExtType *> TargetExtType::checkParams(TargetExtType *TTy) {
         "target extension type riscv.vector.tuple should have one "
         "type parameter and one integer parameter");
 
+  // Opaque types in the AMDGPU name space.
+  if (TTy->Name == "amdgcn.named.barrier" &&
+      (TTy->getNumTypeParameters() != 0 || TTy->getNumIntParameters() != 1)) {
+    return createStringError("target extension type amdgcn.named.barrier "
+                             "should have no type parameters "
+                             "and one integer parameter");
+  }
+
   return TTy;
 }
 
@@ -879,6 +887,12 @@ static TargetTypeInfo getTargetTypeInfo(const TargetExtType *Ty) {
   if (Name.starts_with("dx."))
     return TargetTypeInfo(PointerType::get(C, 0));
 
+  // Opaque types in the AMDGPU name space.
+  if (Name == "amdgcn.named.barrier") {
+    return TargetTypeInfo(FixedVectorType::get(Type::getInt32Ty(C), 4),
+                          TargetExtType::CanBeGlobal);
+  }
+
   return TargetTypeInfo(Type::getVoidTy(C));
 }
 

From 61946687bc68ccba763571cb420049b9a3749dfe Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Fri, 25 Oct 2024 11:33:44 -0700
Subject: [PATCH 30/39] [clang][modules] Shrink the size of `Module::Headers`
 (#113395)

This patch shrinks the size of the `Module` class from 2112B to 1624B. I
wasn't able to get a good data on the actual impact on memory usage, but
given my `clang-scan-deps` workload at hand (with tens of thousands of
instances), I think there should be some win here. This also speeds up
my benchmark by under 0.1%.
---
 .../modularize/CoverageChecker.cpp            |  7 ++---
 .../modularize/ModularizeUtilities.cpp        | 14 ++-------
 clang/include/clang/Basic/Module.h            | 31 ++++++++++++++-----
 clang/lib/Basic/Module.cpp                    |  2 +-
 clang/lib/Frontend/FrontendAction.cpp         |  2 +-
 clang/lib/Lex/ModuleMap.cpp                   | 21 +++++++------
 clang/lib/Serialization/ASTWriter.cpp         |  4 +--
 7 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/clang-tools-extra/modularize/CoverageChecker.cpp b/clang-tools-extra/modularize/CoverageChecker.cpp
index 0e76c539aa3c839..b536ee00497c03f 100644
--- a/clang-tools-extra/modularize/CoverageChecker.cpp
+++ b/clang-tools-extra/modularize/CoverageChecker.cpp
@@ -223,10 +223,9 @@ bool CoverageChecker::collectModuleHeaders(const Module &Mod) {
       return false;
   }
 
-  for (auto &HeaderKind : Mod.Headers)
-    for (auto &Header : HeaderKind)
-      ModuleMapHeadersSet.insert(
-          ModularizeUtilities::getCanonicalPath(Header.Entry.getName()));
+  for (const auto &Header : Mod.getAllHeaders())
+    ModuleMapHeadersSet.insert(
+        ModularizeUtilities::getCanonicalPath(Header.Entry.getName()));
 
   for (auto *Submodule : Mod.submodules())
     collectModuleHeaders(*Submodule);
diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index b202b3aae8f8a3a..476e13770a94f6c 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -358,7 +358,7 @@ bool ModularizeUtilities::collectModuleHeaders(const clang::Module &Mod) {
   } else if (std::optional<clang::Module::DirectoryName> UmbrellaDir =
                  Mod.getUmbrellaDirAsWritten()) {
     // If there normal headers, assume these are umbrellas and skip collection.
-    if (Mod.Headers->size() == 0) {
+    if (Mod.getHeaders(Module::HK_Normal).empty()) {
       // Collect headers in umbrella directory.
       if (!collectUmbrellaHeaders(UmbrellaDir->Entry.getName(),
                                   UmbrellaDependents))
@@ -371,16 +371,8 @@ bool ModularizeUtilities::collectModuleHeaders(const clang::Module &Mod) {
   // modules or because they are meant to be included by another header,
   // and thus should be ignored by modularize.
 
-  int NormalHeaderCount = Mod.Headers[clang::Module::HK_Normal].size();
-
-  for (int Index = 0; Index < NormalHeaderCount; ++Index) {
-    DependentsVector NormalDependents;
-    // Collect normal header.
-    const clang::Module::Header &Header(
-      Mod.Headers[clang::Module::HK_Normal][Index]);
-    std::string HeaderPath = getCanonicalPath(Header.Entry.getName());
-    HeaderFileNames.push_back(HeaderPath);
-  }
+  for (const auto &Header : Mod.getHeaders(clang::Module::HK_Normal))
+    HeaderFileNames.push_back(getCanonicalPath(Header.Entry.getName()));
 
   int MissingCountThisModule = Mod.MissingHeaders.size();
 
diff --git a/clang/include/clang/Basic/Module.h b/clang/include/clang/Basic/Module.h
index 9c5d33fbb562cc9..1ab3b5e5f81567f 100644
--- a/clang/include/clang/Basic/Module.h
+++ b/clang/include/clang/Basic/Module.h
@@ -253,8 +253,6 @@ class alignas(8) Module {
     HK_PrivateTextual,
     HK_Excluded
   };
-  static const int NumHeaderKinds = HK_Excluded + 1;
-
   /// Information about a header directive as found in the module map
   /// file.
   struct Header {
@@ -263,17 +261,36 @@ class alignas(8) Module {
     FileEntryRef Entry;
   };
 
-  /// Information about a directory name as found in the module map
-  /// file.
+private:
+  static const int NumHeaderKinds = HK_Excluded + 1;
+  // The begin index for a HeaderKind also acts the end index of HeaderKind - 1.
+  // The extra element at the end acts as the end index of the last HeaderKind.
+  unsigned HeaderKindBeginIndex[NumHeaderKinds + 1] = {};
+  SmallVector<Header, 2> HeadersStorage;
+
+public:
+  ArrayRef<Header> getAllHeaders() const { return HeadersStorage; }
+  ArrayRef<Header> getHeaders(HeaderKind HK) const {
+    assert(HK < NumHeaderKinds && "Invalid Module::HeaderKind");
+    auto BeginIt = HeadersStorage.begin() + HeaderKindBeginIndex[HK];
+    auto EndIt = HeadersStorage.begin() + HeaderKindBeginIndex[HK + 1];
+    return {BeginIt, EndIt};
+  }
+  void addHeader(HeaderKind HK, Header H) {
+    assert(HK < NumHeaderKinds && "Invalid Module::HeaderKind");
+    auto EndIt = HeadersStorage.begin() + HeaderKindBeginIndex[HK + 1];
+    HeadersStorage.insert(EndIt, std::move(H));
+    for (unsigned HKI = HK + 1; HKI != NumHeaderKinds + 1; ++HKI)
+      ++HeaderKindBeginIndex[HKI];
+  }
+
+  /// Information about a directory name as found in the module map file.
   struct DirectoryName {
     std::string NameAsWritten;
     std::string PathRelativeToRootModuleDirectory;
     DirectoryEntryRef Entry;
   };
 
-  /// The headers that are part of this module.
-  SmallVector<Header, 2> Headers[5];
-
   /// Stored information about a header directive that was found in the
   /// module map file but has not been resolved to a file.
   struct UnresolvedHeaderDirective {
diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index ad52fccff5dc7ff..a7a3f6b37efef17 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -528,7 +528,7 @@ void Module::print(raw_ostream &OS, unsigned Indent, bool Dump) const {
 
   for (auto &K : Kinds) {
     assert(&K == &Kinds[K.Kind] && "kinds in wrong order");
-    for (auto &H : Headers[K.Kind]) {
+    for (auto &H : getHeaders(K.Kind)) {
       OS.indent(Indent + 2);
       OS << K.Prefix << "header \"";
       OS.write_escaped(H.NameAsWritten);
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 81eea9c4c4dc58e..8264bd702fe43fb 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -358,7 +358,7 @@ static std::error_code collectModuleHeaderIncludes(
 
   // Add includes for each of these headers.
   for (auto HK : {Module::HK_Normal, Module::HK_Private}) {
-    for (Module::Header &H : Module->Headers[HK]) {
+    for (const Module::Header &H : Module->getHeaders(HK)) {
       Module->addTopHeader(H.Entry);
       // Use the path as specified in the module map file. We'll look for this
       // file relative to the module build directory (the directory containing
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 0a02a63deba3dc1..bc76a54abd95adf 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -472,12 +472,12 @@ static bool violatesPrivateInclude(Module *RequestingModule,
     // as obtained from the lookup and as obtained from the module.
     // This check is not cheap, so enable it only for debugging.
     bool IsPrivate = false;
-    SmallVectorImpl<Module::Header> *HeaderList[] = {
-        &Header.getModule()->Headers[Module::HK_Private],
-        &Header.getModule()->Headers[Module::HK_PrivateTextual]};
-    for (auto *Hs : HeaderList)
+    ArrayRef<Module::Header> HeaderList[] = {
+        Header.getModule()->getHeaders(Module::HK_Private),
+        Header.getModule()->getHeaders(Module::HK_PrivateTextual)};
+    for (auto Hs : HeaderList)
       IsPrivate |= llvm::any_of(
-          *Hs, [&](const Module::Header &H) { return H.Entry == IncFileEnt; });
+          Hs, [&](const Module::Header &H) { return H.Entry == IncFileEnt; });
     assert(IsPrivate && "inconsistent headers and roles");
   }
 #endif
@@ -1296,27 +1296,28 @@ void ModuleMap::addHeader(Module *Mod, Module::Header Header,
                           ModuleHeaderRole Role, bool Imported) {
   KnownHeader KH(Mod, Role);
 
+  FileEntryRef HeaderEntry = Header.Entry;
+
   // Only add each header to the headers list once.
   // FIXME: Should we diagnose if a header is listed twice in the
   // same module definition?
-  auto &HeaderList = Headers[Header.Entry];
+  auto &HeaderList = Headers[HeaderEntry];
   if (llvm::is_contained(HeaderList, KH))
     return;
 
   HeaderList.push_back(KH);
-  Mod->Headers[headerRoleToKind(Role)].push_back(Header);
+  Mod->addHeader(headerRoleToKind(Role), std::move(Header));
 
   bool isCompilingModuleHeader = Mod->isForBuilding(LangOpts);
   if (!Imported || isCompilingModuleHeader) {
     // When we import HeaderFileInfo, the external source is expected to
     // set the isModuleHeader flag itself.
-    HeaderInfo.MarkFileModuleHeader(Header.Entry, Role,
-                                    isCompilingModuleHeader);
+    HeaderInfo.MarkFileModuleHeader(HeaderEntry, Role, isCompilingModuleHeader);
   }
 
   // Notify callbacks that we just added a new header.
   for (const auto &Cb : Callbacks)
-    Cb->moduleMapAddHeader(Header.Entry.getName());
+    Cb->moduleMapAddHeader(HeaderEntry.getName());
 }
 
 FileID ModuleMap::getContainingModuleMapFileID(const Module *Module) const {
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 494890284d2f2c1..b576822fa704c89 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -3070,9 +3070,9 @@ void ASTWriter::WriteSubmodules(Module *WritingModule) {
         Module::HK_PrivateTextual},
       {SUBMODULE_EXCLUDED_HEADER, ExcludedHeaderAbbrev, Module::HK_Excluded}
     };
-    for (auto &HL : HeaderLists) {
+    for (const auto &HL : HeaderLists) {
       RecordData::value_type Record[] = {HL.RecordKind};
-      for (auto &H : Mod->Headers[HL.HeaderKind])
+      for (const auto &H : Mod->getHeaders(HL.HeaderKind))
         Stream.EmitRecordWithBlob(HL.Abbrev, Record, H.NameAsWritten);
     }
 

From 9648271a3c5adf875680833ac74eb4bafb48678d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 25 Oct 2024 20:39:45 +0200
Subject: [PATCH 31/39] [LV] Pass flag indicating epilogue is vectorized to
 executePlan (NFC)

This clarifies the flag, which is now only passed if the epilogue loop
is being vectorized.
---
 .../Vectorize/LoopVectorizationPlanner.h           |  8 ++++----
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp    | 14 +++++++-------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 1c8d541ef2c51fd..b2745c81dec8885 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -435,9 +435,9 @@ class LoopVectorizationPlanner {
   /// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
   /// according to the best selected \p VF and  \p UF.
   ///
-  /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
-  /// vectorization re-using plans for both the main and epilogue vector loops.
-  /// It should be removed once the re-use issue has been fixed.
+  /// TODO: \p VectorizingEpilogue indicates if the executed VPlan is for the
+  /// epilogue vector loop. It should be removed once the re-use issue has been
+  /// fixed.
   /// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
   /// to re-use expansion results generated during main plan execution.
   ///
@@ -447,7 +447,7 @@ class LoopVectorizationPlanner {
   DenseMap<const SCEV *, Value *>
   executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
               InnerLoopVectorizer &LB, DominatorTree *DT,
-              bool IsEpilogueVectorization,
+              bool VectorizingEpilogue,
               const DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr);
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e1173ddd71af9c5..865f5e3d2e588da 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7626,16 +7626,16 @@ static void createAndCollectMergePhiForReduction(
 
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
-    InnerLoopVectorizer &ILV, DominatorTree *DT, bool IsEpilogueVectorization,
+    InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
     const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
   assert(BestVPlan.hasVF(BestVF) &&
          "Trying to execute plan with unsupported VF");
   assert(BestVPlan.hasUF(BestUF) &&
          "Trying to execute plan with unsupported UF");
   assert(
-      (IsEpilogueVectorization || !ExpandedSCEVs) &&
+      ((VectorizingEpilogue && ExpandedSCEVs) ||
+       (!VectorizingEpilogue && !ExpandedSCEVs)) &&
       "expanded SCEVs to reuse can only be used during epilogue vectorization");
-  (void)IsEpilogueVectorization;
 
   // TODO: Move to VPlan transform stage once the transition to the VPlan-based
   // cost model is complete for better cost estimates.
@@ -7661,8 +7661,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   if (!ILV.getTripCount())
     ILV.setTripCount(State.get(BestVPlan.getTripCount(), VPLane(0)));
   else
-    assert(IsEpilogueVectorization && "should only re-use the existing trip "
-                                      "count during epilogue vectorization");
+    assert(VectorizingEpilogue && "should only re-use the existing trip "
+                                  "count during epilogue vectorization");
 
   // 1. Set up the skeleton for vectorization, including vector pre-header and
   // middle block. The vector loop is created during VPlan execution.
@@ -7715,7 +7715,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
   for (VPRecipeBase &R : *ExitVPBB) {
     createAndCollectMergePhiForReduction(
         dyn_cast<VPInstruction>(&R), State, OrigLoop,
-        State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs);
+        State.CFG.VPBB2IRBB[ExitVPBB], VectorizingEpilogue);
   }
 
   // 2.6. Maintain Loop Hints
@@ -10233,7 +10233,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
         std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
         auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
-                                             *BestMainPlan, MainILV, DT, true);
+                                             *BestMainPlan, MainILV, DT, false);
         ++LoopsVectorized;
 
         // Second pass vectorizes the epilogue and adjusts the control flow

From 8c4bc1e75de27adfbaead34b895b0efbaf17bd02 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 25 Oct 2024 11:44:20 -0700
Subject: [PATCH 32/39] [mlir][Transforms] Merge 1:1 and 1:N type converters
 (#113032)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 1:N type converter derived from the 1:1 type converter and extends
it with 1:N target materializations. This commit merges the two type
converters and stores 1:N target materializations in the 1:1 type
converter. This is in preparation of merging the 1:1 and 1:N dialect
conversion infrastructures.

1:1 target materializations (producing a single `Value`) will remain
valid. An additional API is added to the type converter to register 1:N
target materializations (producing a `SmallVector<Value>`). Internally,
all target materializations are stored as 1:N materializations.

The 1:N type converter is removed.

Note for LLVM integration: If you are using the `OneToNTypeConverter`,
simply switch all occurrences to `TypeConverter`.

---------

Co-authored-by: Markus Böck <markus.boeck02@gmail.com>
---
 .../Dialect/SparseTensor/Transforms/Passes.h  |  2 +-
 .../mlir/Transforms/DialectConversion.h       | 62 ++++++++++++++-----
 .../mlir/Transforms/OneToNTypeConversion.h    | 45 +-------------
 .../ArmSME/Transforms/VectorLegalization.cpp  |  2 +-
 .../Transforms/Utils/DialectConversion.cpp    | 26 ++++++--
 .../Transforms/Utils/OneToNTypeConversion.cpp | 44 +++++--------
 .../TestOneToNTypeConversionPass.cpp          | 18 ++++--
 7 files changed, 101 insertions(+), 98 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index 6ccbc40bdd6034a..2e9c297f20182af 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -150,7 +150,7 @@ std::unique_ptr<Pass> createLowerForeachToSCFPass();
 //===----------------------------------------------------------------------===//
 
 /// Type converter for iter_space and iterator.
-struct SparseIterationTypeConverter : public OneToNTypeConverter {
+struct SparseIterationTypeConverter : public TypeConverter {
   SparseIterationTypeConverter();
 };
 
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index 5ff36160dd61620..5e5957170e646c3 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -173,7 +173,9 @@ class TypeConverter {
   /// conversion has finished.
   ///
   /// Note: Target materializations may optionally accept an additional Type
-  /// parameter, which is the original type of the SSA value.
+  /// parameter, which is the original type of the SSA value. Furthermore, `T`
+  /// can be a TypeRange; in that case, the function must return a
+  /// SmallVector<Value>.
 
   /// This method registers a materialization that will be called when
   /// converting (potentially multiple) block arguments that were the result of
@@ -210,6 +212,9 @@ class TypeConverter {
   /// will be invoked with: outputType = "t3", inputs = "v2",
   // originalType = "t1". Note  that the original type "t1" cannot be recovered
   /// from just "t3" and "v2"; that's why the originalType parameter exists.
+  ///
+  /// Note: During a 1:N conversion, the result types can be a TypeRange. In
+  /// that case the materialization produces a SmallVector<Value>.
   template <typename FnT, typename T = typename llvm::function_traits<
                               std::decay_t<FnT>>::template arg_t<1>>
   void addTargetMaterialization(FnT &&callback) {
@@ -316,6 +321,11 @@ class TypeConverter {
   Value materializeTargetConversion(OpBuilder &builder, Location loc,
                                     Type resultType, ValueRange inputs,
                                     Type originalType = {}) const;
+  SmallVector<Value> materializeTargetConversion(OpBuilder &builder,
+                                                 Location loc,
+                                                 TypeRange resultType,
+                                                 ValueRange inputs,
+                                                 Type originalType = {}) const;
 
   /// Convert an attribute present `attr` from within the type `type` using
   /// the registered conversion functions. If no applicable conversion has been
@@ -340,9 +350,9 @@ class TypeConverter {
 
   /// The signature of the callback used to materialize a target conversion.
   ///
-  /// Arguments: builder, result type, inputs, location, original type
-  using TargetMaterializationCallbackFn =
-      std::function<Value(OpBuilder &, Type, ValueRange, Location, Type)>;
+  /// Arguments: builder, result types, inputs, location, original type
+  using TargetMaterializationCallbackFn = std::function<SmallVector<Value>(
+      OpBuilder &, TypeRange, ValueRange, Location, Type)>;
 
   /// The signature of the callback used to convert a type attribute.
   using TypeAttributeConversionCallbackFn =
@@ -409,22 +419,46 @@ class TypeConverter {
   /// callback.
   ///
   /// With callback of form:
-  /// `Value(OpBuilder &, T, ValueRange, Location, Type)`
+  /// - Value(OpBuilder &, T, ValueRange, Location, Type)
+  /// - SmallVector<Value>(OpBuilder &, TypeRange, ValueRange, Location, Type)
   template <typename T, typename FnT>
   std::enable_if_t<
       std::is_invocable_v<FnT, OpBuilder &, T, ValueRange, Location, Type>,
       TargetMaterializationCallbackFn>
   wrapTargetMaterialization(FnT &&callback) const {
     return [callback = std::forward<FnT>(callback)](
-               OpBuilder &builder, Type resultType, ValueRange inputs,
-               Location loc, Type originalType) -> Value {
-      if (T derivedType = dyn_cast<T>(resultType))
-        return callback(builder, derivedType, inputs, loc, originalType);
-      return Value();
+               OpBuilder &builder, TypeRange resultTypes, ValueRange inputs,
+               Location loc, Type originalType) -> SmallVector<Value> {
+      SmallVector<Value> result;
+      if constexpr (std::is_same<T, TypeRange>::value) {
+        // This is a 1:N target materialization. Return the produces values
+        // directly.
+        result = callback(builder, resultTypes, inputs, loc, originalType);
+      } else if constexpr (std::is_assignable<Type, T>::value) {
+        // This is a 1:1 target materialization. Invoke the callback only if a
+        // single SSA value is requested.
+        if (resultTypes.size() == 1) {
+          // Invoke the callback only if the type class of the callback matches
+          // the requested result type.
+          if (T derivedType = dyn_cast<T>(resultTypes.front())) {
+            // 1:1 materializations produce single values, but we store 1:N
+            // target materialization functions in the type converter. Wrap the
+            // result value in a SmallVector<Value>.
+            Value val =
+                callback(builder, derivedType, inputs, loc, originalType);
+            if (val)
+              result.push_back(val);
+          }
+        }
+      } else {
+        static_assert(sizeof(T) == 0, "T must be a Type or a TypeRange");
+      }
+      return result;
     };
   }
   /// With callback of form:
-  /// `Value(OpBuilder &, T, ValueRange, Location)`
+  /// - Value(OpBuilder &, T, ValueRange, Location)
+  /// - SmallVector<Value>(OpBuilder &, TypeRange, ValueRange, Location)
   template <typename T, typename FnT>
   std::enable_if_t<
       std::is_invocable_v<FnT, OpBuilder &, T, ValueRange, Location>,
@@ -432,9 +466,9 @@ class TypeConverter {
   wrapTargetMaterialization(FnT &&callback) const {
     return wrapTargetMaterialization<T>(
         [callback = std::forward<FnT>(callback)](
-            OpBuilder &builder, T resultType, ValueRange inputs, Location loc,
-            Type originalType) -> Value {
-          return callback(builder, resultType, inputs, loc);
+            OpBuilder &builder, T resultTypes, ValueRange inputs, Location loc,
+            Type originalType) {
+          return callback(builder, resultTypes, inputs, loc);
         });
   }
 
diff --git a/mlir/include/mlir/Transforms/OneToNTypeConversion.h b/mlir/include/mlir/Transforms/OneToNTypeConversion.h
index c59a3a52f028f32..7b4dd65cbff7b2d 100644
--- a/mlir/include/mlir/Transforms/OneToNTypeConversion.h
+++ b/mlir/include/mlir/Transforms/OneToNTypeConversion.h
@@ -33,49 +33,6 @@
 
 namespace mlir {
 
-/// Extends `TypeConverter` with 1:N target materializations. Such
-/// materializations have to provide the "reverse" of 1:N type conversions,
-/// i.e., they need to materialize N values with target types into one value
-/// with a source type (which isn't possible in the base class currently).
-class OneToNTypeConverter : public TypeConverter {
-public:
-  /// Callback that expresses user-provided materialization logic from the given
-  /// value to N values of the given types. This is useful for expressing target
-  /// materializations for 1:N type conversions, which materialize one value in
-  /// a source type as N values in target types.
-  using OneToNMaterializationCallbackFn =
-      std::function<std::optional<SmallVector<Value>>(OpBuilder &, TypeRange,
-                                                      Value, Location)>;
-
-  /// Creates the mapping of the given range of original types to target types
-  /// of the conversion and stores that mapping in the given (signature)
-  /// conversion. This function simply calls
-  /// `TypeConverter::convertSignatureArgs` and exists here with a different
-  /// name to reflect the broader semantic.
-  LogicalResult computeTypeMapping(TypeRange types,
-                                   SignatureConversion &result) const {
-    return convertSignatureArgs(types, result);
-  }
-
-  /// Applies one of the user-provided 1:N target materializations. If several
-  /// exists, they are tried out in the reverse order in which they have been
-  /// added until the first one succeeds. If none succeeds, the functions
-  /// returns `std::nullopt`.
-  std::optional<SmallVector<Value>>
-  materializeTargetConversion(OpBuilder &builder, Location loc,
-                              TypeRange resultTypes, Value input) const;
-
-  /// Adds a 1:N target materialization to the converter. Such materializations
-  /// build IR that converts N values with target types into 1 value of the
-  /// source type.
-  void addTargetMaterialization(OneToNMaterializationCallbackFn &&callback) {
-    oneToNTargetMaterializations.emplace_back(std::move(callback));
-  }
-
-private:
-  SmallVector<OneToNMaterializationCallbackFn> oneToNTargetMaterializations;
-};
-
 /// Stores a 1:N mapping of types and provides several useful accessors. This
 /// class extends `SignatureConversion`, which already supports 1:N type
 /// mappings but lacks some accessors into the mapping as well as access to the
@@ -295,7 +252,7 @@ class OneToNOpConversionPattern : public OneToNConversionPattern {
 /// not fail if some ops or types remain unconverted (i.e., the conversion is
 /// only "partial").
 LogicalResult
-applyPartialOneToNConversion(Operation *op, OneToNTypeConverter &typeConverter,
+applyPartialOneToNConversion(Operation *op, TypeConverter &typeConverter,
                              const FrozenRewritePatternSet &patterns);
 
 /// Add a pattern to the given pattern list to convert the signature of a
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 4968c4fc463d04b..e908a536e6fb271 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -921,7 +921,7 @@ struct VectorLegalizationPass
     : public arm_sme::impl::VectorLegalizationBase<VectorLegalizationPass> {
   void runOnOperation() override {
     auto *context = &getContext();
-    OneToNTypeConverter converter;
+    TypeConverter converter;
     RewritePatternSet patterns(context);
     converter.addConversion([](Type type) { return type; });
     converter.addConversion(
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 3cfcaa965f3546a..3d0c81867e0cc26 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -2831,11 +2831,29 @@ Value TypeConverter::materializeTargetConversion(OpBuilder &builder,
                                                  Location loc, Type resultType,
                                                  ValueRange inputs,
                                                  Type originalType) const {
+  SmallVector<Value> result = materializeTargetConversion(
+      builder, loc, TypeRange(resultType), inputs, originalType);
+  if (result.empty())
+    return nullptr;
+  assert(result.size() == 1 && "expected single result");
+  return result.front();
+}
+
+SmallVector<Value> TypeConverter::materializeTargetConversion(
+    OpBuilder &builder, Location loc, TypeRange resultTypes, ValueRange inputs,
+    Type originalType) const {
   for (const TargetMaterializationCallbackFn &fn :
-       llvm::reverse(targetMaterializations))
-    if (Value result = fn(builder, resultType, inputs, loc, originalType))
-      return result;
-  return nullptr;
+       llvm::reverse(targetMaterializations)) {
+    SmallVector<Value> result =
+        fn(builder, resultTypes, inputs, loc, originalType);
+    if (result.empty())
+      continue;
+    assert(TypeRange(result) == resultTypes &&
+           "callback produced incorrect number of values or values with "
+           "incorrect types");
+    return result;
+  }
+  return {};
 }
 
 std::optional<TypeConverter::SignatureConversion>
diff --git a/mlir/lib/Transforms/Utils/OneToNTypeConversion.cpp b/mlir/lib/Transforms/Utils/OneToNTypeConversion.cpp
index 19e29d48623e04c..c208716891ef1f4 100644
--- a/mlir/lib/Transforms/Utils/OneToNTypeConversion.cpp
+++ b/mlir/lib/Transforms/Utils/OneToNTypeConversion.cpp
@@ -17,20 +17,6 @@
 using namespace llvm;
 using namespace mlir;
 
-std::optional<SmallVector<Value>>
-OneToNTypeConverter::materializeTargetConversion(OpBuilder &builder,
-                                                 Location loc,
-                                                 TypeRange resultTypes,
-                                                 Value input) const {
-  for (const OneToNMaterializationCallbackFn &fn :
-       llvm::reverse(oneToNTargetMaterializations)) {
-    if (std::optional<SmallVector<Value>> result =
-            fn(builder, resultTypes, input, loc))
-      return *result;
-  }
-  return std::nullopt;
-}
-
 TypeRange OneToNTypeMapping::getConvertedTypes(unsigned originalTypeNo) const {
   TypeRange convertedTypes = getConvertedTypes();
   if (auto mapping = getInputMapping(originalTypeNo))
@@ -268,20 +254,20 @@ Block *OneToNPatternRewriter::applySignatureConversion(
 LogicalResult
 OneToNConversionPattern::matchAndRewrite(Operation *op,
                                          PatternRewriter &rewriter) const {
-  auto *typeConverter = getTypeConverter<OneToNTypeConverter>();
+  auto *typeConverter = getTypeConverter();
 
   // Construct conversion mapping for results.
   Operation::result_type_range originalResultTypes = op->getResultTypes();
   OneToNTypeMapping resultMapping(originalResultTypes);
-  if (failed(typeConverter->computeTypeMapping(originalResultTypes,
-                                               resultMapping)))
+  if (failed(typeConverter->convertSignatureArgs(originalResultTypes,
+                                                 resultMapping)))
     return failure();
 
   // Construct conversion mapping for operands.
   Operation::operand_type_range originalOperandTypes = op->getOperandTypes();
   OneToNTypeMapping operandMapping(originalOperandTypes);
-  if (failed(typeConverter->computeTypeMapping(originalOperandTypes,
-                                               operandMapping)))
+  if (failed(typeConverter->convertSignatureArgs(originalOperandTypes,
+                                                 operandMapping)))
     return failure();
 
   // Cast operands to target types.
@@ -318,7 +304,7 @@ namespace mlir {
 // inserted by this pass are annotated with a string attribute that also
 // documents which kind of the cast (source, argument, or target).
 LogicalResult
-applyPartialOneToNConversion(Operation *op, OneToNTypeConverter &typeConverter,
+applyPartialOneToNConversion(Operation *op, TypeConverter &typeConverter,
                              const FrozenRewritePatternSet &patterns) {
 #ifndef NDEBUG
   // Remember existing unrealized casts. This data structure is only used in
@@ -370,15 +356,13 @@ applyPartialOneToNConversion(Operation *op, OneToNTypeConverter &typeConverter,
       // Target materialization.
       assert(!areOperandTypesLegal && areResultsTypesLegal &&
              operands.size() == 1 && "found unexpected target cast");
-      std::optional<SmallVector<Value>> maybeResults =
-          typeConverter.materializeTargetConversion(
-              rewriter, castOp->getLoc(), resultTypes, operands.front());
-      if (!maybeResults) {
+      materializedResults = typeConverter.materializeTargetConversion(
+          rewriter, castOp->getLoc(), resultTypes, operands.front());
+      if (materializedResults.empty()) {
         emitError(castOp->getLoc())
             << "failed to create target materialization";
         return failure();
       }
-      materializedResults = maybeResults.value();
     } else {
       // Source and argument materializations.
       assert(areOperandTypesLegal && !areResultsTypesLegal &&
@@ -427,18 +411,18 @@ class FunctionOpInterfaceSignatureConversion : public OneToNConversionPattern {
                                 const OneToNTypeMapping &resultMapping,
                                 ValueRange convertedOperands) const override {
     auto funcOp = cast<FunctionOpInterface>(op);
-    auto *typeConverter = getTypeConverter<OneToNTypeConverter>();
+    auto *typeConverter = getTypeConverter();
 
     // Construct mapping for function arguments.
     OneToNTypeMapping argumentMapping(funcOp.getArgumentTypes());
-    if (failed(typeConverter->computeTypeMapping(funcOp.getArgumentTypes(),
-                                                 argumentMapping)))
+    if (failed(typeConverter->convertSignatureArgs(funcOp.getArgumentTypes(),
+                                                   argumentMapping)))
       return failure();
 
     // Construct mapping for function results.
     OneToNTypeMapping funcResultMapping(funcOp.getResultTypes());
-    if (failed(typeConverter->computeTypeMapping(funcOp.getResultTypes(),
-                                                 funcResultMapping)))
+    if (failed(typeConverter->convertSignatureArgs(funcOp.getResultTypes(),
+                                                   funcResultMapping)))
       return failure();
 
     // Nothing to do if the op doesn't have any non-identity conversions for its
diff --git a/mlir/test/lib/Conversion/OneToNTypeConversion/TestOneToNTypeConversionPass.cpp b/mlir/test/lib/Conversion/OneToNTypeConversion/TestOneToNTypeConversionPass.cpp
index 5c03ac12d1e58ce..b18dfd8bb22cb15 100644
--- a/mlir/test/lib/Conversion/OneToNTypeConversion/TestOneToNTypeConversionPass.cpp
+++ b/mlir/test/lib/Conversion/OneToNTypeConversion/TestOneToNTypeConversionPass.cpp
@@ -147,9 +147,14 @@ populateDecomposeTuplesTestPatterns(const TypeConverter &typeConverter,
 ///
 /// This function has been copied (with small adaptions) from
 /// TestDecomposeCallGraphTypes.cpp.
-static std::optional<SmallVector<Value>>
-buildGetTupleElementOps(OpBuilder &builder, TypeRange resultTypes, Value input,
-                        Location loc) {
+static SmallVector<Value> buildGetTupleElementOps(OpBuilder &builder,
+                                                  TypeRange resultTypes,
+                                                  ValueRange inputs,
+                                                  Location loc) {
+  if (inputs.size() != 1)
+    return {};
+  Value input = inputs.front();
+
   TupleType inputType = dyn_cast<TupleType>(input.getType());
   if (!inputType)
     return {};
@@ -222,7 +227,7 @@ void TestOneToNTypeConversionPass::runOnOperation() {
   auto *context = &getContext();
 
   // Assemble type converter.
-  OneToNTypeConverter typeConverter;
+  TypeConverter typeConverter;
 
   typeConverter.addConversion([](Type type) { return type; });
   typeConverter.addConversion(
@@ -234,6 +239,11 @@ void TestOneToNTypeConversionPass::runOnOperation() {
   typeConverter.addArgumentMaterialization(buildMakeTupleOp);
   typeConverter.addSourceMaterialization(buildMakeTupleOp);
   typeConverter.addTargetMaterialization(buildGetTupleElementOps);
+  // Test the other target materialization variant that takes the original type
+  // as additional argument. This materialization function always fails.
+  typeConverter.addTargetMaterialization(
+      [](OpBuilder &builder, TypeRange resultTypes, ValueRange inputs,
+         Location loc, Type originalType) -> SmallVector<Value> { return {}; });
 
   // Assemble patterns.
   RewritePatternSet patterns(context);

From e724226da753f10fd36fbb0ea392f04ab0fdbdab Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 25 Oct 2024 12:35:33 +0100
Subject: [PATCH 33/39] [VPlan] Return cost of 0 for VPWidenCastRecipe without
 underlying value.

In some cases, VPWidenCastRecipes are created but not considered in the
legacy cost model, including truncates/extends when evaluating a reduction
in a smaller type. Return 0 for such casts for now, to avoid divergences
between VPlan and legacy cost models.

Fixes https://github.com/llvm/llvm-project/issues/113526.
---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  5 ++
 .../LoopVectorize/X86/cost-model.ll           | 65 +++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0eb4f7c7c88cee7..2080b77157b6ca2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1524,6 +1524,11 @@ void VPWidenCastRecipe::execute(VPTransformState &State) {
 
 InstructionCost VPWidenCastRecipe::computeCost(ElementCount VF,
                                                VPCostContext &Ctx) const {
+  // TODO: In some cases, VPWidenCastRecipes are created but not considered in
+  // the legacy cost model, including truncates/extends when evaluating a
+  // reduction in a smaller type.
+  if (!getUnderlyingValue())
+    return 0;
   // Computes the CastContextHint from a recipes that may access memory.
   auto ComputeCCH = [&](const VPRecipeBase *R) -> TTI::CastContextHint {
     if (VF.isScalar())
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 73647919aac3602..29e54fabad0c1bb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -1037,6 +1037,71 @@ exit:
   ret i64 %red.mul
 }
 
+; Test case for https://github.com/llvm/llvm-project/issues/113526.
+define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 {
+; CHECK-LABEL: @narrowed_reduction(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP:%.*]] to i32
+; CHECK-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = and <16 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i32> [[VEC_PHI1]], <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = or <16 x i32> [[TMP0]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6]] = zext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7]] = zext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc <16 x i32> [[TMP6]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc <16 x i32> [[TMP7]] to <16 x i1>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <16 x i1> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i1 [[TMP11]] to i32
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OR13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[OR13]], 1
+; CHECK-NEXT:    [[OR]] = or i32 [[AND]], [[CONV]]
+; CHECK-NEXT:    [[INC]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV]], 0
+; CHECK-NEXT:    br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[LOOP]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[OR_LCSSA]]
+;
+entry:
+  %conv = zext i1 %cmp to i32
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 1, %entry ], [ %inc, %loop ]
+  %or13 = phi i32 [ 0, %entry ], [ %or, %loop ]
+  %and = and i32 %or13, 1
+  %or = or i32 %and, %conv
+  %inc = add i32 %iv, 1
+  %ec = icmp eq i32 %iv, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i32 %or
+}
+
 declare void @llvm.assume(i1 noundef) #0
 
 attributes #0 = { "target-cpu"="penryn" }

From 75252e29ea6a0959f3c1670e641a03fc18fc65fa Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Fri, 25 Oct 2024 12:40:59 -0700
Subject: [PATCH 34/39] [clang][serialization] Bump `NUM_PREDEF_TYPE_IDS`

This fixes a build error caused by 4ac0e7e400fe2a66d1fd5d5d1fa1c899dfb16716.
---
 clang/include/clang/Serialization/ASTBitCodes.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 99232fd21357904..3ddbc5fcd26c44f 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1149,7 +1149,7 @@ enum PredefinedTypeIDs {
 ///
 /// Type IDs for non-predefined types will start at
 /// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 512;
+const unsigned NUM_PREDEF_TYPE_IDS = 513;
 
 // Ensure we do not overrun the predefined types we reserved
 // in the enum PredefinedTypeIDs above.

From 6c9bbbc818ae8a0d2849dbc1ebd84a220cc27d20 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Fri, 25 Oct 2024 12:47:19 -0700
Subject: [PATCH 35/39] [SandboxVec][Legality] Reject non-instructions
 (#113190)

---
 .../Vectorize/SandboxVectorizer/Legality.h     | 10 +++++++++-
 .../Vectorize/SandboxVectorizer/Legality.cpp   | 18 +++++++++++++++++-
 .../SandboxVectorizer/Passes/BottomUpVec.cpp   |  2 +-
 .../SandboxVectorizer/LegalityTest.cpp         | 13 ++++++++++++-
 4 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index bcfafd75d4caaf5..d4b0b54375b0267 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -28,6 +28,7 @@ enum class LegalityResultID {
 
 /// The reason for vectorizing or not vectorizing.
 enum class ResultReason {
+  NotInstructions,
   DiffOpcodes,
   DiffTypes,
 };
@@ -46,6 +47,8 @@ struct ToStr {
 
   static const char *getVecReason(ResultReason Reason) {
     switch (Reason) {
+    case ResultReason::NotInstructions:
+      return "NotInstructions";
     case ResultReason::DiffOpcodes:
       return "DiffOpcodes";
     case ResultReason::DiffTypes:
@@ -67,6 +70,10 @@ class LegalityResult {
   LegalityResult(LegalityResultID ID) : ID(ID) {}
   friend class LegalityAnalysis;
 
+  /// We shouldn't need copies.
+  LegalityResult(const LegalityResult &) = delete;
+  LegalityResult &operator=(const LegalityResult &) = delete;
+
 public:
   virtual ~LegalityResult() {}
   LegalityResultID getSubclassID() const { return ID; }
@@ -90,6 +97,7 @@ class LegalityResultWithReason : public LegalityResult {
   friend class Pack; // For constructor.
 
 public:
+  ResultReason getReason() const { return Reason; }
 #ifndef NDEBUG
   void print(raw_ostream &OS) const override {
     LegalityResult::print(OS);
@@ -138,7 +146,7 @@ class LegalityAnalysis {
   }
   /// Checks if it's legal to vectorize the instructions in \p Bndl.
   /// \Returns a LegalityResult object owned by LegalityAnalysis.
-  LegalityResult &canVectorize(ArrayRef<Value *> Bndl);
+  const LegalityResult &canVectorize(ArrayRef<Value *> Bndl);
 };
 
 } // namespace llvm::sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index 0e2cd83c37b0cd0..f1c4577cece78af 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -7,11 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/SandboxIR/Utils.h"
 #include "llvm/SandboxIR/Value.h"
 #include "llvm/Support/Debug.h"
 
 namespace llvm::sandboxir {
 
+#define DEBUG_TYPE "SBVec:Legality"
+
 #ifndef NDEBUG
 void LegalityResult::dump() const {
   print(dbgs());
@@ -26,7 +30,19 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
   return std::nullopt;
 }
 
-LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl) {
+static void dumpBndl(ArrayRef<Value *> Bndl) {
+  for (auto *V : Bndl)
+    dbgs() << *V << "\n";
+}
+
+const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl) {
+  // If Bndl contains values other than instructions, we need to Pack.
+  if (any_of(Bndl, [](auto *V) { return !isa<Instruction>(V); })) {
+    LLVM_DEBUG(dbgs() << "Not vectorizing: Not Instructions:\n";
+               dumpBndl(Bndl););
+    return createLegalityResult<Pack>(ResultReason::NotInstructions);
+  }
+
   if (auto ReasonOpt = notVectorizableBasedOnOpcodesAndTypes(Bndl))
     return createLegalityResult<Pack>(*ReasonOpt);
 
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index f11420e47f3e1f9..ede41cd661b559a 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -40,7 +40,7 @@ static SmallVector<Value *, 4> getOperand(ArrayRef<Value *> Bndl,
 }
 
 void BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl) {
-  auto LegalityRes = Legality.canVectorize(Bndl);
+  const auto &LegalityRes = Legality.canVectorize(Bndl);
   switch (LegalityRes.getSubclassID()) {
   case LegalityResultID::Widen: {
     auto *I = cast<Instruction>(Bndl[0]);
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
index 76e5a5ce5aed920..56c6bf5f1ef1f5c 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
@@ -52,8 +52,16 @@ define void @foo(ptr %ptr) {
   auto *St1 = cast<sandboxir::StoreInst>(&*It++);
 
   sandboxir::LegalityAnalysis Legality;
-  auto Result = Legality.canVectorize({St0, St1});
+  const auto &Result = Legality.canVectorize({St0, St1});
   EXPECT_TRUE(isa<sandboxir::Widen>(Result));
+
+  {
+    // Check NotInstructions
+    auto &Result = Legality.canVectorize({F, St0});
+    EXPECT_TRUE(isa<sandboxir::Pack>(Result));
+    EXPECT_EQ(cast<sandboxir::Pack>(Result).getReason(),
+              sandboxir::ResultReason::NotInstructions);
+  }
 }
 
 #ifndef NDEBUG
@@ -68,6 +76,9 @@ TEST_F(LegalityTest, LegalityResultDump) {
   sandboxir::LegalityAnalysis Legality;
   EXPECT_TRUE(
       Matches(Legality.createLegalityResult<sandboxir::Widen>(), "Widen"));
+  EXPECT_TRUE(Matches(Legality.createLegalityResult<sandboxir::Pack>(
+                          sandboxir::ResultReason::NotInstructions),
+                      "Pack Reason: NotInstructions"));
   EXPECT_TRUE(Matches(Legality.createLegalityResult<sandboxir::Pack>(
                           sandboxir::ResultReason::DiffOpcodes),
                       "Pack Reason: DiffOpcodes"));

From eb9f4756bc3daaa4b19f4f46521dc05180814de4 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas@google.com>
Date: Fri, 25 Oct 2024 12:52:31 -0700
Subject: [PATCH 36/39] Revert "[SandboxVec][Legality] Reject non-instructions
 (#113190)"

This reverts commit 6c9bbbc818ae8a0d2849dbc1ebd84a220cc27d20.
---
 .../Vectorize/SandboxVectorizer/Legality.h     | 10 +---------
 .../Vectorize/SandboxVectorizer/Legality.cpp   | 18 +-----------------
 .../SandboxVectorizer/Passes/BottomUpVec.cpp   |  2 +-
 .../SandboxVectorizer/LegalityTest.cpp         | 13 +------------
 4 files changed, 4 insertions(+), 39 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index d4b0b54375b0267..bcfafd75d4caaf5 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -28,7 +28,6 @@ enum class LegalityResultID {
 
 /// The reason for vectorizing or not vectorizing.
 enum class ResultReason {
-  NotInstructions,
   DiffOpcodes,
   DiffTypes,
 };
@@ -47,8 +46,6 @@ struct ToStr {
 
   static const char *getVecReason(ResultReason Reason) {
     switch (Reason) {
-    case ResultReason::NotInstructions:
-      return "NotInstructions";
     case ResultReason::DiffOpcodes:
       return "DiffOpcodes";
     case ResultReason::DiffTypes:
@@ -70,10 +67,6 @@ class LegalityResult {
   LegalityResult(LegalityResultID ID) : ID(ID) {}
   friend class LegalityAnalysis;
 
-  /// We shouldn't need copies.
-  LegalityResult(const LegalityResult &) = delete;
-  LegalityResult &operator=(const LegalityResult &) = delete;
-
 public:
   virtual ~LegalityResult() {}
   LegalityResultID getSubclassID() const { return ID; }
@@ -97,7 +90,6 @@ class LegalityResultWithReason : public LegalityResult {
   friend class Pack; // For constructor.
 
 public:
-  ResultReason getReason() const { return Reason; }
 #ifndef NDEBUG
   void print(raw_ostream &OS) const override {
     LegalityResult::print(OS);
@@ -146,7 +138,7 @@ class LegalityAnalysis {
   }
   /// Checks if it's legal to vectorize the instructions in \p Bndl.
   /// \Returns a LegalityResult object owned by LegalityAnalysis.
-  const LegalityResult &canVectorize(ArrayRef<Value *> Bndl);
+  LegalityResult &canVectorize(ArrayRef<Value *> Bndl);
 };
 
 } // namespace llvm::sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index f1c4577cece78af..0e2cd83c37b0cd0 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -7,15 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
-#include "llvm/SandboxIR/Instruction.h"
-#include "llvm/SandboxIR/Utils.h"
 #include "llvm/SandboxIR/Value.h"
 #include "llvm/Support/Debug.h"
 
 namespace llvm::sandboxir {
 
-#define DEBUG_TYPE "SBVec:Legality"
-
 #ifndef NDEBUG
 void LegalityResult::dump() const {
   print(dbgs());
@@ -30,19 +26,7 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
   return std::nullopt;
 }
 
-static void dumpBndl(ArrayRef<Value *> Bndl) {
-  for (auto *V : Bndl)
-    dbgs() << *V << "\n";
-}
-
-const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl) {
-  // If Bndl contains values other than instructions, we need to Pack.
-  if (any_of(Bndl, [](auto *V) { return !isa<Instruction>(V); })) {
-    LLVM_DEBUG(dbgs() << "Not vectorizing: Not Instructions:\n";
-               dumpBndl(Bndl););
-    return createLegalityResult<Pack>(ResultReason::NotInstructions);
-  }
-
+LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl) {
   if (auto ReasonOpt = notVectorizableBasedOnOpcodesAndTypes(Bndl))
     return createLegalityResult<Pack>(*ReasonOpt);
 
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index ede41cd661b559a..f11420e47f3e1f9 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -40,7 +40,7 @@ static SmallVector<Value *, 4> getOperand(ArrayRef<Value *> Bndl,
 }
 
 void BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl) {
-  const auto &LegalityRes = Legality.canVectorize(Bndl);
+  auto LegalityRes = Legality.canVectorize(Bndl);
   switch (LegalityRes.getSubclassID()) {
   case LegalityResultID::Widen: {
     auto *I = cast<Instruction>(Bndl[0]);
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
index 56c6bf5f1ef1f5c..76e5a5ce5aed920 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
@@ -52,16 +52,8 @@ define void @foo(ptr %ptr) {
   auto *St1 = cast<sandboxir::StoreInst>(&*It++);
 
   sandboxir::LegalityAnalysis Legality;
-  const auto &Result = Legality.canVectorize({St0, St1});
+  auto Result = Legality.canVectorize({St0, St1});
   EXPECT_TRUE(isa<sandboxir::Widen>(Result));
-
-  {
-    // Check NotInstructions
-    auto &Result = Legality.canVectorize({F, St0});
-    EXPECT_TRUE(isa<sandboxir::Pack>(Result));
-    EXPECT_EQ(cast<sandboxir::Pack>(Result).getReason(),
-              sandboxir::ResultReason::NotInstructions);
-  }
 }
 
 #ifndef NDEBUG
@@ -76,9 +68,6 @@ TEST_F(LegalityTest, LegalityResultDump) {
   sandboxir::LegalityAnalysis Legality;
   EXPECT_TRUE(
       Matches(Legality.createLegalityResult<sandboxir::Widen>(), "Widen"));
-  EXPECT_TRUE(Matches(Legality.createLegalityResult<sandboxir::Pack>(
-                          sandboxir::ResultReason::NotInstructions),
-                      "Pack Reason: NotInstructions"));
   EXPECT_TRUE(Matches(Legality.createLegalityResult<sandboxir::Pack>(
                           sandboxir::ResultReason::DiffOpcodes),
                       "Pack Reason: DiffOpcodes"));

From 1540f772c793b3a29ae5d57e99456ec5d7ef4b39 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas@google.com>
Date: Fri, 25 Oct 2024 12:53:26 -0700
Subject: [PATCH 37/39] Reapply "[SandboxVec][Legality] Reject non-instructions
 (#113190)"

This reverts commit eb9f4756bc3daaa4b19f4f46521dc05180814de4.
---
 .../Vectorize/SandboxVectorizer/Legality.h    | 10 +++++++++-
 .../Vectorize/SandboxVectorizer/Legality.cpp  | 20 ++++++++++++++++++-
 .../SandboxVectorizer/Passes/BottomUpVec.cpp  |  2 +-
 .../SandboxVectorizer/LegalityTest.cpp        | 13 +++++++++++-
 4 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index bcfafd75d4caaf5..d4b0b54375b0267 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -28,6 +28,7 @@ enum class LegalityResultID {
 
 /// The reason for vectorizing or not vectorizing.
 enum class ResultReason {
+  NotInstructions,
   DiffOpcodes,
   DiffTypes,
 };
@@ -46,6 +47,8 @@ struct ToStr {
 
   static const char *getVecReason(ResultReason Reason) {
     switch (Reason) {
+    case ResultReason::NotInstructions:
+      return "NotInstructions";
     case ResultReason::DiffOpcodes:
       return "DiffOpcodes";
     case ResultReason::DiffTypes:
@@ -67,6 +70,10 @@ class LegalityResult {
   LegalityResult(LegalityResultID ID) : ID(ID) {}
   friend class LegalityAnalysis;
 
+  /// We shouldn't need copies.
+  LegalityResult(const LegalityResult &) = delete;
+  LegalityResult &operator=(const LegalityResult &) = delete;
+
 public:
   virtual ~LegalityResult() {}
   LegalityResultID getSubclassID() const { return ID; }
@@ -90,6 +97,7 @@ class LegalityResultWithReason : public LegalityResult {
   friend class Pack; // For constructor.
 
 public:
+  ResultReason getReason() const { return Reason; }
 #ifndef NDEBUG
   void print(raw_ostream &OS) const override {
     LegalityResult::print(OS);
@@ -138,7 +146,7 @@ class LegalityAnalysis {
   }
   /// Checks if it's legal to vectorize the instructions in \p Bndl.
   /// \Returns a LegalityResult object owned by LegalityAnalysis.
-  LegalityResult &canVectorize(ArrayRef<Value *> Bndl);
+  const LegalityResult &canVectorize(ArrayRef<Value *> Bndl);
 };
 
 } // namespace llvm::sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index 0e2cd83c37b0cd0..e4546c2f98113ee 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -7,11 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
+#include "llvm/SandboxIR/Instruction.h"
+#include "llvm/SandboxIR/Utils.h"
 #include "llvm/SandboxIR/Value.h"
 #include "llvm/Support/Debug.h"
 
 namespace llvm::sandboxir {
 
+#define DEBUG_TYPE "SBVec:Legality"
+
 #ifndef NDEBUG
 void LegalityResult::dump() const {
   print(dbgs());
@@ -26,7 +30,21 @@ LegalityAnalysis::notVectorizableBasedOnOpcodesAndTypes(
   return std::nullopt;
 }
 
-LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl) {
+#ifndef NDEBUG
+static void dumpBndl(ArrayRef<Value *> Bndl) {
+  for (auto *V : Bndl)
+    dbgs() << *V << "\n";
+}
+#endif // NDEBUG
+
+const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl) {
+  // If Bndl contains values other than instructions, we need to Pack.
+  if (any_of(Bndl, [](auto *V) { return !isa<Instruction>(V); })) {
+    LLVM_DEBUG(dbgs() << "Not vectorizing: Not Instructions:\n";
+               dumpBndl(Bndl););
+    return createLegalityResult<Pack>(ResultReason::NotInstructions);
+  }
+
   if (auto ReasonOpt = notVectorizableBasedOnOpcodesAndTypes(Bndl))
     return createLegalityResult<Pack>(*ReasonOpt);
 
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index f11420e47f3e1f9..ede41cd661b559a 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -40,7 +40,7 @@ static SmallVector<Value *, 4> getOperand(ArrayRef<Value *> Bndl,
 }
 
 void BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl) {
-  auto LegalityRes = Legality.canVectorize(Bndl);
+  const auto &LegalityRes = Legality.canVectorize(Bndl);
   switch (LegalityRes.getSubclassID()) {
   case LegalityResultID::Widen: {
     auto *I = cast<Instruction>(Bndl[0]);
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
index 76e5a5ce5aed920..56c6bf5f1ef1f5c 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
@@ -52,8 +52,16 @@ define void @foo(ptr %ptr) {
   auto *St1 = cast<sandboxir::StoreInst>(&*It++);
 
   sandboxir::LegalityAnalysis Legality;
-  auto Result = Legality.canVectorize({St0, St1});
+  const auto &Result = Legality.canVectorize({St0, St1});
   EXPECT_TRUE(isa<sandboxir::Widen>(Result));
+
+  {
+    // Check NotInstructions
+    auto &Result = Legality.canVectorize({F, St0});
+    EXPECT_TRUE(isa<sandboxir::Pack>(Result));
+    EXPECT_EQ(cast<sandboxir::Pack>(Result).getReason(),
+              sandboxir::ResultReason::NotInstructions);
+  }
 }
 
 #ifndef NDEBUG
@@ -68,6 +76,9 @@ TEST_F(LegalityTest, LegalityResultDump) {
   sandboxir::LegalityAnalysis Legality;
   EXPECT_TRUE(
       Matches(Legality.createLegalityResult<sandboxir::Widen>(), "Widen"));
+  EXPECT_TRUE(Matches(Legality.createLegalityResult<sandboxir::Pack>(
+                          sandboxir::ResultReason::NotInstructions),
+                      "Pack Reason: NotInstructions"));
   EXPECT_TRUE(Matches(Legality.createLegalityResult<sandboxir::Pack>(
                           sandboxir::ResultReason::DiffOpcodes),
                       "Pack Reason: DiffOpcodes"));

From cfde4fbccf5d8d949a8cade0a4f8ef9b0f47ca73 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 25 Oct 2024 16:46:38 -0400
Subject: [PATCH 38/39] [libc++] Remove obsolete Solaris and Newlib support for
 locales (#113721)

The solaris header file doesn't even exist, so that's definitely dead
code. The newlib header is empty, which means that localization can't
work on that platform. If someone is using libc++ with Newlib, they must
be providing LIBCXX_HAS_NO_LOCALIZATION today for anything to work, so
that header is basically dead code as well.
---
 libcxx/include/CMakeLists.txt                        |  1 -
 libcxx/include/__locale_dir/locale_base_api.h        |  4 ----
 libcxx/include/__locale_dir/locale_base_api/newlib.h | 12 ------------
 libcxx/include/module.modulemap                      |  1 -
 4 files changed, 18 deletions(-)
 delete mode 100644 libcxx/include/__locale_dir/locale_base_api/newlib.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 63aa74e09bb1a27..506ed721d0843ec 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -497,7 +497,6 @@ set(files
   __locale_dir/locale_base_api/fuchsia.h
   __locale_dir/locale_base_api/ibm.h
   __locale_dir/locale_base_api/musl.h
-  __locale_dir/locale_base_api/newlib.h
   __locale_dir/locale_base_api/openbsd.h
   __locale_dir/locale_base_api/win32.h
   __locale_dir/locale_guard.h
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index 8c000c558c52793..eab7fa8bf62faec 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -15,10 +15,6 @@
 #  include <__locale_dir/locale_base_api/ibm.h>
 #elif defined(__ANDROID__)
 #  include <__locale_dir/locale_base_api/android.h>
-#elif defined(__sun__)
-#  include <__locale_dir/locale_base_api/solaris.h>
-#elif defined(_NEWLIB_VERSION)
-#  include <__locale_dir/locale_base_api/newlib.h>
 #elif defined(__OpenBSD__)
 #  include <__locale_dir/locale_base_api/openbsd.h>
 #elif defined(__Fuchsia__)
diff --git a/libcxx/include/__locale_dir/locale_base_api/newlib.h b/libcxx/include/__locale_dir/locale_base_api/newlib.h
deleted file mode 100644
index 7da10e5889843dd..000000000000000
--- a/libcxx/include/__locale_dir/locale_base_api/newlib.h
+++ /dev/null
@@ -1,12 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_NEWLIB_H
-#define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_NEWLIB_H
-
-#endif // _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_NEWLIB_H
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index c79070c318759db..f92e8bf5fc9aba5 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1451,7 +1451,6 @@ module std [system] {
       textual header "__locale_dir/locale_base_api/fuchsia.h"
       textual header "__locale_dir/locale_base_api/ibm.h"
       textual header "__locale_dir/locale_base_api/musl.h"
-      textual header "__locale_dir/locale_base_api/newlib.h"
       textual header "__locale_dir/locale_base_api/openbsd.h"
       textual header "__locale_dir/locale_base_api/win32.h"
     }

From 1bc2cd98c58a1059170dc38697c7a29a8e21160b Mon Sep 17 00:00:00 2001
From: Dan Gohman <dev@sunfishcode.online>
Date: Fri, 25 Oct 2024 13:52:51 -0700
Subject: [PATCH 39/39] [WebAssembly] Enable nontrapping-fptoint and
 bulk-memory by default. (#112049)

We were prepared to enable these features [back in February], but they
got pulled for what appear to be unrelated reasons. So let's have
another try at enabling them!

Another motivation here is that it'd be convenient for the [Lime1
proposal] if "lime1" is close to a subset of "generic" (missing only
for extended-const).

[back in February]:
https://github.com/WebAssembly/tool-conventions/issues/158#issuecomment-1931119512
[Lime1 proposal]: https://github.com/llvm/llvm-project/pull/112035
---
 clang/docs/ReleaseNotes.rst                   |  9 ++++++
 clang/lib/Basic/Targets/WebAssembly.cpp       |  4 +--
 .../test/Preprocessor/wasm-target-features.c  |  4 +--
 lld/test/wasm/custom-section-name.ll          |  2 +-
 lld/test/wasm/data-segments.ll                |  2 +-
 lld/test/wasm/lto/Inputs/libcall-archive.ll   |  4 ++-
 lld/test/wasm/lto/libcall-archive.ll          |  4 ++-
 lld/test/wasm/lto/stub-library-libcall.s      |  4 +--
 llvm/docs/ReleaseNotes.md                     |  9 ++++++
 llvm/lib/Target/WebAssembly/WebAssembly.td    |  3 +-
 .../WebAssemblyFixFunctionBitcasts.cpp        |  2 ++
 .../WebAssembly/WebAssemblyTargetMachine.cpp  | 29 +++++++++++++++----
 .../WebAssembly/cfg-stackify-eh-legacy.ll     | 10 +++----
 .../WebAssembly/target-features-cpus.ll       |  8 ++++-
 .../WebAssembly/extern-functype-intrinsic.ll  |  4 +--
 llvm/test/MC/WebAssembly/libcall.ll           |  2 +-
 16 files changed, 74 insertions(+), 26 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 170c4cc280537f9..6a95337815174bc 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -690,6 +690,15 @@ NetBSD Support
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 
+The default target CPU, "generic", now enables the `-mnontrapping-fptoint`
+and `-mbulk-memory` flags, which correspond to the [Bulk Memory Operations]
+and [Non-trapping float-to-int Conversions] language features, which are
+[widely implemented in engines].
+
+[Bulk Memory Operations]: https://github.com/WebAssembly/bulk-memory-operations/blob/master/proposals/bulk-memory-operations/Overview.md
+[Non-trapping float-to-int Conversions]: https://github.com/WebAssembly/spec/blob/master/proposals/nontrapping-float-to-int-conversion/Overview.md
+[widely implemented in engines]: https://webassembly.org/features/
+
 AVR Support
 ^^^^^^^^^^^
 
diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp
index 4c9df6007b78231..0b380bdf835ffbd 100644
--- a/clang/lib/Basic/Targets/WebAssembly.cpp
+++ b/clang/lib/Basic/Targets/WebAssembly.cpp
@@ -154,20 +154,20 @@ bool WebAssemblyTargetInfo::initFeatureMap(
     llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
     const std::vector<std::string> &FeaturesVec) const {
   auto addGenericFeatures = [&]() {
+    Features["bulk-memory"] = true;
     Features["multivalue"] = true;
     Features["mutable-globals"] = true;
+    Features["nontrapping-fptoint"] = true;
     Features["reference-types"] = true;
     Features["sign-ext"] = true;
   };
   auto addBleedingEdgeFeatures = [&]() {
     addGenericFeatures();
     Features["atomics"] = true;
-    Features["bulk-memory"] = true;
     Features["exception-handling"] = true;
     Features["extended-const"] = true;
     Features["fp16"] = true;
     Features["multimemory"] = true;
-    Features["nontrapping-fptoint"] = true;
     Features["tail-call"] = true;
     Features["wide-arithmetic"] = true;
     setSIMDLevel(Features, RelaxedSIMD, true);
diff --git a/clang/test/Preprocessor/wasm-target-features.c b/clang/test/Preprocessor/wasm-target-features.c
index 14d2fbf4423d32b..71b7cf6a5d43cc1 100644
--- a/clang/test/Preprocessor/wasm-target-features.c
+++ b/clang/test/Preprocessor/wasm-target-features.c
@@ -163,8 +163,10 @@
 // RUN:     -target wasm64-unknown-unknown -mcpu=generic \
 // RUN:   | FileCheck %s -check-prefix=GENERIC-INCLUDE
 //
+// GENERIC-INCLUDE-DAG: #define __wasm_bulk_memory__ 1{{$}}
 // GENERIC-INCLUDE-DAG: #define __wasm_multivalue__ 1{{$}}
 // GENERIC-INCLUDE-DAG: #define __wasm_mutable_globals__ 1{{$}}
+// GENERIC-INCLUDE-DAG: #define __wasm_nontrapping_fptoint__ 1{{$}}
 // GENERIC-INCLUDE-DAG: #define __wasm_reference_types__ 1{{$}}
 // GENERIC-INCLUDE-DAG: #define __wasm_sign_ext__ 1{{$}}
 //
@@ -176,12 +178,10 @@
 // RUN:   | FileCheck %s -check-prefix=GENERIC
 //
 // GENERIC-NOT: #define __wasm_atomics__ 1{{$}}
-// GENERIC-NOT: #define __wasm_bulk_memory__ 1{{$}}
 // GENERIC-NOT: #define __wasm_exception_handling__ 1{{$}}
 // GENERIC-NOT: #define __wasm_extended_const__ 1{{$}}
 // GENERIC-NOT: #define __wasm__fp16__ 1{{$}}
 // GENERIC-NOT: #define __wasm_multimemory__ 1{{$}}
-// GENERIC-NOT: #define __wasm_nontrapping_fptoint__ 1{{$}}
 // GENERIC-NOT: #define __wasm_relaxed_simd__ 1{{$}}
 // GENERIC-NOT: #define __wasm_simd128__ 1{{$}}
 // GENERIC-NOT: #define __wasm_tail_call__ 1{{$}}
diff --git a/lld/test/wasm/custom-section-name.ll b/lld/test/wasm/custom-section-name.ll
index b860ef5a83e8364..8799fbf36056d1d 100644
--- a/lld/test/wasm/custom-section-name.ll
+++ b/lld/test/wasm/custom-section-name.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %s -o %t.o
+; RUN: llc -filetype=obj -mattr=-bulk-memory %s -o %t.o
 ; RUN: wasm-ld -no-gc-sections --no-entry -o %t.wasm %t.o
 ; RUN: obj2yaml %t.wasm | FileCheck %s --check-prefixes=CHECK,NO-BSS
 ; RUN: wasm-ld -no-gc-sections --no-entry --import-memory -o %t.bss.wasm %t.o
diff --git a/lld/test/wasm/data-segments.ll b/lld/test/wasm/data-segments.ll
index 670ac3c1f373faf..41868a0b2b50b6f 100644
--- a/lld/test/wasm/data-segments.ll
+++ b/lld/test/wasm/data-segments.ll
@@ -1,4 +1,4 @@
-; RUN: llc --mtriple=wasm32-unknown-unknown -filetype=obj %s -o %t.atomics.o -mattr=+atomics
+; RUN: llc --mtriple=wasm32-unknown-unknown -filetype=obj %s -o %t.atomics.o -mattr=+atomics,-bulk-memory
 ; RUN: llc --mtriple=wasm32-unknown-unknown -filetype=obj %s -o %t.bulk-mem.o -mattr=+bulk-memory
 ; RUN: llc --mtriple=wasm64-unknown-unknown -filetype=obj %s -o %t.bulk-mem64.o -mattr=+bulk-memory
 ; RUN: llc --mtriple=wasm32-unknown-unknown -filetype=obj %s -o %t.atomics.bulk-mem.o -mattr=+atomics,+bulk-memory
diff --git a/lld/test/wasm/lto/Inputs/libcall-archive.ll b/lld/test/wasm/lto/Inputs/libcall-archive.ll
index 9d05efdeae0806e..7d8c34196dfe49a 100644
--- a/lld/test/wasm/lto/Inputs/libcall-archive.ll
+++ b/lld/test/wasm/lto/Inputs/libcall-archive.ll
@@ -1,6 +1,8 @@
 target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
 target triple = "wasm32-unknown-unknown"
 
-define void @memcpy() {
+define void @memcpy() #0 {
   ret void
 }
+
+attributes #0 = { "target-features"="-bulk-memory" }
diff --git a/lld/test/wasm/lto/libcall-archive.ll b/lld/test/wasm/lto/libcall-archive.ll
index 2f785b98976ec88..5c46d2f7ed78381 100644
--- a/lld/test/wasm/lto/libcall-archive.ll
+++ b/lld/test/wasm/lto/libcall-archive.ll
@@ -8,7 +8,7 @@
 target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
 target triple = "wasm32-unknown-unknown"
 
-define void @_start(ptr %a, ptr %b) {
+define void @_start(ptr %a, ptr %b) #0 {
 entry:
   call void @llvm.memcpy.p0.p0.i64(ptr %a, ptr %b, i64 1024, i1 false)
   ret void
@@ -16,6 +16,8 @@ entry:
 
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1)
 
+attributes #0 = { "target-features"="-bulk-memory" }
+
 ; CHECK:       - Type:            CUSTOM
 ; CHECK-NEXT:    Name:            name
 ; CHECK-NEXT:    FunctionNames:
diff --git a/lld/test/wasm/lto/stub-library-libcall.s b/lld/test/wasm/lto/stub-library-libcall.s
index ce88a32dd99dc7b..d65983c0cf5bf52 100644
--- a/lld/test/wasm/lto/stub-library-libcall.s
+++ b/lld/test/wasm/lto/stub-library-libcall.s
@@ -2,7 +2,7 @@
 # RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t_main.o %t/main.s
 # RUN: llvm-as %S/Inputs/foo.ll -o %t_foo.o
 # RUN: llvm-as %S/Inputs/libcall.ll -o %t_libcall.o
-# RUN: wasm-ld %t_main.o %t_libcall.o %t_foo.o %p/Inputs/stub.so -o %t.wasm
+# RUN: wasm-ld -mllvm -mattr=-bulk-memory %t_main.o %t_libcall.o %t_foo.o %p/Inputs/stub.so -o %t.wasm
 # RUN: obj2yaml %t.wasm | FileCheck %s
 
 # The function `func_with_libcall` will generate an undefined reference to
@@ -12,7 +12,7 @@
 # If %t_foo.o is not included in the link we get an undefined symbol reported
 # to the dependency of memcpy on the foo export:
 
-# RUN: not wasm-ld %t_main.o %t_libcall.o %p/Inputs/stub.so -o %t.wasm 2>&1 | FileCheck --check-prefix=MISSING %s
+# RUN: not wasm-ld -mllvm -mattr=-bulk-memory %t_main.o %t_libcall.o %p/Inputs/stub.so -o %t.wasm 2>&1 | FileCheck --check-prefix=MISSING %s
 # MISSING: stub.so: undefined symbol: foo. Required by memcpy
 
 #--- main.s
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index be51b0af56ddbf7..e3d93f0dfd0ec55 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -180,6 +180,15 @@ Changes to the RISC-V Backend
 Changes to the WebAssembly Backend
 ----------------------------------
 
+The default target CPU, "generic", now enables the `-mnontrapping-fptoint`
+and `-mbulk-memory` flags, which correspond to the [Bulk Memory Operations]
+and [Non-trapping float-to-int Conversions] language features, which are
+[widely implemented in engines].
+
+[Bulk Memory Operations]: https://github.com/WebAssembly/bulk-memory-operations/blob/master/proposals/bulk-memory-operations/Overview.md
+[Non-trapping float-to-int Conversions]: https://github.com/WebAssembly/spec/blob/master/proposals/nontrapping-float-to-int-conversion/Overview.md
+[widely implemented in engines]: https://webassembly.org/features/
+
 Changes to the Windows Target
 -----------------------------
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index 37d99690c25b1fa..88628f2a7935453 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -114,7 +114,8 @@ def : ProcessorModel<"mvp", NoSchedModel, []>;
 // consideration given to available support in relevant engines and tools, and
 // the importance of the features.
 def : ProcessorModel<"generic", NoSchedModel,
-                      [FeatureMultivalue, FeatureMutableGlobals,
+                      [FeatureBulkMemory, FeatureMultivalue,
+                       FeatureMutableGlobals, FeatureNontrappingFPToInt,
                        FeatureReferenceTypes, FeatureSignExt]>;
 
 // Latest and greatest experimental version of WebAssembly. Bugs included!
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index a3cc9bae470859b..7c3e8d18ad276bb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -111,6 +111,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
 
   Function *Wrapper = Function::Create(Ty, Function::PrivateLinkage,
                                        F->getName() + "_bitcast", M);
+  Wrapper->setAttributes(F->getAttributes());
   BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
   const DataLayout &DL = BB->getDataLayout();
 
@@ -201,6 +202,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
     Wrapper->eraseFromParent();
     Wrapper = Function::Create(Ty, Function::PrivateLinkage,
                                F->getName() + "_bitcast_invalid", M);
+    Wrapper->setAttributes(F->getAttributes());
     BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
     new UnreachableInst(M->getContext(), BB);
     Wrapper->setName(F->getName() + "_bitcast_invalid");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 3fe6ccf1c608e1e..83cd57d0bbdd557 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -233,13 +233,30 @@ class CoalesceFeaturesAndStripAtomics final : public ModulePass {
 
 private:
   FeatureBitset coalesceFeatures(const Module &M) {
-    FeatureBitset Features =
-        WasmTM
-            ->getSubtargetImpl(std::string(WasmTM->getTargetCPU()),
-                               std::string(WasmTM->getTargetFeatureString()))
-            ->getFeatureBits();
-    for (auto &F : M)
+    // Union the features of all defined functions. Start with an empty set, so
+    // that if a feature is disabled in every function, we'll compute it as
+    // disabled. If any function lacks a target-features attribute, it'll
+    // default to the target CPU from the `TargetMachine`.
+    FeatureBitset Features;
+    bool AnyDefinedFuncs = false;
+    for (auto &F : M) {
+      if (F.isDeclaration())
+        continue;
+
       Features |= WasmTM->getSubtargetImpl(F)->getFeatureBits();
+      AnyDefinedFuncs = true;
+    }
+
+    // If we have no defined functions, use the target CPU from the
+    // `TargetMachine`.
+    if (!AnyDefinedFuncs) {
+      Features =
+          WasmTM
+              ->getSubtargetImpl(std::string(WasmTM->getTargetCPU()),
+                                 std::string(WasmTM->getTargetFeatureString()))
+              ->getFeatureBits();
+    }
+
     return Features;
   }
 
diff --git a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
index cef92f459e4aa37..24a08267db6fbf7 100644
--- a/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
+++ b/llvm/test/CodeGen/WebAssembly/cfg-stackify-eh-legacy.ll
@@ -1,9 +1,9 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling | FileCheck %s
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling
-; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -verify-machineinstrs -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling | FileCheck %s --check-prefix=NOOPT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
-; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,bulk-memory | FileCheck %s
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,bulk-memory
+; RUN: llc < %s -O0 -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -verify-machineinstrs -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory | FileCheck %s --check-prefix=NOOPT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory -wasm-disable-ehpad-sort -stats 2>&1 | FileCheck %s --check-prefix=NOSORT
+; RUN: llc < %s -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 -wasm-enable-eh -exception-model=wasm -mattr=+exception-handling,-bulk-memory -wasm-disable-ehpad-sort | FileCheck %s --check-prefix=NOSORT-LOCALS
 
 target triple = "wasm32-unknown-unknown"
 
diff --git a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
index 77d1564409f78cc..ba10dd94a9838dc 100644
--- a/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
+++ b/llvm/test/CodeGen/WebAssembly/target-features-cpus.ll
@@ -13,7 +13,10 @@ target triple = "wasm32-unknown-unknown"
 
 ; generic: +multivalue, +mutable-globals, +reference-types, +sign-ext
 ; GENERIC-LABEL: .custom_section.target_features,"",@
-; GENERIC-NEXT: .int8  4
+; GENERIC-NEXT: .int8  6
+; GENERIC-NEXT: .int8  43
+; GENERIC-NEXT: .int8  11
+; GENERIC-NEXT: .ascii  "bulk-memory"
 ; GENERIC-NEXT: .int8  43
 ; GENERIC-NEXT: .int8  10
 ; GENERIC-NEXT: .ascii  "multivalue"
@@ -21,6 +24,9 @@ target triple = "wasm32-unknown-unknown"
 ; GENERIC-NEXT: .int8  15
 ; GENERIC-NEXT: .ascii  "mutable-globals"
 ; GENERIC-NEXT: .int8  43
+; GENERIC-NEXT: .int8  19
+; GENERIC-NEXT: .ascii  "nontrapping-fptoint"
+; GENERIC-NEXT: .int8  43
 ; GENERIC-NEXT: .int8  15
 ; GENERIC-NEXT: .ascii  "reference-types"
 ; GENERIC-NEXT: .int8  43
diff --git a/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll b/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll
index 320b65356ba9f37..b321c0c82ad4d31 100644
--- a/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll
+++ b/llvm/test/MC/WebAssembly/extern-functype-intrinsic.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - | FileCheck %s
-; RUN: llc %s -o - | llvm-mc -triple=wasm32-unknown-unknown | FileCheck %s
+; RUN: llc %s -mattr=-bulk-memory -o - | FileCheck %s
+; RUN: llc %s -mattr=-bulk-memory -o - | llvm-mc -triple=wasm32-unknown-unknown | FileCheck %s
 
 ; ModuleID = 'test.c'
 source_filename = "test.c"
diff --git a/llvm/test/MC/WebAssembly/libcall.ll b/llvm/test/MC/WebAssembly/libcall.ll
index 8b81f150da892aa..ffd32abe2345bc7 100644
--- a/llvm/test/MC/WebAssembly/libcall.ll
+++ b/llvm/test/MC/WebAssembly/libcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj %s -o - | obj2yaml | FileCheck %s
+; RUN: llc -filetype=obj -mattr=-bulk-memory %s -o - | obj2yaml | FileCheck %s
 
 target triple = "wasm32-unknown-unknown"