diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index c6302cc8d..df5cc8c4f 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -14,9 +14,13 @@ material = "1.12.0" jetbrainsKotlinJvm = "1.7.21" oboe = "1.8.1" +activityCompose = "1.9.0" +composeBom = "2023.08.00" +coreKtx = "1.13.1" curl = "7.79.1-beta-1" googletest = "1.11.0-beta-1" jsoncpp = "1.9.5-beta-1" +lifecycleRuntimeKtx = "2.7.0" openssl = "1.1.1q-beta-1" [libraries] @@ -40,6 +44,17 @@ openssl = { group = "com.android.ndk.thirdparty", name = "openssl", version.ref # build-logic dependencies android-gradlePlugin = { group = "com.android.tools.build", name = "gradle", version.ref = "agp" } +androidx-activity-compose = { group = "androidx.activity", name = "activity-compose", version.ref = "activityCompose" } +androidx-compose-bom = { group = "androidx.compose", name = "compose-bom", version.ref = "composeBom" } +androidx-core-ktx = { group = "androidx.core", name = "core-ktx", version.ref = "coreKtx" } +androidx-lifecycle-runtime-ktx = { group = "androidx.lifecycle", name = "lifecycle-runtime-ktx", version.ref = "lifecycleRuntimeKtx" } +androidx-material3 = { group = "androidx.compose.material3", name = "material3" } +androidx-ui = { group = "androidx.compose.ui", name = "ui" } +androidx-ui-graphics = { group = "androidx.compose.ui", name = "ui-graphics" } +androidx-ui-test-junit4 = { group = "androidx.compose.ui", name = "ui-test-junit4" } +androidx-ui-test-manifest = { group = "androidx.compose.ui", name = "ui-test-manifest" } +androidx-ui-tooling = { group = "androidx.compose.ui", name = "ui-tooling" } +androidx-ui-tooling-preview = { group = "androidx.compose.ui", name = "ui-tooling-preview" } [plugins] android-application = { id = "com.android.application", version.ref = "agp" } diff --git a/settings.gradle b/settings.gradle index f0d4b009a..11c73fa0e 100644 --- a/settings.gradle +++ b/settings.gradle @@ -63,3 +63,4 @@ include(":teapots:image-decoder") include(":teapots:more-teapots") include(":teapots:textured-teapot") include(":unit-test:app") +include(":vectorization") diff --git a/vectorization/.gitignore b/vectorization/.gitignore new file mode 100644 index 000000000..796b96d1c --- /dev/null +++ b/vectorization/.gitignore @@ -0,0 +1 @@ +/build diff --git a/vectorization/README.md b/vectorization/README.md new file mode 100644 index 000000000..285757f1c --- /dev/null +++ b/vectorization/README.md @@ -0,0 +1,182 @@ +# Vectorization + +This sample shows how to implement matrix multiplication using various +vectorization approaches. + +Note: You should not reuse this matrix library in your application. It was not +written to be useful beyond the scope of this demo. If you're looking for a +matrix library, you probably want [GLM] for graphics applications, or a linear +algebra library such as BLAS for compute applications. + +The sample app will benchmark each implementation and display the average run +time over 1,000,000 runs. The goal of this sample is to illustrate the trade- +offs of each implementation in terms of flexibility, readability, and +performance. + +Given the relatively small problem size used here (4x4 matrices and vec4s), the +best performing implementations in this sample are the ones that can best +improve over the naive implementation without large set up costs. You should not +take the results of this sample as authoritative: if performance is important to +you, you **must** benchmark your code for workloads realistic for your app. + +If you're not familiar with it [Godbolt] is an invaluable tool for examining +compiler optimizer behavior. You could also use `$NDK_BIN/clang -S -O2 -o -` +from the command line for a local workflow. + +## Implementations + +This sample contains the following implementations. Each of their trade-offs are +discussed briefly, but as mentioned above, you should not rely on the +performance results measured here to make a decision for your app. + +### Auto-vectorization + +See [auto_vectorization.h] for the implementation. + +This implementation is written in generic C++ and contains no explicit SIMD. The +only vectorization that will be performed is Clang's auto-vectorization. This +makes for the most portable code and readable code, but at the cost of +performance. + +See https://llvm.org/docs/Vectorizers.html for Clang's docs about +auto-vectorization. + +### std::simd + +This isn't actually available yet. It's an experimental part of the C++ standard +and is in development in libc++, but NDK r27 happened to catch it right in the +middle of a rewrite, so it's not currently usable. + +See https://en.cppreference.com/w/cpp/experimental/simd/simd. + +### Clang vectors + +See [clang_vector.h] for the implementation. + +This implementation uses Clang's generic vector types. This code is mostly as +portable as the auto-vectorization implementation, with the only caveat being +that it is limited by the width of the vector registers for the target hardware. +To deal with problems that don't fit in the target's vector registers, you would +need to either alter the algorithm to tile the operations, or use Scalable +Vector Extensions (AKA [SVE]). + +However, the benefit of the portability trade-off is that this does outperform +the auto-vectorization implementation. + +See +https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors. + +### Clang matrices + +See [matrix.h] for the implementation. This is the default implementation for +`Matrix::operator*`, so unlike the others that file contains the rest of the +`Matrix` class as well. + +This implementation uses Clang's built-in matrix type. This is an experimental +feature in Clang, but it has the simplest code (because some kind Clang person +wrote the hard part) and performs the best by a wide margin. There are +implementation defined limits on the size of the matrix, but within those limits +the code is as portable as the auto-vectorization implementation. The docs say +the feature is still under development and subject to change, so be wary of +using this in production, and definitely don't use these types as part of your +ABI. + +See https://clang.llvm.org/docs/LanguageExtensions.html#matrix-types for more +details. + +### OpenMP SIMD + +See [omp_simd.h] for the implementation. + +This implementation uses OpenMP's SIMD directive. For some reason this +under-performs even the auto-vectorized implementation. There are a lot of +additional specifiers that can be added to the simd directive that would maybe +improve this implementation. Patches welcome :) + +See https://www.openmp.org/spec-html/5.0/openmpsu42.html for more information. + +## Alternatives not shown here + +There are other approaches that could be used that aren't shown here. + +### Neon + +A Neon implementation would be nearly identical to the one in [clang_vector.h]. +The only difference is how the vector type is specified. A lot of older Neon +sample code looks substantially different because it uses the Neon intrinsics +defined in `arm_neon.h`, but if you look at how the intrinsics in that file are +defined, all they actually do (for a little endian system, and Android does not +support big endian, so we can ignore that caveat) is use the `*` operator and +leave the correct instruction selection up to Clang. + +In other words, you should probably never use the Neon-specific approach. The +generated code should be identical to code written with Clang's arch-generic +vectors. If you rewrite the [clang_vector.h] implementation to use Neon's +`float32x4_t` instead of the Clang vector, the results are identical. + +### SVE + +[SVE] scales SIMD to arbitrarily sized vectors, and the C extensions, while +making for less concise code than is needed for a constrained vector size like +we have here, handle windowing of data to fit the hardware vector size for you. +For problems like the small matrix multiply we do here, it's overkill. For +portability across various vector widths for the Arm CPUs that support SVE, it +can reduce the difficulty of writing SIMD code. + +### GPU acceleration + +GPU acceleration is a better fit for large data sets. That approach isn't shown +here because it's substantially more code to set up the GPU for this +computation, and our data size is so small that the cost of GPU initialization +and streaming the data to the GPU is likely to make that a net-loss. If you want +to learn more about GPU compute, see https://vulkan-tutorial.com/Compute_Shader, +https://www.khronos.org/opengl/wiki/Compute_Shader, and +https://www.khronos.org/opencl/ (while OpenCL is not guaranteed to be available +for all Android devices, it is a very common OEM extension). + +## Function multi-versioning + +There are two compiler attributes that can be helpful for targeting specific +hardware features when optimizing hot code paths: [target] and [target_clones], +both of which may be referred to as "function multiversioning" or "FMV". Each +solves a slightly different but related problem. + +The `target` attribute makes it easier to write multiple implementations for a +function that should be selected based on the runtime hardware. If benchmarking +shows that one implementation performs better on armv8.2 and a different +implementation performs better on armv8 (see the docs for more details on +specific targeting capabilities), you can write the function twice, annotate +them with the appropriate `__attribute__((target(...)))` tag, and the compiler +will auto-generate the code to select the best-fitting implementation at runtime +(it uses ifuncs under the hood, so the branch is resolved once at library load +time rather than for each call). + +The `target_clones` attribute, on the other hand, allows you to write the +function once but instruct the compiler to generate multiple variants of the +function for each requested target. This means that, for example, if you've +requested both `default` and `armv8.2`, the compiler will generate a default +implementation compatible with all Android devices, as well as a second +implementation that uses instructions available in armv8.2 but not available in +the base armv8 ABI. As with the `target` attribute, Clang will automatically +select the best-fitting implementation at runtime. Using `target_clones` is the +same as using `target` with identical function bodies. + +Note that with both of these approaches, testing becomes more difficult because +you will need a greater variety of hardware to test each code path. If you're +already doing fine grained targeting like this, that isn't a new problem, and +using one or both of these attributes may help you simplify your implementation. + +Neither of these techniques are shown in this sample. We don't have access to +enough hardware to benchmark or verify multiple implementations, and (as of NDK +r27, at least), Clang doesn't support `target_clones` on templated functions. + +[auto_vectorization.h]: src/main/cpp/auto_vectorization.h +[clang_vector.h]: src/main/cpp/clang_vector.h +[GLM]: https://github.com/g-truc/glm +[Gobolt]: https://godbolt.org/ +[matrix.h]: src/main/cpp/matrix.h +[neon.h]: src/main/cpp/neon.h +[omp_simd.h]: src/main/cpp/omp_simd.h +[SVE]: https://developer.arm.com/Architectures/Scalable%20Vector%20Extensions +[target_clones]: https://clang.llvm.org/docs/AttributeReference.html#target-clones +[target]: https://clang.llvm.org/docs/AttributeReference.html#target diff --git a/vectorization/build.gradle.kts b/vectorization/build.gradle.kts new file mode 100644 index 000000000..f6e32824b --- /dev/null +++ b/vectorization/build.gradle.kts @@ -0,0 +1,45 @@ +plugins { + id("ndksamples.android.application") + id("ndksamples.android.kotlin") +} + +android { + namespace = "com.android.ndk.samples.vectorization" + + defaultConfig { + applicationId = "com.android.ndk.samples.vectorization" + + vectorDrawables { + useSupportLibrary = true + } + } + + externalNativeBuild { + cmake { + path = file("src/main/cpp/CMakeLists.txt") + } + } + + buildFeatures { + compose = true + prefab = true + } + + composeOptions { + kotlinCompilerExtensionVersion = "1.5.1" + } +} + +dependencies { + implementation(project(":base")) + implementation(libs.androidx.core.ktx) + implementation(libs.androidx.lifecycle.runtime.ktx) + implementation(libs.androidx.activity.compose) + implementation(platform(libs.androidx.compose.bom)) + implementation(libs.androidx.ui) + implementation(libs.androidx.ui.graphics) + implementation(libs.androidx.ui.tooling.preview) + implementation(libs.androidx.material3) + debugImplementation(libs.androidx.ui.tooling) + debugImplementation(libs.androidx.ui.test.manifest) +} diff --git a/vectorization/src/main/AndroidManifest.xml b/vectorization/src/main/AndroidManifest.xml new file mode 100644 index 000000000..f058ea4dc --- /dev/null +++ b/vectorization/src/main/AndroidManifest.xml @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + diff --git a/vectorization/src/main/cpp/CMakeLists.txt b/vectorization/src/main/cpp/CMakeLists.txt new file mode 100644 index 000000000..a51799c64 --- /dev/null +++ b/vectorization/src/main/cpp/CMakeLists.txt @@ -0,0 +1,32 @@ +cmake_minimum_required(VERSION 3.22.1) +project(Vectorization LANGUAGES CXX) + +add_compile_options(-Wall -Wextra -Werror) + +find_package(base REQUIRED CONFIG) + +add_library(app + SHARED + benchmark.cpp + jni.cpp +) + +target_compile_features(app PUBLIC cxx_std_23) +target_compile_options(app PUBLIC -fenable-matrix -fopenmp) + +target_link_libraries(app + PRIVATE + base::base + log +) + +target_link_options(app + PRIVATE + -flto + -Wl,--version-script,${CMAKE_SOURCE_DIR}/libapp.map.txt +) + +set_target_properties(app + PROPERTIES + LINK_DEPENDS ${CMAKE_SOURCE_DIR}/libapp.map.txt +) diff --git a/vectorization/src/main/cpp/auto_vectorization.h b/vectorization/src/main/cpp/auto_vectorization.h new file mode 100644 index 000000000..fbeeae5bc --- /dev/null +++ b/vectorization/src/main/cpp/auto_vectorization.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "matrix.h" + +namespace samples::vectorization { + +/** + * Multiplies two compatible matrices and returns the result. + * + * @tparam T The type of each matrix cell. + * @tparam M The number of rows in the left operand and the result. + * @tparam N The number of columns in the left operand, and the rows in the + * right operand. + * @tparam P The number of columns in the right operand and the result. + * @param lhs The left operand. + * @param rhs The right operand. + * @return The result of lhs * rhs. + */ +template +Matrix MultiplyWithAutoVectorization(const Matrix& lhs, + const Matrix& rhs) { + // This may look like an unfair benchmark because this implementation uses the + // less vector friendly one than the others, however, using the vector + // friendly algorithm here actually made performance worse. + // + // This is a good illustration of why it's important to benchmark your own + // code and not rely on what someone else tells you about which works best: it + // depends. + // + // It's probably also worth mentioning that if what you need is *consistent* + // performance across compiler versions, the only real choice you have is + // writing assembly. Even the instruction intrinsics (at least for Neon) are + // subject to the compiler's instruction selection. That will be overkill for + // most users, since it's substantially more difficult to write and maintain, + // but is how you'll see some code bases deal with this (codecs in particular + // are willing to make that trade-off). + Matrix result; + for (auto i = 0U; i < M; i++) { + for (auto j = 0U; j < P; j++) { + T sum = {}; + for (auto k = 0U; k < N; k++) { + sum += lhs.get(i, k) * rhs[k, j]; + } + result[i, j] = sum; + } + } + return result; +} + +} // namespace samples::vectorization diff --git a/vectorization/src/main/cpp/benchmark.cpp b/vectorization/src/main/cpp/benchmark.cpp new file mode 100644 index 000000000..49944ab83 --- /dev/null +++ b/vectorization/src/main/cpp/benchmark.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark.h" + +#include +#include + +#include +#include + +#include "auto_vectorization.h" +#include "clang_vector.h" +#include "matrix.h" +#include "omp_simd.h" + +constexpr uint32_t kNumRuns = 1'000'000; + +namespace samples::vectorization { + +Vec4 result; + +/** + * Benchmarks a given matrix multiply operation. + * + * The multiply is given here as a callback to try to keep Clang from folding, + * unrolling, or inlining inconsistently across each benchmarked implementation. + * We want Clang to do as much as possible to optimize *within* the multiply + * function itself, but inconsistent optimization of the benchmark code itself + * could skew results. + * + * @param position A position vector. + * @param translation A translation vector. + * @param func The multiplication function to use. + * @return The average duration per call in nanoseconds. + */ +[[nodiscard, clang::noinline]] std::chrono::nanoseconds Benchmark( + Vec4& position, Mat4& translation, + std::function func) { + // TODO: Move to a unit test. + auto test = func(position, translation); + auto expected = Vec4{{20, 10, 10, 1}}; + CHECK_EQ(test, expected); + + auto begin = std::chrono::steady_clock::now(); + + // This is another attempt to prevent Clang from optimizing the benchmark + // harness inconsistently. +#pragma clang loop unroll(disable) + for (auto i = 0U; i < kNumRuns; i++) { + result = func(position, translation); + } + + auto end = std::chrono::steady_clock::now(); + + return (end - begin) / kNumRuns; +} + +[[nodiscard]] std::expected +BenchmarkMatrixMultiplication(Backend backend) { + Vec4 position{{10.0f, 10.0f, 10.0f, 1.0f}}; + Mat4 translation{{ + {1.0f, 0.0f, 0.0f, 10.0f}, + {0.0f, 1.0f, 0.0f, 0.0f}, + {0.0f, 0.0f, 1.0f, 0.0f}, + {0.0f, 0.0f, 0.0f, 1.0f}, + }}; + + switch (backend) { + case Backend::kAutoVectorization: + LOG(INFO) << "Benchmarking auto-vectorization"; + return Benchmark(position, translation, [](Vec4 p, Mat4 t) { + return MultiplyWithAutoVectorization(t, p); + }); + case Backend::kCxxSimd: +#if __NDK_MAJOR__ >= 28 +#error check if std::simd works yet +#endif + // The libc++ in NDK r27 has only a skeleton implementation of std::simd. + // Some things we can do without, but it doesn't actually have operator*, + // which is sort of essential :) + LOG(INFO) << "Benchmarking std::simd"; + return std::unexpected{BenchmarkError::kNotImplemented}; + case Backend::kClangVector: + LOG(INFO) << "Benchmarking Clang vectors"; + return Benchmark(position, translation, [](Vec4 p, Mat4 t) { + return MultiplyWithClangVectors(t, p); + }); + case Backend::kClangMatrix: + LOG(INFO) << "Benchmarking Clang matrices"; + return Benchmark(position, translation, [](Vec4 p, Mat4 t) { + // This is the default implementation since it's the fastest. + return t * p; + }); + case Backend::kOpenMp: + LOG(INFO) << "Benchmarking OpenMP SIMD"; + return Benchmark(position, translation, + [](Vec4 p, Mat4 t) { return MultiplyWithOpenMP(t, p); }); + default: + return std::unexpected{BenchmarkError::kUnknownBackend}; + } +} + +} // namespace samples::vectorization diff --git a/vectorization/src/main/cpp/benchmark.h b/vectorization/src/main/cpp/benchmark.h new file mode 100644 index 000000000..ca134d163 --- /dev/null +++ b/vectorization/src/main/cpp/benchmark.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +namespace samples::vectorization { + +/** + * The available backends for matrix multiplication shown in this sample. + */ +enum class Backend : uint8_t { + /// Auto-vectorization only. + kAutoVectorization = 0, + + /// C++ std::simd. + kCxxSimd = 1, + + /// Clang's arch-generic vector types. + kClangVector = 2, + + /// Clang's built-in matrix type. + kClangMatrix = 3, + + /// OpenMP SIMD. + kOpenMp = 4, +}; + +/// Errors returned by BenchmarkMatrixMultiplication. +enum class BenchmarkError : int8_t { + /// Indicates that the requested backend has not yet been implemented. + kNotImplemented = -1, + /// Indicates that the requested backend isn't supported for the device. + kNotSupported = -2, + /// Indicates that an unknown backend was requested. + kUnknownBackend = -3, +}; + +/** + * Benchmarks the given matrix multiply backend. + * + * The chosen backend will run a predetermined number of times and return the + * average execution time. + * + * @param backend The backend to benchmark. + * @return The average execution time, or an error code. + */ +[[nodiscard]] std::expected +BenchmarkMatrixMultiplication(Backend backend); + +} // namespace samples::vectorization diff --git a/vectorization/src/main/cpp/clang_vector.h b/vectorization/src/main/cpp/clang_vector.h new file mode 100644 index 000000000..660921aa1 --- /dev/null +++ b/vectorization/src/main/cpp/clang_vector.h @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "matrix.h" + +namespace samples::vectorization { + +/** + * Multiplies two compatible matrices and returns the result. + * + * @tparam T The type of each matrix cell. + * @tparam M The number of rows in the left operand and the result. + * @tparam N The number of columns in the left operand, and the rows in the + * right operand. + * @tparam P The number of columns in the right operand and the result. + * @param lhs The left operand. + * @param rhs The right operand. + * @return The result of lhs * rhs. + */ +template +Matrix MultiplyWithClangVectors(const Matrix& lhs, + const Matrix& rhs) { + // The rearrangement of the matrix multiplication algorithm here allows us to + // avoid reducing vectors to scalar stores. Instead we compute the partial + // result for each (result) column as a vector, accumulate partial results + // there, and then store the resulting row with a single vector store. + // + // This implementation only works if your columns (or rows, if you restructure + // this and the data to work in row-major order) fit within your vector + // registers. If you have larger data, you can tile the algorithm to fit the + // vector size. + // + // See https://mbernste.github.io/posts/matrix_vector_mult/ for a more + // thorough explanation. + typedef T Vec __attribute__((__vector_size__(M * sizeof(T)))); + Matrix result; + for (auto result_column_index = 0U; result_column_index < P; + result_column_index++) { + Vec result_column = {}; + for (auto lhs_column_index = 0U; lhs_column_index < N; lhs_column_index++) { + auto c = lhs.column(lhs_column_index); + Vec lhs_column = *reinterpret_cast(c.data()); + result_column += lhs_column * rhs[lhs_column_index, result_column_index]; + } + *reinterpret_cast(result.column(result_column_index).data()) = + result_column; + } + return result; +} + +} // namespace samples::vectorization diff --git a/vectorization/src/main/cpp/jni.cpp b/vectorization/src/main/cpp/jni.cpp new file mode 100644 index 000000000..9cceb65f3 --- /dev/null +++ b/vectorization/src/main/cpp/jni.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include "benchmark.h" + +using samples::vectorization::Backend; +using samples::vectorization::BenchmarkMatrixMultiplication; + +static jlong BenchmarkMatrixMultiplyJni(JNIEnv* _Nonnull /* env */, + jobject _Nonnull /* this */, + jint backend) { + auto result = BenchmarkMatrixMultiplication(static_cast(backend)); + if (result.has_value()) { + return result->count(); + } + return static_cast(result.error()); +} + +JNIEXPORT jint JNI_OnLoad(JavaVM* _Nonnull vm, + void* _Nullable reserved __unused) { + JNIEnv* env; + if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION_1_6) != JNI_OK) { + return JNI_ERR; + } + + jclass c = env->FindClass("com/android/ndk/samples/vectorization/AppJni"); + if (c == nullptr) return JNI_ERR; + + static const JNINativeMethod methods[] = { + {"benchmarkMatrixMultiply", "(I)J", + reinterpret_cast(BenchmarkMatrixMultiplyJni)}, + }; + int rc = env->RegisterNatives(c, methods, arraysize(methods)); + if (rc != JNI_OK) return rc; + + return JNI_VERSION_1_6; +} \ No newline at end of file diff --git a/vectorization/src/main/cpp/libapp.map.txt b/vectorization/src/main/cpp/libapp.map.txt new file mode 100644 index 000000000..c14a3109a --- /dev/null +++ b/vectorization/src/main/cpp/libapp.map.txt @@ -0,0 +1,6 @@ +LIBAPP { + global: + JNI_OnLoad; + local: + *; +}; diff --git a/vectorization/src/main/cpp/matrix.h b/vectorization/src/main/cpp/matrix.h new file mode 100644 index 000000000..14ab5bdea --- /dev/null +++ b/vectorization/src/main/cpp/matrix.h @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +namespace samples::vectorization { + +template +class Matrix { + public: + Matrix() = default; + Matrix(T (&&cells)[Rows][Columns]) { + for (size_t row = 0; row < Rows; row++) { + for (size_t column = 0; column < Columns; column++) { + (*this)[row, column] = cells[row][column]; + } + } + } + + // Convenience constructor for vectors so callers don't need to use + // `{{x}, {y}, {z}}`. + Matrix(const std::array cells) + requires(Columns == 1) + : cells_(cells) {} + + [[nodiscard, clang::always_inline]] constexpr const T* _Nonnull data() const { + return cells_.data(); + } + + [[nodiscard, clang::always_inline]] constexpr T* _Nonnull data() { + return cells_.data(); + } + + [[nodiscard, clang::always_inline]] constexpr T& get(size_t row, + size_t column) { + return cells_[column * Rows + row]; + } + + [[nodiscard, clang::always_inline]] constexpr const T& get( + size_t row, size_t column) const { + return cells_[column * Rows + row]; + } + + [[nodiscard, clang::always_inline]] constexpr T& operator[](size_t row, + size_t column) { + return get(row, column); + } + + [[nodiscard, clang::always_inline]] constexpr const T& operator[]( + size_t row, size_t column) const { + return get(row, column); + } + + [[nodiscard, clang::always_inline]] constexpr const std::span column( + size_t column) const { + return std::span{&get(0, column), Rows}; + } + + [[nodiscard, clang::always_inline]] constexpr std::span column( + size_t column) { + return std::span{&get(0, column), Rows}; + } + + bool operator==(const Matrix& rhs) const { + return cells_ == rhs.cells_; + } + + friend std::ostream& operator<<(std::ostream& stream, + const Matrix& m) { + stream << "{" << std::endl; + for (size_t row = 0; row < Rows; row++) { + stream << "\t{"; + for (size_t column = 0; column < Columns; column++) { + stream << m[row, column]; + if (column != Columns - 1) { + stream << ", "; + } + } + stream << "}" << std::endl; + } + stream << "}"; + return stream; + } + + /** + * Multiplies two compatible matrices and returns the result. + * + * @tparam T The type of each matrix cell. + * @tparam M The number of rows in the left operand and the result. + * @tparam N The number of columns in the left operand, and the rows in the + * right operand. + * @tparam P The number of columns in the right operand and the result. + * @param lhs The left operand. + * @param rhs The right operand. + * @return The result of lhs * rhs. + */ + template + Matrix operator*( + const Matrix& rhs) const + requires(OtherRows == Columns) + { + auto m_lhs = + __builtin_matrix_column_major_load(data(), Rows, Columns, Rows); + auto m_rhs = __builtin_matrix_column_major_load(rhs.data(), OtherRows, + OtherColumns, OtherRows); + auto m_result = m_lhs * m_rhs; + + Matrix result; + __builtin_matrix_column_major_store(m_result, result.data(), Rows); + return result; + } + + private: + std::array cells_ = {}; +}; + +// Enables automatic deduction of definitions like `Matrix m{{1, 0}, {0, 1}}` +// without needing to specify `Matrix<2, 2, int>`. +template +Matrix(T (&&)[Rows][Columns]) -> Matrix; + +#if __NDK_MAJOR__ >= 28 +#error "TODO: `template ` each of these" +#endif +using Mat4 = Matrix<4, 4>; +using Vec4 = Matrix<4, 1>; + +} // namespace samples::vectorization diff --git a/vectorization/src/main/cpp/omp_simd.h b/vectorization/src/main/cpp/omp_simd.h new file mode 100644 index 000000000..343a71b3e --- /dev/null +++ b/vectorization/src/main/cpp/omp_simd.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include "matrix.h" + +namespace samples::vectorization { + +/** + * Multiplies two compatible matrices and returns the result. + * + * @tparam T The type of each matrix cell. + * @tparam M The number of rows in the left operand and the result. + * @tparam N The number of columns in the left operand, and the rows in the + * right operand. + * @tparam P The number of columns in the right operand and the result. + * @param lhs The left operand. + * @param rhs The right operand. + * @return The result of lhs * rhs. + */ +template +Matrix MultiplyWithOpenMP(const Matrix& lhs, + const Matrix& rhs) { + Matrix result; +#pragma omp simd + for (auto result_column_index = 0U; result_column_index < P; + result_column_index++) { + for (auto lhs_column_index = 0U; lhs_column_index < N; lhs_column_index++) { + auto lhs_column = lhs.column(lhs_column_index); + const T& scalar = rhs[lhs_column_index, result_column_index]; + for (auto row = 0U; row < lhs_column.size(); row++) { + result[row, result_column_index] += lhs_column[row] * scalar; + } + } + } + return result; +} + +} // namespace samples::vectorization diff --git a/vectorization/src/main/java/com/android/ndk/samples/vectorization/VectorizationActivity.kt b/vectorization/src/main/java/com/android/ndk/samples/vectorization/VectorizationActivity.kt new file mode 100644 index 000000000..56bae964e --- /dev/null +++ b/vectorization/src/main/java/com/android/ndk/samples/vectorization/VectorizationActivity.kt @@ -0,0 +1,132 @@ +package com.android.ndk.samples.vectorization + +import android.os.Bundle +import androidx.activity.ComponentActivity +import androidx.activity.compose.setContent +import androidx.activity.enableEdgeToEdge +import androidx.compose.foundation.layout.Column +import androidx.compose.foundation.layout.Row +import androidx.compose.foundation.layout.Spacer +import androidx.compose.foundation.layout.fillMaxSize +import androidx.compose.foundation.layout.fillMaxWidth +import androidx.compose.foundation.layout.padding +import androidx.compose.material3.Scaffold +import androidx.compose.material3.Text +import androidx.compose.runtime.Composable +import androidx.compose.runtime.LaunchedEffect +import androidx.compose.runtime.mutableStateMapOf +import androidx.compose.runtime.remember +import androidx.compose.runtime.snapshots.SnapshotStateMap +import androidx.compose.ui.Modifier +import androidx.compose.ui.tooling.preview.Preview +import com.android.ndk.samples.vectorization.ui.theme.NDKSamplesTheme +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.withContext +import kotlin.time.Duration +import kotlin.time.Duration.Companion.nanoseconds + +// Keep in sync with the definition in benchmark.h. +enum class Backend(val id: Int, val label: String) { + AUTO_VECTORIZATION(0, "Auto-vectorization"), + CXX_SIMD(1, "std::simd"), + CLANG_VECTORS(2, "Clang Vectors"), + CLANG_MATRICES(3, "Clang Matrices"), + OPEN_MP(4, "OpenMP"), +} + +class VectorizationActivity : ComponentActivity() { + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + enableEdgeToEdge() + setContent { + NDKSamplesTheme { + Scaffold(modifier = Modifier.fillMaxSize()) { innerPadding -> + Column( + modifier = Modifier + .padding(innerPadding) + .fillMaxWidth() + ) { + Text(text = "Average times for 10,000,000 runs") + BenchmarkTable() + } + } + } + } + } + + init { + System.loadLibrary("app") + } +} + +sealed interface BenchmarkResult { + class Success(private val duration: Duration) : BenchmarkResult { + override fun toString(): String = duration.toString() + } + + class Failure(private val message: String) : BenchmarkResult { + override fun toString(): String = message + } +} + +object AppJni { + fun benchmarkMatrixMultiply(backend: Backend): BenchmarkResult { + val result = benchmarkMatrixMultiply(backend.id) + if (result >= 0) { + return BenchmarkResult.Success(result.nanoseconds) + } + + return BenchmarkResult.Failure( + when (result) { + -1L -> "Not implemented" + -2L -> "Not supported" + -3L -> "Invalid backend" + else -> "Unknown error" + } + ) + } + + private external fun benchmarkMatrixMultiply(backend: Int): Long +} + +@Composable +fun BenchmarkTable(modifier: Modifier = Modifier) { + val status: SnapshotStateMap = remember { + mutableStateMapOf(*Backend.entries.map { it to "Not started" } + .toTypedArray()) + } + + LaunchedEffect(true) { + withContext(Dispatchers.Default) { + Backend.entries.forEach { + status[it] = "Running..." + status[it] = AppJni.benchmarkMatrixMultiply(it).toString() + } + } + } + + Column( + modifier = modifier + ) { + status.toSortedMap().forEach { (backend, status) -> + BenchmarkResult(name = backend.label, duration = status) + } + } +} + +@Composable +fun BenchmarkResult(name: String, duration: String) { + Row(modifier = Modifier.fillMaxWidth()) { + Text(text = name) + Spacer(modifier = Modifier.weight(1f)) + Text(text = duration) + } +} + +@Preview(showBackground = true) +@Composable +fun BenchmarkResultPreview() { + NDKSamplesTheme { + BenchmarkResult("Auto-vectorization", "00m10s") + } +} \ No newline at end of file diff --git a/vectorization/src/main/java/com/android/ndk/samples/vectorization/ui/theme/Color.kt b/vectorization/src/main/java/com/android/ndk/samples/vectorization/ui/theme/Color.kt new file mode 100644 index 000000000..eede0cd09 --- /dev/null +++ b/vectorization/src/main/java/com/android/ndk/samples/vectorization/ui/theme/Color.kt @@ -0,0 +1,11 @@ +package com.android.ndk.samples.vectorization.ui.theme + +import androidx.compose.ui.graphics.Color + +val Purple80 = Color(0xFFD0BCFF) +val PurpleGrey80 = Color(0xFFCCC2DC) +val Pink80 = Color(0xFFEFB8C8) + +val Purple40 = Color(0xFF6650a4) +val PurpleGrey40 = Color(0xFF625b71) +val Pink40 = Color(0xFF7D5260) \ No newline at end of file diff --git a/vectorization/src/main/java/com/android/ndk/samples/vectorization/ui/theme/Theme.kt b/vectorization/src/main/java/com/android/ndk/samples/vectorization/ui/theme/Theme.kt new file mode 100644 index 000000000..449a96f33 --- /dev/null +++ b/vectorization/src/main/java/com/android/ndk/samples/vectorization/ui/theme/Theme.kt @@ -0,0 +1,58 @@ +package com.android.ndk.samples.vectorization.ui.theme + +import android.app.Activity +import android.os.Build +import androidx.compose.foundation.isSystemInDarkTheme +import androidx.compose.material3.MaterialTheme +import androidx.compose.material3.darkColorScheme +import androidx.compose.material3.dynamicDarkColorScheme +import androidx.compose.material3.dynamicLightColorScheme +import androidx.compose.material3.lightColorScheme +import androidx.compose.runtime.Composable +import androidx.compose.ui.platform.LocalContext + +private val DarkColorScheme = darkColorScheme( + primary = Purple80, + secondary = PurpleGrey80, + tertiary = Pink80 +) + +private val LightColorScheme = lightColorScheme( + primary = Purple40, + secondary = PurpleGrey40, + tertiary = Pink40 + + /* Other default colors to override + background = Color(0xFFFFFBFE), + surface = Color(0xFFFFFBFE), + onPrimary = Color.White, + onSecondary = Color.White, + onTertiary = Color.White, + onBackground = Color(0xFF1C1B1F), + onSurface = Color(0xFF1C1B1F), + */ +) + +@Composable +fun NDKSamplesTheme( + darkTheme: Boolean = isSystemInDarkTheme(), + // Dynamic color is available on Android 12+ + dynamicColor: Boolean = true, + content: @Composable () -> Unit +) { + val colorScheme = when { + dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> { + val context = LocalContext.current + if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context) + } + + darkTheme -> DarkColorScheme + else -> LightColorScheme + } + + MaterialTheme( + colorScheme = colorScheme, + typography = Typography, + content = content + ) +} \ No newline at end of file diff --git a/vectorization/src/main/java/com/android/ndk/samples/vectorization/ui/theme/Type.kt b/vectorization/src/main/java/com/android/ndk/samples/vectorization/ui/theme/Type.kt new file mode 100644 index 000000000..1eec0ccca --- /dev/null +++ b/vectorization/src/main/java/com/android/ndk/samples/vectorization/ui/theme/Type.kt @@ -0,0 +1,34 @@ +package com.android.ndk.samples.vectorization.ui.theme + +import androidx.compose.material3.Typography +import androidx.compose.ui.text.TextStyle +import androidx.compose.ui.text.font.FontFamily +import androidx.compose.ui.text.font.FontWeight +import androidx.compose.ui.unit.sp + +// Set of Material typography styles to start with +val Typography = Typography( + bodyLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 24.sp, + lineHeight = 24.sp, + letterSpacing = 0.5.sp + ) + /* Other default text styles to override + titleLarge = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Normal, + fontSize = 22.sp, + lineHeight = 28.sp, + letterSpacing = 0.sp + ), + labelSmall = TextStyle( + fontFamily = FontFamily.Default, + fontWeight = FontWeight.Medium, + fontSize = 11.sp, + lineHeight = 16.sp, + letterSpacing = 0.5.sp + ) + */ +) \ No newline at end of file diff --git a/vectorization/src/main/res/drawable-v24/ic_launcher_foreground.xml b/vectorization/src/main/res/drawable-v24/ic_launcher_foreground.xml new file mode 100644 index 000000000..2b068d114 --- /dev/null +++ b/vectorization/src/main/res/drawable-v24/ic_launcher_foreground.xml @@ -0,0 +1,30 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/vectorization/src/main/res/drawable/ic_launcher_background.xml b/vectorization/src/main/res/drawable/ic_launcher_background.xml new file mode 100644 index 000000000..07d5da9cb --- /dev/null +++ b/vectorization/src/main/res/drawable/ic_launcher_background.xml @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/vectorization/src/main/res/mipmap-anydpi-v26/ic_launcher.xml b/vectorization/src/main/res/mipmap-anydpi-v26/ic_launcher.xml new file mode 100644 index 000000000..6f3b755bf --- /dev/null +++ b/vectorization/src/main/res/mipmap-anydpi-v26/ic_launcher.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/vectorization/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml b/vectorization/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml new file mode 100644 index 000000000..6f3b755bf --- /dev/null +++ b/vectorization/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/vectorization/src/main/res/mipmap-hdpi/ic_launcher.webp b/vectorization/src/main/res/mipmap-hdpi/ic_launcher.webp new file mode 100644 index 000000000..c209e78ec Binary files /dev/null and b/vectorization/src/main/res/mipmap-hdpi/ic_launcher.webp differ diff --git a/vectorization/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/vectorization/src/main/res/mipmap-hdpi/ic_launcher_round.webp new file mode 100644 index 000000000..b2dfe3d1b Binary files /dev/null and b/vectorization/src/main/res/mipmap-hdpi/ic_launcher_round.webp differ diff --git a/vectorization/src/main/res/mipmap-mdpi/ic_launcher.webp b/vectorization/src/main/res/mipmap-mdpi/ic_launcher.webp new file mode 100644 index 000000000..4f0f1d64e Binary files /dev/null and b/vectorization/src/main/res/mipmap-mdpi/ic_launcher.webp differ diff --git a/vectorization/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/vectorization/src/main/res/mipmap-mdpi/ic_launcher_round.webp new file mode 100644 index 000000000..62b611da0 Binary files /dev/null and b/vectorization/src/main/res/mipmap-mdpi/ic_launcher_round.webp differ diff --git a/vectorization/src/main/res/mipmap-xhdpi/ic_launcher.webp b/vectorization/src/main/res/mipmap-xhdpi/ic_launcher.webp new file mode 100644 index 000000000..948a3070f Binary files /dev/null and b/vectorization/src/main/res/mipmap-xhdpi/ic_launcher.webp differ diff --git a/vectorization/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/vectorization/src/main/res/mipmap-xhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..1b9a6956b Binary files /dev/null and b/vectorization/src/main/res/mipmap-xhdpi/ic_launcher_round.webp differ diff --git a/vectorization/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/vectorization/src/main/res/mipmap-xxhdpi/ic_launcher.webp new file mode 100644 index 000000000..28d4b77f9 Binary files /dev/null and b/vectorization/src/main/res/mipmap-xxhdpi/ic_launcher.webp differ diff --git a/vectorization/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/vectorization/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..9287f5083 Binary files /dev/null and b/vectorization/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp differ diff --git a/vectorization/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/vectorization/src/main/res/mipmap-xxxhdpi/ic_launcher.webp new file mode 100644 index 000000000..aa7d6427e Binary files /dev/null and b/vectorization/src/main/res/mipmap-xxxhdpi/ic_launcher.webp differ diff --git a/vectorization/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/vectorization/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp new file mode 100644 index 000000000..9126ae37c Binary files /dev/null and b/vectorization/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp differ diff --git a/vectorization/src/main/res/values/colors.xml b/vectorization/src/main/res/values/colors.xml new file mode 100644 index 000000000..f8c6127d3 --- /dev/null +++ b/vectorization/src/main/res/values/colors.xml @@ -0,0 +1,10 @@ + + + #FFBB86FC + #FF6200EE + #FF3700B3 + #FF03DAC5 + #FF018786 + #FF000000 + #FFFFFFFF + \ No newline at end of file diff --git a/vectorization/src/main/res/values/strings.xml b/vectorization/src/main/res/values/strings.xml new file mode 100644 index 000000000..db54169ec --- /dev/null +++ b/vectorization/src/main/res/values/strings.xml @@ -0,0 +1,3 @@ + + vectorization + \ No newline at end of file diff --git a/vectorization/src/main/res/values/themes.xml b/vectorization/src/main/res/values/themes.xml new file mode 100644 index 000000000..ccb7ed912 --- /dev/null +++ b/vectorization/src/main/res/values/themes.xml @@ -0,0 +1,5 @@ + + + +