From 4d18c318853b7401064bd44c3c9f58027710d46b Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 20 Mar 2023 12:30:21 -0700 Subject: [PATCH 1/7] intial commit for sw4 kernels --- src/apps/CMakeLists.txt | 4 +- src/apps/SW4CK_KERNEL_2.hpp | 491 +++++++++++++++++++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 6 + 3 files changed, 500 insertions(+), 1 deletion(-) create mode 100644 src/apps/SW4CK_KERNEL_2.hpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 6d521d1df..96cb3ed5c 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -80,7 +80,9 @@ blt_add_library( PRESSURE-Hip.cpp PRESSURE-Cuda.cpp PRESSURE-OMP.cpp - PRESSURE-OMPTarget.cpp + PRESSURE-OMPTarget.cpp + SW4CK_KERNEL_2.cpp + SW4CK_KERNEL_2-Seq.cpp VOL3D.cpp VOL3D-Seq.cpp VOL3D-Hip.cpp diff --git a/src/apps/SW4CK_KERNEL_2.hpp b/src/apps/SW4CK_KERNEL_2.hpp new file mode 100644 index 000000000..925a2cf20 --- /dev/null +++ b/src/apps/SW4CK_KERNEL_2.hpp @@ -0,0 +1,491 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// SW4CK_KERNEL_2 kernel reference implementation: +/// +/// +/// +int main() +{ + for( int k= kstart; k <= klast-2 ; k++ ) + for( int j=jfirst+2; j <= jlast-2 ; j++ ) + for( int i=ifirst+2; i <= ilast-2 ; i++ ) + { + + // 5 ops + float_sw4 ijac = strx(i) * stry(j) / jac(i, j, k); + float_sw4 istry = 1 / (stry(j)); + float_sw4 istrx = 1 / (strx(i)); + float_sw4 istrxy = istry * istrx; + + float_sw4 r1 = 0; + + // pp derivative (u) + // 53 ops, tot=58 + float_sw4 cof1 = (2 * mu(i - 2, j, k) + la(i - 2, j, k)) * + met(1, i - 2, j, k) * met(1, i - 2, j, k) * + strx(i - 2); + float_sw4 cof2 = (2 * mu(i - 1, j, k) + la(i - 1, j, k)) * + met(1, i - 1, j, k) * met(1, i - 1, j, k) * + strx(i - 1); + float_sw4 cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(1, i, j, k) * + met(1, i, j, k) * strx(i); + float_sw4 cof4 = (2 * mu(i + 1, j, k) + la(i + 1, j, k)) * + met(1, i + 1, j, k) * met(1, i + 1, j, k) * + strx(i + 1); + float_sw4 cof5 = (2 * mu(i + 2, j, k) + la(i + 2, j, k)) * + met(1, i + 2, j, k) * met(1, i + 2, j, k) * + strx(i + 2); + float_sw4 mux1 = cof2 - tf * (cof3 + cof1); + float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); + float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); + float_sw4 mux4 = cof4 - tf * (cof3 + cof5); + + r1 += i6 * + (mux1 * (u(1, i - 2, j, k) - u(1, i, j, k)) + + mux2 * (u(1, i - 1, j, k) - u(1, i, j, k)) + + mux3 * (u(1, i + 1, j, k) - u(1, i, j, k)) + + mux4 * (u(1, i + 2, j, k) - u(1, i, j, k))) * + istry; + // qq derivative (u) + // 43 ops, tot=101 + { + float_sw4 cof1 = (mu(i, j - 2, k)) * met(1, i, j - 2, k) * + met(1, i, j - 2, k) * stry(j - 2); + float_sw4 cof2 = (mu(i, j - 1, k)) * met(1, i, j - 1, k) * + met(1, i, j - 1, k) * stry(j - 1); + float_sw4 cof3 = + (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * stry(j); + float_sw4 cof4 = (mu(i, j + 1, k)) * met(1, i, j + 1, k) * + met(1, i, j + 1, k) * stry(j + 1); + float_sw4 cof5 = (mu(i, j + 2, k)) * met(1, i, j + 2, k) * + met(1, i, j + 2, k) * stry(j + 2); + float_sw4 mux1 = cof2 - tf * (cof3 + cof1); + float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); + float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); + float_sw4 mux4 = cof4 - tf * (cof3 + cof5); + + r1 += i6 * + (mux1 * (u(1, i, j - 2, k) - u(1, i, j, k)) + + mux2 * (u(1, i, j - 1, k) - u(1, i, j, k)) + + mux3 * (u(1, i, j + 1, k) - u(1, i, j, k)) + + mux4 * (u(1, i, j + 2, k) - u(1, i, j, k))) * + istrx; + } +#ifdef MAGIC_SYNC + __syncthreads(); +#endif + // rr derivative (u) + // 5*11+14+14=83 ops, tot=184 + { + float_sw4 cof1 = + (2 * mu(i, j, k - 2) + la(i, j, k - 2)) * met(2, i, j, k - 2) * + strx(i) * met(2, i, j, k - 2) * strx(i) + + mu(i, j, k - 2) * (met(3, i, j, k - 2) * stry(j) * + met(3, i, j, k - 2) * stry(j) + + met(4, i, j, k - 2) * met(4, i, j, k - 2)); + float_sw4 cof2 = + (2 * mu(i, j, k - 1) + la(i, j, k - 1)) * met(2, i, j, k - 1) * + strx(i) * met(2, i, j, k - 1) * strx(i) + + mu(i, j, k - 1) * (met(3, i, j, k - 1) * stry(j) * + met(3, i, j, k - 1) * stry(j) + + met(4, i, j, k - 1) * met(4, i, j, k - 1)); + float_sw4 cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(2, i, j, k) * + strx(i) * met(2, i, j, k) * strx(i) + + mu(i, j, k) * (met(3, i, j, k) * stry(j) * + met(3, i, j, k) * stry(j) + + met(4, i, j, k) * met(4, i, j, k)); + float_sw4 cof4 = + (2 * mu(i, j, k + 1) + la(i, j, k + 1)) * met(2, i, j, k + 1) * + strx(i) * met(2, i, j, k + 1) * strx(i) + + mu(i, j, k + 1) * (met(3, i, j, k + 1) * stry(j) * + met(3, i, j, k + 1) * stry(j) + + met(4, i, j, k + 1) * met(4, i, j, k + 1)); + float_sw4 cof5 = + (2 * mu(i, j, k + 2) + la(i, j, k + 2)) * met(2, i, j, k + 2) * + strx(i) * met(2, i, j, k + 2) * strx(i) + + mu(i, j, k + 2) * (met(3, i, j, k + 2) * stry(j) * + met(3, i, j, k + 2) * stry(j) + + met(4, i, j, k + 2) * met(4, i, j, k + 2)); + + float_sw4 mux1 = cof2 - tf * (cof3 + cof1); + float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); + float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); + float_sw4 mux4 = cof4 - tf * (cof3 + cof5); + + r1 += i6 * + (mux1 * (u(1, i, j, k - 2) - u(1, i, j, k)) + + mux2 * (u(1, i, j, k - 1) - u(1, i, j, k)) + + mux3 * (u(1, i, j, k + 1) - u(1, i, j, k)) + + mux4 * (u(1, i, j, k + 2) - u(1, i, j, k))) * + istrxy; + } + // rr derivative (v) + // 42 ops, tot=226 + cof1 = (mu(i, j, k - 2) + la(i, j, k - 2)) * met(2, i, j, k - 2) * + met(3, i, j, k - 2); + cof2 = (mu(i, j, k - 1) + la(i, j, k - 1)) * met(2, i, j, k - 1) * + met(3, i, j, k - 1); + cof3 = + (mu(i, j, k) + la(i, j, k)) * met(2, i, j, k) * met(3, i, j, k); + cof4 = (mu(i, j, k + 1) + la(i, j, k + 1)) * met(2, i, j, k + 1) * + met(3, i, j, k + 1); + cof5 = (mu(i, j, k + 2) + la(i, j, k + 2)) * met(2, i, j, k + 2) * + met(3, i, j, k + 2); + mux1 = cof2 - tf * (cof3 + cof1); + mux2 = cof1 + cof4 + 3 * (cof3 + cof2); + mux3 = cof2 + cof5 + 3 * (cof4 + cof3); + mux4 = cof4 - tf * (cof3 + cof5); + + r1 += i6 * (mux1 * (u(2, i, j, k - 2) - u(2, i, j, k)) + + mux2 * (u(2, i, j, k - 1) - u(2, i, j, k)) + + mux3 * (u(2, i, j, k + 1) - u(2, i, j, k)) + + mux4 * (u(2, i, j, k + 2) - u(2, i, j, k))); + + // rr derivative (w) + // 43 ops, tot=269 + cof1 = (mu(i, j, k - 2) + la(i, j, k - 2)) * met(2, i, j, k - 2) * + met(4, i, j, k - 2); + cof2 = (mu(i, j, k - 1) + la(i, j, k - 1)) * met(2, i, j, k - 1) * + met(4, i, j, k - 1); + cof3 = + (mu(i, j, k) + la(i, j, k)) * met(2, i, j, k) * met(4, i, j, k); + cof4 = (mu(i, j, k + 1) + la(i, j, k + 1)) * met(2, i, j, k + 1) * + met(4, i, j, k + 1); + cof5 = (mu(i, j, k + 2) + la(i, j, k + 2)) * met(2, i, j, k + 2) * + met(4, i, j, k + 2); + mux1 = cof2 - tf * (cof3 + cof1); + mux2 = cof1 + cof4 + 3 * (cof3 + cof2); + mux3 = cof2 + cof5 + 3 * (cof4 + cof3); + mux4 = cof4 - tf * (cof3 + cof5); + + r1 += i6 * + (mux1 * (u(3, i, j, k - 2) - u(3, i, j, k)) + + mux2 * (u(3, i, j, k - 1) - u(3, i, j, k)) + + mux3 * (u(3, i, j, k + 1) - u(3, i, j, k)) + + mux4 * (u(3, i, j, k + 2) - u(3, i, j, k))) * + istry; + + // pq-derivatives + // 38 ops, tot=307 + r1 += + c2 * + (mu(i, j + 2, k) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * + (c2 * (u(2, i + 2, j + 2, k) - u(2, i - 2, j + 2, k)) + + c1 * (u(2, i + 1, j + 2, k) - u(2, i - 1, j + 2, k))) - + mu(i, j - 2, k) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * + (c2 * (u(2, i + 2, j - 2, k) - u(2, i - 2, j - 2, k)) + + c1 * (u(2, i + 1, j - 2, k) - u(2, i - 1, j - 2, k)))) + + c1 * + (mu(i, j + 1, k) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * + (c2 * (u(2, i + 2, j + 1, k) - u(2, i - 2, j + 1, k)) + + c1 * (u(2, i + 1, j + 1, k) - u(2, i - 1, j + 1, k))) - + mu(i, j - 1, k) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * + (c2 * (u(2, i + 2, j - 1, k) - u(2, i - 2, j - 1, k)) + + c1 * (u(2, i + 1, j - 1, k) - u(2, i - 1, j - 1, k)))); + + // qp-derivatives + // 38 ops, tot=345 + r1 += + c2 * + (la(i + 2, j, k) * met(1, i + 2, j, k) * met(1, i + 2, j, k) * + (c2 * (u(2, i + 2, j + 2, k) - u(2, i + 2, j - 2, k)) + + c1 * (u(2, i + 2, j + 1, k) - u(2, i + 2, j - 1, k))) - + la(i - 2, j, k) * met(1, i - 2, j, k) * met(1, i - 2, j, k) * + (c2 * (u(2, i - 2, j + 2, k) - u(2, i - 2, j - 2, k)) + + c1 * (u(2, i - 2, j + 1, k) - u(2, i - 2, j - 1, k)))) + + c1 * + (la(i + 1, j, k) * met(1, i + 1, j, k) * met(1, i + 1, j, k) * + (c2 * (u(2, i + 1, j + 2, k) - u(2, i + 1, j - 2, k)) + + c1 * (u(2, i + 1, j + 1, k) - u(2, i + 1, j - 1, k))) - + la(i - 1, j, k) * met(1, i - 1, j, k) * met(1, i - 1, j, k) * + (c2 * (u(2, i - 1, j + 2, k) - u(2, i - 1, j - 2, k)) + + c1 * (u(2, i - 1, j + 1, k) - u(2, i - 1, j - 1, k)))); + + // pr-derivatives + // 130 ops., tot=475 + r1 += + c2 * + ((2 * mu(i, j, k + 2) + la(i, j, k + 2)) * + met(2, i, j, k + 2) * met(1, i, j, k + 2) * + (c2 * (u(1, i + 2, j, k + 2) - u(1, i - 2, j, k + 2)) + + c1 * (u(1, i + 1, j, k + 2) - u(1, i - 1, j, k + 2))) * + strx(i) * istry + + mu(i, j, k + 2) * met(3, i, j, k + 2) * met(1, i, j, k + 2) * + (c2 * (u(2, i + 2, j, k + 2) - u(2, i - 2, j, k + 2)) + + c1 * (u(2, i + 1, j, k + 2) - u(2, i - 1, j, k + 2))) + + mu(i, j, k + 2) * met(4, i, j, k + 2) * met(1, i, j, k + 2) * + (c2 * (u(3, i + 2, j, k + 2) - u(3, i - 2, j, k + 2)) + + c1 * (u(3, i + 1, j, k + 2) - u(3, i - 1, j, k + 2))) * + istry - + ((2 * mu(i, j, k - 2) + la(i, j, k - 2)) * + met(2, i, j, k - 2) * met(1, i, j, k - 2) * + (c2 * (u(1, i + 2, j, k - 2) - u(1, i - 2, j, k - 2)) + + c1 * (u(1, i + 1, j, k - 2) - u(1, i - 1, j, k - 2))) * + strx(i) * istry + + mu(i, j, k - 2) * met(3, i, j, k - 2) * + met(1, i, j, k - 2) * + (c2 * (u(2, i + 2, j, k - 2) - u(2, i - 2, j, k - 2)) + + c1 * (u(2, i + 1, j, k - 2) - u(2, i - 1, j, k - 2))) + + mu(i, j, k - 2) * met(4, i, j, k - 2) * + met(1, i, j, k - 2) * + (c2 * (u(3, i + 2, j, k - 2) - u(3, i - 2, j, k - 2)) + + c1 * (u(3, i + 1, j, k - 2) - u(3, i - 1, j, k - 2))) * + istry)) + + c1 * + ((2 * mu(i, j, k + 1) + la(i, j, k + 1)) * + met(2, i, j, k + 1) * met(1, i, j, k + 1) * + (c2 * (u(1, i + 2, j, k + 1) - u(1, i - 2, j, k + 1)) + + c1 * (u(1, i + 1, j, k + 1) - u(1, i - 1, j, k + 1))) * + strx(i) * istry + + mu(i, j, k + 1) * met(3, i, j, k + 1) * met(1, i, j, k + 1) * + (c2 * (u(2, i + 2, j, k + 1) - u(2, i - 2, j, k + 1)) + + c1 * (u(2, i + 1, j, k + 1) - u(2, i - 1, j, k + 1))) + + mu(i, j, k + 1) * met(4, i, j, k + 1) * met(1, i, j, k + 1) * + (c2 * (u(3, i + 2, j, k + 1) - u(3, i - 2, j, k + 1)) + + c1 * (u(3, i + 1, j, k + 1) - u(3, i - 1, j, k + 1))) * + istry - + ((2 * mu(i, j, k - 1) + la(i, j, k - 1)) * + met(2, i, j, k - 1) * met(1, i, j, k - 1) * + (c2 * (u(1, i + 2, j, k - 1) - u(1, i - 2, j, k - 1)) + + c1 * (u(1, i + 1, j, k - 1) - u(1, i - 1, j, k - 1))) * + strx(i) * istry + + mu(i, j, k - 1) * met(3, i, j, k - 1) * + met(1, i, j, k - 1) * + (c2 * (u(2, i + 2, j, k - 1) - u(2, i - 2, j, k - 1)) + + c1 * (u(2, i + 1, j, k - 1) - u(2, i - 1, j, k - 1))) + + mu(i, j, k - 1) * met(4, i, j, k - 1) * + met(1, i, j, k - 1) * + (c2 * (u(3, i + 2, j, k - 1) - u(3, i - 2, j, k - 1)) + + c1 * (u(3, i + 1, j, k - 1) - u(3, i - 1, j, k - 1))) * + istry)); + + // rp derivatives + // 130 ops, tot=605 + r1 += + (c2 * + ((2 * mu(i + 2, j, k) + la(i + 2, j, k)) * + met(2, i + 2, j, k) * met(1, i + 2, j, k) * + (c2 * (u(1, i + 2, j, k + 2) - u(1, i + 2, j, k - 2)) + + c1 * (u(1, i + 2, j, k + 1) - u(1, i + 2, j, k - 1))) * + strx(i + 2) + + la(i + 2, j, k) * met(3, i + 2, j, k) * + met(1, i + 2, j, k) * + (c2 * (u(2, i + 2, j, k + 2) - u(2, i + 2, j, k - 2)) + + c1 * (u(2, i + 2, j, k + 1) - u(2, i + 2, j, k - 1))) * + stry(j) + + la(i + 2, j, k) * met(4, i + 2, j, k) * + met(1, i + 2, j, k) * + (c2 * (u(3, i + 2, j, k + 2) - u(3, i + 2, j, k - 2)) + + c1 * (u(3, i + 2, j, k + 1) - u(3, i + 2, j, k - 1))) - + ((2 * mu(i - 2, j, k) + la(i - 2, j, k)) * + met(2, i - 2, j, k) * met(1, i - 2, j, k) * + (c2 * (u(1, i - 2, j, k + 2) - u(1, i - 2, j, k - 2)) + + c1 * + (u(1, i - 2, j, k + 1) - u(1, i - 2, j, k - 1))) * + strx(i - 2) + + la(i - 2, j, k) * met(3, i - 2, j, k) * + met(1, i - 2, j, k) * + (c2 * (u(2, i - 2, j, k + 2) - u(2, i - 2, j, k - 2)) + + c1 * + (u(2, i - 2, j, k + 1) - u(2, i - 2, j, k - 1))) * + stry(j) + + la(i - 2, j, k) * met(4, i - 2, j, k) * + met(1, i - 2, j, k) * + (c2 * (u(3, i - 2, j, k + 2) - u(3, i - 2, j, k - 2)) + + c1 * (u(3, i - 2, j, k + 1) - + u(3, i - 2, j, k - 1))))) + + c1 * + ((2 * mu(i + 1, j, k) + la(i + 1, j, k)) * + met(2, i + 1, j, k) * met(1, i + 1, j, k) * + (c2 * (u(1, i + 1, j, k + 2) - u(1, i + 1, j, k - 2)) + + c1 * (u(1, i + 1, j, k + 1) - u(1, i + 1, j, k - 1))) * + strx(i + 1) + + la(i + 1, j, k) * met(3, i + 1, j, k) * + met(1, i + 1, j, k) * + (c2 * (u(2, i + 1, j, k + 2) - u(2, i + 1, j, k - 2)) + + c1 * (u(2, i + 1, j, k + 1) - u(2, i + 1, j, k - 1))) * + stry(j) + + la(i + 1, j, k) * met(4, i + 1, j, k) * + met(1, i + 1, j, k) * + (c2 * (u(3, i + 1, j, k + 2) - u(3, i + 1, j, k - 2)) + + c1 * (u(3, i + 1, j, k + 1) - u(3, i + 1, j, k - 1))) - + ((2 * mu(i - 1, j, k) + la(i - 1, j, k)) * + met(2, i - 1, j, k) * met(1, i - 1, j, k) * + (c2 * (u(1, i - 1, j, k + 2) - u(1, i - 1, j, k - 2)) + + c1 * + (u(1, i - 1, j, k + 1) - u(1, i - 1, j, k - 1))) * + strx(i - 1) + + la(i - 1, j, k) * met(3, i - 1, j, k) * + met(1, i - 1, j, k) * + (c2 * (u(2, i - 1, j, k + 2) - u(2, i - 1, j, k - 2)) + + c1 * + (u(2, i - 1, j, k + 1) - u(2, i - 1, j, k - 1))) * + stry(j) + + la(i - 1, j, k) * met(4, i - 1, j, k) * + met(1, i - 1, j, k) * + (c2 * (u(3, i - 1, j, k + 2) - u(3, i - 1, j, k - 2)) + + c1 * (u(3, i - 1, j, k + 1) - + u(3, i - 1, j, k - 1)))))) * + istry; + + // qr derivatives + // 82 ops, tot=687 + r1 += + c2 * + (mu(i, j, k + 2) * met(3, i, j, k + 2) * met(1, i, j, k + 2) * + (c2 * (u(1, i, j + 2, k + 2) - u(1, i, j - 2, k + 2)) + + c1 * (u(1, i, j + 1, k + 2) - u(1, i, j - 1, k + 2))) * + stry(j) * istrx + + la(i, j, k + 2) * met(2, i, j, k + 2) * met(1, i, j, k + 2) * + (c2 * (u(2, i, j + 2, k + 2) - u(2, i, j - 2, k + 2)) + + c1 * (u(2, i, j + 1, k + 2) - u(2, i, j - 1, k + 2))) - + (mu(i, j, k - 2) * met(3, i, j, k - 2) * + met(1, i, j, k - 2) * + (c2 * (u(1, i, j + 2, k - 2) - u(1, i, j - 2, k - 2)) + + c1 * (u(1, i, j + 1, k - 2) - u(1, i, j - 1, k - 2))) * + stry(j) * istrx + + la(i, j, k - 2) * met(2, i, j, k - 2) * + met(1, i, j, k - 2) * + (c2 * (u(2, i, j + 2, k - 2) - u(2, i, j - 2, k - 2)) + + c1 * (u(2, i, j + 1, k - 2) - + u(2, i, j - 1, k - 2))))) + + c1 * + (mu(i, j, k + 1) * met(3, i, j, k + 1) * met(1, i, j, k + 1) * + (c2 * (u(1, i, j + 2, k + 1) - u(1, i, j - 2, k + 1)) + + c1 * (u(1, i, j + 1, k + 1) - u(1, i, j - 1, k + 1))) * + stry(j) * istrx + + la(i, j, k + 1) * met(2, i, j, k + 1) * met(1, i, j, k + 1) * + (c2 * (u(2, i, j + 2, k + 1) - u(2, i, j - 2, k + 1)) + + c1 * (u(2, i, j + 1, k + 1) - u(2, i, j - 1, k + 1))) - + (mu(i, j, k - 1) * met(3, i, j, k - 1) * + met(1, i, j, k - 1) * + (c2 * (u(1, i, j + 2, k - 1) - u(1, i, j - 2, k - 1)) + + c1 * (u(1, i, j + 1, k - 1) - u(1, i, j - 1, k - 1))) * + stry(j) * istrx + + la(i, j, k - 1) * met(2, i, j, k - 1) * + met(1, i, j, k - 1) * + (c2 * (u(2, i, j + 2, k - 1) - u(2, i, j - 2, k - 1)) + + c1 * + (u(2, i, j + 1, k - 1) - u(2, i, j - 1, k - 1))))); + + // rq derivatives + // 82 ops, tot=769 + r1 += + c2 * + (mu(i, j + 2, k) * met(3, i, j + 2, k) * met(1, i, j + 2, k) * + (c2 * (u(1, i, j + 2, k + 2) - u(1, i, j + 2, k - 2)) + + c1 * (u(1, i, j + 2, k + 1) - u(1, i, j + 2, k - 1))) * + stry(j + 2) * istrx + + mu(i, j + 2, k) * met(2, i, j + 2, k) * met(1, i, j + 2, k) * + (c2 * (u(2, i, j + 2, k + 2) - u(2, i, j + 2, k - 2)) + + c1 * (u(2, i, j + 2, k + 1) - u(2, i, j + 2, k - 1))) - + (mu(i, j - 2, k) * met(3, i, j - 2, k) * + met(1, i, j - 2, k) * + (c2 * (u(1, i, j - 2, k + 2) - u(1, i, j - 2, k - 2)) + + c1 * (u(1, i, j - 2, k + 1) - u(1, i, j - 2, k - 1))) * + stry(j - 2) * istrx + + mu(i, j - 2, k) * met(2, i, j - 2, k) * + met(1, i, j - 2, k) * + (c2 * (u(2, i, j - 2, k + 2) - u(2, i, j - 2, k - 2)) + + c1 * (u(2, i, j - 2, k + 1) - + u(2, i, j - 2, k - 1))))) + + c1 * + (mu(i, j + 1, k) * met(3, i, j + 1, k) * met(1, i, j + 1, k) * + (c2 * (u(1, i, j + 1, k + 2) - u(1, i, j + 1, k - 2)) + + c1 * (u(1, i, j + 1, k + 1) - u(1, i, j + 1, k - 1))) * + stry(j + 1) * istrx + + mu(i, j + 1, k) * met(2, i, j + 1, k) * met(1, i, j + 1, k) * + (c2 * (u(2, i, j + 1, k + 2) - u(2, i, j + 1, k - 2)) + + c1 * (u(2, i, j + 1, k + 1) - u(2, i, j + 1, k - 1))) - + (mu(i, j - 1, k) * met(3, i, j - 1, k) * + met(1, i, j - 1, k) * + (c2 * (u(1, i, j - 1, k + 2) - u(1, i, j - 1, k - 2)) + + c1 * (u(1, i, j - 1, k + 1) - u(1, i, j - 1, k - 1))) * + stry(j - 1) * istrx + + mu(i, j - 1, k) * met(2, i, j - 1, k) * + met(1, i, j - 1, k) * + (c2 * (u(2, i, j - 1, k + 2) - u(2, i, j - 1, k - 2)) + + c1 * + (u(2, i, j - 1, k + 1) - u(2, i, j - 1, k - 1))))); + + // 4 ops, tot=773 + lu(1, i, j, k) = a1 * lu(1, i, j, k) + sgn * r1 * ijac; + + + + + + } + +/// + +#ifndef RAJAPerf_Apps_SW4CK_KERNEL_2_HPP +#define RAJAPerf_Apps_SW4CK_KERNEL_2_HPP + +#define SW4CK_KERNEL_2_DATA_SETUP \ + +#define SW4CK_KERNEL_2_BODY \ + + + +#include "common/KernelBase.hpp" + +namespace rajaperf +{ +class RunParams; + +namespace apps +{ +class ADomain; + +class SW4CK_KERNEL_2 : public KernelBase +{ +public: + + SW4CK_KERNEL_2(const RunParams& params); + + ~SW4CK_KERNEL_2(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template < size_t block_size > + void runCudaVariantImpl(VariantID vid); + template < size_t block_size > + void runHipVariantImpl(VariantID vid); + +private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = gpu_block_size::make_list_type; + + Real_ptr m_x; + Real_ptr m_y; + Real_ptr m_z; + Real_ptr m_vol; + + Real_type m_vnormq; + + ADomain* m_domain; + Index_type m_array_length; +}; + +} // end namespace apps +} // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 27650cf56..4e9465008 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -91,6 +91,7 @@ #include "apps/MASS3DPA.hpp" #include "apps/NODAL_ACCUMULATION_3D.hpp" #include "apps/PRESSURE.hpp" +#include "apps/SW4CK_KERNEL_2.hpp" #include "apps/VOL3D.hpp" // @@ -227,6 +228,7 @@ static const std::string KernelNames [] = std::string("Apps_MASS3DPA"), std::string("Apps_NODAL_ACCUMULATION_3D"), std::string("Apps_PRESSURE"), + std::string("Apps_SW4CK_KERNEL_2"), std::string("Apps_VOL3D"), // @@ -771,6 +773,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new apps::PRESSURE(run_params); break; } + case Apps_SW4CK_KERNEL_2 : { + kernel = new apps::SW4CK_KERNEL_2(run_params); + break; + } case Apps_VOL3D : { kernel = new apps::VOL3D(run_params); break; From 49a9d501372aea6d8ad9da79e30070acafdfd89b Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 20 Mar 2023 12:55:40 -0700 Subject: [PATCH 2/7] intial commit for SW4CK kernels --- src/apps/SW4CK_KERNEL_2-Seq.cpp | 81 +++ src/apps/SW4CK_KERNEL_2.cpp | 95 ++++ src/apps/SW4CK_KERNEL_2.hpp | 884 +++++++++++++++----------------- src/common/RAJAPerfSuite.hpp | 1 + 4 files changed, 594 insertions(+), 467 deletions(-) create mode 100644 src/apps/SW4CK_KERNEL_2-Seq.cpp create mode 100644 src/apps/SW4CK_KERNEL_2.cpp diff --git a/src/apps/SW4CK_KERNEL_2-Seq.cpp b/src/apps/SW4CK_KERNEL_2-Seq.cpp new file mode 100644 index 000000000..ca6e21a7c --- /dev/null +++ b/src/apps/SW4CK_KERNEL_2-Seq.cpp @@ -0,0 +1,81 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SW4CK_KERNEL_2.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void SW4CK_KERNEL_2::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + SW4CK_KERNEL_2_DATA_SETUP; + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + //Reference impl + + } + stopTimer(); + + break; + } + +#if defined(RUN_RAJA_SEQ) + case Lambda_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + //Lambda impl + + } + stopTimer(); + + break; + } + + case RAJA_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + //RAJA impl + + } + stopTimer(); + + break; + } +#endif // RUN_RAJA_SEQ + + default : { + getCout() << "\n SW4CK_KERNEL_2 : Unknown variant id = " << vid << std::endl; + } + + } + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/SW4CK_KERNEL_2.cpp b/src/apps/SW4CK_KERNEL_2.cpp new file mode 100644 index 000000000..d1e7cca1b --- /dev/null +++ b/src/apps/SW4CK_KERNEL_2.cpp @@ -0,0 +1,95 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SW4CK_KERNEL_2.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" +#include "common/DataUtils.hpp" + +#include + + +namespace rajaperf +{ +namespace apps +{ + + +SW4CK_KERNEL_2::SW4CK_KERNEL_2(const RunParams& params) + : KernelBase(rajaperf::Apps_SW4CK_KERNEL_2, params) +{ + setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct + setDefaultReps(100); + + Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + //m_domain = new ADomain(rzmax, /* ndims = */ 3); + + //m_array_length = m_domain->nnalls; + + //setActualProblemSize( m_domain->lpz+1 - m_domain->fpz ); + + //setItsPerRep( m_domain->lpz+1 - m_domain->fpz ); + setKernelsPerRep(1); + // touched data size, not actual number of stores and loads + // setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + + //(0*sizeof(Real_type) + 3*sizeof(Real_type)) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + + //setFLOPsPerRep(72 * (m_domain->lpz+1 - m_domain->fpz)); + + checksum_scale_factor = 0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + setUsesFeature(Teams); + + //Goal is to get the following three variants right first + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + /* + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + */ +} + +SW4CK_KERNEL_2::~SW4CK_KERNEL_2() +{ + // delete m_domain; +} + +void SW4CK_KERNEL_2::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + +} + +void SW4CK_KERNEL_2::updateChecksum(VariantID vid, size_t tune_idx) +{ + //checksum[vid][tune_idx] += calcChecksum(m_vol, m_array_length, checksum_scale_factor ); +} + +void SW4CK_KERNEL_2::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/SW4CK_KERNEL_2.hpp b/src/apps/SW4CK_KERNEL_2.hpp index 925a2cf20..b5b1bbdff 100644 --- a/src/apps/SW4CK_KERNEL_2.hpp +++ b/src/apps/SW4CK_KERNEL_2.hpp @@ -1,3 +1,4 @@ + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Copyright (c) 2017-23, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. @@ -8,484 +9,433 @@ /// /// SW4CK_KERNEL_2 kernel reference implementation: +/// https://github.com/LLNL/SW4CK /// +/// for (int k = kstart; k <= klast - 2; k++) +/// for (int j = jfirst + 2; j <= jlast - 2; j++) +/// for (int i = ifirst + 2; i <= ilast - 2; i++) { /// +/// // 5 ops +/// float_sw4 ijac = strx(i) * stry(j) / jac(i, j, k); +/// float_sw4 istry = 1 / (stry(j)); +/// float_sw4 istrx = 1 / (strx(i)); +/// float_sw4 istrxy = istry * istrx; /// -int main() -{ - for( int k= kstart; k <= klast-2 ; k++ ) - for( int j=jfirst+2; j <= jlast-2 ; j++ ) - for( int i=ifirst+2; i <= ilast-2 ; i++ ) - { - - // 5 ops - float_sw4 ijac = strx(i) * stry(j) / jac(i, j, k); - float_sw4 istry = 1 / (stry(j)); - float_sw4 istrx = 1 / (strx(i)); - float_sw4 istrxy = istry * istrx; - - float_sw4 r1 = 0; - - // pp derivative (u) - // 53 ops, tot=58 - float_sw4 cof1 = (2 * mu(i - 2, j, k) + la(i - 2, j, k)) * - met(1, i - 2, j, k) * met(1, i - 2, j, k) * - strx(i - 2); - float_sw4 cof2 = (2 * mu(i - 1, j, k) + la(i - 1, j, k)) * - met(1, i - 1, j, k) * met(1, i - 1, j, k) * - strx(i - 1); - float_sw4 cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(1, i, j, k) * - met(1, i, j, k) * strx(i); - float_sw4 cof4 = (2 * mu(i + 1, j, k) + la(i + 1, j, k)) * - met(1, i + 1, j, k) * met(1, i + 1, j, k) * - strx(i + 1); - float_sw4 cof5 = (2 * mu(i + 2, j, k) + la(i + 2, j, k)) * - met(1, i + 2, j, k) * met(1, i + 2, j, k) * - strx(i + 2); - float_sw4 mux1 = cof2 - tf * (cof3 + cof1); - float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); - float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); - float_sw4 mux4 = cof4 - tf * (cof3 + cof5); - - r1 += i6 * - (mux1 * (u(1, i - 2, j, k) - u(1, i, j, k)) + - mux2 * (u(1, i - 1, j, k) - u(1, i, j, k)) + - mux3 * (u(1, i + 1, j, k) - u(1, i, j, k)) + - mux4 * (u(1, i + 2, j, k) - u(1, i, j, k))) * - istry; - // qq derivative (u) - // 43 ops, tot=101 - { - float_sw4 cof1 = (mu(i, j - 2, k)) * met(1, i, j - 2, k) * - met(1, i, j - 2, k) * stry(j - 2); - float_sw4 cof2 = (mu(i, j - 1, k)) * met(1, i, j - 1, k) * - met(1, i, j - 1, k) * stry(j - 1); - float_sw4 cof3 = - (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * stry(j); - float_sw4 cof4 = (mu(i, j + 1, k)) * met(1, i, j + 1, k) * - met(1, i, j + 1, k) * stry(j + 1); - float_sw4 cof5 = (mu(i, j + 2, k)) * met(1, i, j + 2, k) * - met(1, i, j + 2, k) * stry(j + 2); - float_sw4 mux1 = cof2 - tf * (cof3 + cof1); - float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); - float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); - float_sw4 mux4 = cof4 - tf * (cof3 + cof5); - - r1 += i6 * - (mux1 * (u(1, i, j - 2, k) - u(1, i, j, k)) + - mux2 * (u(1, i, j - 1, k) - u(1, i, j, k)) + - mux3 * (u(1, i, j + 1, k) - u(1, i, j, k)) + - mux4 * (u(1, i, j + 2, k) - u(1, i, j, k))) * - istrx; - } -#ifdef MAGIC_SYNC - __syncthreads(); -#endif - // rr derivative (u) - // 5*11+14+14=83 ops, tot=184 - { - float_sw4 cof1 = - (2 * mu(i, j, k - 2) + la(i, j, k - 2)) * met(2, i, j, k - 2) * - strx(i) * met(2, i, j, k - 2) * strx(i) + - mu(i, j, k - 2) * (met(3, i, j, k - 2) * stry(j) * - met(3, i, j, k - 2) * stry(j) + - met(4, i, j, k - 2) * met(4, i, j, k - 2)); - float_sw4 cof2 = - (2 * mu(i, j, k - 1) + la(i, j, k - 1)) * met(2, i, j, k - 1) * - strx(i) * met(2, i, j, k - 1) * strx(i) + - mu(i, j, k - 1) * (met(3, i, j, k - 1) * stry(j) * - met(3, i, j, k - 1) * stry(j) + - met(4, i, j, k - 1) * met(4, i, j, k - 1)); - float_sw4 cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(2, i, j, k) * - strx(i) * met(2, i, j, k) * strx(i) + - mu(i, j, k) * (met(3, i, j, k) * stry(j) * - met(3, i, j, k) * stry(j) + - met(4, i, j, k) * met(4, i, j, k)); - float_sw4 cof4 = - (2 * mu(i, j, k + 1) + la(i, j, k + 1)) * met(2, i, j, k + 1) * - strx(i) * met(2, i, j, k + 1) * strx(i) + - mu(i, j, k + 1) * (met(3, i, j, k + 1) * stry(j) * - met(3, i, j, k + 1) * stry(j) + - met(4, i, j, k + 1) * met(4, i, j, k + 1)); - float_sw4 cof5 = - (2 * mu(i, j, k + 2) + la(i, j, k + 2)) * met(2, i, j, k + 2) * - strx(i) * met(2, i, j, k + 2) * strx(i) + - mu(i, j, k + 2) * (met(3, i, j, k + 2) * stry(j) * - met(3, i, j, k + 2) * stry(j) + - met(4, i, j, k + 2) * met(4, i, j, k + 2)); - - float_sw4 mux1 = cof2 - tf * (cof3 + cof1); - float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); - float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); - float_sw4 mux4 = cof4 - tf * (cof3 + cof5); - - r1 += i6 * - (mux1 * (u(1, i, j, k - 2) - u(1, i, j, k)) + - mux2 * (u(1, i, j, k - 1) - u(1, i, j, k)) + - mux3 * (u(1, i, j, k + 1) - u(1, i, j, k)) + - mux4 * (u(1, i, j, k + 2) - u(1, i, j, k))) * - istrxy; - } - // rr derivative (v) - // 42 ops, tot=226 - cof1 = (mu(i, j, k - 2) + la(i, j, k - 2)) * met(2, i, j, k - 2) * - met(3, i, j, k - 2); - cof2 = (mu(i, j, k - 1) + la(i, j, k - 1)) * met(2, i, j, k - 1) * - met(3, i, j, k - 1); - cof3 = - (mu(i, j, k) + la(i, j, k)) * met(2, i, j, k) * met(3, i, j, k); - cof4 = (mu(i, j, k + 1) + la(i, j, k + 1)) * met(2, i, j, k + 1) * - met(3, i, j, k + 1); - cof5 = (mu(i, j, k + 2) + la(i, j, k + 2)) * met(2, i, j, k + 2) * - met(3, i, j, k + 2); - mux1 = cof2 - tf * (cof3 + cof1); - mux2 = cof1 + cof4 + 3 * (cof3 + cof2); - mux3 = cof2 + cof5 + 3 * (cof4 + cof3); - mux4 = cof4 - tf * (cof3 + cof5); - - r1 += i6 * (mux1 * (u(2, i, j, k - 2) - u(2, i, j, k)) + - mux2 * (u(2, i, j, k - 1) - u(2, i, j, k)) + - mux3 * (u(2, i, j, k + 1) - u(2, i, j, k)) + - mux4 * (u(2, i, j, k + 2) - u(2, i, j, k))); - - // rr derivative (w) - // 43 ops, tot=269 - cof1 = (mu(i, j, k - 2) + la(i, j, k - 2)) * met(2, i, j, k - 2) * - met(4, i, j, k - 2); - cof2 = (mu(i, j, k - 1) + la(i, j, k - 1)) * met(2, i, j, k - 1) * - met(4, i, j, k - 1); - cof3 = - (mu(i, j, k) + la(i, j, k)) * met(2, i, j, k) * met(4, i, j, k); - cof4 = (mu(i, j, k + 1) + la(i, j, k + 1)) * met(2, i, j, k + 1) * - met(4, i, j, k + 1); - cof5 = (mu(i, j, k + 2) + la(i, j, k + 2)) * met(2, i, j, k + 2) * - met(4, i, j, k + 2); - mux1 = cof2 - tf * (cof3 + cof1); - mux2 = cof1 + cof4 + 3 * (cof3 + cof2); - mux3 = cof2 + cof5 + 3 * (cof4 + cof3); - mux4 = cof4 - tf * (cof3 + cof5); - - r1 += i6 * - (mux1 * (u(3, i, j, k - 2) - u(3, i, j, k)) + - mux2 * (u(3, i, j, k - 1) - u(3, i, j, k)) + - mux3 * (u(3, i, j, k + 1) - u(3, i, j, k)) + - mux4 * (u(3, i, j, k + 2) - u(3, i, j, k))) * - istry; - - // pq-derivatives - // 38 ops, tot=307 - r1 += - c2 * - (mu(i, j + 2, k) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * - (c2 * (u(2, i + 2, j + 2, k) - u(2, i - 2, j + 2, k)) + - c1 * (u(2, i + 1, j + 2, k) - u(2, i - 1, j + 2, k))) - - mu(i, j - 2, k) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * - (c2 * (u(2, i + 2, j - 2, k) - u(2, i - 2, j - 2, k)) + - c1 * (u(2, i + 1, j - 2, k) - u(2, i - 1, j - 2, k)))) + - c1 * - (mu(i, j + 1, k) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * - (c2 * (u(2, i + 2, j + 1, k) - u(2, i - 2, j + 1, k)) + - c1 * (u(2, i + 1, j + 1, k) - u(2, i - 1, j + 1, k))) - - mu(i, j - 1, k) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * - (c2 * (u(2, i + 2, j - 1, k) - u(2, i - 2, j - 1, k)) + - c1 * (u(2, i + 1, j - 1, k) - u(2, i - 1, j - 1, k)))); - - // qp-derivatives - // 38 ops, tot=345 - r1 += - c2 * - (la(i + 2, j, k) * met(1, i + 2, j, k) * met(1, i + 2, j, k) * - (c2 * (u(2, i + 2, j + 2, k) - u(2, i + 2, j - 2, k)) + - c1 * (u(2, i + 2, j + 1, k) - u(2, i + 2, j - 1, k))) - - la(i - 2, j, k) * met(1, i - 2, j, k) * met(1, i - 2, j, k) * - (c2 * (u(2, i - 2, j + 2, k) - u(2, i - 2, j - 2, k)) + - c1 * (u(2, i - 2, j + 1, k) - u(2, i - 2, j - 1, k)))) + - c1 * - (la(i + 1, j, k) * met(1, i + 1, j, k) * met(1, i + 1, j, k) * - (c2 * (u(2, i + 1, j + 2, k) - u(2, i + 1, j - 2, k)) + - c1 * (u(2, i + 1, j + 1, k) - u(2, i + 1, j - 1, k))) - - la(i - 1, j, k) * met(1, i - 1, j, k) * met(1, i - 1, j, k) * - (c2 * (u(2, i - 1, j + 2, k) - u(2, i - 1, j - 2, k)) + - c1 * (u(2, i - 1, j + 1, k) - u(2, i - 1, j - 1, k)))); - - // pr-derivatives - // 130 ops., tot=475 - r1 += - c2 * - ((2 * mu(i, j, k + 2) + la(i, j, k + 2)) * - met(2, i, j, k + 2) * met(1, i, j, k + 2) * - (c2 * (u(1, i + 2, j, k + 2) - u(1, i - 2, j, k + 2)) + - c1 * (u(1, i + 1, j, k + 2) - u(1, i - 1, j, k + 2))) * - strx(i) * istry + - mu(i, j, k + 2) * met(3, i, j, k + 2) * met(1, i, j, k + 2) * - (c2 * (u(2, i + 2, j, k + 2) - u(2, i - 2, j, k + 2)) + - c1 * (u(2, i + 1, j, k + 2) - u(2, i - 1, j, k + 2))) + - mu(i, j, k + 2) * met(4, i, j, k + 2) * met(1, i, j, k + 2) * - (c2 * (u(3, i + 2, j, k + 2) - u(3, i - 2, j, k + 2)) + - c1 * (u(3, i + 1, j, k + 2) - u(3, i - 1, j, k + 2))) * - istry - - ((2 * mu(i, j, k - 2) + la(i, j, k - 2)) * - met(2, i, j, k - 2) * met(1, i, j, k - 2) * - (c2 * (u(1, i + 2, j, k - 2) - u(1, i - 2, j, k - 2)) + - c1 * (u(1, i + 1, j, k - 2) - u(1, i - 1, j, k - 2))) * - strx(i) * istry + - mu(i, j, k - 2) * met(3, i, j, k - 2) * - met(1, i, j, k - 2) * - (c2 * (u(2, i + 2, j, k - 2) - u(2, i - 2, j, k - 2)) + - c1 * (u(2, i + 1, j, k - 2) - u(2, i - 1, j, k - 2))) + - mu(i, j, k - 2) * met(4, i, j, k - 2) * - met(1, i, j, k - 2) * - (c2 * (u(3, i + 2, j, k - 2) - u(3, i - 2, j, k - 2)) + - c1 * (u(3, i + 1, j, k - 2) - u(3, i - 1, j, k - 2))) * - istry)) + - c1 * - ((2 * mu(i, j, k + 1) + la(i, j, k + 1)) * - met(2, i, j, k + 1) * met(1, i, j, k + 1) * - (c2 * (u(1, i + 2, j, k + 1) - u(1, i - 2, j, k + 1)) + - c1 * (u(1, i + 1, j, k + 1) - u(1, i - 1, j, k + 1))) * - strx(i) * istry + - mu(i, j, k + 1) * met(3, i, j, k + 1) * met(1, i, j, k + 1) * - (c2 * (u(2, i + 2, j, k + 1) - u(2, i - 2, j, k + 1)) + - c1 * (u(2, i + 1, j, k + 1) - u(2, i - 1, j, k + 1))) + - mu(i, j, k + 1) * met(4, i, j, k + 1) * met(1, i, j, k + 1) * - (c2 * (u(3, i + 2, j, k + 1) - u(3, i - 2, j, k + 1)) + - c1 * (u(3, i + 1, j, k + 1) - u(3, i - 1, j, k + 1))) * - istry - - ((2 * mu(i, j, k - 1) + la(i, j, k - 1)) * - met(2, i, j, k - 1) * met(1, i, j, k - 1) * - (c2 * (u(1, i + 2, j, k - 1) - u(1, i - 2, j, k - 1)) + - c1 * (u(1, i + 1, j, k - 1) - u(1, i - 1, j, k - 1))) * - strx(i) * istry + - mu(i, j, k - 1) * met(3, i, j, k - 1) * - met(1, i, j, k - 1) * - (c2 * (u(2, i + 2, j, k - 1) - u(2, i - 2, j, k - 1)) + - c1 * (u(2, i + 1, j, k - 1) - u(2, i - 1, j, k - 1))) + - mu(i, j, k - 1) * met(4, i, j, k - 1) * - met(1, i, j, k - 1) * - (c2 * (u(3, i + 2, j, k - 1) - u(3, i - 2, j, k - 1)) + - c1 * (u(3, i + 1, j, k - 1) - u(3, i - 1, j, k - 1))) * - istry)); - - // rp derivatives - // 130 ops, tot=605 - r1 += - (c2 * - ((2 * mu(i + 2, j, k) + la(i + 2, j, k)) * - met(2, i + 2, j, k) * met(1, i + 2, j, k) * - (c2 * (u(1, i + 2, j, k + 2) - u(1, i + 2, j, k - 2)) + - c1 * (u(1, i + 2, j, k + 1) - u(1, i + 2, j, k - 1))) * - strx(i + 2) + - la(i + 2, j, k) * met(3, i + 2, j, k) * - met(1, i + 2, j, k) * - (c2 * (u(2, i + 2, j, k + 2) - u(2, i + 2, j, k - 2)) + - c1 * (u(2, i + 2, j, k + 1) - u(2, i + 2, j, k - 1))) * - stry(j) + - la(i + 2, j, k) * met(4, i + 2, j, k) * - met(1, i + 2, j, k) * - (c2 * (u(3, i + 2, j, k + 2) - u(3, i + 2, j, k - 2)) + - c1 * (u(3, i + 2, j, k + 1) - u(3, i + 2, j, k - 1))) - - ((2 * mu(i - 2, j, k) + la(i - 2, j, k)) * - met(2, i - 2, j, k) * met(1, i - 2, j, k) * - (c2 * (u(1, i - 2, j, k + 2) - u(1, i - 2, j, k - 2)) + - c1 * - (u(1, i - 2, j, k + 1) - u(1, i - 2, j, k - 1))) * - strx(i - 2) + - la(i - 2, j, k) * met(3, i - 2, j, k) * - met(1, i - 2, j, k) * - (c2 * (u(2, i - 2, j, k + 2) - u(2, i - 2, j, k - 2)) + - c1 * - (u(2, i - 2, j, k + 1) - u(2, i - 2, j, k - 1))) * - stry(j) + - la(i - 2, j, k) * met(4, i - 2, j, k) * - met(1, i - 2, j, k) * - (c2 * (u(3, i - 2, j, k + 2) - u(3, i - 2, j, k - 2)) + - c1 * (u(3, i - 2, j, k + 1) - - u(3, i - 2, j, k - 1))))) + - c1 * - ((2 * mu(i + 1, j, k) + la(i + 1, j, k)) * - met(2, i + 1, j, k) * met(1, i + 1, j, k) * - (c2 * (u(1, i + 1, j, k + 2) - u(1, i + 1, j, k - 2)) + - c1 * (u(1, i + 1, j, k + 1) - u(1, i + 1, j, k - 1))) * - strx(i + 1) + - la(i + 1, j, k) * met(3, i + 1, j, k) * - met(1, i + 1, j, k) * - (c2 * (u(2, i + 1, j, k + 2) - u(2, i + 1, j, k - 2)) + - c1 * (u(2, i + 1, j, k + 1) - u(2, i + 1, j, k - 1))) * - stry(j) + - la(i + 1, j, k) * met(4, i + 1, j, k) * - met(1, i + 1, j, k) * - (c2 * (u(3, i + 1, j, k + 2) - u(3, i + 1, j, k - 2)) + - c1 * (u(3, i + 1, j, k + 1) - u(3, i + 1, j, k - 1))) - - ((2 * mu(i - 1, j, k) + la(i - 1, j, k)) * - met(2, i - 1, j, k) * met(1, i - 1, j, k) * - (c2 * (u(1, i - 1, j, k + 2) - u(1, i - 1, j, k - 2)) + - c1 * - (u(1, i - 1, j, k + 1) - u(1, i - 1, j, k - 1))) * - strx(i - 1) + - la(i - 1, j, k) * met(3, i - 1, j, k) * - met(1, i - 1, j, k) * - (c2 * (u(2, i - 1, j, k + 2) - u(2, i - 1, j, k - 2)) + - c1 * - (u(2, i - 1, j, k + 1) - u(2, i - 1, j, k - 1))) * - stry(j) + - la(i - 1, j, k) * met(4, i - 1, j, k) * - met(1, i - 1, j, k) * - (c2 * (u(3, i - 1, j, k + 2) - u(3, i - 1, j, k - 2)) + - c1 * (u(3, i - 1, j, k + 1) - - u(3, i - 1, j, k - 1)))))) * - istry; - - // qr derivatives - // 82 ops, tot=687 - r1 += - c2 * - (mu(i, j, k + 2) * met(3, i, j, k + 2) * met(1, i, j, k + 2) * - (c2 * (u(1, i, j + 2, k + 2) - u(1, i, j - 2, k + 2)) + - c1 * (u(1, i, j + 1, k + 2) - u(1, i, j - 1, k + 2))) * - stry(j) * istrx + - la(i, j, k + 2) * met(2, i, j, k + 2) * met(1, i, j, k + 2) * - (c2 * (u(2, i, j + 2, k + 2) - u(2, i, j - 2, k + 2)) + - c1 * (u(2, i, j + 1, k + 2) - u(2, i, j - 1, k + 2))) - - (mu(i, j, k - 2) * met(3, i, j, k - 2) * - met(1, i, j, k - 2) * - (c2 * (u(1, i, j + 2, k - 2) - u(1, i, j - 2, k - 2)) + - c1 * (u(1, i, j + 1, k - 2) - u(1, i, j - 1, k - 2))) * - stry(j) * istrx + - la(i, j, k - 2) * met(2, i, j, k - 2) * - met(1, i, j, k - 2) * - (c2 * (u(2, i, j + 2, k - 2) - u(2, i, j - 2, k - 2)) + - c1 * (u(2, i, j + 1, k - 2) - - u(2, i, j - 1, k - 2))))) + - c1 * - (mu(i, j, k + 1) * met(3, i, j, k + 1) * met(1, i, j, k + 1) * - (c2 * (u(1, i, j + 2, k + 1) - u(1, i, j - 2, k + 1)) + - c1 * (u(1, i, j + 1, k + 1) - u(1, i, j - 1, k + 1))) * - stry(j) * istrx + - la(i, j, k + 1) * met(2, i, j, k + 1) * met(1, i, j, k + 1) * - (c2 * (u(2, i, j + 2, k + 1) - u(2, i, j - 2, k + 1)) + - c1 * (u(2, i, j + 1, k + 1) - u(2, i, j - 1, k + 1))) - - (mu(i, j, k - 1) * met(3, i, j, k - 1) * - met(1, i, j, k - 1) * - (c2 * (u(1, i, j + 2, k - 1) - u(1, i, j - 2, k - 1)) + - c1 * (u(1, i, j + 1, k - 1) - u(1, i, j - 1, k - 1))) * - stry(j) * istrx + - la(i, j, k - 1) * met(2, i, j, k - 1) * - met(1, i, j, k - 1) * - (c2 * (u(2, i, j + 2, k - 1) - u(2, i, j - 2, k - 1)) + - c1 * - (u(2, i, j + 1, k - 1) - u(2, i, j - 1, k - 1))))); - - // rq derivatives - // 82 ops, tot=769 - r1 += - c2 * - (mu(i, j + 2, k) * met(3, i, j + 2, k) * met(1, i, j + 2, k) * - (c2 * (u(1, i, j + 2, k + 2) - u(1, i, j + 2, k - 2)) + - c1 * (u(1, i, j + 2, k + 1) - u(1, i, j + 2, k - 1))) * - stry(j + 2) * istrx + - mu(i, j + 2, k) * met(2, i, j + 2, k) * met(1, i, j + 2, k) * - (c2 * (u(2, i, j + 2, k + 2) - u(2, i, j + 2, k - 2)) + - c1 * (u(2, i, j + 2, k + 1) - u(2, i, j + 2, k - 1))) - - (mu(i, j - 2, k) * met(3, i, j - 2, k) * - met(1, i, j - 2, k) * - (c2 * (u(1, i, j - 2, k + 2) - u(1, i, j - 2, k - 2)) + - c1 * (u(1, i, j - 2, k + 1) - u(1, i, j - 2, k - 1))) * - stry(j - 2) * istrx + - mu(i, j - 2, k) * met(2, i, j - 2, k) * - met(1, i, j - 2, k) * - (c2 * (u(2, i, j - 2, k + 2) - u(2, i, j - 2, k - 2)) + - c1 * (u(2, i, j - 2, k + 1) - - u(2, i, j - 2, k - 1))))) + - c1 * - (mu(i, j + 1, k) * met(3, i, j + 1, k) * met(1, i, j + 1, k) * - (c2 * (u(1, i, j + 1, k + 2) - u(1, i, j + 1, k - 2)) + - c1 * (u(1, i, j + 1, k + 1) - u(1, i, j + 1, k - 1))) * - stry(j + 1) * istrx + - mu(i, j + 1, k) * met(2, i, j + 1, k) * met(1, i, j + 1, k) * - (c2 * (u(2, i, j + 1, k + 2) - u(2, i, j + 1, k - 2)) + - c1 * (u(2, i, j + 1, k + 1) - u(2, i, j + 1, k - 1))) - - (mu(i, j - 1, k) * met(3, i, j - 1, k) * - met(1, i, j - 1, k) * - (c2 * (u(1, i, j - 1, k + 2) - u(1, i, j - 1, k - 2)) + - c1 * (u(1, i, j - 1, k + 1) - u(1, i, j - 1, k - 1))) * - stry(j - 1) * istrx + - mu(i, j - 1, k) * met(2, i, j - 1, k) * - met(1, i, j - 1, k) * - (c2 * (u(2, i, j - 1, k + 2) - u(2, i, j - 1, k - 2)) + - c1 * - (u(2, i, j - 1, k + 1) - u(2, i, j - 1, k - 1))))); - - // 4 ops, tot=773 - lu(1, i, j, k) = a1 * lu(1, i, j, k) + sgn * r1 * ijac; - - - - - - } - +/// float_sw4 r1 = 0; +/// +/// // pp derivative (u) +/// // 53 ops, tot=58 +/// float_sw4 cof1 = (2 * mu(i - 2, j, k) + la(i - 2, j, k)) * +/// met(1, i - 2, j, k) * met(1, i - 2, j, k) * +/// strx(i - 2); +/// float_sw4 cof2 = (2 * mu(i - 1, j, k) + la(i - 1, j, k)) * +/// met(1, i - 1, j, k) * met(1, i - 1, j, k) * +/// strx(i - 1); +/// float_sw4 cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(1, i, j, k) * +/// met(1, i, j, k) * strx(i); +/// float_sw4 cof4 = (2 * mu(i + 1, j, k) + la(i + 1, j, k)) * +/// met(1, i + 1, j, k) * met(1, i + 1, j, k) * +/// strx(i + 1); +/// float_sw4 cof5 = (2 * mu(i + 2, j, k) + la(i + 2, j, k)) * +/// met(1, i + 2, j, k) * met(1, i + 2, j, k) * +/// strx(i + 2); +/// float_sw4 mux1 = cof2 - tf * (cof3 + cof1); +/// float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// float_sw4 mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r1 += i6 * +/// (mux1 * (u(1, i - 2, j, k) - u(1, i, j, k)) + +/// mux2 * (u(1, i - 1, j, k) - u(1, i, j, k)) + +/// mux3 * (u(1, i + 1, j, k) - u(1, i, j, k)) + +/// mux4 * (u(1, i + 2, j, k) - u(1, i, j, k))) * +/// istry; +/// // qq derivative (u) +/// // 43 ops, tot=101 +/// { +/// float_sw4 cof1 = (mu(i, j - 2, k)) * met(1, i, j - 2, k) * +/// met(1, i, j - 2, k) * stry(j - 2); +/// float_sw4 cof2 = (mu(i, j - 1, k)) * met(1, i, j - 1, k) * +/// met(1, i, j - 1, k) * stry(j - 1); +/// float_sw4 cof3 = +/// (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * stry(j); +/// float_sw4 cof4 = (mu(i, j + 1, k)) * met(1, i, j + 1, k) * +/// met(1, i, j + 1, k) * stry(j + 1); +/// float_sw4 cof5 = (mu(i, j + 2, k)) * met(1, i, j + 2, k) * +/// met(1, i, j + 2, k) * stry(j + 2); +/// float_sw4 mux1 = cof2 - tf * (cof3 + cof1); +/// float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// float_sw4 mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r1 += i6 * +/// (mux1 * (u(1, i, j - 2, k) - u(1, i, j, k)) + +/// mux2 * (u(1, i, j - 1, k) - u(1, i, j, k)) + +/// mux3 * (u(1, i, j + 1, k) - u(1, i, j, k)) + +/// mux4 * (u(1, i, j + 2, k) - u(1, i, j, k))) * +/// istrx; +/// } +/// #ifdef MAGIC_SYNC +/// __syncthreads(); +/// #endif +/// // rr derivative (u) +/// // 5*11+14+14=83 ops, tot=184 +/// { +/// float_sw4 cof1 = +/// (2 * mu(i, j, k - 2) + la(i, j, k - 2)) * met(2, i, j, k - 2) * +/// strx(i) * met(2, i, j, k - 2) * strx(i) + +/// mu(i, j, k - 2) * (met(3, i, j, k - 2) * stry(j) * +/// met(3, i, j, k - 2) * stry(j) + +/// met(4, i, j, k - 2) * met(4, i, j, k - 2)); +/// float_sw4 cof2 = +/// (2 * mu(i, j, k - 1) + la(i, j, k - 1)) * met(2, i, j, k - 1) * +/// strx(i) * met(2, i, j, k - 1) * strx(i) + +/// mu(i, j, k - 1) * (met(3, i, j, k - 1) * stry(j) * +/// met(3, i, j, k - 1) * stry(j) + +/// met(4, i, j, k - 1) * met(4, i, j, k - 1)); +/// float_sw4 cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(2, i, j, k) * +/// strx(i) * met(2, i, j, k) * strx(i) + +/// mu(i, j, k) * (met(3, i, j, k) * stry(j) * +/// met(3, i, j, k) * stry(j) + +/// met(4, i, j, k) * met(4, i, j, k)); +/// float_sw4 cof4 = +/// (2 * mu(i, j, k + 1) + la(i, j, k + 1)) * met(2, i, j, k + 1) * +/// strx(i) * met(2, i, j, k + 1) * strx(i) + +/// mu(i, j, k + 1) * (met(3, i, j, k + 1) * stry(j) * +/// met(3, i, j, k + 1) * stry(j) + +/// met(4, i, j, k + 1) * met(4, i, j, k + 1)); +/// float_sw4 cof5 = +/// (2 * mu(i, j, k + 2) + la(i, j, k + 2)) * met(2, i, j, k + 2) * +/// strx(i) * met(2, i, j, k + 2) * strx(i) + +/// mu(i, j, k + 2) * (met(3, i, j, k + 2) * stry(j) * +/// met(3, i, j, k + 2) * stry(j) + +/// met(4, i, j, k + 2) * met(4, i, j, k + 2)); /// +/// float_sw4 mux1 = cof2 - tf * (cof3 + cof1); +/// float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// float_sw4 mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r1 += i6 * +/// (mux1 * (u(1, i, j, k - 2) - u(1, i, j, k)) + +/// mux2 * (u(1, i, j, k - 1) - u(1, i, j, k)) + +/// mux3 * (u(1, i, j, k + 1) - u(1, i, j, k)) + +/// mux4 * (u(1, i, j, k + 2) - u(1, i, j, k))) * +/// istrxy; +/// } +/// // rr derivative (v) +/// // 42 ops, tot=226 +/// cof1 = (mu(i, j, k - 2) + la(i, j, k - 2)) * met(2, i, j, k - 2) * +/// met(3, i, j, k - 2); +/// cof2 = (mu(i, j, k - 1) + la(i, j, k - 1)) * met(2, i, j, k - 1) * +/// met(3, i, j, k - 1); +/// cof3 = (mu(i, j, k) + la(i, j, k)) * met(2, i, j, k) * met(3, i, j, k); +/// cof4 = (mu(i, j, k + 1) + la(i, j, k + 1)) * met(2, i, j, k + 1) * +/// met(3, i, j, k + 1); +/// cof5 = (mu(i, j, k + 2) + la(i, j, k + 2)) * met(2, i, j, k + 2) * +/// met(3, i, j, k + 2); +/// mux1 = cof2 - tf * (cof3 + cof1); +/// mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r1 += i6 * (mux1 * (u(2, i, j, k - 2) - u(2, i, j, k)) + +/// mux2 * (u(2, i, j, k - 1) - u(2, i, j, k)) + +/// mux3 * (u(2, i, j, k + 1) - u(2, i, j, k)) + +/// mux4 * (u(2, i, j, k + 2) - u(2, i, j, k))); +/// +/// // rr derivative (w) +/// // 43 ops, tot=269 +/// cof1 = (mu(i, j, k - 2) + la(i, j, k - 2)) * met(2, i, j, k - 2) * +/// met(4, i, j, k - 2); +/// cof2 = (mu(i, j, k - 1) + la(i, j, k - 1)) * met(2, i, j, k - 1) * +/// met(4, i, j, k - 1); +/// cof3 = (mu(i, j, k) + la(i, j, k)) * met(2, i, j, k) * met(4, i, j, k); +/// cof4 = (mu(i, j, k + 1) + la(i, j, k + 1)) * met(2, i, j, k + 1) * +/// met(4, i, j, k + 1); +/// cof5 = (mu(i, j, k + 2) + la(i, j, k + 2)) * met(2, i, j, k + 2) * +/// met(4, i, j, k + 2); +/// mux1 = cof2 - tf * (cof3 + cof1); +/// mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r1 += i6 * +/// (mux1 * (u(3, i, j, k - 2) - u(3, i, j, k)) + +/// mux2 * (u(3, i, j, k - 1) - u(3, i, j, k)) + +/// mux3 * (u(3, i, j, k + 1) - u(3, i, j, k)) + +/// mux4 * (u(3, i, j, k + 2) - u(3, i, j, k))) * +/// istry; +/// +/// // pq-derivatives +/// // 38 ops, tot=307 +/// r1 += +/// c2 * (mu(i, j + 2, k) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * +/// (c2 * (u(2, i + 2, j + 2, k) - u(2, i - 2, j + 2, k)) + +/// c1 * (u(2, i + 1, j + 2, k) - u(2, i - 1, j + 2, k))) - +/// mu(i, j - 2, k) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * +/// (c2 * (u(2, i + 2, j - 2, k) - u(2, i - 2, j - 2, k)) + +/// c1 * (u(2, i + 1, j - 2, k) - u(2, i - 1, j - 2, k)))) + +/// c1 * (mu(i, j + 1, k) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * +/// (c2 * (u(2, i + 2, j + 1, k) - u(2, i - 2, j + 1, k)) + +/// c1 * (u(2, i + 1, j + 1, k) - u(2, i - 1, j + 1, k))) - +/// mu(i, j - 1, k) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * +/// (c2 * (u(2, i + 2, j - 1, k) - u(2, i - 2, j - 1, k)) + +/// c1 * (u(2, i + 1, j - 1, k) - u(2, i - 1, j - 1, k)))); +/// +/// // qp-derivatives +/// // 38 ops, tot=345 +/// r1 += +/// c2 * (la(i + 2, j, k) * met(1, i + 2, j, k) * met(1, i + 2, j, k) * +/// (c2 * (u(2, i + 2, j + 2, k) - u(2, i + 2, j - 2, k)) + +/// c1 * (u(2, i + 2, j + 1, k) - u(2, i + 2, j - 1, k))) - +/// la(i - 2, j, k) * met(1, i - 2, j, k) * met(1, i - 2, j, k) * +/// (c2 * (u(2, i - 2, j + 2, k) - u(2, i - 2, j - 2, k)) + +/// c1 * (u(2, i - 2, j + 1, k) - u(2, i - 2, j - 1, k)))) + +/// c1 * (la(i + 1, j, k) * met(1, i + 1, j, k) * met(1, i + 1, j, k) * +/// (c2 * (u(2, i + 1, j + 2, k) - u(2, i + 1, j - 2, k)) + +/// c1 * (u(2, i + 1, j + 1, k) - u(2, i + 1, j - 1, k))) - +/// la(i - 1, j, k) * met(1, i - 1, j, k) * met(1, i - 1, j, k) * +/// (c2 * (u(2, i - 1, j + 2, k) - u(2, i - 1, j - 2, k)) + +/// c1 * (u(2, i - 1, j + 1, k) - u(2, i - 1, j - 1, k)))); +/// +/// // pr-derivatives +/// // 130 ops., tot=475 +/// r1 += +/// c2 * ((2 * mu(i, j, k + 2) + la(i, j, k + 2)) * +/// met(2, i, j, k + 2) * met(1, i, j, k + 2) * +/// (c2 * (u(1, i + 2, j, k + 2) - u(1, i - 2, j, k + 2)) + +/// c1 * (u(1, i + 1, j, k + 2) - u(1, i - 1, j, k + 2))) * +/// strx(i) * istry + +/// mu(i, j, k + 2) * met(3, i, j, k + 2) * met(1, i, j, k + 2) * +/// (c2 * (u(2, i + 2, j, k + 2) - u(2, i - 2, j, k + 2)) + +/// c1 * (u(2, i + 1, j, k + 2) - u(2, i - 1, j, k + 2))) + +/// mu(i, j, k + 2) * met(4, i, j, k + 2) * met(1, i, j, k + 2) * +/// (c2 * (u(3, i + 2, j, k + 2) - u(3, i - 2, j, k + 2)) + +/// c1 * (u(3, i + 1, j, k + 2) - u(3, i - 1, j, k + 2))) * +/// istry - +/// ((2 * mu(i, j, k - 2) + la(i, j, k - 2)) * +/// met(2, i, j, k - 2) * met(1, i, j, k - 2) * +/// (c2 * (u(1, i + 2, j, k - 2) - u(1, i - 2, j, k - 2)) + +/// c1 * (u(1, i + 1, j, k - 2) - u(1, i - 1, j, k - 2))) * +/// strx(i) * istry + +/// mu(i, j, k - 2) * met(3, i, j, k - 2) * met(1, i, j, k - 2) * +/// (c2 * (u(2, i + 2, j, k - 2) - u(2, i - 2, j, k - 2)) + +/// c1 * (u(2, i + 1, j, k - 2) - u(2, i - 1, j, k - 2))) + +/// mu(i, j, k - 2) * met(4, i, j, k - 2) * met(1, i, j, k - 2) * +/// (c2 * (u(3, i + 2, j, k - 2) - u(3, i - 2, j, k - 2)) + +/// c1 * (u(3, i + 1, j, k - 2) - u(3, i - 1, j, k - 2))) * +/// istry)) + +/// c1 * ((2 * mu(i, j, k + 1) + la(i, j, k + 1)) * +/// met(2, i, j, k + 1) * met(1, i, j, k + 1) * +/// (c2 * (u(1, i + 2, j, k + 1) - u(1, i - 2, j, k + 1)) + +/// c1 * (u(1, i + 1, j, k + 1) - u(1, i - 1, j, k + 1))) * +/// strx(i) * istry + +/// mu(i, j, k + 1) * met(3, i, j, k + 1) * met(1, i, j, k + 1) * +/// (c2 * (u(2, i + 2, j, k + 1) - u(2, i - 2, j, k + 1)) + +/// c1 * (u(2, i + 1, j, k + 1) - u(2, i - 1, j, k + 1))) + +/// mu(i, j, k + 1) * met(4, i, j, k + 1) * met(1, i, j, k + 1) * +/// (c2 * (u(3, i + 2, j, k + 1) - u(3, i - 2, j, k + 1)) + +/// c1 * (u(3, i + 1, j, k + 1) - u(3, i - 1, j, k + 1))) * +/// istry - +/// ((2 * mu(i, j, k - 1) + la(i, j, k - 1)) * +/// met(2, i, j, k - 1) * met(1, i, j, k - 1) * +/// (c2 * (u(1, i + 2, j, k - 1) - u(1, i - 2, j, k - 1)) + +/// c1 * (u(1, i + 1, j, k - 1) - u(1, i - 1, j, k - 1))) * +/// strx(i) * istry + +/// mu(i, j, k - 1) * met(3, i, j, k - 1) * met(1, i, j, k - 1) * +/// (c2 * (u(2, i + 2, j, k - 1) - u(2, i - 2, j, k - 1)) + +/// c1 * (u(2, i + 1, j, k - 1) - u(2, i - 1, j, k - 1))) + +/// mu(i, j, k - 1) * met(4, i, j, k - 1) * met(1, i, j, k - 1) * +/// (c2 * (u(3, i + 2, j, k - 1) - u(3, i - 2, j, k - 1)) + +/// c1 * (u(3, i + 1, j, k - 1) - u(3, i - 1, j, k - 1))) * +/// istry)); +/// +/// // rp derivatives +/// // 130 ops, tot=605 +/// r1 += +/// (c2 * +/// ((2 * mu(i + 2, j, k) + la(i + 2, j, k)) * +/// met(2, i + 2, j, k) * met(1, i + 2, j, k) * +/// (c2 * (u(1, i + 2, j, k + 2) - u(1, i + 2, j, k - 2)) + +/// c1 * (u(1, i + 2, j, k + 1) - u(1, i + 2, j, k - 1))) * +/// strx(i + 2) + +/// la(i + 2, j, k) * met(3, i + 2, j, k) * met(1, i + 2, j, k) * +/// (c2 * (u(2, i + 2, j, k + 2) - u(2, i + 2, j, k - 2)) + +/// c1 * (u(2, i + 2, j, k + 1) - u(2, i + 2, j, k - 1))) * +/// stry(j) + +/// la(i + 2, j, k) * met(4, i + 2, j, k) * met(1, i + 2, j, k) * +/// (c2 * (u(3, i + 2, j, k + 2) - u(3, i + 2, j, k - 2)) + +/// c1 * (u(3, i + 2, j, k + 1) - u(3, i + 2, j, k - 1))) - +/// ((2 * mu(i - 2, j, k) + la(i - 2, j, k)) * +/// met(2, i - 2, j, k) * met(1, i - 2, j, k) * +/// (c2 * (u(1, i - 2, j, k + 2) - u(1, i - 2, j, k - 2)) + +/// c1 * (u(1, i - 2, j, k + 1) - u(1, i - 2, j, k - 1))) * +/// strx(i - 2) + +/// la(i - 2, j, k) * met(3, i - 2, j, k) * met(1, i - 2, j, k) * +/// (c2 * (u(2, i - 2, j, k + 2) - u(2, i - 2, j, k - 2)) + +/// c1 * (u(2, i - 2, j, k + 1) - u(2, i - 2, j, k - 1))) * +/// stry(j) + +/// la(i - 2, j, k) * met(4, i - 2, j, k) * met(1, i - 2, j, k) * +/// (c2 * (u(3, i - 2, j, k + 2) - u(3, i - 2, j, k - 2)) + +/// c1 * +/// (u(3, i - 2, j, k + 1) - u(3, i - 2, j, k - 1))))) + +/// c1 * +/// ((2 * mu(i + 1, j, k) + la(i + 1, j, k)) * +/// met(2, i + 1, j, k) * met(1, i + 1, j, k) * +/// (c2 * (u(1, i + 1, j, k + 2) - u(1, i + 1, j, k - 2)) + +/// c1 * (u(1, i + 1, j, k + 1) - u(1, i + 1, j, k - 1))) * +/// strx(i + 1) + +/// la(i + 1, j, k) * met(3, i + 1, j, k) * met(1, i + 1, j, k) * +/// (c2 * (u(2, i + 1, j, k + 2) - u(2, i + 1, j, k - 2)) + +/// c1 * (u(2, i + 1, j, k + 1) - u(2, i + 1, j, k - 1))) * +/// stry(j) + +/// la(i + 1, j, k) * met(4, i + 1, j, k) * met(1, i + 1, j, k) * +/// (c2 * (u(3, i + 1, j, k + 2) - u(3, i + 1, j, k - 2)) + +/// c1 * (u(3, i + 1, j, k + 1) - u(3, i + 1, j, k - 1))) - +/// ((2 * mu(i - 1, j, k) + la(i - 1, j, k)) * +/// met(2, i - 1, j, k) * met(1, i - 1, j, k) * +/// (c2 * (u(1, i - 1, j, k + 2) - u(1, i - 1, j, k - 2)) + +/// c1 * (u(1, i - 1, j, k + 1) - u(1, i - 1, j, k - 1))) * +/// strx(i - 1) + +/// la(i - 1, j, k) * met(3, i - 1, j, k) * met(1, i - 1, j, k) * +/// (c2 * (u(2, i - 1, j, k + 2) - u(2, i - 1, j, k - 2)) + +/// c1 * (u(2, i - 1, j, k + 1) - u(2, i - 1, j, k - 1))) * +/// stry(j) + +/// la(i - 1, j, k) * met(4, i - 1, j, k) * met(1, i - 1, j, k) * +/// (c2 * (u(3, i - 1, j, k + 2) - u(3, i - 1, j, k - 2)) + +/// c1 * (u(3, i - 1, j, k + 1) - +/// u(3, i - 1, j, k - 1)))))) * +/// istry; +/// +/// // qr derivatives +/// // 82 ops, tot=687 +/// r1 += +/// c2 * +/// (mu(i, j, k + 2) * met(3, i, j, k + 2) * met(1, i, j, k + 2) * +/// (c2 * (u(1, i, j + 2, k + 2) - u(1, i, j - 2, k + 2)) + +/// c1 * (u(1, i, j + 1, k + 2) - u(1, i, j - 1, k + 2))) * +/// stry(j) * istrx + +/// la(i, j, k + 2) * met(2, i, j, k + 2) * met(1, i, j, k + 2) * +/// (c2 * (u(2, i, j + 2, k + 2) - u(2, i, j - 2, k + 2)) + +/// c1 * (u(2, i, j + 1, k + 2) - u(2, i, j - 1, k + 2))) - +/// (mu(i, j, k - 2) * met(3, i, j, k - 2) * met(1, i, j, k - 2) * +/// (c2 * (u(1, i, j + 2, k - 2) - u(1, i, j - 2, k - 2)) + +/// c1 * (u(1, i, j + 1, k - 2) - u(1, i, j - 1, k - 2))) * +/// stry(j) * istrx + +/// la(i, j, k - 2) * met(2, i, j, k - 2) * met(1, i, j, k - 2) * +/// (c2 * (u(2, i, j + 2, k - 2) - u(2, i, j - 2, k - 2)) + +/// c1 * (u(2, i, j + 1, k - 2) - u(2, i, j - 1, k - 2))))) + +/// c1 * (mu(i, j, k + 1) * met(3, i, j, k + 1) * met(1, i, j, k + 1) * +/// (c2 * (u(1, i, j + 2, k + 1) - u(1, i, j - 2, k + 1)) + +/// c1 * (u(1, i, j + 1, k + 1) - u(1, i, j - 1, k + 1))) * +/// stry(j) * istrx + +/// la(i, j, k + 1) * met(2, i, j, k + 1) * met(1, i, j, k + 1) * +/// (c2 * (u(2, i, j + 2, k + 1) - u(2, i, j - 2, k + 1)) + +/// c1 * (u(2, i, j + 1, k + 1) - u(2, i, j - 1, k + 1))) - +/// (mu(i, j, k - 1) * met(3, i, j, k - 1) * met(1, i, j, k - 1) * +/// (c2 * (u(1, i, j + 2, k - 1) - u(1, i, j - 2, k - 1)) + +/// c1 * (u(1, i, j + 1, k - 1) - u(1, i, j - 1, k - 1))) * +/// stry(j) * istrx + +/// la(i, j, k - 1) * met(2, i, j, k - 1) * met(1, i, j, k - 1) * +/// (c2 * (u(2, i, j + 2, k - 1) - u(2, i, j - 2, k - 1)) + +/// c1 * (u(2, i, j + 1, k - 1) - u(2, i, j - 1, k - 1))))); +/// +/// // rq derivatives +/// // 82 ops, tot=769 +/// r1 += +/// c2 * +/// (mu(i, j + 2, k) * met(3, i, j + 2, k) * met(1, i, j + 2, k) * +/// (c2 * (u(1, i, j + 2, k + 2) - u(1, i, j + 2, k - 2)) + +/// c1 * (u(1, i, j + 2, k + 1) - u(1, i, j + 2, k - 1))) * +/// stry(j + 2) * istrx + +/// mu(i, j + 2, k) * met(2, i, j + 2, k) * met(1, i, j + 2, k) * +/// (c2 * (u(2, i, j + 2, k + 2) - u(2, i, j + 2, k - 2)) + +/// c1 * (u(2, i, j + 2, k + 1) - u(2, i, j + 2, k - 1))) - +/// (mu(i, j - 2, k) * met(3, i, j - 2, k) * met(1, i, j - 2, k) * +/// (c2 * (u(1, i, j - 2, k + 2) - u(1, i, j - 2, k - 2)) + +/// c1 * (u(1, i, j - 2, k + 1) - u(1, i, j - 2, k - 1))) * +/// stry(j - 2) * istrx + +/// mu(i, j - 2, k) * met(2, i, j - 2, k) * met(1, i, j - 2, k) * +/// (c2 * (u(2, i, j - 2, k + 2) - u(2, i, j - 2, k - 2)) + +/// c1 * (u(2, i, j - 2, k + 1) - u(2, i, j - 2, k - 1))))) + +/// c1 * (mu(i, j + 1, k) * met(3, i, j + 1, k) * met(1, i, j + 1, k) * +/// (c2 * (u(1, i, j + 1, k + 2) - u(1, i, j + 1, k - 2)) + +/// c1 * (u(1, i, j + 1, k + 1) - u(1, i, j + 1, k - 1))) * +/// stry(j + 1) * istrx + +/// mu(i, j + 1, k) * met(2, i, j + 1, k) * met(1, i, j + 1, k) * +/// (c2 * (u(2, i, j + 1, k + 2) - u(2, i, j + 1, k - 2)) + +/// c1 * (u(2, i, j + 1, k + 1) - u(2, i, j + 1, k - 1))) - +/// (mu(i, j - 1, k) * met(3, i, j - 1, k) * met(1, i, j - 1, k) * +/// (c2 * (u(1, i, j - 1, k + 2) - u(1, i, j - 1, k - 2)) + +/// c1 * (u(1, i, j - 1, k + 1) - u(1, i, j - 1, k - 1))) * +/// stry(j - 1) * istrx + +/// mu(i, j - 1, k) * met(2, i, j - 1, k) * met(1, i, j - 1, k) * +/// (c2 * (u(2, i, j - 1, k + 2) - u(2, i, j - 1, k - 2)) + +/// c1 * (u(2, i, j - 1, k + 1) - u(2, i, j - 1, k - 1))))); +/// +/// // 4 ops, tot=773 +/// lu(1, i, j, k) = a1 * lu(1, i, j, k) + sgn * r1 * ijac; +/// } #ifndef RAJAPerf_Apps_SW4CK_KERNEL_2_HPP #define RAJAPerf_Apps_SW4CK_KERNEL_2_HPP -#define SW4CK_KERNEL_2_DATA_SETUP \ - -#define SW4CK_KERNEL_2_BODY \ - +#define SW4CK_KERNEL_2_DATA_SETUP +#define SW4CK_KERNEL_2_BODY #include "common/KernelBase.hpp" -namespace rajaperf -{ -class RunParams; - -namespace apps -{ -class ADomain; - -class SW4CK_KERNEL_2 : public KernelBase -{ -public: - - SW4CK_KERNEL_2(const RunParams& params); - - ~SW4CK_KERNEL_2(); - - void setUp(VariantID vid, size_t tune_idx); - void updateChecksum(VariantID vid, size_t tune_idx); - void tearDown(VariantID vid, size_t tune_idx); - - void runSeqVariant(VariantID vid, size_t tune_idx); - void runOpenMPVariant(VariantID vid, size_t tune_idx); - void runCudaVariant(VariantID vid, size_t tune_idx); - void runHipVariant(VariantID vid, size_t tune_idx); - void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); - - void setCudaTuningDefinitions(VariantID vid); - void setHipTuningDefinitions(VariantID vid); - template < size_t block_size > - void runCudaVariantImpl(VariantID vid); - template < size_t block_size > - void runHipVariantImpl(VariantID vid); - -private: - static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = gpu_block_size::make_list_type; - - Real_ptr m_x; - Real_ptr m_y; - Real_ptr m_z; - Real_ptr m_vol; - - Real_type m_vnormq; - - ADomain* m_domain; - Index_type m_array_length; -}; - -} // end namespace apps -} // end namespace rajaperf + namespace rajaperf { + class RunParams; + + namespace apps { + class ADomain; + + class SW4CK_KERNEL_2 : public KernelBase { + public: + SW4CK_KERNEL_2(const RunParams ¶ms); + + ~SW4CK_KERNEL_2(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template void runCudaVariantImpl(VariantID vid); + template void runHipVariantImpl(VariantID vid); + + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = + gpu_block_size::make_list_type; + + Real_ptr m_a_mu; + Real_ptr m_a_lambda; + Real_ptr m_a_jac; + Real_ptr m_a_u; + Real_ptr a_lu; + Real_ptr a_met; + Real_ptr a_strx; + Real_ptr a_stry; + Real_ptr a_acof; + Real_ptr a_bope; + Real_ptr a_ghcof; + Real_ptr a_acof_no_gp; + Real_ptr a_ghcof_no_gp; + + }; + + } // end namespace apps + } // end namespace rajaperf #endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index e73bd9888..15acdf26a 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -144,6 +144,7 @@ enum KernelID { Apps_MASS3DPA, Apps_NODAL_ACCUMULATION_3D, Apps_PRESSURE, + Apps_SW4CK_KERNEL_2, Apps_VOL3D, // From e8273871e91458483c797bc2902d746b7afe29c3 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 29 Mar 2023 13:19:56 -0700 Subject: [PATCH 3/7] add in kernels -- does not compile yet --- src/apps/SW4CK_KERNEL_2-Seq.cpp | 72 +++++- src/apps/SW4CK_KERNEL_2.hpp | 419 +++++++++++++++++++++++++++++++- 2 files changed, 486 insertions(+), 5 deletions(-) diff --git a/src/apps/SW4CK_KERNEL_2-Seq.cpp b/src/apps/SW4CK_KERNEL_2-Seq.cpp index ca6e21a7c..f36011d50 100644 --- a/src/apps/SW4CK_KERNEL_2-Seq.cpp +++ b/src/apps/SW4CK_KERNEL_2-Seq.cpp @@ -26,6 +26,15 @@ void SW4CK_KERNEL_2::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun SW4CK_KERNEL_2_DATA_SETUP; + //To be populated later with + const int ifirst = 0; + const int ilast = 1; + const int jfirst = 0; + const int jlast = 1; + const int kstart = 0; + const int klast = 1; + const int kend = 1; + switch ( vid ) { case Base_Seq : { @@ -34,6 +43,65 @@ void SW4CK_KERNEL_2::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun for (RepIndex_type irep = 0; irep < run_reps; ++irep) { //Reference impl + for(int k=kstart; k Date: Wed, 29 Mar 2023 14:10:13 -0700 Subject: [PATCH 4/7] code builds, need to populate with realistic parameters --- src/apps/SW4CK_KERNEL_2-Seq.cpp | 11 ++++- src/apps/SW4CK_KERNEL_2.hpp | 80 ++++++++++++++++++++++++++++----- 2 files changed, 77 insertions(+), 14 deletions(-) diff --git a/src/apps/SW4CK_KERNEL_2-Seq.cpp b/src/apps/SW4CK_KERNEL_2-Seq.cpp index f36011d50..5aff9be29 100644 --- a/src/apps/SW4CK_KERNEL_2-Seq.cpp +++ b/src/apps/SW4CK_KERNEL_2-Seq.cpp @@ -24,17 +24,24 @@ void SW4CK_KERNEL_2::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun { const Index_type run_reps = getRunReps(); - SW4CK_KERNEL_2_DATA_SETUP; - //To be populated later with + const int istart = 0; const int ifirst = 0; const int ilast = 1; + const int jstart = 0; const int jfirst = 0; + const int jend = 0; const int jlast = 1; + const int kfirst = 0; const int kstart = 0; const int klast = 1; const int kend = 1; + char op = '='; + + SW4CK_KERNEL_2_DATA_SETUP; + + switch ( vid ) { case Base_Seq : { diff --git a/src/apps/SW4CK_KERNEL_2.hpp b/src/apps/SW4CK_KERNEL_2.hpp index 04100d0dc..5ec9a9668 100644 --- a/src/apps/SW4CK_KERNEL_2.hpp +++ b/src/apps/SW4CK_KERNEL_2.hpp @@ -381,10 +381,67 @@ #ifndef RAJAPerf_Apps_SW4CK_KERNEL_2_HPP #define RAJAPerf_Apps_SW4CK_KERNEL_2_HPP -#define SW4CK_KERNEL_2_DATA_SETUP - using float_sw4 = double; +#define SW4CK_KERNEL_2_DATA_SETUP \ + float_sw4 a1 = 0; \ + float_sw4 sgn = 1; \ + if (op == '=') { \ + a1 = 0; \ + sgn = 1; \ + } else if (op == '+') { \ + a1 = 1; \ + sgn = 1; \ + } else if (op == '-') { \ + a1 = 1; \ + sgn = -1; \ + } \ + \ + const float_sw4 i6 = 1.0 / 6; \ + const float_sw4 tf = 0.75; \ + const float_sw4 c1 = 2.0 / 3; \ + const float_sw4 c2 = -1.0 / 12; \ + \ + const int ni = ilast - ifirst + 1; \ + const int nij = ni * (jlast - jfirst + 1); \ + const int nijk = nij * (klast - kfirst + 1); \ + const int base = -(ifirst + ni * jfirst + nij * kfirst); \ + const int base3 = base - nijk; \ + const int base4 = base - nijk; \ + const int ifirst0 = ifirst; \ + const int jfirst0 = jfirst; \ + \ + Real_ptr a_mu = m_a_mu; \ + Real_ptr a_lambda = m_a_lambda; \ + Real_ptr a_jac = m_a_jac; \ + Real_ptr a_u = m_a_u; \ + Real_ptr a_lu = m_a_lu; \ + Real_ptr a_met = m_a_met; \ + Real_ptr a_strx = m_a_strx; \ + Real_ptr a_stry = m_a_stry; +/* + Real_ptr a_acof = m_a_acof; \ + Real_ptr a_bope = m_a_bope; \ + Real_ptr a_ghcof = m_a_ghcof; \ + Real_ptr a_acof_no_gp = m_a_acof_no_gp; \ + Real_ptr a_ghcof_no_gp = m_a_ghcof_no_gp; +*/ + +// Direct reuse of fortran code by these macro definitions: +#define mu(i, j, k) a_mu[base + (i) + ni * (j) + nij * (k)] +#define la(i, j, k) a_lambda[base + (i) + ni * (j) + nij * (k)] +#define jac(i, j, k) a_jac[base + (i) + ni * (j) + nij * (k)] +#define u(c, i, j, k) a_u[base3 + (i) + ni * (j) + nij * (k) + nijk * (c)] +#define lu(c, i, j, k) a_lu[base3 + (i) + ni * (j) + nij * (k) + nijk * (c)] +#define met(c, i, j, k) a_met[base4 + (i) + ni * (j) + nij * (k) + nijk * (c)] +#define strx(i) a_strx[i - ifirst0] +#define stry(j) a_stry[j - jfirst0] +#define acof(i, j, k) a_acof[(i - 1) + 6 * (j - 1) + 48 * (k - 1)] +#define bope(i, j) a_bope[i - 1 + 6 * (j - 1)] +#define ghcof(i) a_ghcof[i - 1] +#define acof_no_gp(i, j, k) a_acof_no_gp[(i - 1) + 6 * (j - 1) + 48 * (k - 1)] +#define ghcof_no_gp(i) a_ghcof_no_gp[i - 1] + // 5 ops #define SW4CK_KERNEL_2_BODY_1 \ float_sw4 ijac = strx(i) * stry(j) / jac(i, j, k); \ @@ -798,7 +855,6 @@ using float_sw4 = double; lu(1, i, j, k) = a1 * lu(1, i, j, k) + sgn * r1 * ijac; - #include "common/KernelBase.hpp" namespace rajaperf { @@ -836,15 +892,15 @@ using float_sw4 = double; Real_ptr m_a_lambda; Real_ptr m_a_jac; Real_ptr m_a_u; - Real_ptr a_lu; - Real_ptr a_met; - Real_ptr a_strx; - Real_ptr a_stry; - Real_ptr a_acof; - Real_ptr a_bope; - Real_ptr a_ghcof; - Real_ptr a_acof_no_gp; - Real_ptr a_ghcof_no_gp; + Real_ptr m_a_lu; + Real_ptr m_a_met; + Real_ptr m_a_strx; + Real_ptr m_a_stry; + Real_ptr m_a_acof; + Real_ptr m_a_bope; + Real_ptr m_a_ghcof; + Real_ptr m_a_acof_no_gp; + Real_ptr m_a_ghcof_no_gp; }; From 3046342334fa42607d49351de05007ccbd345c9b Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 30 Mar 2023 11:43:13 -0700 Subject: [PATCH 5/7] made macros for SW4CK kernel 5 --- src/apps/CMakeLists.txt | 40 +- src/apps/SW4CK_KERNEL_5-Seq.cpp | 107 +++ src/apps/SW4CK_KERNEL_5.cpp | 95 +++ src/apps/SW4CK_KERNEL_5.hpp | 1368 +++++++++++++++++++++++++++++++ src/common/RAJAPerfSuite.cpp | 12 +- src/common/RAJAPerfSuite.hpp | 1 + 6 files changed, 1601 insertions(+), 22 deletions(-) create mode 100644 src/apps/SW4CK_KERNEL_5-Seq.cpp create mode 100644 src/apps/SW4CK_KERNEL_5.cpp create mode 100644 src/apps/SW4CK_KERNEL_5.hpp diff --git a/src/apps/CMakeLists.txt b/src/apps/CMakeLists.txt index 96cb3ed5c..5cb71a72e 100644 --- a/src/apps/CMakeLists.txt +++ b/src/apps/CMakeLists.txt @@ -15,12 +15,12 @@ blt_add_library( CONVECTION3DPA-Seq.cpp CONVECTION3DPA-OMP.cpp CONVECTION3DPA-OMPTarget.cpp - DEL_DOT_VEC_2D.cpp - DEL_DOT_VEC_2D-Seq.cpp - DEL_DOT_VEC_2D-Hip.cpp - DEL_DOT_VEC_2D-Cuda.cpp - DEL_DOT_VEC_2D-OMP.cpp - DEL_DOT_VEC_2D-OMPTarget.cpp + DEL_DOT_VEC_2D.cpp + DEL_DOT_VEC_2D-Seq.cpp + DEL_DOT_VEC_2D-Hip.cpp + DEL_DOT_VEC_2D-Cuda.cpp + DEL_DOT_VEC_2D-OMP.cpp + DEL_DOT_VEC_2D-OMPTarget.cpp DIFFUSION3DPA.cpp DIFFUSION3DPA-Cuda.cpp DIFFUSION3DPA-Hip.cpp @@ -29,10 +29,10 @@ blt_add_library( DIFFUSION3DPA-OMPTarget.cpp ENERGY.cpp ENERGY-Seq.cpp - ENERGY-Hip.cpp - ENERGY-Cuda.cpp - ENERGY-OMP.cpp - ENERGY-OMPTarget.cpp + ENERGY-Hip.cpp + ENERGY-Cuda.cpp + ENERGY-OMP.cpp + ENERGY-OMPTarget.cpp FIR.cpp FIR-Seq.cpp FIR-Hip.cpp @@ -75,20 +75,22 @@ blt_add_library( NODAL_ACCUMULATION_3D-Cuda.cpp NODAL_ACCUMULATION_3D-OMP.cpp NODAL_ACCUMULATION_3D-OMPTarget.cpp - PRESSURE.cpp - PRESSURE-Seq.cpp - PRESSURE-Hip.cpp - PRESSURE-Cuda.cpp - PRESSURE-OMP.cpp + PRESSURE.cpp + PRESSURE-Seq.cpp + PRESSURE-Hip.cpp + PRESSURE-Cuda.cpp + PRESSURE-OMP.cpp PRESSURE-OMPTarget.cpp SW4CK_KERNEL_2.cpp SW4CK_KERNEL_2-Seq.cpp + SW4CK_KERNEL_5.cpp + SW4CK_KERNEL_5-Seq.cpp VOL3D.cpp VOL3D-Seq.cpp - VOL3D-Hip.cpp - VOL3D-Cuda.cpp - VOL3D-OMP.cpp - VOL3D-OMPTarget.cpp + VOL3D-Hip.cpp + VOL3D-Cuda.cpp + VOL3D-OMP.cpp + VOL3D-OMPTarget.cpp WIP-COUPLE.cpp DEPENDS_ON common ${RAJA_PERFSUITE_DEPENDS} ) diff --git a/src/apps/SW4CK_KERNEL_5-Seq.cpp b/src/apps/SW4CK_KERNEL_5-Seq.cpp new file mode 100644 index 000000000..0893be43e --- /dev/null +++ b/src/apps/SW4CK_KERNEL_5-Seq.cpp @@ -0,0 +1,107 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include "SW4CK_KERNEL_5.hpp" + +#include "RAJA/RAJA.hpp" + +#include "AppsData.hpp" + +#include + +namespace rajaperf +{ +namespace apps +{ + + +void SW4CK_KERNEL_5::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + const Index_type run_reps = getRunReps(); + + //To be populated later with + const int istart = 0; + const int ifirst = 0; + const int ilast = 1; + const int jstart = 0; + const int jfirst = 0; + const int jend = 0; + const int jlast = 1; + const int kfirst = 0; + const int kstart = 0; + const int klast = 1; + const int kend = 1; + + char op = '='; + + SW4CK_KERNEL_5_DATA_SETUP; + + + switch ( vid ) { + + case Base_Seq : { + + startTimer(); + for (RepIndex_type irep = 0; irep < run_reps; ++irep) { + + //Reference impl + for(int k=kstart; k + + +namespace rajaperf +{ +namespace apps +{ + + +SW4CK_KERNEL_5::SW4CK_KERNEL_5(const RunParams& params) + : KernelBase(rajaperf::Apps_SW4CK_KERNEL_5, params) +{ + setDefaultProblemSize(100*100*100); // See rzmax in ADomain struct + setDefaultReps(100); + + Index_type rzmax = std::cbrt(getTargetProblemSize())+1; + //m_domain = new ADomain(rzmax, /* ndims = */ 3); + + //m_array_length = m_domain->nnalls; + + //setActualProblemSize( m_domain->lpz+1 - m_domain->fpz ); + + //setItsPerRep( m_domain->lpz+1 - m_domain->fpz ); + setKernelsPerRep(1); + // touched data size, not actual number of stores and loads + // setBytesPerRep( (1*sizeof(Real_type) + 0*sizeof(Real_type)) * getItsPerRep() + + //(0*sizeof(Real_type) + 3*sizeof(Real_type)) * (getItsPerRep() + 1+m_domain->jp+m_domain->kp) ); + + //setFLOPsPerRep(72 * (m_domain->lpz+1 - m_domain->fpz)); + + checksum_scale_factor = 0.001 * + ( static_cast(getDefaultProblemSize()) / + getActualProblemSize() ); + + setUsesFeature(Teams); + + //Goal is to get the following three variants right first + setVariantDefined( Base_Seq ); + setVariantDefined( Lambda_Seq ); + setVariantDefined( RAJA_Seq ); + + /* + setVariantDefined( Base_OpenMP ); + setVariantDefined( Lambda_OpenMP ); + setVariantDefined( RAJA_OpenMP ); + + setVariantDefined( Base_OpenMPTarget ); + setVariantDefined( RAJA_OpenMPTarget ); + + setVariantDefined( Base_CUDA ); + setVariantDefined( RAJA_CUDA ); + + setVariantDefined( Base_HIP ); + setVariantDefined( RAJA_HIP ); + */ +} + +SW4CK_KERNEL_5::~SW4CK_KERNEL_5() +{ + // delete m_domain; +} + +void SW4CK_KERNEL_5::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + +} + +void SW4CK_KERNEL_5::updateChecksum(VariantID vid, size_t tune_idx) +{ + //checksum[vid][tune_idx] += calcChecksum(m_vol, m_array_length, checksum_scale_factor ); +} + +void SW4CK_KERNEL_5::tearDown(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx)) +{ + (void) vid; + +} + +} // end namespace apps +} // end namespace rajaperf diff --git a/src/apps/SW4CK_KERNEL_5.hpp b/src/apps/SW4CK_KERNEL_5.hpp new file mode 100644 index 000000000..72ce22443 --- /dev/null +++ b/src/apps/SW4CK_KERNEL_5.hpp @@ -0,0 +1,1368 @@ + +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2017-23, Lawrence Livermore National Security, LLC +// and RAJA Performance Suite project contributors. +// See the RAJAPerf/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +/// +/// SW4CK_KERNEL_5 kernel reference implementation: +/// https://github.com/LLNL/SW4CK +/// +/// for (int k = kstart; k <= klast - 2; k++) +/// for (int j = jfirst + 2; j <= jlast - 2; j++) +/// for (int i = ifirst + 2; i <= ilast - 2; i++) { +/// +/// // 5 ops +/// float_sw4 ijac = strx(i) * stry(j) / jac(i, j, k); +/// float_sw4 istry = 1 / (stry(j)); +/// float_sw4 istrx = 1 / (strx(i)); +/// float_sw4 istrxy = istry * istrx; +/// +/// float_sw4 r1 = 0, r2 = 0, r3 = 0; +/// +/// // pp derivative (u) (u-eq) +/// // 53 ops, tot=58 +/// float_sw4 cof1 = (2 * mu(i - 2, j, k) + la(i - 2, j, k)) * +/// met(1, i - 2, j, k) * met(1, i - 2, j, k) * +/// strx(i - 2); +/// float_sw4 cof2 = (2 * mu(i - 1, j, k) + la(i - 1, j, k)) * +/// met(1, i - 1, j, k) * met(1, i - 1, j, k) * +/// strx(i - 1); +/// float_sw4 cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(1, i, j, k) * +/// met(1, i, j, k) * strx(i); +/// float_sw4 cof4 = (2 * mu(i + 1, j, k) + la(i + 1, j, k)) * +/// met(1, i + 1, j, k) * met(1, i + 1, j, k) * +/// strx(i + 1); +/// float_sw4 cof5 = (2 * mu(i + 2, j, k) + la(i + 2, j, k)) * +/// met(1, i + 2, j, k) * met(1, i + 2, j, k) * +/// strx(i + 2); +/// +/// float_sw4 mux1 = cof2 - tf * (cof3 + cof1); +/// float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// float_sw4 mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r1 = r1 + i6 * +/// (mux1 * (u(1, i - 2, j, k) - u(1, i, j, k)) + +/// mux2 * (u(1, i - 1, j, k) - u(1, i, j, k)) + +/// mux3 * (u(1, i + 1, j, k) - u(1, i, j, k)) + +/// mux4 * (u(1, i + 2, j, k) - u(1, i, j, k))) * +/// istry; +/// +/// // qq derivative (u) (u-eq) +/// // 43 ops, tot=101 +/// cof1 = (mu(i, j - 2, k)) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * +/// stry(j - 2); +/// cof2 = (mu(i, j - 1, k)) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * +/// stry(j - 1); +/// cof3 = (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * stry(j); +/// cof4 = (mu(i, j + 1, k)) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * +/// stry(j + 1); +/// cof5 = (mu(i, j + 2, k)) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * +/// stry(j + 2); +/// +/// mux1 = cof2 - tf * (cof3 + cof1); +/// mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r1 = r1 + i6 * +/// (mux1 * (u(1, i, j - 2, k) - u(1, i, j, k)) + +/// mux2 * (u(1, i, j - 1, k) - u(1, i, j, k)) + +/// mux3 * (u(1, i, j + 1, k) - u(1, i, j, k)) + +/// mux4 * (u(1, i, j + 2, k) - u(1, i, j, k))) * +/// istrx; +/// +/// // pp derivative (v) (v-eq) +/// // 43 ops, tot=144 +/// cof1 = (mu(i - 2, j, k)) * met(1, i - 2, j, k) * met(1, i - 2, j, k) * +/// strx(i - 2); +/// cof2 = (mu(i - 1, j, k)) * met(1, i - 1, j, k) * met(1, i - 1, j, k) * +/// strx(i - 1); +/// cof3 = (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * strx(i); +/// cof4 = (mu(i + 1, j, k)) * met(1, i + 1, j, k) * met(1, i + 1, j, k) * +/// strx(i + 1); +/// cof5 = (mu(i + 2, j, k)) * met(1, i + 2, j, k) * met(1, i + 2, j, k) * +/// strx(i + 2); +/// +/// mux1 = cof2 - tf * (cof3 + cof1); +/// mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r2 = r2 + i6 * +/// (mux1 * (u(2, i - 2, j, k) - u(2, i, j, k)) + +/// mux2 * (u(2, i - 1, j, k) - u(2, i, j, k)) + +/// mux3 * (u(2, i + 1, j, k) - u(2, i, j, k)) + +/// mux4 * (u(2, i + 2, j, k) - u(2, i, j, k))) * +/// istry; +/// +/// // qq derivative (v) (v-eq) +/// // 53 ops, tot=197 +/// cof1 = (2 * mu(i, j - 2, k) + la(i, j - 2, k)) * met(1, i, j - 2, k) * +/// met(1, i, j - 2, k) * stry(j - 2); +/// cof2 = (2 * mu(i, j - 1, k) + la(i, j - 1, k)) * met(1, i, j - 1, k) * +/// met(1, i, j - 1, k) * stry(j - 1); +/// cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(1, i, j, k) * +/// met(1, i, j, k) * stry(j); +/// cof4 = (2 * mu(i, j + 1, k) + la(i, j + 1, k)) * met(1, i, j + 1, k) * +/// met(1, i, j + 1, k) * stry(j + 1); +/// cof5 = (2 * mu(i, j + 2, k) + la(i, j + 2, k)) * met(1, i, j + 2, k) * +/// met(1, i, j + 2, k) * stry(j + 2); +/// mux1 = cof2 - tf * (cof3 + cof1); +/// mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r2 = r2 + i6 * +/// (mux1 * (u(2, i, j - 2, k) - u(2, i, j, k)) + +/// mux2 * (u(2, i, j - 1, k) - u(2, i, j, k)) + +/// mux3 * (u(2, i, j + 1, k) - u(2, i, j, k)) + +/// mux4 * (u(2, i, j + 2, k) - u(2, i, j, k))) * +/// istrx; +/// +/// // pp derivative (w) (w-eq) +/// // 43 ops, tot=240 +/// cof1 = (mu(i - 2, j, k)) * met(1, i - 2, j, k) * met(1, i - 2, j, k) * +/// strx(i - 2); +/// cof2 = (mu(i - 1, j, k)) * met(1, i - 1, j, k) * met(1, i - 1, j, k) * +/// strx(i - 1); +/// cof3 = (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * strx(i); +/// cof4 = (mu(i + 1, j, k)) * met(1, i + 1, j, k) * met(1, i + 1, j, k) * +/// strx(i + 1); +/// cof5 = (mu(i + 2, j, k)) * met(1, i + 2, j, k) * met(1, i + 2, j, k) * +/// strx(i + 2); +/// +/// mux1 = cof2 - tf * (cof3 + cof1); +/// mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r3 = r3 + i6 * +/// (mux1 * (u(3, i - 2, j, k) - u(3, i, j, k)) + +/// mux2 * (u(3, i - 1, j, k) - u(3, i, j, k)) + +/// mux3 * (u(3, i + 1, j, k) - u(3, i, j, k)) + +/// mux4 * (u(3, i + 2, j, k) - u(3, i, j, k))) * +/// istry; +/// +/// // qq derivative (w) (w-eq) +/// // 43 ops, tot=283 +/// cof1 = (mu(i, j - 2, k)) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * +/// stry(j - 2); +/// cof2 = (mu(i, j - 1, k)) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * +/// stry(j - 1); +/// cof3 = (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * stry(j); +/// cof4 = (mu(i, j + 1, k)) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * +/// stry(j + 1); +/// cof5 = (mu(i, j + 2, k)) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * +/// stry(j + 2); +/// mux1 = cof2 - tf * (cof3 + cof1); +/// mux2 = cof1 + cof4 + 3 * (cof3 + cof2); +/// mux3 = cof2 + cof5 + 3 * (cof4 + cof3); +/// mux4 = cof4 - tf * (cof3 + cof5); +/// +/// r3 = r3 + i6 * +/// (mux1 * (u(3, i, j - 2, k) - u(3, i, j, k)) + +/// mux2 * (u(3, i, j - 1, k) - u(3, i, j, k)) + +/// mux3 * (u(3, i, j + 1, k) - u(3, i, j, k)) + +/// mux4 * (u(3, i, j + 2, k) - u(3, i, j, k))) * +/// istrx; +/// +/// // All rr-derivatives at once +/// // averaging the coefficient +/// // 54*8*8+25*8 = 3656 ops, tot=3939 +/// float_sw4 mucofu2, mucofuv, mucofuw, mucofvw, mucofv2, mucofw2; +/// //#pragma unroll 8 +/// #ifdef MAGIC_SYNC +/// __syncthreads(); +/// #endif +/// for (int q = nk - 7; q <= nk; q++) { +/// mucofu2 = 0; +/// mucofuv = 0; +/// mucofuw = 0; +/// mucofvw = 0; +/// mucofv2 = 0; +/// mucofw2 = 0; +/// #ifdef AMD_UNROLL_FIX +/// #pragma unroll 8 +/// #endif +/// for (int m = nk - 7; m <= nk; m++) { +/// mucofu2 += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * +/// ((2 * mu(i, j, m) + la(i, j, m)) * met(2, i, j, m) * +/// strx(i) * met(2, i, j, m) * strx(i) + +/// mu(i, j, m) * (met(3, i, j, m) * stry(j) * +/// met(3, i, j, m) * stry(j) + +/// met(4, i, j, m) * met(4, i, j, m))); +/// mucofv2 += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * +/// ((2 * mu(i, j, m) + la(i, j, m)) * met(3, i, j, m) * +/// stry(j) * met(3, i, j, m) * stry(j) + +/// mu(i, j, m) * (met(2, i, j, m) * strx(i) * +/// met(2, i, j, m) * strx(i) + +/// met(4, i, j, m) * met(4, i, j, m))); +/// mucofw2 += +/// acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * +/// ((2 * mu(i, j, m) + la(i, j, m)) * met(4, i, j, m) * +/// met(4, i, j, m) + +/// mu(i, j, m) * +/// (met(2, i, j, m) * strx(i) * met(2, i, j, m) * strx(i) + +/// met(3, i, j, m) * stry(j) * met(3, i, j, m) * stry(j))); +/// mucofuv += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * +/// (mu(i, j, m) + la(i, j, m)) * met(2, i, j, m) * +/// met(3, i, j, m); +/// mucofuw += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * +/// (mu(i, j, m) + la(i, j, m)) * met(2, i, j, m) * +/// met(4, i, j, m); +/// mucofvw += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * +/// (mu(i, j, m) + la(i, j, m)) * met(3, i, j, m) * +/// met(4, i, j, m); +/// } +/// +/// // Computing the second derivative, +/// r1 += istrxy * mucofu2 * u(1, i, j, q) + mucofuv * u(2, i, j, q) + +/// istry * mucofuw * u(3, i, j, q); +/// r2 += mucofuv * u(1, i, j, q) + istrxy * mucofv2 * u(2, i, j, q) + +/// istrx * mucofvw * u(3, i, j, q); +/// r3 += istry * mucofuw * u(1, i, j, q) + +/// istrx * mucofvw * u(2, i, j, q) + +/// istrxy * mucofw2 * u(3, i, j, q); +/// } +/// +/// // Ghost point values, only nonzero for k=nk. +/// // 72 ops., tot=4011 +/// mucofu2 = ghcof_no_gp(nk - k + 1) * +/// ((2 * mu(i, j, nk) + la(i, j, nk)) * met(2, i, j, nk) * +/// strx(i) * met(2, i, j, nk) * strx(i) + +/// mu(i, j, nk) * (met(3, i, j, nk) * stry(j) * +/// met(3, i, j, nk) * stry(j) + +/// met(4, i, j, nk) * met(4, i, j, nk))); +/// mucofv2 = ghcof_no_gp(nk - k + 1) * +/// ((2 * mu(i, j, nk) + la(i, j, nk)) * met(3, i, j, nk) * +/// stry(j) * met(3, i, j, nk) * stry(j) + +/// mu(i, j, nk) * (met(2, i, j, nk) * strx(i) * +/// met(2, i, j, nk) * strx(i) + +/// met(4, i, j, nk) * met(4, i, j, nk))); +/// mucofw2 = +/// ghcof_no_gp(nk - k + 1) * +/// ((2 * mu(i, j, nk) + la(i, j, nk)) * met(4, i, j, nk) * +/// met(4, i, j, nk) + +/// mu(i, j, nk) * +/// (met(2, i, j, nk) * strx(i) * met(2, i, j, nk) * strx(i) + +/// met(3, i, j, nk) * stry(j) * met(3, i, j, nk) * stry(j))); +/// mucofuv = ghcof_no_gp(nk - k + 1) * (mu(i, j, nk) + la(i, j, nk)) * +/// met(2, i, j, nk) * met(3, i, j, nk); +/// mucofuw = ghcof_no_gp(nk - k + 1) * (mu(i, j, nk) + la(i, j, nk)) * +/// met(2, i, j, nk) * met(4, i, j, nk); +/// mucofvw = ghcof_no_gp(nk - k + 1) * (mu(i, j, nk) + la(i, j, nk)) * +/// met(3, i, j, nk) * met(4, i, j, nk); +/// r1 += istrxy * mucofu2 * u(1, i, j, nk + 1) + +/// mucofuv * u(2, i, j, nk + 1) + +/// istry * mucofuw * u(3, i, j, nk + 1); +/// r2 += mucofuv * u(1, i, j, nk + 1) + +/// istrxy * mucofv2 * u(2, i, j, nk + 1) + +/// istrx * mucofvw * u(3, i, j, nk + 1); +/// r3 += istry * mucofuw * u(1, i, j, nk + 1) + +/// istrx * mucofvw * u(2, i, j, nk + 1) + +/// istrxy * mucofw2 * u(3, i, j, nk + 1); +/// +/// // pq-derivatives (u-eq) +/// // 38 ops., tot=4049 +/// r1 += +/// c2 * +/// (mu(i, j + 2, k) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * +/// (c2 * (u(2, i + 2, j + 2, k) - u(2, i - 2, j + 2, k)) + +/// c1 * (u(2, i + 1, j + 2, k) - u(2, i - 1, j + 2, k))) - +/// mu(i, j - 2, k) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * +/// (c2 * (u(2, i + 2, j - 2, k) - u(2, i - 2, j - 2, k)) + +/// c1 * (u(2, i + 1, j - 2, k) - u(2, i - 1, j - 2, k)))) + +/// c1 * +/// (mu(i, j + 1, k) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * +/// (c2 * (u(2, i + 2, j + 1, k) - u(2, i - 2, j + 1, k)) + +/// c1 * (u(2, i + 1, j + 1, k) - u(2, i - 1, j + 1, k))) - +/// mu(i, j - 1, k) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * +/// (c2 * (u(2, i + 2, j - 1, k) - u(2, i - 2, j - 1, k)) + +/// c1 * (u(2, i + 1, j - 1, k) - u(2, i - 1, j - 1, k)))); +/// +/// // qp-derivatives (u-eq) +/// // 38 ops. tot=4087 +/// r1 += +/// c2 * +/// (la(i + 2, j, k) * met(1, i + 2, j, k) * met(1, i + 2, j, k) * +/// (c2 * (u(2, i + 2, j + 2, k) - u(2, i + 2, j - 2, k)) + +/// c1 * (u(2, i + 2, j + 1, k) - u(2, i + 2, j - 1, k))) - +/// la(i - 2, j, k) * met(1, i - 2, j, k) * met(1, i - 2, j, k) * +/// (c2 * (u(2, i - 2, j + 2, k) - u(2, i - 2, j - 2, k)) + +/// c1 * (u(2, i - 2, j + 1, k) - u(2, i - 2, j - 1, k)))) + +/// c1 * +/// (la(i + 1, j, k) * met(1, i + 1, j, k) * met(1, i + 1, j, k) * +/// (c2 * (u(2, i + 1, j + 2, k) - u(2, i + 1, j - 2, k)) + +/// c1 * (u(2, i + 1, j + 1, k) - u(2, i + 1, j - 1, k))) - +/// la(i - 1, j, k) * met(1, i - 1, j, k) * met(1, i - 1, j, k) * +/// (c2 * (u(2, i - 1, j + 2, k) - u(2, i - 1, j - 2, k)) + +/// c1 * (u(2, i - 1, j + 1, k) - u(2, i - 1, j - 1, k)))); +/// +/// // pq-derivatives (v-eq) +/// // 38 ops. , tot=4125 +/// r2 += +/// c2 * +/// (la(i, j + 2, k) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * +/// (c2 * (u(1, i + 2, j + 2, k) - u(1, i - 2, j + 2, k)) + +/// c1 * (u(1, i + 1, j + 2, k) - u(1, i - 1, j + 2, k))) - +/// la(i, j - 2, k) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * +/// (c2 * (u(1, i + 2, j - 2, k) - u(1, i - 2, j - 2, k)) + +/// c1 * (u(1, i + 1, j - 2, k) - u(1, i - 1, j - 2, k)))) + +/// c1 * +/// (la(i, j + 1, k) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * +/// (c2 * (u(1, i + 2, j + 1, k) - u(1, i - 2, j + 1, k)) + +/// c1 * (u(1, i + 1, j + 1, k) - u(1, i - 1, j + 1, k))) - +/// la(i, j - 1, k) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * +/// (c2 * (u(1, i + 2, j - 1, k) - u(1, i - 2, j - 1, k)) + +/// c1 * (u(1, i + 1, j - 1, k) - u(1, i - 1, j - 1, k)))); +/// +/// //* qp-derivatives (v-eq) +/// // 38 ops., tot=4163 +/// r2 += +/// c2 * +/// (mu(i + 2, j, k) * met(1, i + 2, j, k) * met(1, i + 2, j, k) * +/// (c2 * (u(1, i + 2, j + 2, k) - u(1, i + 2, j - 2, k)) + +/// c1 * (u(1, i + 2, j + 1, k) - u(1, i + 2, j - 1, k))) - +/// mu(i - 2, j, k) * met(1, i - 2, j, k) * met(1, i - 2, j, k) * +/// (c2 * (u(1, i - 2, j + 2, k) - u(1, i - 2, j - 2, k)) + +/// c1 * (u(1, i - 2, j + 1, k) - u(1, i - 2, j - 1, k)))) + +/// c1 * +/// (mu(i + 1, j, k) * met(1, i + 1, j, k) * met(1, i + 1, j, k) * +/// (c2 * (u(1, i + 1, j + 2, k) - u(1, i + 1, j - 2, k)) + +/// c1 * (u(1, i + 1, j + 1, k) - u(1, i + 1, j - 1, k))) - +/// mu(i - 1, j, k) * met(1, i - 1, j, k) * met(1, i - 1, j, k) * +/// (c2 * (u(1, i - 1, j + 2, k) - u(1, i - 1, j - 2, k)) + +/// c1 * (u(1, i - 1, j + 1, k) - u(1, i - 1, j - 1, k)))); +/// +/// // rp - derivatives +/// // 24*8 = 192 ops, tot=4355 +/// float_sw4 dudrm2 = 0, dudrm1 = 0, dudrp1 = 0, dudrp2 = 0; +/// float_sw4 dvdrm2 = 0, dvdrm1 = 0, dvdrp1 = 0, dvdrp2 = 0; +/// float_sw4 dwdrm2 = 0, dwdrm1 = 0, dwdrp1 = 0, dwdrp2 = 0; +/// //#pragma unroll 8 +/// for (int q = nk - 7; q <= nk; q++) { +/// dudrm2 -= bope(nk - k + 1, nk - q + 1) * u(1, i - 2, j, q); +/// dvdrm2 -= bope(nk - k + 1, nk - q + 1) * u(2, i - 2, j, q); +/// dwdrm2 -= bope(nk - k + 1, nk - q + 1) * u(3, i - 2, j, q); +/// dudrm1 -= bope(nk - k + 1, nk - q + 1) * u(1, i - 1, j, q); +/// dvdrm1 -= bope(nk - k + 1, nk - q + 1) * u(2, i - 1, j, q); +/// dwdrm1 -= bope(nk - k + 1, nk - q + 1) * u(3, i - 1, j, q); +/// dudrp2 -= bope(nk - k + 1, nk - q + 1) * u(1, i + 2, j, q); +/// dvdrp2 -= bope(nk - k + 1, nk - q + 1) * u(2, i + 2, j, q); +/// dwdrp2 -= bope(nk - k + 1, nk - q + 1) * u(3, i + 2, j, q); +/// dudrp1 -= bope(nk - k + 1, nk - q + 1) * u(1, i + 1, j, q); +/// dvdrp1 -= bope(nk - k + 1, nk - q + 1) * u(2, i + 1, j, q); +/// dwdrp1 -= bope(nk - k + 1, nk - q + 1) * u(3, i + 1, j, q); +/// } +/// +/// // rp derivatives (u-eq) +/// // 67 ops, tot=4422 +/// r1 += (c2 * ((2 * mu(i + 2, j, k) + la(i + 2, j, k)) * +/// met(2, i + 2, j, k) * met(1, i + 2, j, k) * +/// strx(i + 2) * dudrp2 + +/// la(i + 2, j, k) * met(3, i + 2, j, k) * +/// met(1, i + 2, j, k) * dvdrp2 * stry(j) + +/// la(i + 2, j, k) * met(4, i + 2, j, k) * +/// met(1, i + 2, j, k) * dwdrp2 - +/// ((2 * mu(i - 2, j, k) + la(i - 2, j, k)) * +/// met(2, i - 2, j, k) * met(1, i - 2, j, k) * +/// strx(i - 2) * dudrm2 + +/// la(i - 2, j, k) * met(3, i - 2, j, k) * +/// met(1, i - 2, j, k) * dvdrm2 * stry(j) + +/// la(i - 2, j, k) * met(4, i - 2, j, k) * +/// met(1, i - 2, j, k) * dwdrm2)) + +/// c1 * ((2 * mu(i + 1, j, k) + la(i + 1, j, k)) * +/// met(2, i + 1, j, k) * met(1, i + 1, j, k) * +/// strx(i + 1) * dudrp1 + +/// la(i + 1, j, k) * met(3, i + 1, j, k) * +/// met(1, i + 1, j, k) * dvdrp1 * stry(j) + +/// la(i + 1, j, k) * met(4, i + 1, j, k) * +/// met(1, i + 1, j, k) * dwdrp1 - +/// ((2 * mu(i - 1, j, k) + la(i - 1, j, k)) * +/// met(2, i - 1, j, k) * met(1, i - 1, j, k) * +/// strx(i - 1) * dudrm1 + +/// la(i - 1, j, k) * met(3, i - 1, j, k) * +/// met(1, i - 1, j, k) * dvdrm1 * stry(j) + +/// la(i - 1, j, k) * met(4, i - 1, j, k) * +/// met(1, i - 1, j, k) * dwdrm1))) * +/// istry; +/// +/// // rp derivatives (v-eq) +/// // 42 ops, tot=4464 +/// r2 += +/// c2 * (mu(i + 2, j, k) * met(3, i + 2, j, k) * +/// met(1, i + 2, j, k) * dudrp2 + +/// mu(i + 2, j, k) * met(2, i + 2, j, k) * +/// met(1, i + 2, j, k) * dvdrp2 * strx(i + 2) * istry - +/// (mu(i - 2, j, k) * met(3, i - 2, j, k) * +/// met(1, i - 2, j, k) * dudrm2 + +/// mu(i - 2, j, k) * met(2, i - 2, j, k) * +/// met(1, i - 2, j, k) * dvdrm2 * strx(i - 2) * istry)) + +/// c1 * (mu(i + 1, j, k) * met(3, i + 1, j, k) * +/// met(1, i + 1, j, k) * dudrp1 + +/// mu(i + 1, j, k) * met(2, i + 1, j, k) * +/// met(1, i + 1, j, k) * dvdrp1 * strx(i + 1) * istry - +/// (mu(i - 1, j, k) * met(3, i - 1, j, k) * +/// met(1, i - 1, j, k) * dudrm1 + +/// mu(i - 1, j, k) * met(2, i - 1, j, k) * +/// met(1, i - 1, j, k) * dvdrm1 * strx(i - 1) * istry)); +/// +/// // rp derivatives (w-eq) +/// // 38 ops, tot=4502 +/// r3 += +/// istry * (c2 * (mu(i + 2, j, k) * met(4, i + 2, j, k) * +/// met(1, i + 2, j, k) * dudrp2 + +/// mu(i + 2, j, k) * met(2, i + 2, j, k) * +/// met(1, i + 2, j, k) * dwdrp2 * strx(i + 2) - +/// (mu(i - 2, j, k) * met(4, i - 2, j, k) * +/// met(1, i - 2, j, k) * dudrm2 + +/// mu(i - 2, j, k) * met(2, i - 2, j, k) * +/// met(1, i - 2, j, k) * dwdrm2 * strx(i - 2))) + +/// c1 * (mu(i + 1, j, k) * met(4, i + 1, j, k) * +/// met(1, i + 1, j, k) * dudrp1 + +/// mu(i + 1, j, k) * met(2, i + 1, j, k) * +/// met(1, i + 1, j, k) * dwdrp1 * strx(i + 1) - +/// (mu(i - 1, j, k) * met(4, i - 1, j, k) * +/// met(1, i - 1, j, k) * dudrm1 + +/// mu(i - 1, j, k) * met(2, i - 1, j, k) * +/// met(1, i - 1, j, k) * dwdrm1 * strx(i - 1)))); +/// +/// // rq - derivatives +/// // 24*8 = 192 ops , tot=4694 +/// +/// dudrm2 = 0; +/// dudrm1 = 0; +/// dudrp1 = 0; +/// dudrp2 = 0; +/// dvdrm2 = 0; +/// dvdrm1 = 0; +/// dvdrp1 = 0; +/// dvdrp2 = 0; +/// dwdrm2 = 0; +/// dwdrm1 = 0; +/// dwdrp1 = 0; +/// dwdrp2 = 0; +/// //#pragma unroll 8 +/// for (int q = nk - 7; q <= nk; q++) { +/// dudrm2 -= bope(nk - k + 1, nk - q + 1) * u(1, i, j - 2, q); +/// dvdrm2 -= bope(nk - k + 1, nk - q + 1) * u(2, i, j - 2, q); +/// dwdrm2 -= bope(nk - k + 1, nk - q + 1) * u(3, i, j - 2, q); +/// dudrm1 -= bope(nk - k + 1, nk - q + 1) * u(1, i, j - 1, q); +/// dvdrm1 -= bope(nk - k + 1, nk - q + 1) * u(2, i, j - 1, q); +/// dwdrm1 -= bope(nk - k + 1, nk - q + 1) * u(3, i, j - 1, q); +/// dudrp2 -= bope(nk - k + 1, nk - q + 1) * u(1, i, j + 2, q); +/// dvdrp2 -= bope(nk - k + 1, nk - q + 1) * u(2, i, j + 2, q); +/// dwdrp2 -= bope(nk - k + 1, nk - q + 1) * u(3, i, j + 2, q); +/// dudrp1 -= bope(nk - k + 1, nk - q + 1) * u(1, i, j + 1, q); +/// dvdrp1 -= bope(nk - k + 1, nk - q + 1) * u(2, i, j + 1, q); +/// dwdrp1 -= bope(nk - k + 1, nk - q + 1) * u(3, i, j + 1, q); +/// } +/// +/// // rq derivatives (u-eq) +/// // 42 ops, tot=4736 +/// r1 += c2 * (mu(i, j + 2, k) * met(3, i, j + 2, k) * +/// met(1, i, j + 2, k) * dudrp2 * stry(j + 2) * istrx + +/// mu(i, j + 2, k) * met(2, i, j + 2, k) * +/// met(1, i, j + 2, k) * dvdrp2 - +/// (mu(i, j - 2, k) * met(3, i, j - 2, k) * +/// met(1, i, j - 2, k) * dudrm2 * stry(j - 2) * istrx + +/// mu(i, j - 2, k) * met(2, i, j - 2, k) * +/// met(1, i, j - 2, k) * dvdrm2)) + +/// c1 * (mu(i, j + 1, k) * met(3, i, j + 1, k) * +/// met(1, i, j + 1, k) * dudrp1 * stry(j + 1) * istrx + +/// mu(i, j + 1, k) * met(2, i, j + 1, k) * +/// met(1, i, j + 1, k) * dvdrp1 - +/// (mu(i, j - 1, k) * met(3, i, j - 1, k) * +/// met(1, i, j - 1, k) * dudrm1 * stry(j - 1) * istrx + +/// mu(i, j - 1, k) * met(2, i, j - 1, k) * +/// met(1, i, j - 1, k) * dvdrm1)); +/// +/// // rq derivatives (v-eq) +/// // 70 ops, tot=4806 +/// r2 += c2 * (la(i, j + 2, k) * met(2, i, j + 2, k) * +/// met(1, i, j + 2, k) * dudrp2 + +/// (2 * mu(i, j + 2, k) + la(i, j + 2, k)) * +/// met(3, i, j + 2, k) * met(1, i, j + 2, k) * dvdrp2 * +/// stry(j + 2) * istrx + +/// la(i, j + 2, k) * met(4, i, j + 2, k) * +/// met(1, i, j + 2, k) * dwdrp2 * istrx - +/// (la(i, j - 2, k) * met(2, i, j - 2, k) * +/// met(1, i, j - 2, k) * dudrm2 + +/// (2 * mu(i, j - 2, k) + la(i, j - 2, k)) * +/// met(3, i, j - 2, k) * met(1, i, j - 2, k) * dvdrm2 * +/// stry(j - 2) * istrx + +/// la(i, j - 2, k) * met(4, i, j - 2, k) * +/// met(1, i, j - 2, k) * dwdrm2 * istrx)) + +/// c1 * (la(i, j + 1, k) * met(2, i, j + 1, k) * +/// met(1, i, j + 1, k) * dudrp1 + +/// (2 * mu(i, j + 1, k) + la(i, j + 1, k)) * +/// met(3, i, j + 1, k) * met(1, i, j + 1, k) * dvdrp1 * +/// stry(j + 1) * istrx + +/// la(i, j + 1, k) * met(4, i, j + 1, k) * +/// met(1, i, j + 1, k) * dwdrp1 * istrx - +/// (la(i, j - 1, k) * met(2, i, j - 1, k) * +/// met(1, i, j - 1, k) * dudrm1 + +/// (2 * mu(i, j - 1, k) + la(i, j - 1, k)) * +/// met(3, i, j - 1, k) * met(1, i, j - 1, k) * dvdrm1 * +/// stry(j - 1) * istrx + +/// la(i, j - 1, k) * met(4, i, j - 1, k) * +/// met(1, i, j - 1, k) * dwdrm1 * istrx)); +/// +/// // rq derivatives (w-eq) +/// // 39 ops, tot=4845 +/// r3 += (c2 * (mu(i, j + 2, k) * met(3, i, j + 2, k) * +/// met(1, i, j + 2, k) * dwdrp2 * stry(j + 2) + +/// mu(i, j + 2, k) * met(4, i, j + 2, k) * +/// met(1, i, j + 2, k) * dvdrp2 - +/// (mu(i, j - 2, k) * met(3, i, j - 2, k) * +/// met(1, i, j - 2, k) * dwdrm2 * stry(j - 2) + +/// mu(i, j - 2, k) * met(4, i, j - 2, k) * +/// met(1, i, j - 2, k) * dvdrm2)) + +/// c1 * (mu(i, j + 1, k) * met(3, i, j + 1, k) * +/// met(1, i, j + 1, k) * dwdrp1 * stry(j + 1) + +/// mu(i, j + 1, k) * met(4, i, j + 1, k) * +/// met(1, i, j + 1, k) * dvdrp1 - +/// (mu(i, j - 1, k) * met(3, i, j - 1, k) * +/// met(1, i, j - 1, k) * dwdrm1 * stry(j - 1) + +/// mu(i, j - 1, k) * met(4, i, j - 1, k) * +/// met(1, i, j - 1, k) * dvdrm1))) * +/// istrx; +/// +/// // pr and qr derivatives at once +/// // in loop: 8*(53+53+43) = 1192 ops, tot=6037 +/// //#pragma unroll 8 +/// for (int q = nk - 7; q <= nk; q++) { +/// // (u-eq) +/// // 53 ops +/// r1 -= bope(nk - k + 1, nk - q + 1) * +/// ( +/// // pr +/// (2 * mu(i, j, q) + la(i, j, q)) * met(2, i, j, q) * +/// met(1, i, j, q) * +/// (c2 * (u(1, i + 2, j, q) - u(1, i - 2, j, q)) + +/// c1 * (u(1, i + 1, j, q) - u(1, i - 1, j, q))) * +/// strx(i) * istry + +/// mu(i, j, q) * met(3, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(2, i + 2, j, q) - u(2, i - 2, j, q)) + +/// c1 * (u(2, i + 1, j, q) - u(2, i - 1, j, q))) + +/// mu(i, j, q) * met(4, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(3, i + 2, j, q) - u(3, i - 2, j, q)) + +/// c1 * (u(3, i + 1, j, q) - u(3, i - 1, j, q))) * +/// istry +/// // qr +/// + mu(i, j, q) * met(3, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(1, i, j + 2, q) - u(1, i, j - 2, q)) + +/// c1 * (u(1, i, j + 1, q) - u(1, i, j - 1, q))) * +/// stry(j) * istrx + +/// la(i, j, q) * met(2, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(2, i, j + 2, q) - u(2, i, j - 2, q)) + +/// c1 * (u(2, i, j + 1, q) - u(2, i, j - 1, q)))); +/// +/// // (v-eq) +/// // 53 ops +/// r2 -= bope(nk - k + 1, nk - q + 1) * +/// ( +/// // pr +/// la(i, j, q) * met(3, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(1, i + 2, j, q) - u(1, i - 2, j, q)) + +/// c1 * (u(1, i + 1, j, q) - u(1, i - 1, j, q))) + +/// mu(i, j, q) * met(2, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(2, i + 2, j, q) - u(2, i - 2, j, q)) + +/// c1 * (u(2, i + 1, j, q) - u(2, i - 1, j, q))) * +/// strx(i) * istry +/// // qr +/// + mu(i, j, q) * met(2, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(1, i, j + 2, q) - u(1, i, j - 2, q)) + +/// c1 * (u(1, i, j + 1, q) - u(1, i, j - 1, q))) + +/// (2 * mu(i, j, q) + la(i, j, q)) * met(3, i, j, q) * +/// met(1, i, j, q) * +/// (c2 * (u(2, i, j + 2, q) - u(2, i, j - 2, q)) + +/// c1 * (u(2, i, j + 1, q) - u(2, i, j - 1, q))) * +/// stry(j) * istrx + +/// mu(i, j, q) * met(4, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(3, i, j + 2, q) - u(3, i, j - 2, q)) + +/// c1 * (u(3, i, j + 1, q) - u(3, i, j - 1, q))) * +/// istrx); +/// +/// // (w-eq) +/// // 43 ops +/// r3 -= bope(nk - k + 1, nk - q + 1) * +/// ( +/// // pr +/// la(i, j, q) * met(4, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(1, i + 2, j, q) - u(1, i - 2, j, q)) + +/// c1 * (u(1, i + 1, j, q) - u(1, i - 1, j, q))) * +/// istry + +/// mu(i, j, q) * met(2, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(3, i + 2, j, q) - u(3, i - 2, j, q)) + +/// c1 * (u(3, i + 1, j, q) - u(3, i - 1, j, q))) * +/// strx(i) * istry +/// // qr +/// + mu(i, j, q) * met(3, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(3, i, j + 2, q) - u(3, i, j - 2, q)) + +/// c1 * (u(3, i, j + 1, q) - u(3, i, j - 1, q))) * +/// stry(j) * istrx + +/// la(i, j, q) * met(4, i, j, q) * met(1, i, j, q) * +/// (c2 * (u(2, i, j + 2, q) - u(2, i, j - 2, q)) + +/// c1 * (u(2, i, j + 1, q) - u(2, i, j - 1, q))) * +/// istrx); +/// } +/// +/// // 12 ops, tot=6049 +/// lu(1, i, j, k) = a1 * lu(1, i, j, k) + sgn * r1 * ijac; +/// lu(2, i, j, k) = a1 * lu(2, i, j, k) + sgn * r2 * ijac; +/// lu(3, i, j, k) = a1 * lu(3, i, j, k) + sgn * r3 * ijac; +/// +/// } + +#ifndef RAJAPerf_Apps_SW4CK_KERNEL_5_HPP +#define RAJAPerf_Apps_SW4CK_KERNEL_5_HPP + +using float_sw4 = double; + +#define SW4CK_KERNEL_5_DATA_SETUP \ + float_sw4 a1 = 0; \ + float_sw4 sgn = 1; \ + if (op == '=') { \ + a1 = 0; \ + sgn = 1; \ + } else if (op == '+') { \ + a1 = 1; \ + sgn = 1; \ + } else if (op == '-') { \ + a1 = 1; \ + sgn = -1; \ + } \ + \ + const float_sw4 i6 = 1.0 / 6; \ + const float_sw4 tf = 0.75; \ + const float_sw4 c1 = 2.0 / 3; \ + const float_sw4 c2 = -1.0 / 12; \ + \ + const int ni = ilast - ifirst + 1; \ + const int nij = ni * (jlast - jfirst + 1); \ + const int nijk = nij * (klast - kfirst + 1); \ + const int base = -(ifirst + ni * jfirst + nij * kfirst); \ + const int base3 = base - nijk; \ + const int base4 = base - nijk; \ + const int ifirst0 = ifirst; \ + const int jfirst0 = jfirst; \ + \ + Real_ptr a_mu = m_a_mu; \ + Real_ptr a_lambda = m_a_lambda; \ + Real_ptr a_jac = m_a_jac; \ + Real_ptr a_u = m_a_u; \ + Real_ptr a_lu = m_a_lu; \ + Real_ptr a_met = m_a_met; \ + Real_ptr a_strx = m_a_strx; \ + Real_ptr a_stry = m_a_stry; +/* + Real_ptr a_acof = m_a_acof; \ + Real_ptr a_bope = m_a_bope; \ + Real_ptr a_ghcof = m_a_ghcof; \ + Real_ptr a_acof_no_gp = m_a_acof_no_gp; \ + Real_ptr a_ghcof_no_gp = m_a_ghcof_no_gp; +*/ + +// Direct reuse of fortran code by these macro definitions: +#define mu(i, j, k) a_mu[base + (i) + ni * (j) + nij * (k)] +#define la(i, j, k) a_lambda[base + (i) + ni * (j) + nij * (k)] +#define jac(i, j, k) a_jac[base + (i) + ni * (j) + nij * (k)] +#define u(c, i, j, k) a_u[base3 + (i) + ni * (j) + nij * (k) + nijk * (c)] +#define lu(c, i, j, k) a_lu[base3 + (i) + ni * (j) + nij * (k) + nijk * (c)] +#define met(c, i, j, k) a_met[base4 + (i) + ni * (j) + nij * (k) + nijk * (c)] +#define strx(i) a_strx[i - ifirst0] +#define stry(j) a_stry[j - jfirst0] +#define acof(i, j, k) a_acof[(i - 1) + 6 * (j - 1) + 48 * (k - 1)] +#define bope(i, j) a_bope[i - 1 + 6 * (j - 1)] +#define ghcof(i) a_ghcof[i - 1] +#define acof_no_gp(i, j, k) a_acof_no_gp[(i - 1) + 6 * (j - 1) + 48 * (k - 1)] +#define ghcof_no_gp(i) a_ghcof_no_gp[i - 1] + + +// 5 ops +#define SW4CK_KERNEL_5_BODY_1 \ + float_sw4 ijac = strx(i) * stry(j) / jac(i, j, k); \ + float_sw4 istry = 1 / (stry(j)); \ + float_sw4 istrx = 1 / (strx(i)); \ + float_sw4 istrxy = istry * istrx; \ + \ + float_sw4 r1 = 0, r2 = 0, r3 = 0; + +// pp derivative (u) (u-eq) +// 53 ops, tot=58 +#define SW4CK_KERNEL_5_BODY_2 \ + float_sw4 cof1 = (2 * mu(i - 2, j, k) + la(i - 2, j, k)) * \ + met(1, i - 2, j, k) * met(1, i - 2, j, k) * \ + strx(i - 2); \ + float_sw4 cof2 = (2 * mu(i - 1, j, k) + la(i - 1, j, k)) * \ + met(1, i - 1, j, k) * met(1, i - 1, j, k) * \ + strx(i - 1); \ + float_sw4 cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(1, i, j, k) * \ + met(1, i, j, k) * strx(i); \ + float_sw4 cof4 = (2 * mu(i + 1, j, k) + la(i + 1, j, k)) * \ + met(1, i + 1, j, k) * met(1, i + 1, j, k) * \ + strx(i + 1); \ + float_sw4 cof5 = (2 * mu(i + 2, j, k) + la(i + 2, j, k)) * \ + met(1, i + 2, j, k) * met(1, i + 2, j, k) * \ + strx(i + 2); \ + \ + float_sw4 mux1 = cof2 - tf * (cof3 + cof1); \ + float_sw4 mux2 = cof1 + cof4 + 3 * (cof3 + cof2); \ + float_sw4 mux3 = cof2 + cof5 + 3 * (cof4 + cof3); \ + float_sw4 mux4 = cof4 - tf * (cof3 + cof5); \ + \ + r1 = r1 + i6 * \ + (mux1 * (u(1, i - 2, j, k) - u(1, i, j, k)) + \ + mux2 * (u(1, i - 1, j, k) - u(1, i, j, k)) + \ + mux3 * (u(1, i + 1, j, k) - u(1, i, j, k)) + \ + mux4 * (u(1, i + 2, j, k) - u(1, i, j, k))) * \ + istry; + +// qq derivative (u) (u-eq) +// 43 ops, tot=101 +#define SW4CK_KERNEL_5_BODY_3 \ + cof1 = (mu(i, j - 2, k)) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * \ + stry(j - 2); \ + cof2 = (mu(i, j - 1, k)) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * \ + stry(j - 1); \ + cof3 = (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * stry(j); \ + cof4 = (mu(i, j + 1, k)) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * \ + stry(j + 1); \ + cof5 = (mu(i, j + 2, k)) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * \ + stry(j + 2); \ + \ + mux1 = cof2 - tf * (cof3 + cof1); \ + mux2 = cof1 + cof4 + 3 * (cof3 + cof2); \ + mux3 = cof2 + cof5 + 3 * (cof4 + cof3); \ + mux4 = cof4 - tf * (cof3 + cof5); \ + \ + r1 = r1 + i6 * \ + (mux1 * (u(1, i, j - 2, k) - u(1, i, j, k)) + \ + mux2 * (u(1, i, j - 1, k) - u(1, i, j, k)) + \ + mux3 * (u(1, i, j + 1, k) - u(1, i, j, k)) + \ + mux4 * (u(1, i, j + 2, k) - u(1, i, j, k))) * \ + istrx; + + +// pp derivative (v) (v-eq) +// 43 ops, tot=144 +#define SW4CK_KERNEL_5_BODY_4 \ + cof1 = (mu(i - 2, j, k)) * met(1, i - 2, j, k) * met(1, i - 2, j, k) * \ + strx(i - 2); \ + cof2 = (mu(i - 1, j, k)) * met(1, i - 1, j, k) * met(1, i - 1, j, k) * \ + strx(i - 1); \ + cof3 = (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * strx(i); \ + cof4 = (mu(i + 1, j, k)) * met(1, i + 1, j, k) * met(1, i + 1, j, k) * \ + strx(i + 1); \ + cof5 = (mu(i + 2, j, k)) * met(1, i + 2, j, k) * met(1, i + 2, j, k) * \ + strx(i + 2); \ + \ + mux1 = cof2 - tf * (cof3 + cof1); \ + mux2 = cof1 + cof4 + 3 * (cof3 + cof2); \ + mux3 = cof2 + cof5 + 3 * (cof4 + cof3); \ + mux4 = cof4 - tf * (cof3 + cof5); \ + \ + r2 = r2 + i6 * \ + (mux1 * (u(2, i - 2, j, k) - u(2, i, j, k)) + \ + mux2 * (u(2, i - 1, j, k) - u(2, i, j, k)) + \ + mux3 * (u(2, i + 1, j, k) - u(2, i, j, k)) + \ + mux4 * (u(2, i + 2, j, k) - u(2, i, j, k))) * \ + istry; + + +// qq derivative (v) (v-eq) +// 53 ops, tot=197 +#define SW4CK_KERNEL_5_BODY_5 \ + cof1 = (2 * mu(i, j - 2, k) + la(i, j - 2, k)) * met(1, i, j - 2, k) * \ + met(1, i, j - 2, k) * stry(j - 2); \ + cof2 = (2 * mu(i, j - 1, k) + la(i, j - 1, k)) * met(1, i, j - 1, k) * \ + met(1, i, j - 1, k) * stry(j - 1); \ + cof3 = (2 * mu(i, j, k) + la(i, j, k)) * met(1, i, j, k) * \ + met(1, i, j, k) * stry(j); \ + cof4 = (2 * mu(i, j + 1, k) + la(i, j + 1, k)) * met(1, i, j + 1, k) * \ + met(1, i, j + 1, k) * stry(j + 1); \ + cof5 = (2 * mu(i, j + 2, k) + la(i, j + 2, k)) * met(1, i, j + 2, k) * \ + met(1, i, j + 2, k) * stry(j + 2); \ + mux1 = cof2 - tf * (cof3 + cof1); \ + mux2 = cof1 + cof4 + 3 * (cof3 + cof2); \ + mux3 = cof2 + cof5 + 3 * (cof4 + cof3); \ + mux4 = cof4 - tf * (cof3 + cof5); \ + \ + r2 = r2 + i6 * \ + (mux1 * (u(2, i, j - 2, k) - u(2, i, j, k)) + \ + mux2 * (u(2, i, j - 1, k) - u(2, i, j, k)) + \ + mux3 * (u(2, i, j + 1, k) - u(2, i, j, k)) + \ + mux4 * (u(2, i, j + 2, k) - u(2, i, j, k))) * \ + istrx; + +// pp derivative (w) (w-eq) +// 43 ops, tot=240 +#define SW4CK_KERNEL_5_BODY_6 \ + cof1 = (mu(i, j - 2, k)) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * \ + stry(j - 2); \ + cof2 = (mu(i, j - 1, k)) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * \ + stry(j - 1); \ + cof3 = (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * stry(j); \ + cof4 = (mu(i, j + 1, k)) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * \ + stry(j + 1); \ + cof5 = (mu(i, j + 2, k)) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * \ + stry(j + 2); \ + mux1 = cof2 - tf * (cof3 + cof1); \ + mux2 = cof1 + cof4 + 3 * (cof3 + cof2); \ + mux3 = cof2 + cof5 + 3 * (cof4 + cof3); \ + mux4 = cof4 - tf * (cof3 + cof5); \ + \ + r3 = r3 + i6 * \ + (mux1 * (u(3, i, j - 2, k) - u(3, i, j, k)) + \ + mux2 * (u(3, i, j - 1, k) - u(3, i, j, k)) + \ + mux3 * (u(3, i, j + 1, k) - u(3, i, j, k)) + \ + mux4 * (u(3, i, j + 2, k) - u(3, i, j, k))) * \ + istrx; + +// qq derivative (w) (w-eq) +// 43 ops, tot=283 +#define SW4CK_KERNEL_5_BODY_7 \ + cof1 = (mu(i, j - 2, k)) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * \ + stry(j - 2); \ + cof2 = (mu(i, j - 1, k)) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * \ + stry(j - 1); \ + cof3 = (mu(i, j, k)) * met(1, i, j, k) * met(1, i, j, k) * stry(j); \ + cof4 = (mu(i, j + 1, k)) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * \ + stry(j + 1); \ + cof5 = (mu(i, j + 2, k)) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * \ + stry(j + 2); \ + mux1 = cof2 - tf * (cof3 + cof1); \ + mux2 = cof1 + cof4 + 3 * (cof3 + cof2); \ + mux3 = cof2 + cof5 + 3 * (cof4 + cof3); \ + mux4 = cof4 - tf * (cof3 + cof5); \ + \ + r3 = r3 + i6 * \ + (mux1 * (u(3, i, j - 2, k) - u(3, i, j, k)) + \ + mux2 * (u(3, i, j - 1, k) - u(3, i, j, k)) + \ + mux3 * (u(3, i, j + 1, k) - u(3, i, j, k)) + \ + mux4 * (u(3, i, j + 2, k) - u(3, i, j, k))) * \ + istrx; + +// All rr-derivatives at once +// averaging the coefficient +// 54*8*8+25*8 = 3656 ops, tot=3939 +#define SW4CK_KERNEL_5_BODY_8 \ + float_sw4 mucofu2, mucofuv, mucofuw, mucofvw, mucofv2, mucofw2; \ + /*#pragma unroll 8 */ \ + #ifdef MAGIC_SYNC \ + __syncthreads(); \ + #endif \ + for (int q = nk - 7; q <= nk; q++) { \ + mucofu2 = 0; \ + mucofuv = 0; \ + mucofuw = 0; \ + mucofvw = 0; \ + mucofv2 = 0; \ + mucofw2 = 0; \ + #ifdef AMD_UNROLL_FIX \ + #pragma unroll 8 \ + #endif \ + for (int m = nk - 7; m <= nk; m++) { \ + mucofu2 += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * \ + ((2 * mu(i, j, m) + la(i, j, m)) * met(2, i, j, m) * \ + strx(i) * met(2, i, j, m) * strx(i) + \ + mu(i, j, m) * (met(3, i, j, m) * stry(j) * \ + met(3, i, j, m) * stry(j) + \ + met(4, i, j, m) * met(4, i, j, m))); \ + mucofv2 += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * \ + ((2 * mu(i, j, m) + la(i, j, m)) * met(3, i, j, m) * \ + stry(j) * met(3, i, j, m) * stry(j) + \ + mu(i, j, m) * (met(2, i, j, m) * strx(i) * \ + met(2, i, j, m) * strx(i) + \ + met(4, i, j, m) * met(4, i, j, m))); \ + mucofw2 += \ + acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * \ + ((2 * mu(i, j, m) + la(i, j, m)) * met(4, i, j, m) * \ + met(4, i, j, m) + \ + mu(i, j, m) * \ + (met(2, i, j, m) * strx(i) * met(2, i, j, m) * strx(i) + \ + met(3, i, j, m) * stry(j) * met(3, i, j, m) * stry(j))); \ + mucofuv += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * \ + (mu(i, j, m) + la(i, j, m)) * met(2, i, j, m) * \ + met(3, i, j, m); \ + mucofuw += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * \ + (mu(i, j, m) + la(i, j, m)) * met(2, i, j, m) * \ + met(4, i, j, m); \ + mucofvw += acof_no_gp(nk - k + 1, nk - q + 1, nk - m + 1) * \ + (mu(i, j, m) + la(i, j, m)) * met(3, i, j, m) * \ + met(4, i, j, m); \ + } \ + \ + /* Computing the second derivative, */ \ + r1 += istrxy * mucofu2 * u(1, i, j, q) + mucofuv * u(2, i, j, q) + \ + istry * mucofuw * u(3, i, j, q); \ + r2 += mucofuv * u(1, i, j, q) + istrxy * mucofv2 * u(2, i, j, q) + \ + istrx * mucofvw * u(3, i, j, q); \ + r3 += istry * mucofuw * u(1, i, j, q) + \ + istrx * mucofvw * u(2, i, j, q) + \ + istrxy * mucofw2 * u(3, i, j, q); \ + } + +// Ghost point values, only nonzero for k=nk. +// 72 ops., tot=4011 +#define SW4CK_KERNEL_5_BODY_9 \ + mucofu2 = ghcof_no_gp(nk - k + 1) * \ + ((2 * mu(i, j, nk) + la(i, j, nk)) * met(2, i, j, nk) * \ + strx(i) * met(2, i, j, nk) * strx(i) + \ + mu(i, j, nk) * (met(3, i, j, nk) * stry(j) * \ + met(3, i, j, nk) * stry(j) + \ + met(4, i, j, nk) * met(4, i, j, nk))); \ + mucofv2 = ghcof_no_gp(nk - k + 1) * \ + ((2 * mu(i, j, nk) + la(i, j, nk)) * met(3, i, j, nk) * \ + stry(j) * met(3, i, j, nk) * stry(j) + \ + mu(i, j, nk) * (met(2, i, j, nk) * strx(i) * \ + met(2, i, j, nk) * strx(i) + \ + met(4, i, j, nk) * met(4, i, j, nk))); \ + mucofw2 = \ + ghcof_no_gp(nk - k + 1) * \ + ((2 * mu(i, j, nk) + la(i, j, nk)) * met(4, i, j, nk) * \ + met(4, i, j, nk) + \ + mu(i, j, nk) * \ + (met(2, i, j, nk) * strx(i) * met(2, i, j, nk) * strx(i) + \ + met(3, i, j, nk) * stry(j) * met(3, i, j, nk) * stry(j))); \ + mucofuv = ghcof_no_gp(nk - k + 1) * (mu(i, j, nk) + la(i, j, nk)) * \ + met(2, i, j, nk) * met(3, i, j, nk); \ + mucofuw = ghcof_no_gp(nk - k + 1) * (mu(i, j, nk) + la(i, j, nk)) * \ + met(2, i, j, nk) * met(4, i, j, nk); \ + mucofvw = ghcof_no_gp(nk - k + 1) * (mu(i, j, nk) + la(i, j, nk)) * \ + met(3, i, j, nk) * met(4, i, j, nk); \ + r1 += istrxy * mucofu2 * u(1, i, j, nk + 1) + \ + mucofuv * u(2, i, j, nk + 1) + \ + istry * mucofuw * u(3, i, j, nk + 1); \ + r2 += mucofuv * u(1, i, j, nk + 1) + \ + istrxy * mucofv2 * u(2, i, j, nk + 1) + \ + istrx * mucofvw * u(3, i, j, nk + 1); \ + r3 += istry * mucofuw * u(1, i, j, nk + 1) + \ + istrx * mucofvw * u(2, i, j, nk + 1) + \ + istrxy * mucofw2 * u(3, i, j, nk + 1); + +// pq-derivatives (u-eq) +// 38 ops., tot=4049 +#define SW4CK_KERNEL_5_BODY_10 \ + r1 += \ + c2 * \ + (mu(i, j + 2, k) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * \ + (c2 * (u(2, i + 2, j + 2, k) - u(2, i - 2, j + 2, k)) + \ + c1 * (u(2, i + 1, j + 2, k) - u(2, i - 1, j + 2, k))) - \ + mu(i, j - 2, k) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * \ + (c2 * (u(2, i + 2, j - 2, k) - u(2, i - 2, j - 2, k)) + \ + c1 * (u(2, i + 1, j - 2, k) - u(2, i - 1, j - 2, k)))) + \ + c1 * \ + (mu(i, j + 1, k) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * \ + (c2 * (u(2, i + 2, j + 1, k) - u(2, i - 2, j + 1, k)) + \ + c1 * (u(2, i + 1, j + 1, k) - u(2, i - 1, j + 1, k))) - \ + mu(i, j - 1, k) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * \ + (c2 * (u(2, i + 2, j - 1, k) - u(2, i - 2, j - 1, k)) + \ + c1 * (u(2, i + 1, j - 1, k) - u(2, i - 1, j - 1, k)))); + +// qp-derivatives (u-eq) +// 38 ops. tot=4087 +#define SW4CK_KERNEL_5_BODY_11 \ + r1 += \ + c2 * \ + (mu(i, j + 2, k) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * \ + (c2 * (u(2, i + 2, j + 2, k) - u(2, i - 2, j + 2, k)) + \ + c1 * (u(2, i + 1, j + 2, k) - u(2, i - 1, j + 2, k))) - \ + mu(i, j - 2, k) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * \ + (c2 * (u(2, i + 2, j - 2, k) - u(2, i - 2, j - 2, k)) + \ + c1 * (u(2, i + 1, j - 2, k) - u(2, i - 1, j - 2, k)))) + \ + c1 * \ + (mu(i, j + 1, k) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * \ + (c2 * (u(2, i + 2, j + 1, k) - u(2, i - 2, j + 1, k)) + \ + c1 * (u(2, i + 1, j + 1, k) - u(2, i - 1, j + 1, k))) - \ + mu(i, j - 1, k) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * \ + (c2 * (u(2, i + 2, j - 1, k) - u(2, i - 2, j - 1, k)) + \ + c1 * (u(2, i + 1, j - 1, k) - u(2, i - 1, j - 1, k)))); + +// pq-derivatives (v-eq) +// 38 ops. , tot=4125 +#define SW4CK_KERNEL_5_BODY_12 \ + r2 += \ + c2 * \ + (la(i, j + 2, k) * met(1, i, j + 2, k) * met(1, i, j + 2, k) * \ + (c2 * (u(1, i + 2, j + 2, k) - u(1, i - 2, j + 2, k)) + \ + c1 * (u(1, i + 1, j + 2, k) - u(1, i - 1, j + 2, k))) - \ + la(i, j - 2, k) * met(1, i, j - 2, k) * met(1, i, j - 2, k) * \ + (c2 * (u(1, i + 2, j - 2, k) - u(1, i - 2, j - 2, k)) + \ + c1 * (u(1, i + 1, j - 2, k) - u(1, i - 1, j - 2, k)))) + \ + c1 * \ + (la(i, j + 1, k) * met(1, i, j + 1, k) * met(1, i, j + 1, k) * \ + (c2 * (u(1, i + 2, j + 1, k) - u(1, i - 2, j + 1, k)) + \ + c1 * (u(1, i + 1, j + 1, k) - u(1, i - 1, j + 1, k))) - \ + la(i, j - 1, k) * met(1, i, j - 1, k) * met(1, i, j - 1, k) * \ + (c2 * (u(1, i + 2, j - 1, k) - u(1, i - 2, j - 1, k)) + \ + c1 * (u(1, i + 1, j - 1, k) - u(1, i - 1, j - 1, k)))); + + +//* qp-derivatives (v-eq) +// 38 ops., tot=4163 +#define SW4CK_KERNEL_5_BODY_13 \ + r2 += \ + c2 * \ + (mu(i + 2, j, k) * met(1, i + 2, j, k) * met(1, i + 2, j, k) * \ + (c2 * (u(1, i + 2, j + 2, k) - u(1, i + 2, j - 2, k)) + \ + c1 * (u(1, i + 2, j + 1, k) - u(1, i + 2, j - 1, k))) - \ + mu(i - 2, j, k) * met(1, i - 2, j, k) * met(1, i - 2, j, k) * \ + (c2 * (u(1, i - 2, j + 2, k) - u(1, i - 2, j - 2, k)) + \ + c1 * (u(1, i - 2, j + 1, k) - u(1, i - 2, j - 1, k)))) + \ + c1 * \ + (mu(i + 1, j, k) * met(1, i + 1, j, k) * met(1, i + 1, j, k) * \ + (c2 * (u(1, i + 1, j + 2, k) - u(1, i + 1, j - 2, k)) + \ + c1 * (u(1, i + 1, j + 1, k) - u(1, i + 1, j - 1, k))) - \ + mu(i - 1, j, k) * met(1, i - 1, j, k) * met(1, i - 1, j, k) * \ + (c2 * (u(1, i - 1, j + 2, k) - u(1, i - 1, j - 2, k)) + \ + c1 * (u(1, i - 1, j + 1, k) - u(1, i - 1, j - 1, k)))); + +// rp - derivatives +// 24*8 = 192 ops, tot=4355 +#define SW4CK_KERNEL_5_BODY_14 \ + float_sw4 dudrm2 = 0, dudrm1 = 0, dudrp1 = 0, dudrp2 = 0; \ + float_sw4 dvdrm2 = 0, dvdrm1 = 0, dvdrp1 = 0, dvdrp2 = 0; \ + float_sw4 dwdrm2 = 0, dwdrm1 = 0, dwdrp1 = 0, dwdrp2 = 0; \ + /*#pragma unroll 8 */ \ + for (int q = nk - 7; q <= nk; q++) { \ + dudrm2 -= bope(nk - k + 1, nk - q + 1) * u(1, i - 2, j, q); \ + dvdrm2 -= bope(nk - k + 1, nk - q + 1) * u(2, i - 2, j, q); \ + dwdrm2 -= bope(nk - k + 1, nk - q + 1) * u(3, i - 2, j, q); \ + dudrm1 -= bope(nk - k + 1, nk - q + 1) * u(1, i - 1, j, q); \ + dvdrm1 -= bope(nk - k + 1, nk - q + 1) * u(2, i - 1, j, q); \ + dwdrm1 -= bope(nk - k + 1, nk - q + 1) * u(3, i - 1, j, q); \ + dudrp2 -= bope(nk - k + 1, nk - q + 1) * u(1, i + 2, j, q); \ + dvdrp2 -= bope(nk - k + 1, nk - q + 1) * u(2, i + 2, j, q); \ + dwdrp2 -= bope(nk - k + 1, nk - q + 1) * u(3, i + 2, j, q); \ + dudrp1 -= bope(nk - k + 1, nk - q + 1) * u(1, i + 1, j, q); \ + dvdrp1 -= bope(nk - k + 1, nk - q + 1) * u(2, i + 1, j, q); \ + dwdrp1 -= bope(nk - k + 1, nk - q + 1) * u(3, i + 1, j, q); \ + } + +// rp derivatives (u-eq) +// 67 ops, tot=4422 +#define SW4CK_KERNEL_5_BODY_15 \ + r1 += (c2 * ((2 * mu(i + 2, j, k) + la(i + 2, j, k)) * \ + met(2, i + 2, j, k) * met(1, i + 2, j, k) * \ + strx(i + 2) * dudrp2 + \ + la(i + 2, j, k) * met(3, i + 2, j, k) * \ + met(1, i + 2, j, k) * dvdrp2 * stry(j) + \ + la(i + 2, j, k) * met(4, i + 2, j, k) * \ + met(1, i + 2, j, k) * dwdrp2 - \ + ((2 * mu(i - 2, j, k) + la(i - 2, j, k)) * \ + met(2, i - 2, j, k) * met(1, i - 2, j, k) * \ + strx(i - 2) * dudrm2 + \ + la(i - 2, j, k) * met(3, i - 2, j, k) * \ + met(1, i - 2, j, k) * dvdrm2 * stry(j) + \ + la(i - 2, j, k) * met(4, i - 2, j, k) * \ + met(1, i - 2, j, k) * dwdrm2)) + \ + c1 * ((2 * mu(i + 1, j, k) + la(i + 1, j, k)) * \ + met(2, i + 1, j, k) * met(1, i + 1, j, k) * \ + strx(i + 1) * dudrp1 + \ + la(i + 1, j, k) * met(3, i + 1, j, k) * \ + met(1, i + 1, j, k) * dvdrp1 * stry(j) + \ + la(i + 1, j, k) * met(4, i + 1, j, k) * \ + met(1, i + 1, j, k) * dwdrp1 - \ + ((2 * mu(i - 1, j, k) + la(i - 1, j, k)) * \ + met(2, i - 1, j, k) * met(1, i - 1, j, k) * \ + strx(i - 1) * dudrm1 + \ + la(i - 1, j, k) * met(3, i - 1, j, k) * \ + met(1, i - 1, j, k) * dvdrm1 * stry(j) + \ + la(i - 1, j, k) * met(4, i - 1, j, k) * \ + met(1, i - 1, j, k) * dwdrm1))) * \ + istry; + +// rp derivatives (v-eq) +// 42 ops, tot=4464 +#define SW4CK_KERNEL_5_BODY_16 \ + r2 += \ + c2 * (mu(i + 2, j, k) * met(3, i + 2, j, k) * \ + met(1, i + 2, j, k) * dudrp2 + \ + mu(i + 2, j, k) * met(2, i + 2, j, k) * \ + met(1, i + 2, j, k) * dvdrp2 * strx(i + 2) * istry - \ + (mu(i - 2, j, k) * met(3, i - 2, j, k) * \ + met(1, i - 2, j, k) * dudrm2 + \ + mu(i - 2, j, k) * met(2, i - 2, j, k) * \ + met(1, i - 2, j, k) * dvdrm2 * strx(i - 2) * istry)) + \ + c1 * (mu(i + 1, j, k) * met(3, i + 1, j, k) * \ + met(1, i + 1, j, k) * dudrp1 + \ + mu(i + 1, j, k) * met(2, i + 1, j, k) * \ + met(1, i + 1, j, k) * dvdrp1 * strx(i + 1) * istry - \ + (mu(i - 1, j, k) * met(3, i - 1, j, k) * \ + met(1, i - 1, j, k) * dudrm1 + \ + mu(i - 1, j, k) * met(2, i - 1, j, k) * \ + met(1, i - 1, j, k) * dvdrm1 * strx(i - 1) * istry)); + +// rp derivatives (w-eq) +// 38 ops, tot=4502 +#define SW4CK_KERNEL_5_BODY_17 \ + r3 += \ + istry * (c2 * (mu(i + 2, j, k) * met(4, i + 2, j, k) * \ + met(1, i + 2, j, k) * dudrp2 + \ + mu(i + 2, j, k) * met(2, i + 2, j, k) * \ + met(1, i + 2, j, k) * dwdrp2 * strx(i + 2) - \ + (mu(i - 2, j, k) * met(4, i - 2, j, k) * \ + met(1, i - 2, j, k) * dudrm2 + \ + mu(i - 2, j, k) * met(2, i - 2, j, k) * \ + met(1, i - 2, j, k) * dwdrm2 * strx(i - 2))) + \ + c1 * (mu(i + 1, j, k) * met(4, i + 1, j, k) * \ + met(1, i + 1, j, k) * dudrp1 + \ + mu(i + 1, j, k) * met(2, i + 1, j, k) * \ + met(1, i + 1, j, k) * dwdrp1 * strx(i + 1) - \ + (mu(i - 1, j, k) * met(4, i - 1, j, k) * \ + met(1, i - 1, j, k) * dudrm1 + \ + mu(i - 1, j, k) * met(2, i - 1, j, k) * \ + met(1, i - 1, j, k) * dwdrm1 * strx(i - 1)))); + + +// rq - derivatives +// 24*8 = 192 ops , tot=4694 +#define SW4CK_KERNEL_5_BODY_18 \ + dudrm2 = 0; \ + dudrm1 = 0; \ + dudrp1 = 0; \ + dudrp2 = 0; \ + dvdrm2 = 0; \ + dvdrm1 = 0; \ + dvdrp1 = 0; \ + dvdrp2 = 0; \ + dwdrm2 = 0; \ + dwdrm1 = 0; \ + dwdrp1 = 0; \ + dwdrp2 = 0; \ + /* #pragma unroll 8 */ \ + for (int q = nk - 7; q <= nk; q++) { \ + dudrm2 -= bope(nk - k + 1, nk - q + 1) * u(1, i, j - 2, q); \ + dvdrm2 -= bope(nk - k + 1, nk - q + 1) * u(2, i, j - 2, q); \ + dwdrm2 -= bope(nk - k + 1, nk - q + 1) * u(3, i, j - 2, q); \ + dudrm1 -= bope(nk - k + 1, nk - q + 1) * u(1, i, j - 1, q); \ + dvdrm1 -= bope(nk - k + 1, nk - q + 1) * u(2, i, j - 1, q); \ + dwdrm1 -= bope(nk - k + 1, nk - q + 1) * u(3, i, j - 1, q); \ + dudrp2 -= bope(nk - k + 1, nk - q + 1) * u(1, i, j + 2, q); \ + dvdrp2 -= bope(nk - k + 1, nk - q + 1) * u(2, i, j + 2, q); \ + dwdrp2 -= bope(nk - k + 1, nk - q + 1) * u(3, i, j + 2, q); \ + dudrp1 -= bope(nk - k + 1, nk - q + 1) * u(1, i, j + 1, q); \ + dvdrp1 -= bope(nk - k + 1, nk - q + 1) * u(2, i, j + 1, q); \ + dwdrp1 -= bope(nk - k + 1, nk - q + 1) * u(3, i, j + 1, q); \ + } + +// rq derivatives (u-eq) +// 42 ops, tot=4736 +#define SW4CK_KERNEL_5_BODY_19 \ + r1 += c2 * (mu(i, j + 2, k) * met(3, i, j + 2, k) * \ + met(1, i, j + 2, k) * dudrp2 * stry(j + 2) * istrx + \ + mu(i, j + 2, k) * met(2, i, j + 2, k) * \ + met(1, i, j + 2, k) * dvdrp2 - \ + (mu(i, j - 2, k) * met(3, i, j - 2, k) * \ + met(1, i, j - 2, k) * dudrm2 * stry(j - 2) * istrx + \ + mu(i, j - 2, k) * met(2, i, j - 2, k) * \ + met(1, i, j - 2, k) * dvdrm2)) + \ + c1 * (mu(i, j + 1, k) * met(3, i, j + 1, k) * \ + met(1, i, j + 1, k) * dudrp1 * stry(j + 1) * istrx + \ + mu(i, j + 1, k) * met(2, i, j + 1, k) * \ + met(1, i, j + 1, k) * dvdrp1 - \ + (mu(i, j - 1, k) * met(3, i, j - 1, k) * \ + met(1, i, j - 1, k) * dudrm1 * stry(j - 1) * istrx + \ + mu(i, j - 1, k) * met(2, i, j - 1, k) * \ + met(1, i, j - 1, k) * dvdrm1)); + +// rq derivatives (v-eq) +// 70 ops, tot=4806 +#define SW4CK_KERNEL_5_BODY_20 \ + r2 += c2 * (la(i, j + 2, k) * met(2, i, j + 2, k) * \ + met(1, i, j + 2, k) * dudrp2 + \ + (2 * mu(i, j + 2, k) + la(i, j + 2, k)) * \ + met(3, i, j + 2, k) * met(1, i, j + 2, k) * dvdrp2 * \ + stry(j + 2) * istrx + \ + la(i, j + 2, k) * met(4, i, j + 2, k) * \ + met(1, i, j + 2, k) * dwdrp2 * istrx - \ + (la(i, j - 2, k) * met(2, i, j - 2, k) * \ + met(1, i, j - 2, k) * dudrm2 + \ + (2 * mu(i, j - 2, k) + la(i, j - 2, k)) * \ + met(3, i, j - 2, k) * met(1, i, j - 2, k) * dvdrm2 * \ + stry(j - 2) * istrx + \ + la(i, j - 2, k) * met(4, i, j - 2, k) * \ + met(1, i, j - 2, k) * dwdrm2 * istrx)) + \ + c1 * (la(i, j + 1, k) * met(2, i, j + 1, k) * \ + met(1, i, j + 1, k) * dudrp1 + \ + (2 * mu(i, j + 1, k) + la(i, j + 1, k)) * \ + met(3, i, j + 1, k) * met(1, i, j + 1, k) * dvdrp1 * \ + stry(j + 1) * istrx + \ + la(i, j + 1, k) * met(4, i, j + 1, k) * \ + met(1, i, j + 1, k) * dwdrp1 * istrx - \ + (la(i, j - 1, k) * met(2, i, j - 1, k) * \ + met(1, i, j - 1, k) * dudrm1 + \ + (2 * mu(i, j - 1, k) + la(i, j - 1, k)) * \ + met(3, i, j - 1, k) * met(1, i, j - 1, k) * dvdrm1 * \ + stry(j - 1) * istrx + \ + la(i, j - 1, k) * met(4, i, j - 1, k) * \ + met(1, i, j - 1, k) * dwdrm1 * istrx)); + +// rq derivatives (w-eq) +// 39 ops, tot=4845 +#define SW4CK_KERNEL_5_BODY_21 \ + r3 += (c2 * (mu(i, j + 2, k) * met(3, i, j + 2, k) * \ + met(1, i, j + 2, k) * dwdrp2 * stry(j + 2) + \ + mu(i, j + 2, k) * met(4, i, j + 2, k) * \ + met(1, i, j + 2, k) * dvdrp2 - \ + (mu(i, j - 2, k) * met(3, i, j - 2, k) * \ + met(1, i, j - 2, k) * dwdrm2 * stry(j - 2) + \ + mu(i, j - 2, k) * met(4, i, j - 2, k) * \ + met(1, i, j - 2, k) * dvdrm2)) + \ + c1 * (mu(i, j + 1, k) * met(3, i, j + 1, k) * \ + met(1, i, j + 1, k) * dwdrp1 * stry(j + 1) + \ + mu(i, j + 1, k) * met(4, i, j + 1, k) * \ + met(1, i, j + 1, k) * dvdrp1 - \ + (mu(i, j - 1, k) * met(3, i, j - 1, k) * \ + met(1, i, j - 1, k) * dwdrm1 * stry(j - 1) + \ + mu(i, j - 1, k) * met(4, i, j - 1, k) * \ + met(1, i, j - 1, k) * dvdrm1))) * \ + istrx; + +// pr and qr derivatives at once +// in loop: 8*(53+53+43) = 1192 ops, tot=6037 +#define SW4CK_KERNEL_5_BODY_22 \ + /* #pragma unroll 8 */ \ + for (int q = nk - 7; q <= nk; q++) { \ + /* (u-eq) */ \ + /* 53 ops */ \ + r1 -= bope(nk - k + 1, nk - q + 1) * \ + ( \ + /* pr */ \ + (2 * mu(i, j, q) + la(i, j, q)) * met(2, i, j, q) * \ + met(1, i, j, q) * \ +(c2 * (u(1, i + 2, j, q) - u(1, i - 2, j, q)) + \ + c1 * (u(1, i + 1, j, q) - u(1, i - 1, j, q))) * \ + strx(i) * istry + \ + mu(i, j, q) * met(3, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(2, i + 2, j, q) - u(2, i - 2, j, q)) + \ + c1 * (u(2, i + 1, j, q) - u(2, i - 1, j, q))) + \ + mu(i, j, q) * met(4, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(3, i + 2, j, q) - u(3, i - 2, j, q)) + \ + c1 * (u(3, i + 1, j, q) - u(3, i - 1, j, q))) * \ + istry \ + /* qr */ \ + + mu(i, j, q) * met(3, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(1, i, j + 2, q) - u(1, i, j - 2, q)) + \ + c1 * (u(1, i, j + 1, q) - u(1, i, j - 1, q))) * \ + stry(j) * istrx + \ + la(i, j, q) * met(2, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(2, i, j + 2, q) - u(2, i, j - 2, q)) + \ + c1 * (u(2, i, j + 1, q) - u(2, i, j - 1, q)))); \ + \ + /* (v-eq) */ \ + /* 53 ops */ \ + r2 -= bope(nk - k + 1, nk - q + 1) * \ + ( \ + /* pr */ \ + la(i, j, q) * met(3, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(1, i + 2, j, q) - u(1, i - 2, j, q)) + \ + c1 * (u(1, i + 1, j, q) - u(1, i - 1, j, q))) + \ + mu(i, j, q) * met(2, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(2, i + 2, j, q) - u(2, i - 2, j, q)) + \ + c1 * (u(2, i + 1, j, q) - u(2, i - 1, j, q))) * \ + strx(i) * istry \ + /* qr */ \ + + mu(i, j, q) * met(2, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(1, i, j + 2, q) - u(1, i, j - 2, q)) + \ + c1 * (u(1, i, j + 1, q) - u(1, i, j - 1, q))) + \ + (2 * mu(i, j, q) + la(i, j, q)) * met(3, i, j, q) * \ + met(1, i, j, q) * \ + (c2 * (u(2, i, j + 2, q) - u(2, i, j - 2, q)) + \ + c1 * (u(2, i, j + 1, q) - u(2, i, j - 1, q))) * \ + stry(j) * istrx + \ + mu(i, j, q) * met(4, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(3, i, j + 2, q) - u(3, i, j - 2, q)) + \ + c1 * (u(3, i, j + 1, q) - u(3, i, j - 1, q))) * \ + istrx); \ + \ + /* (w-eq) */ \ + /* 43 ops */ \ + r3 -= bope(nk - k + 1, nk - q + 1) * \ + ( \ + /* pr */ \ + la(i, j, q) * met(4, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(1, i + 2, j, q) - u(1, i - 2, j, q)) + \ + c1 * (u(1, i + 1, j, q) - u(1, i - 1, j, q))) * \ + istry + \ + mu(i, j, q) * met(2, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(3, i + 2, j, q) - u(3, i - 2, j, q)) + \ + c1 * (u(3, i + 1, j, q) - u(3, i - 1, j, q))) * \ + strx(i) * istry \ + /* qr */ \ + + mu(i, j, q) * met(3, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(3, i, j + 2, q) - u(3, i, j - 2, q)) + \ + c1 * (u(3, i, j + 1, q) - u(3, i, j - 1, q))) * \ + stry(j) * istrx + \ + la(i, j, q) * met(4, i, j, q) * met(1, i, j, q) * \ + (c2 * (u(2, i, j + 2, q) - u(2, i, j - 2, q)) + \ + c1 * (u(2, i, j + 1, q) - u(2, i, j - 1, q))) * \ + istrx); \ + } + +// 12 ops, tot=6049 +#define SW4CK_KERNEL_5_BODY_23 \ + lu(1, i, j, k) = a1 * lu(1, i, j, k) + sgn * r1 * ijac; \ + lu(2, i, j, k) = a1 * lu(2, i, j, k) + sgn * r2 * ijac; \ + lu(3, i, j, k) = a1 * lu(3, i, j, k) + sgn * r3 * ijac; + +#include "common/KernelBase.hpp" + + namespace rajaperf { + class RunParams; + + namespace apps { + + class SW4CK_KERNEL_5 : public KernelBase { + public: + SW4CK_KERNEL_5(const RunParams ¶ms); + + ~SW4CK_KERNEL_5(); + + void setUp(VariantID vid, size_t tune_idx); + void updateChecksum(VariantID vid, size_t tune_idx); + void tearDown(VariantID vid, size_t tune_idx); + + void runSeqVariant(VariantID vid, size_t tune_idx); + void runOpenMPVariant(VariantID vid, size_t tune_idx); + void runCudaVariant(VariantID vid, size_t tune_idx); + void runHipVariant(VariantID vid, size_t tune_idx); + void runOpenMPTargetVariant(VariantID vid, size_t tune_idx); + + void setCudaTuningDefinitions(VariantID vid); + void setHipTuningDefinitions(VariantID vid); + template void runCudaVariantImpl(VariantID vid); + template void runHipVariantImpl(VariantID vid); + + private: + static const size_t default_gpu_block_size = 256; + using gpu_block_sizes_type = + gpu_block_size::make_list_type; + + Real_ptr m_a_mu; + Real_ptr m_a_lambda; + Real_ptr m_a_jac; + Real_ptr m_a_u; + Real_ptr m_a_lu; + Real_ptr m_a_met; + Real_ptr m_a_strx; + Real_ptr m_a_stry; + Real_ptr m_a_acof; + Real_ptr m_a_bope; + Real_ptr m_a_ghcof; + Real_ptr m_a_acof_no_gp; + Real_ptr m_a_ghcof_no_gp; + + }; + + } // end namespace apps + } // end namespace rajaperf + +#endif // closing endif for header file include guard diff --git a/src/common/RAJAPerfSuite.cpp b/src/common/RAJAPerfSuite.cpp index 4e9465008..027ec74f6 100644 --- a/src/common/RAJAPerfSuite.cpp +++ b/src/common/RAJAPerfSuite.cpp @@ -92,6 +92,7 @@ #include "apps/NODAL_ACCUMULATION_3D.hpp" #include "apps/PRESSURE.hpp" #include "apps/SW4CK_KERNEL_2.hpp" +#include "apps/SW4CK_KERNEL_5.hpp" #include "apps/VOL3D.hpp" // @@ -229,6 +230,7 @@ static const std::string KernelNames [] = std::string("Apps_NODAL_ACCUMULATION_3D"), std::string("Apps_PRESSURE"), std::string("Apps_SW4CK_KERNEL_2"), + std::string("Apps_SW4CK_KERNEL_5"), std::string("Apps_VOL3D"), // @@ -581,10 +583,10 @@ KernelBase* getKernelObject(KernelID kid, kernel = new basic::REDUCE3_INT(run_params); break; } - case Basic_REDUCE_STRUCT : { + case Basic_REDUCE_STRUCT : { kernel = new basic::REDUCE_STRUCT(run_params); break; - } + } case Basic_TRAP_INT : { kernel = new basic::TRAP_INT(run_params); break; @@ -776,7 +778,11 @@ KernelBase* getKernelObject(KernelID kid, case Apps_SW4CK_KERNEL_2 : { kernel = new apps::SW4CK_KERNEL_2(run_params); break; - } + } + case Apps_SW4CK_KERNEL_5 : { + kernel = new apps::SW4CK_KERNEL_5(run_params); + break; + } case Apps_VOL3D : { kernel = new apps::VOL3D(run_params); break; diff --git a/src/common/RAJAPerfSuite.hpp b/src/common/RAJAPerfSuite.hpp index 15acdf26a..701ed156a 100644 --- a/src/common/RAJAPerfSuite.hpp +++ b/src/common/RAJAPerfSuite.hpp @@ -145,6 +145,7 @@ enum KernelID { Apps_NODAL_ACCUMULATION_3D, Apps_PRESSURE, Apps_SW4CK_KERNEL_2, + Apps_SW4CK_KERNEL_5, Apps_VOL3D, // From fd5d702969beeb0f460de6be8f9f4cbf16ca3095 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 30 Mar 2023 13:11:39 -0700 Subject: [PATCH 6/7] get kernel building --need to check range values --- src/apps/SW4CK_KERNEL_5-Seq.cpp | 107 +++++++++++++++++++++++++++++++- src/apps/SW4CK_KERNEL_5.hpp | 83 +++++++++++++------------ 2 files changed, 148 insertions(+), 42 deletions(-) diff --git a/src/apps/SW4CK_KERNEL_5-Seq.cpp b/src/apps/SW4CK_KERNEL_5-Seq.cpp index 0893be43e..0e7737298 100644 --- a/src/apps/SW4CK_KERNEL_5-Seq.cpp +++ b/src/apps/SW4CK_KERNEL_5-Seq.cpp @@ -30,17 +30,19 @@ void SW4CK_KERNEL_5::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun const int ilast = 1; const int jstart = 0; const int jfirst = 0; - const int jend = 0; + const int jend = 0; const int jlast = 1; const int kfirst = 0; const int kstart = 0; const int klast = 1; const int kend = 1; - char op = '='; + const int nk = 0; + char op = '='; + SW4CK_KERNEL_5_DATA_SETUP; - + switch ( vid ) { @@ -54,7 +56,106 @@ void SW4CK_KERNEL_5::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun for(int j=jstart; j Date: Sun, 21 Apr 2024 16:28:15 -0700 Subject: [PATCH 7/7] partial compilation fixes --- src/apps/SW4CK_KERNEL_2-Seq.cpp | 11 ----------- src/apps/SW4CK_KERNEL_2.hpp | 34 +++++++++++++++++++++++++++------ src/apps/SW4CK_KERNEL_5.hpp | 5 ++--- 3 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/apps/SW4CK_KERNEL_2-Seq.cpp b/src/apps/SW4CK_KERNEL_2-Seq.cpp index 5aff9be29..244fb59db 100644 --- a/src/apps/SW4CK_KERNEL_2-Seq.cpp +++ b/src/apps/SW4CK_KERNEL_2-Seq.cpp @@ -25,17 +25,6 @@ void SW4CK_KERNEL_2::runSeqVariant(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tun const Index_type run_reps = getRunReps(); //To be populated later with - const int istart = 0; - const int ifirst = 0; - const int ilast = 1; - const int jstart = 0; - const int jfirst = 0; - const int jend = 0; - const int jlast = 1; - const int kfirst = 0; - const int kstart = 0; - const int klast = 1; - const int kend = 1; char op = '='; diff --git a/src/apps/SW4CK_KERNEL_2.hpp b/src/apps/SW4CK_KERNEL_2.hpp index 5ec9a9668..9fc7f4e28 100644 --- a/src/apps/SW4CK_KERNEL_2.hpp +++ b/src/apps/SW4CK_KERNEL_2.hpp @@ -1,4 +1,3 @@ - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Copyright (c) 2017-23, Lawrence Livermore National Security, LLC // and RAJA Performance Suite project contributors. @@ -397,6 +396,18 @@ using float_sw4 = double; sgn = -1; \ } \ \ + const int istart = m_istart; \ + const int ifirst = m_first; \ + const int ilast = m_ilast; \ + const int jstart = m_jstart; \ + const int jfirst = m_jfirst; \ + const int jend = m_jend; \ + const int jlast = m_jlast; \ + const int kfirst = m_kfirst; \ + const int kstart = m_kstart; \ + const int klast = m_klast; \ + const int kend = m_kend; \ + \ const float_sw4 i6 = 1.0 / 6; \ const float_sw4 tf = 0.75; \ const float_sw4 c1 = 2.0 / 3; \ @@ -421,9 +432,9 @@ using float_sw4 = double; Real_ptr a_stry = m_a_stry; /* Real_ptr a_acof = m_a_acof; \ - Real_ptr a_bope = m_a_bope; \ - Real_ptr a_ghcof = m_a_ghcof; \ - Real_ptr a_acof_no_gp = m_a_acof_no_gp; \ + Real_ptr a_bope = m_a_bope; \ + Real_ptr a_ghcof = m_a_ghcof; \ + Real_ptr a_acof_no_gp = m_a_acof_no_gp; \ Real_ptr a_ghcof_no_gp = m_a_ghcof_no_gp; */ @@ -885,8 +896,7 @@ using float_sw4 = double; private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = - gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_a_mu; Real_ptr m_a_lambda; @@ -902,6 +912,18 @@ using float_sw4 = double; Real_ptr m_a_acof_no_gp; Real_ptr m_a_ghcof_no_gp; + const int m_istart; + const int m_first; + const int m_ilast; + const int m_jstart; + const int m_jfirst; + const int m_jend; + const int m_jlast; + const int m_kfirst; + const int m_kstart; + const int m_klast; + const int m_kend; + }; } // end namespace apps diff --git a/src/apps/SW4CK_KERNEL_5.hpp b/src/apps/SW4CK_KERNEL_5.hpp index b6773c42e..453b7a0c6 100644 --- a/src/apps/SW4CK_KERNEL_5.hpp +++ b/src/apps/SW4CK_KERNEL_5.hpp @@ -673,7 +673,7 @@ using float_sw4 = double; #define mu(i, j, k) a_mu[base + (i) + ni * (j) + nij * (k)] #define la(i, j, k) a_lambda[base + (i) + ni * (j) + nij * (k)] #define jac(i, j, k) a_jac[base + (i) + ni * (j) + nij * (k)] -#define u(c, i, j, k) a_u[base3 + (i) + ni * (j) + nij * (k) + nijk * (c)] +#define swck4_u(c, i, j, k) a_u[base3 + (i) + ni * (j) + nij * (k) + nijk * (c)] #define lu(c, i, j, k) a_lu[base3 + (i) + ni * (j) + nij * (k) + nijk * (c)] #define met(c, i, j, k) a_met[base4 + (i) + ni * (j) + nij * (k) + nijk * (c)] #define strx(i) a_strx[i - ifirst0] @@ -1348,8 +1348,7 @@ for (int m = nk - 7; m <= nk; m++) { \ private: static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = - gpu_block_size::make_list_type; + using gpu_block_sizes_type = integer::list_type; Real_ptr m_a_mu; Real_ptr m_a_lambda;