Add NEON implementation for armv7 and aarch64 platforms (#615)

* Configure build system to compile arch-specific extension for Arm NEON * Add NEON implementation for `util/simd/vector.h` * Add `tranpose16x16` implementation with NEON * Fix wrong macro used in dynamic dispatch with NEON enabled * Add NEON implementation of score vectors in `score_vector_int8.h` * Enable `benchmark_transpose` when compiled with NEON support * Add NEON implementation in `score_vector_int16.h` * Implement `SwipeProfile` with NEON in `swipe.h` * Add remaining unimplemented code in `score_vector.h` * Enable SIMD swipe and benchmarks when compiling for NEON * Add guards to compile NEON code on Armv7 platforms * Add NEON horizontal sums implementations using `vpadalq` cascades * Add fallback `ScoreVector::cmp_mask` implementation for Armv7 NEON * Add NEON support to `finger_print.h` and `hash_set.h` * Enable NEON architecture in `banded_3frame_swipe.cpp` * Add remaining benchmarks for which there exist a NEON implementation * List NEON in compile-time and runtime feature lists * Update define macros for compilation of NEON architecture library * Fix some define guards in `util/simd` headers * Fix define guards in `ungapped_simd` when compiling for Aarch64 * Fix `cmp_mask` implementations for NEON `ScoreVector` instantiations * Rewrite NEON `transpose` without Aarch64-specific instructions * Use Arm NEON include guards instead of architecture for banded swipe code * Add `expand_from_8bit` implementation for Armv7 NEON * Rewrite 16x16 NEON transpose without `vst1q_s8_x4` for Armv7 compatibility * Benchmark scalar 16x16 transpose to compare to vectorized code * Fix include guards in `swipe_wrapper.cpp` preventing the use of NEON vectors * Remove `Deque::Iterator::operator-` when compiling for Arm for `armv7` compatibility * Fix `benchmark_transpose` not using a valid scalar implementation * Refactor `cmp_mask` implementation into common inline function for NEON * Add Arm NEON implementation to `sse_dist.h` * Add Arm NEON implementation of `BitVector::one_count` using `vcntq_u8` intrinsic * Fix `reduce_seq_aarch64` definition in `sse_dist.h` * Implement `ScoreVector<int16_t>(unsigned, Register)` for NEON * Fix potential overflow in NEON code of `BitVector::one_count` * Fix `CMakeLists.txt` and `simd.h` to only build NEON on `armv7` with build support * Setup runtime detection of `NEON` for `armv7` in `simd.cpp` * Setup dispatch for NEON in `dispatch.h` * Fix scope of NEON helpers in `simd.h` * Change NEON architecture ID in `CMakeLists.txt` to avoid conflict with AVX512 * Fix remaining use of `::SIMD` namespace in NEON code * Fix NEON `letter_mask` not disabling sequence mask based on macros * Revert back commented code in `deque.h` * Fix iteration in `hauser_correction.cpp` * Use compile-time macro to avoid `Deque::Iterator::operator-` overload * Fix benchmarks not running properly for NEON * Fix `armv7l` implementation of `vmaskq_s8` * Fix `table.h` for 32-bit platforms
bbuchfink · Jan 31, 2024 · 56f5d14 · 56f5d14
1 parent dd43519
commit 56f5d14
Show file tree

Hide file tree

Showing 27 changed files with 1,011 additions and 102 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,12 +1,16 @@
 cmake_minimum_required (VERSION 2.6)
 project (DIAMOND)
 include(CheckCXXCompilerFlag)
+include(CheckSymbolExists)
+include(CheckTypeSize)
 
 option(BUILD_STATIC "BUILD_STATIC" OFF)
 option(EXTRA "EXTRA" OFF)
 option(STATIC_LIBGCC "STATIC_LIBGCC" OFF)
 option(STATIC_LIBSTDC++ "STATIC_LIBSTDC++" OFF)
 option(X86 "X86" ON)
+option(ARM "ARM" OFF)
+option(AARCH64 "AARCH64" OFF)
 option(STRICT_BAND "STRICT_BAND" ON)
 option(LEFTMOST_SEED_FILTER "LEFTMOST_SEED_FILTER" ON)
 option(SEQ_MASK "SEQ_MASK" ON)
@@ -26,10 +30,12 @@ set(MAX_SHAPE_LEN 19)
 set(BLAST_INCLUDE_DIR "" CACHE STRING "BLAST_INCLUDE_DIR")
 set(BLAST_LIBRARY_DIR "" CACHE STRING "BLAST_LIBRARY_DIR")
 
-if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*)")
   set(X86 OFF)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
+  set(AARCH64 ON)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
   set(X86 OFF)
+  set(ARM ON)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "PPC64*|ppc64*|powerpc64*")
   set(X86 OFF)
   # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maltivec")
@@ -114,6 +120,9 @@ function(set_cxx_standard std flag)
   endif()
 endfunction(set_cxx_standard)
 
+check_type_size(ptrdiff_t SIZEOF_PTRDIFF_T)
+check_type_size(int SIZEOF_INT)
+
 check_cxx_compiler_flag("-std=gnu++14" HAS_GNUPP14)
 check_cxx_compiler_flag("-std=gnu++17" HAS_GNUPP17)
 if(HAS_GNUPP17 OR ${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC)
@@ -197,6 +206,31 @@ if(X86)
   endif()
 endif(X86)
 
+# NEON is mandatory on Aarch64
+if(AARCH64)
+  add_definitions(-DWITH_NEON)
+  add_library(arch_neon OBJECT ${DISPATCH_OBJECTS})
+  target_include_directories(arch_neon PRIVATE "${CMAKE_SOURCE_DIR}/src/lib")
+  target_compile_options(arch_neon PUBLIC -DDISPATCH_ARCH=ARCH_NEON -DARCH_ID=4 -D__ARM_NEON -D__aarch64__ -DEigen=Eigen_NEON)
+endif(AARCH64)
+
+# NEON is optional on Armv7, so we need to check for compiler support,
+# and for the <sys/auxv.h> header used for runtime detection.
+if(ARM)
+check_symbol_exists(getauxval "sys/auxv.h" HAVE_GETAUXVAL)
+check_cxx_compiler_flag("-mfpu=neon" HAVE_MFPU_NEON)
+if(HAVE_MFPU_NEON)
+    add_definitions(-DWITH_NEON)
+    add_definitions(-DHAVE_MFPU_NEON)
+    add_library(arch_neon OBJECT ${DISPATCH_OBJECTS})
+    target_include_directories(arch_neon PRIVATE "${CMAKE_SOURCE_DIR}/src/lib")
+    target_compile_options(arch_neon PUBLIC -DDISPATCH_ARCH=ARCH_NEON -DARCH_ID=4 -D__ARM_NEON -DEigen=Eigen_NEON -mfpu=neon)
+  endif()
+  if(HAVE_GETAUXVAL)
+    add_definitions(-DHAVE_GETAUXVAL)
+  endif()
+endif(ARM)
+
 set(OBJECTS
         src/run/main.cpp
         src/basic/config.cpp
@@ -394,6 +428,8 @@ if(X86)
   else()
     add_executable(diamond $<TARGET_OBJECTS:arch_generic> $<TARGET_OBJECTS:arch_sse4_1> $<TARGET_OBJECTS:arch_avx2> ${OBJECTS} ${BLAST_OBJ} ${ZSTD_OBJ})
   endif()
+elseif(ARM OR AARCH64)
+  add_executable(diamond $<TARGET_OBJECTS:arch_generic> $<TARGET_OBJECTS:arch_neon> ${OBJECTS} ${BLAST_OBJ} ${ZSTD_OBJ})
 else()
   add_executable(diamond $<TARGET_OBJECTS:arch_generic> ${OBJECTS} ${BLAST_OBJ} ${ZSTD_OBJ})
 endif()
@@ -489,4 +525,4 @@ SET(SP -DTEST_DIR=${CMAKE_SOURCE_DIR}/src/test -P ${CMAKE_SOURCE_DIR}/src/test/t
 add_test(NAME blastp COMMAND ${CMAKE_COMMAND} -DNAME=blastp "-DARGS=blastp -q ${TD}/1.faa -d ${TD}/2.faa -p1" ${SP})
 add_test(NAME blastp-mid-sens COMMAND ${CMAKE_COMMAND} -DNAME=blastp-mid-sens "-DARGS=blastp -q ${TD}/3.faa -d ${TD}/4.faa --mid-sensitive -p1" ${SP})
 add_test(NAME blastp-f0 COMMAND ${CMAKE_COMMAND} -DNAME=blastp-f0 "-DARGS=blastp -q ${TD}/1.faa -d ${TD}/2.faa -f0 -p1" ${SP})
-add_test(NAME diamond COMMAND diamond test)
+add_test(NAME diamond COMMAND diamond test)
diff --git a/src/basic/value.h b/src/basic/value.h
@@ -114,6 +114,16 @@ static inline __m256i letter_mask(__m256i x) {
 }
 #endif
 
+#ifdef __ARM_NEON
+static inline int8x16_t letter_mask(int8x16_t x) {
+#ifdef SEQ_MASK
+	return vandq_s8(x, vdupq_n_s8(LETTER_MASK));
+#else
+	return x;
+#endif
+}
+#endif
+
 extern const ValueTraits amino_acid_traits;
 extern const ValueTraits nucleotide_traits;
 extern ValueTraits value_traits;
@@ -163,4 +173,4 @@ using DictId = int64_t;
 using Score = int32_t;
 using TaxId = int32_t;
 using CentroidId = OId;
-using SuperBlockId = int32_t;
+using SuperBlockId = int32_t;
diff --git a/src/dp/scan_diags.cpp b/src/dp/scan_diags.cpp
@@ -60,7 +60,7 @@ void scan_diags128(const LongScoreProfile<int8_t>& qp, Sequence s, int d_begin,
 	max4.store(scores + 96);
 	for (int i = 0; i < 128; ++i)
 		out[i] = ScoreTraits<Sv>::int_score(scores[i]);
-#elif defined(__SSE4_1__)
+#elif defined(__SSE4_1__) | defined(__ARM_NEON)
 	using Sv = ScoreVector<int8_t, SCHAR_MIN>;
 	const int qlen = (int)qp.length();
 
@@ -150,7 +150,7 @@ void scan_diags64(const LongScoreProfile<int8_t>& qp, Sequence s, int d_begin, i
 	max2.store(scores + 32);
 	for (int i = 0; i < 64; ++i)
 		out[i] = ScoreTraits<Sv>::int_score(scores[i]);
-#elif defined(__SSE4_1__)
+#elif defined(__SSE4_1__) | defined(__ARM_NEON)
 	using Sv = ScoreVector<int8_t, SCHAR_MIN>;
 	const int qlen = (int)qp.length();
 
@@ -225,7 +225,7 @@ void scan_diags(const LongScoreProfile<int8_t>& qp, Sequence s, int d_begin, int
 	max2.store(scores + 32);
 	for (int i = 0; i < 64; ++i)
 		out[i] = ScoreTraits<Sv>::int_score(scores[i]);
-#elif defined(__SSE4_1__)
+#elif defined(__SSE4_1__) | defined(__ARM_NEON)
 	using Sv = ScoreVector<int8_t, SCHAR_MIN>;
 	const int qlen = (int)qp.length();
 

diff --git a/src/dp/score_vector.h b/src/dp/score_vector.h
@@ -180,6 +180,20 @@ static inline void store_sv(const DISPATCH_ARCH::ScoreVector<_t, DELTA> &sv, _p
 
 #endif
 
+#ifdef __ARM_NEON
+template<int DELTA>
+static inline void store_sv(const DISPATCH_ARCH::ScoreVector<int8_t, DELTA> &sv, int8_t *dst)
+{
+	vst1q_s8(dst, sv.data_);
+}
+
+template<int DELTA>
+static inline void store_sv(const DISPATCH_ARCH::ScoreVector<int16_t, DELTA> &sv, int16_t *dst)
+{
+	vst1q_s16(dst, sv.data_);
+}
+#endif
+
 static inline int extract_channel(const int32_t v, const int i) {
 	return v;
 }
@@ -194,4 +208,4 @@ static inline void saturate(Sv& v) {
 
 static inline void saturate(int32_t& v) {
 	v = std::max(v, 0);
-}
+}
diff --git a/src/dp/score_vector_int16.h b/src/dp/score_vector_int16.h
@@ -4,6 +4,7 @@ Copyright (C) 2016-2021 Max Planck Society for the Advancement of Science e.V.
                         Benjamin Buchfink
 						
 Code developed by Benjamin Buchfink <[email protected]>
+Arm NEON port contributed by Martin Larralde <[email protected]>
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -196,6 +197,162 @@ static inline ScoreVector<int16_t, DELTA> blend(const ScoreVector<int16_t, DELTA
 	return ScoreVector<int16_t, DELTA>(_mm256_blendv_epi8(v.data_, w.data_, mask.data_));
 }
 
+#elif defined(__ARM_NEON)
+
+template<int DELTA>
+struct ScoreVector<int16_t, DELTA>
+{
+
+	typedef int16x8_t Register;
+
+	inline ScoreVector() :
+		data_(vdupq_n_s16(DELTA))
+	{}
+
+	explicit ScoreVector(int x)
+	{
+		data_ = vdupq_n_s16(x);
+	}
+
+	explicit ScoreVector(int16_t x)
+	{
+		data_ = vdupq_n_s16(x);
+	}
+
+	explicit ScoreVector(int16x8_t data) :
+		data_(data)
+	{ }
+
+	explicit ScoreVector(const int16_t *x):
+		data_(vld1q_s16(x))
+	{}
+
+	explicit ScoreVector(const uint16_t *x) :
+		data_(vreinterpretq_s16_u16(vld1q_u16(x)))
+	{}
+
+#ifdef __aarch64__
+	ScoreVector(unsigned a, Register seq)
+	{
+		const int8x16_t* row = reinterpret_cast<const int8x16_t*>(&score_matrix.matrix8()[a << 5]);
+
+		int8x16_t high_mask = vreinterpretq_s8_s16(vshlq_n_s16(vreinterpretq_s16_s8(vandq_s8(vreinterpretq_s8_s16(seq), vdupq_n_s8('\x10'))), 3));
+		int8x16_t seq_low   = vorrq_s8(vreinterpretq_s8_s16(seq), high_mask);
+		int8x16_t seq_high  = vorrq_s8(vreinterpretq_s8_s16(seq), veorq_s8(high_mask, vdupq_n_s8('\x80')));
+
+		int8x16_t r1 = vld1q_s8(reinterpret_cast<const int8_t*>(row));
+		int8x16_t r2 = vld1q_s8(reinterpret_cast<const int8_t*>(row + 1));
+
+		int8x16_t s1 = vqtbl1q_s8(r1, vandq_u8(vreinterpretq_u8_s8(seq_low),  vdupq_n_u8(0x8F)));
+		int8x16_t s2 = vqtbl1q_s8(r2, vandq_u8(vreinterpretq_u8_s8(seq_high), vdupq_n_u8(0x8F)));
+
+		data_ = vorrq_s16(vreinterpretq_s16_s8(s1), vreinterpretq_s16_s8(s2));
+		data_ = vandq_s16(data_, vdupq_n_s16(255));
+		data_ = vqsubq_s16(data_, vdupq_n_s16(score_matrix.bias()));
+	}
+#endif
+
+	ScoreVector operator+(const ScoreVector&rhs) const
+	{
+		return ScoreVector(vqaddq_s16(data_, rhs.data_));
+	}
+
+	ScoreVector operator-(const ScoreVector&rhs) const
+	{
+		return ScoreVector(vqsubq_s16(data_, rhs.data_));
+	}
+
+	ScoreVector& operator+=(const ScoreVector& rhs) {
+		data_ = vqaddq_s16(data_, rhs.data_);
+		return *this;
+	}
+
+	ScoreVector& operator-=(const ScoreVector&rhs)
+	{
+		data_ = vqsubq_s16(data_, rhs.data_);
+		return *this;
+	}
+
+	ScoreVector& operator &=(const ScoreVector& rhs) {
+		data_ = vandq_s16(data_, rhs.data_);
+		return *this;
+	}
+
+	ScoreVector& operator++() {
+		data_ = vqaddq_s16(data_, vdupq_n_s16(1));
+		return *this;
+	}
+
+	ScoreVector operator==(const ScoreVector&v) const {
+		return ScoreVector(vreinterpretq_s16_u16(vceqq_s16(data_, v.data_)));
+	}
+
+	friend uint32_t cmp_mask(const ScoreVector&v, const ScoreVector&w) {
+		return vmaskq_s8(vreinterpretq_s8_u16(vceqq_s16(v.data_, w.data_)));
+	}
+
+	ScoreVector& max(const ScoreVector&rhs)
+	{
+		data_ = vmaxq_s16(data_, rhs.data_);
+		return *this;
+	}
+
+	friend ScoreVector max(const ScoreVector& lhs, const ScoreVector&rhs)
+	{
+		return ScoreVector(vmaxq_s16(lhs.data_, rhs.data_));
+	}
+
+	friend ScoreVector blend(const ScoreVector&v, const ScoreVector&w, const ScoreVector&mask) {
+			/* Use a signed shift right to create a mask with the sign bit */
+			uint16x8_t mask_ = vreinterpretq_u16_s16(vshrq_n_s16(mask.data_, 7));
+			return ScoreVector(vbslq_s16(mask_, w.data_, v.data_));
+	}
+
+	void store(int16_t *ptr) const
+	{
+		vst1q_s16(ptr, data_);
+	}
+
+	int16_t operator[](int i) const {
+		// return vgetq_lane_s16(data_, i);
+		int16_t tmp[8];
+		vst1q_s16(tmp, data_);
+		return tmp[i];
+	}
+
+	ScoreVector& set(int i, int16_t x) {
+		// vsetq_lane_s16(x, data_, i);
+		int16_t tmp[8];
+		vst1q_s16(tmp, data_);
+		tmp[i] = x;
+		data_ = vld1q_s16(tmp);
+		return *this;
+	}
+
+	void expand_from_8bit() {
+		int8x16_t mask = vdupq_n_s8(0x80);
+		int8x16_t sign = vreinterpretq_s8_u8(vceqq_s8(vandq_s8(vreinterpretq_s8_s16(data_), mask), mask));
+#ifdef __aarch64__
+		data_ = vreinterpretq_s16_s8(vzip1q_s8(vreinterpretq_s8_s16(data_), sign));
+#else
+		int8x16x2_t tmp = vzipq_s8(vreinterpretq_s8_s16(data_), sign);
+		data_ = vreinterpretq_s16_s8(tmp.val[0]);
+#endif
+	}
+
+	friend std::ostream& operator<<(std::ostream& s, ScoreVector v)
+	{
+		int16_t x[8];
+		v.store(x);
+		for (unsigned i = 0; i < 8; ++i)
+			printf("%3i ", (int)x[i]);
+		return s;
+	}
+
+	int16x8_t data_;
+
+};
+
 #elif defined(__SSE2__)
 
 template<int DELTA>
@@ -374,7 +531,7 @@ static inline int16_t extract(ScoreVector<int16_t, DELTA> sv) {
 
 #endif
 
-#ifdef __SSE2__
+#if defined(__SSE2__) | defined(__ARM_NEON)
 
 template<int DELTA>
 struct ScoreTraits<ScoreVector<int16_t, DELTA>>
@@ -445,7 +602,7 @@ struct ScoreTraits<ScoreVector<int16_t, DELTA>>
 
 }
 
-#ifdef __SSE2__
+#if defined(__SSE2__) | defined(__ARM_NEON)
 
 template<int DELTA>
 static inline int16_t extract_channel(const DISPATCH_ARCH::ScoreVector<int16_t, DELTA>& v, int i) {
@@ -457,4 +614,4 @@ static inline void set_channel(DISPATCH_ARCH::ScoreVector<int16_t, DELTA>& v, co
 	v.set(i, x);
 }
 
-#endif
+#endif