diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 3e7c388057..5f363fb6a8 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -161,6 +161,7 @@ add_library( src/map_utils.cu src/murmur_hash.cu src/row_conversion.cu + src/xxhash64.cu src/zorder.cu ) diff --git a/src/main/cpp/src/HashJni.cpp b/src/main/cpp/src/HashJni.cpp index bcf72922d4..d106da2b61 100644 --- a/src/main/cpp/src/HashJni.cpp +++ b/src/main/cpp/src/HashJni.cpp @@ -36,4 +36,21 @@ JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_Hash_murmurHash32( } CATCH_STD(env, 0); } + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_Hash_xxhash64(JNIEnv* env, + jclass, + jlong seed, + jlongArray column_handles) +{ + JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto column_views = + cudf::jni::native_jpointerArray{env, column_handles}.get_dereferenced(); + return cudf::jni::release_as_jlong( + spark_rapids_jni::xxhash64(cudf::table_view{column_views}, seed)); + } + CATCH_STD(env, 0); +} } diff --git a/src/main/cpp/src/hash.cuh b/src/main/cpp/src/hash.cuh index 84a204d08f..a6af264ceb 100644 --- a/src/main/cpp/src/hash.cuh +++ b/src/main/cpp/src/hash.cuh @@ -25,6 +25,8 @@ namespace spark_rapids_jni { +constexpr int64_t DEFAULT_XXHASH64_SEED = 42; + /** * @brief Converts a cudf decimal128 value to a java bigdecimal value. * @@ -91,4 +93,20 @@ std::unique_ptr murmur_hash3_32( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Computes the xxhash64 hash value of each row in the input set of columns. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a column from the input. + */ +std::unique_ptr xxhash64( + cudf::table_view const& input, + int64_t seed = DEFAULT_XXHASH64_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace spark_rapids_jni diff --git a/src/main/cpp/src/xxhash64.cu b/src/main/cpp/src/xxhash64.cu new file mode 100644 index 0000000000..30a8cc15fc --- /dev/null +++ b/src/main/cpp/src/xxhash64.cu @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "hash.cuh" + +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace spark_rapids_jni { + +namespace { + +using hash_value_type = int64_t; +using half_size_type = int32_t; + +constexpr __device__ inline int64_t rotate_bits_left_signed(hash_value_type h, int8_t r) +{ + return (h << r) | (h >> (64 - r)) & ~(-1 << r); +} + +template +struct XXHash_64 { + using result_type = hash_value_type; + + constexpr XXHash_64() = delete; + constexpr XXHash_64(hash_value_type seed) : m_seed(seed) {} + + template + __device__ inline T getblock32(std::byte const* data, cudf::size_type offset) const + { + // Read a 4-byte value from the data pointer as individual bytes for safe + // unaligned access (very likely for string types). + auto block = reinterpret_cast(data + offset); + uint32_t result = static_cast(block[0]) | (static_cast(block[1]) << 8) | + (static_cast(block[2]) << 16) | + (static_cast(block[3]) << 24); + return reinterpret_cast(&result)[0]; + } + + __device__ inline hash_value_type getblock64(std::byte const* data, cudf::size_type offset) const + { + uint64_t result = static_cast(getblock32(data, offset)) | + static_cast(getblock32(data, offset + 4)) << 32; + return reinterpret_cast(&result)[0]; + } + + result_type __device__ inline operator()(Key const& key) const { return compute(key); } + + template + result_type __device__ inline compute(T const& key) const + { + return compute_bytes(reinterpret_cast(&key), sizeof(T)); + } + + result_type __device__ inline compute_remaining_bytes(std::byte const* data, + cudf::size_type const nbytes, + cudf::size_type offset, + result_type h64) const + { + // remaining data can be processed in 8-byte chunks + if ((nbytes % 32) >= 8) { + for (; offset <= nbytes - 8; offset += 8) { + hash_value_type k1 = getblock64(data, offset) * prime2; + k1 = rotate_bits_left_signed(k1, 31) * prime1; + h64 ^= k1; + h64 = rotate_bits_left_signed(h64, 27) * prime1 + prime4; + } + } + + // remaining data can be processed in 4-byte chunks + if (((nbytes % 32) % 8) >= 4) { + for (; offset <= nbytes - 4; offset += 4) { + h64 ^= (getblock32(data, offset) & 0xffffffffL) * prime1; + h64 = rotate_bits_left_signed(h64, 23) * prime2 + prime3; + } + } + + // and the rest + if (nbytes % 4) { + while (offset < nbytes) { + h64 ^= (static_cast(data[offset]) & 0xff) * prime5; + h64 = rotate_bits_left_signed(h64, 11) * prime1; + ++offset; + } + } + return h64; + } + + result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const nbytes) const + { + uint64_t offset = 0; + hash_value_type h64; + // data can be processed in 32-byte chunks + if (nbytes >= 32) { + auto limit = nbytes - 32; + hash_value_type v1 = m_seed + prime1 + prime2; + hash_value_type v2 = m_seed + prime2; + hash_value_type v3 = m_seed; + hash_value_type v4 = m_seed - prime1; + + do { + // pipeline 4*8byte computations + v1 += getblock64(data, offset) * prime2; + v1 = rotate_bits_left_signed(v1, 31); + v1 *= prime1; + offset += 8; + v2 += getblock64(data, offset) * prime2; + v2 = rotate_bits_left_signed(v2, 31); + v2 *= prime1; + offset += 8; + v3 += getblock64(data, offset) * prime2; + v3 = rotate_bits_left_signed(v3, 31); + v3 *= prime1; + offset += 8; + v4 += getblock64(data, offset) * prime2; + v4 = rotate_bits_left_signed(v4, 31); + v4 *= prime1; + offset += 8; + } while (offset <= limit); + + h64 = rotate_bits_left_signed(v1, 1) + rotate_bits_left_signed(v2, 7) + + rotate_bits_left_signed(v3, 12) + rotate_bits_left_signed(v4, 18); + + v1 *= prime2; + v1 = rotate_bits_left_signed(v1, 31); + v1 *= prime1; + h64 ^= v1; + h64 = h64 * prime1 + prime4; + + v2 *= prime2; + v2 = rotate_bits_left_signed(v2, 31); + v2 *= prime1; + h64 ^= v2; + h64 = h64 * prime1 + prime4; + + v3 *= prime2; + v3 = rotate_bits_left_signed(v3, 31); + v3 *= prime1; + h64 ^= v3; + h64 = h64 * prime1 + prime4; + + v4 *= prime2; + v4 = rotate_bits_left_signed(v4, 31); + v4 *= prime1; + h64 ^= v4; + h64 = h64 * prime1 + prime4; + } else { + h64 = m_seed + prime5; + } + + h64 += nbytes; + h64 = compute_remaining_bytes(data, nbytes, offset, h64); + + return finalize(h64); + } + + constexpr __host__ __device__ hash_value_type finalize(hash_value_type h) const noexcept + { + h ^= static_cast(static_cast(h) >> 33); + h *= prime2; + h ^= static_cast(static_cast(h) >> 29); + h *= prime3; + h ^= static_cast(static_cast(h) >> 32); + return h; + } + + private: + hash_value_type m_seed{}; + + static constexpr hash_value_type prime1 = 0x9E3779B185EBCA87L; + static constexpr hash_value_type prime2 = 0xC2B2AE3D27D4EB4FL; + static constexpr hash_value_type prime3 = 0x165667B19E3779F9L; + static constexpr hash_value_type prime4 = 0x85EBCA77C2B2AE63L; + static constexpr hash_value_type prime5 = 0x27D4EB2F165667C5L; +}; + +template <> +hash_value_type __device__ inline XXHash_64::operator()(bool const& key) const +{ + return compute(key); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()(int8_t const& key) const +{ + return compute(key); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()(uint8_t const& key) const +{ + return compute(key); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()(int16_t const& key) const +{ + return compute(key); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()(uint16_t const& key) const +{ + return compute(key); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()(float const& key) const +{ + return compute(cudf::detail::normalize_nans_and_zeros(key)); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()(double const& key) const +{ + return compute(cudf::detail::normalize_nans_and_zeros(key)); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()( + cudf::string_view const& key) const +{ + auto const data = reinterpret_cast(key.data()); + auto const len = key.size_bytes(); + return compute_bytes(data, len); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()( + numeric::decimal32 const& key) const +{ + return compute(key.value()); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()( + numeric::decimal64 const& key) const +{ + return compute(key.value()); +} + +template <> +hash_value_type __device__ inline XXHash_64::operator()( + numeric::decimal128 const& key) const +{ + auto [java_d, length] = to_java_bigdecimal(key); + auto bytes = reinterpret_cast(&java_d); + return compute_bytes(bytes, length); +} + +/** + * @brief Computes the hash value of a row in the given table. + * + * @tparam Nullate A cudf::nullate type describing whether to check for nulls. + */ +template +class device_row_hasher { + public: + device_row_hasher(Nullate nulls, cudf::table_device_view const& t, hash_value_type seed) + : _check_nulls(nulls), _table(t), _seed(seed) + { + } + + __device__ auto operator()(cudf::size_type row_index) const noexcept + { + return cudf::detail::accumulate( + _table.begin(), + _table.end(), + _seed, + [row_index, nulls = _check_nulls] __device__(auto hash, auto column) { + return cudf::type_dispatcher( + column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); + }); + } + + /** + * @brief Computes the hash value of an element in the given column. + */ + class element_hasher_adapter { + public: + template ())> + __device__ hash_value_type operator()(cudf::column_device_view const& col, + cudf::size_type row_index, + Nullate const _check_nulls, + hash_value_type const _seed) const noexcept + { + if (_check_nulls && col.is_null(row_index)) { return _seed; } + auto const hasher = XXHash_64{_seed}; + return hasher(col.element(row_index)); + } + + template ())> + __device__ hash_value_type operator()(cudf::column_device_view const&, + cudf::size_type, + Nullate const, + hash_value_type const) const noexcept + { + CUDF_UNREACHABLE("Unsupported type for xxhash64"); + } + }; + + Nullate const _check_nulls; + cudf::table_device_view const _table; + hash_value_type const _seed; +}; + +} // namespace + +std::unique_ptr xxhash64(cudf::table_view const& input, + int64_t _seed, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + hash_value_type seed = static_cast(_seed); + + auto output = cudf::make_numeric_column(cudf::data_type(cudf::type_to_id()), + input.num_rows(), + cudf::mask_state::UNALLOCATED, + stream, + mr); + + // Return early if there's nothing to hash + if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } + + bool const nullable = has_nulls(input); + auto const input_view = cudf::table_device_view::create(input, stream); + auto output_view = output->mutable_view(); + + // Compute the hash value for each row + thrust::tabulate(rmm::exec_policy(stream), + output_view.begin(), + output_view.end(), + device_row_hasher(nullable, *input_view, seed)); + + return output; +} + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/hash.cpp b/src/main/cpp/tests/hash.cpp index 8ac8f2862e..5f6a71ab29 100644 --- a/src/main/cpp/tests/hash.cpp +++ b/src/main/cpp/tests/hash.cpp @@ -115,17 +115,30 @@ TEST_F(HashTest, MultiValueNulls) auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1}); auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2}); - auto const output1 = cudf::hash(input1); - auto const output2 = cudf::hash(input2); - - EXPECT_EQ(input1.num_rows(), output1->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); - - auto const spark_output1 = spark_rapids_jni::murmur_hash3_32(input1, 0); - auto const spark_output2 = spark_rapids_jni::murmur_hash3_32(input2); - - EXPECT_EQ(input1.num_rows(), spark_output1->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view()); + { + auto const output1 = cudf::hash(input1); + auto const output2 = cudf::hash(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + } + + { + auto const output1 = spark_rapids_jni::murmur_hash3_32(input1, 0); + auto const output2 = spark_rapids_jni::murmur_hash3_32(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + } + + { + auto const output1 = + spark_rapids_jni::xxhash64(input1, spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const output2 = spark_rapids_jni::xxhash64(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + } } TEST_F(HashTest, BasicList) @@ -354,17 +367,30 @@ TYPED_TEST(HashTestTyped, Equality) auto const input = cudf::table_view({col}); // Hash of same input should be equal - auto const output1 = cudf::hash(input); - auto const output2 = cudf::hash(input); - EXPECT_EQ(input.num_rows(), output1->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + { + auto const output1 = cudf::hash(input); + auto const output2 = cudf::hash(input); + + EXPECT_EQ(input.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + } - auto const spark_output1 = spark_rapids_jni::murmur_hash3_32(input, 0); - auto const spark_output2 = spark_rapids_jni::murmur_hash3_32(input); + { + auto const output1 = spark_rapids_jni::murmur_hash3_32(input, 0); + auto const output2 = spark_rapids_jni::murmur_hash3_32(input); - EXPECT_EQ(input.num_rows(), spark_output1->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view()); + EXPECT_EQ(input.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + } + + { + auto const output1 = spark_rapids_jni::xxhash64(input, spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const output2 = spark_rapids_jni::xxhash64(input); + + EXPECT_EQ(input.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + } } TYPED_TEST(HashTestTyped, EqualityNulls) @@ -378,17 +404,30 @@ TYPED_TEST(HashTestTyped, EqualityNulls) auto const input1 = cudf::table_view({col1}); auto const input2 = cudf::table_view({col2}); - auto const output1 = cudf::hash(input1); - auto const output2 = cudf::hash(input2); + { + auto const output1 = cudf::hash(input1); + auto const output2 = cudf::hash(input2); - EXPECT_EQ(input1.num_rows(), output1->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + } + + { + auto const output1 = spark_rapids_jni::murmur_hash3_32(input1, 0); + auto const output2 = spark_rapids_jni::murmur_hash3_32(input2); + + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + } - auto const spark_output1 = spark_rapids_jni::murmur_hash3_32(input1, 0); - auto const spark_output2 = spark_rapids_jni::murmur_hash3_32(input2); + { + auto const output1 = + spark_rapids_jni::xxhash64(input1, spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const output2 = spark_rapids_jni::xxhash64(input2); - EXPECT_EQ(input1.num_rows(), spark_output1->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view()); + EXPECT_EQ(input1.num_rows(), output1->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output1->view(), output2->view()); + } } template @@ -423,10 +462,20 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_col, *hash_col_neg_nan, verbosity); // Spark hash is sensitive to 0 and -0 - auto const spark_col = spark_rapids_jni::murmur_hash3_32(table_col, 0); - auto const spark_col_neg_nan = spark_rapids_jni::murmur_hash3_32(table_col_neg_nan); + { + auto const spark_col = spark_rapids_jni::murmur_hash3_32(table_col, 0); + auto const spark_col_neg_nan = spark_rapids_jni::murmur_hash3_32(table_col_neg_nan); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan); + } + + { + auto const spark_col = + spark_rapids_jni::xxhash64(table_col, spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const spark_col_neg_nan = spark_rapids_jni::xxhash64(table_col_neg_nan); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan); + } } class SparkMurmurHash3Test : public cudf::test::BaseFixture {}; @@ -843,3 +892,382 @@ TEST_F(SparkMurmurHash3Test, ListOfStructValues) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity); */ } + +class SparkXXHash64Test : public cudf::test::BaseFixture {}; + +TEST_F(SparkXXHash64Test, MultiValueWithSeeds) +{ + // The hash values were determined by running the following Scala code in Apache Spark. + // Note that Spark >= 3.2 normalizes the float/double value of -0. to +0. and both values hash + // to the same result. This is normalized in the calling code (Spark RAPIDS plugin) for Spark + // >= 3.2. However, the reference values for -0. below must be obtained with Spark < 3.2 and + // libcudf will continue to implement the Spark < 3.2 behavior until Spark >= 3.2 is required and + // the workaround in the calling code is removed. This also affects the combined hash values. + + /* + import org.apache.spark.sql.functions._ + import org.apache.spark.sql.types._ + import org.apache.spark.sql.Row + import org.apache.spark.sql.catalyst.util.DateTimeUtils + + val schema = new StructType() + .add("strings", StringType) + .add("doubles", DoubleType) + .add("timestamps", TimestampType) + .add("decimal64", DecimalType(18, 7)) + .add("longs", LongType) + .add("floats", FloatType) + .add("dates", DateType) + .add("decimal32", DecimalType(9, 3)) + .add("ints", IntegerType) + .add("shorts", ShortType) + .add("bytes", ByteType) + .add("bools", BooleanType) + .add("decimal128", DecimalType(38, 11)) + + val data = Seq( + Row("", 0.toDouble, + DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat, + DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, + false, BigDecimal(0)), + Row("The quick brown fox", -(0.toDouble), + DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat), + DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, + true, BigDecimal("0.000000001")), + Row("jumps over the lazy dog.", -Double.NaN, + DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN, + DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, + true, BigDecimal("-0.00000000001")), + Row("All work and no play makes Jack a dull boy", Double.MinValue, + DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), + Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), + BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true, + BigDecimal("-9999999999999999.99999999999")), + Row("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, + DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), + Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), + BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false, + BigDecimal("99999999999999999999999999.99999999999")), + Row(null, null, null, null, null, null, null, null, null, null, null, null, null, null), + Row("abcdefgh", 100.toDouble, + DateTimeUtils.toJavaTimestamp(200), BigDecimal("0.0000123"), 0x123456789abcdefL, + Float.PositiveInfinity, DateTimeUtils.toJavaDate(-200), BigDecimal("-0.2"), -200, -200.toShort, + -90.toByte, false, BigDecimal("-9999999999999999.99999999999")), Row("abcdefghi", 200.toDouble, + DateTimeUtils.toJavaTimestamp(300), BigDecimal("0.0000432"), -0x123456789abcdefL, + Float.NegativeInfinity, DateTimeUtils.toJavaDate(-300), BigDecimal("-0.3"), -300, -300.toShort, + -80.toByte, false, BigDecimal("99999999999999999999999999.99999999999"))) + + + val df = spark.createDataFrame(sc.parallelize(data), schema) + df.columns.foreach(c => println(s"$c => ${df.select(xxhash64(col(c))).collect.mkString(",")}")) + println(s"combined => ${df.select(xxhash64(col("*"))).collect.mkString(",")}") + */ + + // cudf::test::fixed_width_column_wrapper const hash_structs_expected( + //{-105406170, 90479889, -678041645, 1667387937, 301478567}); + cudf::test::fixed_width_column_wrapper const hash_strings_expected( + {-7444071767201028348, + -3617261401988713833, + 8198945020833482635, + -5346617152005100141, + 6614298085531227868, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + 2470326616177429180, + -7093207067522615973}); + cudf::test::fixed_width_column_wrapper const hash_doubles_expected( + {-5252525462095825812, + -5252525462095825812, + -3127944061524951246, + 9065082843545458248, + -4222314252576420879, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -7996023612001835843, + -8838535416664833914}); + cudf::test::fixed_width_column_wrapper const hash_timestamps_expected( + {-5252525462095825812, + 8713583529807266080, + 5675770457807661948, + 7123048472642709644, + -5141505295506489983, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -1244884446866925109, + 1772389229253425430}); + cudf::test::fixed_width_column_wrapper const hash_decimal64_expected( + {-5252525462095825812, + 8713583529807266080, + 5675770457807661948, + 4265531446127695490, + 2162198894918931945, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -3178482946328430151, + 4788666723486520022}); + cudf::test::fixed_width_column_wrapper const hash_longs_expected( + {-5252525462095825812, + 8713583529807266080, + 5675770457807661948, + -8619748838626508300, + -3246596055638297850, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + 1941233597257011502, + -1318946533059658749}); + cudf::test::fixed_width_column_wrapper const hash_floats_expected( + {3614696996920510707, + 3614696996920510707, + 2692338816207849720, + -8545425418825163117, + -1065250890878313112, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -5940311692336719973, + -7580553461823983095}); + cudf::test::fixed_width_column_wrapper const hash_dates_expected( + {3614696996920510707, + -7987742665087449293, + 8990748234399402673, + -8442426365007754391, + -1447590449373190349, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -953008374380745918, + 2895908635257747121}); + cudf::test::fixed_width_column_wrapper const hash_decimal32_expected( + {-5252525462095825812, + 8713583529807266080, + 5675770457807661948, + 8670643431269007867, + 6810183316718625826, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + 7277994511003214036, + 6264187449999859617}); + cudf::test::fixed_width_column_wrapper const hash_ints_expected( + {3614696996920510707, + -7987742665087449293, + 8990748234399402673, + 2073849959933241805, + 1508894993788531228, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -953008374380745918, + 2895908635257747121}); + cudf::test::fixed_width_column_wrapper const hash_shorts_expected( + {3614696996920510707, + -7987742665087449293, + 8990748234399402673, + -904511417458573795, + 8952525448871805501, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -953008374380745918, + 2895908635257747121}); + cudf::test::fixed_width_column_wrapper const hash_bytes_expected( + {3614696996920510707, + -7987742665087449293, + 8990748234399402673, + 4160238337661960656, + 8632298611707923906, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -4008061843281999337, + 6690883199412647955}); + cudf::test::fixed_width_column_wrapper const hash_bools_expected( + {3614696996920510707, + -6698625589789238999, + -6698625589789238999, + -6698625589789238999, + 3614696996920510707, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + 3614696996920510707, + 3614696996920510707}); + cudf::test::fixed_width_column_wrapper const hash_decimal128_expected( + {-8959994473701255385, + 4409375254388155230, + -4006032525457443936, + -5423362182451591024, + 7041733194569950081, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -5423362182451591024, + 7041733194569950081}); + cudf::test::fixed_width_column_wrapper const hash_combined_expected( + {541735645035655239, + 9011982951766246298, + 3834379147931449211, + -5406325166887725795, + 7797509897614041972, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -9032872913521304524, + -604070008711895908}); + + using double_limits = std::numeric_limits; + using long_limits = std::numeric_limits; + using float_limits = std::numeric_limits; + using int_limits = std::numeric_limits; + + cudf::test::strings_column_wrapper const strings_col( + {"", + "The quick brown fox", + "jumps over the lazy dog.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", + "", + "abcdefgh", + "abcdefghi"}, + {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const doubles_col({0., + -0., + -double_limits::quiet_NaN(), + double_limits::lowest(), + double_limits::max(), + 0., + 100., + 200.}, + {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const + timestamps_col( + {0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000, 0L, 200L, 300L}, + {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_point_column_wrapper const decimal64_col( + {0L, 100L, -100L, -999999999999999999L, 999999999999999999L, 0L, 123L, 432L}, + {1, 1, 1, 1, 1, 0, 1, 1}, + numeric::scale_type{-7}); + cudf::test::fixed_width_column_wrapper const longs_col({0L, + 100L, + -100L, + long_limits::min(), + long_limits::max(), + 0L, + 0x123456789abcdefL, + -0x123456789abcdefL}, + {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const floats_col({0.f, + -0.f, + -float_limits::quiet_NaN(), + float_limits::lowest(), + float_limits::max(), + 0.f, + float_limits::infinity(), + -float_limits::infinity()}, + {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper dates_col( + {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100, 0, -200, -300}, + {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_point_column_wrapper const decimal32_col( + {0, 100, -100, -999999999, 999999999, 0, -200, -300}, + {1, 1, 1, 1, 1, 0, 1, 1}, + numeric::scale_type{-3}); + cudf::test::fixed_width_column_wrapper const ints_col( + {0, 100, -100, int_limits::min(), int_limits::max(), 0, -200, -300}, {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const shorts_col( + {0, 100, -100, -32768, 32767, 0, -200, -300}, {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const bytes_col( + {0, 100, -100, -128, 127, 0, -90, -80}, {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0, 0, 0, 0}, + {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0, 0, 0, 0}, + {1, 1, 1, 1, 1, 0, 1, 1}); + cudf::test::fixed_point_column_wrapper<__int128_t> const decimal128_col( + { + static_cast<__int128>(0), + static_cast<__int128>(100), + static_cast<__int128>(-1), + (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u), + (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu), + 0, + (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u), + (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu), + }, + {1, 1, 1, 1, 1, 0, 1, 1}, + numeric::scale_type{-11}); + + // auto const hash_structs = spark_rapids_jni::xxhash64(cudf::table_view({structs_col}), + // spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_strings = spark_rapids_jni::xxhash64(cudf::table_view({strings_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_doubles = spark_rapids_jni::xxhash64(cudf::table_view({doubles_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_timestamps = spark_rapids_jni::xxhash64(cudf::table_view({timestamps_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_decimal64 = spark_rapids_jni::xxhash64(cudf::table_view({decimal64_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_longs = spark_rapids_jni::xxhash64(cudf::table_view({longs_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_floats = spark_rapids_jni::xxhash64(cudf::table_view({floats_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_dates = spark_rapids_jni::xxhash64(cudf::table_view({dates_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_decimal32 = spark_rapids_jni::xxhash64(cudf::table_view({decimal32_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_ints = spark_rapids_jni::xxhash64(cudf::table_view({ints_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_shorts = spark_rapids_jni::xxhash64(cudf::table_view({shorts_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_bytes = spark_rapids_jni::xxhash64(cudf::table_view({bytes_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_bools1 = spark_rapids_jni::xxhash64(cudf::table_view({bools_col1}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_bools2 = spark_rapids_jni::xxhash64(cudf::table_view({bools_col2}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + auto const hash_decimal128 = spark_rapids_jni::xxhash64(cudf::table_view({decimal128_col}), + spark_rapids_jni::DEFAULT_XXHASH64_SEED); + + // CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity); + + auto const combined_table = cudf::table_view({// structs_col, + strings_col, + doubles_col, + timestamps_col, + decimal64_col, + longs_col, + floats_col, + dates_col, + decimal32_col, + ints_col, + shorts_col, + bytes_col, + bools_col2, + decimal128_col}); + auto const hash_combined = + spark_rapids_jni::xxhash64(combined_table, spark_rapids_jni::DEFAULT_XXHASH64_SEED); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity); +} + +TEST_F(SparkXXHash64Test, Strings) +{ + // The hash values were determined by running the following Scala code in Apache Spark: + // val strs = Seq("", + // null, + // "The quick brown fox", + // "jumps over the lazy dog.", + // "All work and no play makes Jack a dull boy", + // "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721").toDF() + // strs.selectExpr("xxhash64(value)") + + cudf::test::fixed_width_column_wrapper const hash_strings_expected( + {-7444071767201028348, + spark_rapids_jni::DEFAULT_XXHASH64_SEED, + -3617261401988713833, + 8198945020833482635, + -5346617152005100141, + 6614298085531227868}); + + cudf::test::strings_column_wrapper const strings_col( + {"", + "", + "The quick brown fox", + "jumps over the lazy dog.", + "All work and no play makes Jack a dull boy", + "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"}, + {1, 0, 1, 1, 1, 1}); + + auto const hash_strings = spark_rapids_jni::xxhash64(cudf::table_view({strings_col})); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity); +} diff --git a/src/main/java/com/nvidia/spark/rapids/jni/Hash.java b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java index f182d68d1f..3059c5e785 100644 --- a/src/main/java/com/nvidia/spark/rapids/jni/Hash.java +++ b/src/main/java/com/nvidia/spark/rapids/jni/Hash.java @@ -16,9 +16,15 @@ package com.nvidia.spark.rapids.jni; -import ai.rapids.cudf.*; +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.ColumnView; +import ai.rapids.cudf.CudfException; +import ai.rapids.cudf.NativeDepsLoader; public class Hash { + // there doesn't appear to be a useful constant in spark to reference. this could break. + static final long DEFAULT_XXHASH64_SEED = 42; + static { NativeDepsLoader.loadNativeDeps(); } @@ -51,5 +57,35 @@ public static ColumnVector murmurHash32(ColumnView columns[]) { return murmurHash32(0, columns); } + /** + * Create a new vector containing the xxhash64 hash of each row in the table. + * + * @param seed integer seed for the xxhash64 hash function + * @param columns array of columns to hash, must have identical number of rows. + * @return the new ColumnVector of 64-bit values representing each row's hash value. + */ + public static ColumnVector xxhash64(long seed, ColumnView columns[]) { + if (columns.length < 1) { + throw new IllegalArgumentException("xxhash64 hashing requires at least 1 column of input"); + } + long[] columnViews = new long[columns.length]; + long size = columns[0].getRowCount(); + + for(int i = 0; i < columns.length; i++) { + assert columns[i] != null : "Column vectors passed may not be null"; + assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; + assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; + assert !columns[i].getType().isNestedType() : "Unsupported column type Nested"; + columnViews[i] = columns[i].getNativeView(); + } + return new ColumnVector(xxhash64(seed, columnViews)); + } + + public static ColumnVector xxhash64(ColumnView columns[]) { + return xxhash64(DEFAULT_XXHASH64_SEED, columns); + } + private static native long murmurHash32(int seed, long[] viewHandles) throws CudfException; + + private static native long xxhash64(long seed, long[] viewHandles) throws CudfException; } diff --git a/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java index dda03affe2..a124b4849e 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/HashTest.java @@ -261,4 +261,131 @@ void testSpark32BitMurmur3HashListsAndNestedLists() { assertColumnsAreEqual(nestedExpected, nestedResult); } } + + @Test + void testXXHash64Strings() { + try (ColumnVector v0 = ColumnVector.fromStrings( + "a", "B\nc", "dE\"\u0100\t\u0101 \ud720\ud721\\Fg2\'", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.A 60 character string to " + + "test MD5's message padding algorithm", + "hiJ\ud720\ud721\ud720\ud721", null); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{v0}); + ColumnVector expected = ColumnVector.fromBoxedLongs(-8582455328737087284L, 2221214721321197934L, 5798966295358745941L, -4834097201550955483L, -3782648123388245694L, Hash.DEFAULT_XXHASH64_SEED)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Ints() { + try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null); + ColumnVector v1 = ColumnVector.fromBoxedInts(0, null, -100, null, null, Integer.MAX_VALUE); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{v0, v1}); + ColumnVector expected = ColumnVector.fromBoxedLongs(1151812168208346021L, -7987742665087449293L, 8990748234399402673L, Hash.DEFAULT_XXHASH64_SEED, 2073849959933241805L, 1508894993788531228L)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Doubles() { + try (ColumnVector v = ColumnVector.fromBoxedDoubles( + 0.0, null, 100.0, -100.0, Double.MIN_NORMAL, Double.MAX_VALUE, + POSITIVE_DOUBLE_NAN_UPPER_RANGE, POSITIVE_DOUBLE_NAN_LOWER_RANGE, + NEGATIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_DOUBLE_NAN_LOWER_RANGE, + Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedLongs(-5252525462095825812L, Hash.DEFAULT_XXHASH64_SEED, -7996023612001835843L, 5695175288042369293L, 6181148431538304986L, -4222314252576420879L, -3127944061524951246L, -3127944061524951246L, -3127944061524951246L, -3127944061524951246L, 5810986238603807492L, 5326262080505358431L)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Timestamps() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 + try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs( + 0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedLongs(-5252525462095825812L, Hash.DEFAULT_XXHASH64_SEED, 8713583529807266080L, 5675770457807661948L, 1941233597257011502L, Hash.DEFAULT_XXHASH64_SEED, -1318946533059658749L)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Decimal64() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 + try (ColumnVector v = ColumnVector.decimalFromLongs(-7, + 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedLongs(-5252525462095825812L, 8713583529807266080L, 5675770457807661948L, 1941233597257011502L, -1318946533059658749L)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Decimal32() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 + try (ColumnVector v = ColumnVector.decimalFromInts(-3, + 0, 100, -100, 0x12345678, -0x12345678); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedLongs(-5252525462095825812L, 8713583529807266080L, 5675770457807661948L, -7728554078125612835L, 3142315292375031143L)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Dates() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 + try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts( + 0, null, 100, -100, 0x12345678, null, -0x12345678); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedLongs(3614696996920510707L, Hash.DEFAULT_XXHASH64_SEED, -7987742665087449293L, 8990748234399402673L, 6954428822481665164L, Hash.DEFAULT_XXHASH64_SEED, -4294222333805341278L)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Floats() { + try (ColumnVector v = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, Float.MIN_NORMAL, Float.MAX_VALUE, null, + POSITIVE_FLOAT_NAN_LOWER_RANGE, POSITIVE_FLOAT_NAN_UPPER_RANGE, + NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, + Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{v}); + ColumnVector expected = ColumnVector.fromBoxedLongs(3614696996920510707L, -8232251799677946044L, -6625719127870404449L, -6699704595004115126L, -1065250890878313112L, Hash.DEFAULT_XXHASH64_SEED, 2692338816207849720L, 2692338816207849720L, 2692338816207849720L, 2692338816207849720L, -5940311692336719973L, -7580553461823983095L)){ + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Bools() { + try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(null, true, false, true, null, false); + ColumnVector v1 = ColumnVector.fromBoxedBooleans(null, true, false, null, false, true); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{v0, v1}); + ColumnVector expected = ColumnVector.fromBoxedLongs(Hash.DEFAULT_XXHASH64_SEED, 9083826852238114423L, 1151812168208346021L, -6698625589789238999L, 3614696996920510707L, 7945966957015589024L)) { + assertColumnsAreEqual(expected, result); + } + } + + @Test + void testXXHash64Mixed() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnVector result = Hash.xxhash64(new ColumnVector[]{strings, integers, doubles, floats, bools}); + ColumnVector expected = ColumnVector.fromBoxedLongs(7451748878409563026L, 6024043102550151964L, 3380664624738534402L, 8444697026100086329L, -5888679192448042852L, Hash.DEFAULT_XXHASH64_SEED)) { + assertColumnsAreEqual(expected, result); + } + } }