diff --git a/.github/workflows/pre-commit-checks.yml b/.github/workflows/pre-commit-checks.yml index d7d4d9c15..60f871646 100644 --- a/.github/workflows/pre-commit-checks.yml +++ b/.github/workflows/pre-commit-checks.yml @@ -58,6 +58,11 @@ jobs: - name: Generate Compile Database run: | make compile_db_all + - name: Generate JIT kernel bitcode header + # Only builds kernels.bc + kernels_bc.h, not the full project (~2s). + # Needed so clang-tidy can resolve #include "kernels_bc.h" in PrebuiltIR.cpp. + run: | + cmake --build _build/Release --target jit_kernels_bc - name: Run pre-commit hooks run: | pre-commit run --all-files diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ce73064c6..f19a7b488 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,7 +85,7 @@ repos: PR base / merge queue base stages: [pre-commit] entry: ./scripts/run-clang-tidy.py - args: [--diff, auto] + args: [--diff, auto, --exclude, "bolt/jit/kernels/"] language: python pass_filenames: false always_run: true diff --git a/bolt/jit/CMakeLists.txt b/bolt/jit/CMakeLists.txt index cade0033d..f8ac8bf9a 100644 --- a/bolt/jit/CMakeLists.txt +++ b/bolt/jit/CMakeLists.txt @@ -13,13 +13,20 @@ # See the License for the specific language governing permissions and # limitations under the License. +add_subdirectory(kernels) + bolt_add_library( bolt_thrustjit CompiledModule.cpp ThrustJITv2.cpp + PrebuiltIR.cpp RowContainer/RowContainerCodeGenerator.cpp RowContainer/RowEqVectorsCodeGenerator.cpp ) +add_dependencies(bolt_thrustjit jit_kernels_bc) + +target_include_directories(bolt_thrustjit PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/kernels) + target_link_libraries(bolt_thrustjit PUBLIC llvm-core::llvm-core date::date fmt::fmt Folly::folly) target_compile_options( diff --git a/bolt/jit/PrebuiltIR.cpp b/bolt/jit/PrebuiltIR.cpp new file mode 100644 index 000000000..0270d8c73 --- /dev/null +++ b/bolt/jit/PrebuiltIR.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef ENABLE_BOLT_JIT + +#include "bolt/jit/PrebuiltIR.h" + +#include +#include +#include +#include + +// Generated at build time by xxd -i +// Generated at build time by xxd -i (excluded from clang-tidy) +#include "kernels_bc.h" + +namespace bytedance::bolt::jit { + +void PrebuiltIR::linkInto(llvm::Module& target) { + auto buffer = llvm::MemoryBuffer::getMemBuffer( + llvm::StringRef( + reinterpret_cast(kernels_bc), kernels_bc_len), + "prebuilt_kernels", + /*RequiresNullTerminator=*/false); + + auto moduleOrErr = + llvm::parseBitcodeFile(buffer->getMemBufferRef(), target.getContext()); + if (!moduleOrErr) { + llvm::errs() << "[JIT] Failed to parse prebuilt bitcode: " + << llvm::toString(moduleOrErr.takeError()) << "\n"; + return; + } + + auto prebuilt = std::move(*moduleOrErr); + + llvm::SmallVector prebuiltNames; + for (auto& func : *prebuilt) { + if (!func.isDeclaration()) { + prebuiltNames.push_back(func.getName().str()); + } + } + + prebuilt->setDataLayout(target.getDataLayout()); + llvm::Linker::linkModules(target, std::move(prebuilt)); + + // Internalize pre-built functions so they don't get exported as + // global symbols into the JITDylib (avoiding "duplicate definition" + // errors). Internal functions are still callable within the same + // module — the JIT compiler resolves them during compilation. + for (auto& name : prebuiltNames) { + if (auto* fn = target.getFunction(name)) { + fn->setLinkage(llvm::GlobalValue::InternalLinkage); + } + } +} + +} // namespace bytedance::bolt::jit + +#endif // ENABLE_BOLT_JIT diff --git a/bolt/jit/PrebuiltIR.h b/bolt/jit/PrebuiltIR.h new file mode 100644 index 000000000..8f7cb5c43 --- /dev/null +++ b/bolt/jit/PrebuiltIR.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifdef ENABLE_BOLT_JIT + +#include +#include + +namespace bytedance::bolt::jit { + +/// Pre-built JIT IR: C++ kernel functions compiled to LLVM bitcode at build +/// time (kernels.cpp → clang → .bc → xxd → embedded byte array). +/// +/// Usage with ThrustJITv2: +/// +/// auto irGenerator = [](llvm::Module& m) -> bool { +/// // 1. Link pre-built kernels into this module +/// PrebuiltIR::linkInto(m); +/// +/// // 2. Build outer function with IRBuilder, calling pre-built kernels +/// auto* kernel = m.getFunction("jit_store_i64"); +/// builder.CreateCall(kernel, {row, offset, decoded, index, ...}); +/// +/// // 3. Verify the generated function +/// return llvm::verifyFunction(*func, &llvm::errs()); +/// }; +/// +/// // ThrustJITv2 compiles the module. Its AlwaysInliner pass inlines +/// // the pre-built kernels into the outer function automatically. +/// auto mod = ThrustJITv2::getInstance()->CompileModule(irGenerator, name); +/// auto fn = mod->getFuncPtr(name); +/// +/// See kernels.cpp for the list of available pre-built kernels. +class PrebuiltIR { + public: + /// Load pre-built bitcode and link into target module. + /// All pre-built functions are internalized (InternalLinkage) so they + /// don't conflict across modules in the JITDylib. The AlwaysInliner + /// pass in ThrustJITv2's IR transform layer inlines them into callers. + static void linkInto(llvm::Module& target); +}; + +} // namespace bytedance::bolt::jit + +#endif // ENABLE_BOLT_JIT diff --git a/bolt/jit/ThrustJITv2.cpp b/bolt/jit/ThrustJITv2.cpp index 54d8e4a7f..f874d6acc 100644 --- a/bolt/jit/ThrustJITv2.cpp +++ b/bolt/jit/ThrustJITv2.cpp @@ -21,12 +21,22 @@ #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/Support/DynamicLibrary.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Utils.h" + +#include #include +#include #include #include +#include #include namespace bytedance::bolt::jit { @@ -81,6 +91,46 @@ llvm::Expected> ThrustJITv2::Create() { } result->jit_ = std::move(*jit); + + // Add IR optimization pass: inline alwaysinline functions (pre-built kernels) + // and run basic optimizations on the inlined code. + result->jit_->getIRTransformLayer().setTransform( + [](llvm::orc::ThreadSafeModule TSM, + const llvm::orc::MaterializationResponsibility&) + -> llvm::Expected { + TSM.withModuleDo([](llvm::Module& M) { + // Module-level: inline alwaysinline functions + { + llvm::legacy::PassManager mpm; + mpm.add(llvm::createAlwaysInlinerLegacyPass()); + mpm.run(M); + } + + // Per-function: clean up inlined code + auto fpm = std::make_unique(&M); + fpm->add(llvm::createPromoteMemoryToRegisterPass()); + fpm->add(llvm::createInstructionCombiningPass()); + fpm->add(llvm::createGVNPass()); + fpm->add(llvm::createCFGSimplificationPass()); + fpm->doInitialization(); + for (auto& func : M) { + if (!func.isDeclaration()) { + fpm->run(func); + } + } + // Dump final IR for debugging (enable with --v=1) + if (VLOG_IS_ON(1)) { + std::string irStr; + llvm::raw_string_ostream os(irStr); + M.print(os, nullptr); + VLOG(1) << "[JIT] Final IR for module '" << M.getModuleIdentifier() + << "':\n" + << irStr; + } + }); + return std::move(TSM); + }); + return result; } @@ -117,6 +167,7 @@ CompiledModuleSP ThrustJITv2::CompileModule( compilingCv_.notify_all(); }; + auto compileStart = std::chrono::steady_clock::now(); auto llvmContext = std::make_unique(); auto llvmModule = std::make_unique(funcName, *llvmContext); llvmModule->setDataLayout(jit_->getDataLayout()); @@ -127,7 +178,7 @@ CompiledModuleSP ThrustJITv2::CompileModule( std::vector funcNames; for (auto& function : *llvmModule) { - if (!function.isDeclaration()) { + if (!function.isDeclaration() && !function.hasInternalLinkage()) { funcNames.emplace_back(function.getName().str()); } } @@ -207,6 +258,12 @@ CompiledModuleSP ThrustJITv2::CompileModule( } compilingCv_.notify_all(); + auto compileMs = std::chrono::duration_cast( + std::chrono::steady_clock::now() - compileStart) + .count(); + LOG(INFO) << "[JIT] Compiled '" << funcName << "' in " << compileMs + << " ms, code size: " << codeSize << " bytes"; + return compiledModule; } diff --git a/bolt/jit/kernels/CMakeLists.txt b/bolt/jit/kernels/CMakeLists.txt new file mode 100644 index 000000000..ca32eef0a --- /dev/null +++ b/bolt/jit/kernels/CMakeLists.txt @@ -0,0 +1,86 @@ +# +# Copyright (c) ByteDance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +message(STATUS "LLVM_TOOLS_BINARY_DIR = ${LLVM_TOOLS_BINARY_DIR}") +find_program(LLVM_CLANG clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) +if(NOT LLVM_CLANG) + find_program(LLVM_CLANG clang REQUIRED) +endif() +message(STATUS "Using clang: ${LLVM_CLANG}") + +# Verify clang version matches LLVM to ensure bitcode compatibility. +execute_process( + COMMAND ${LLVM_CLANG} --version + OUTPUT_VARIABLE _CLANG_VERSION_OUTPUT + OUTPUT_STRIP_TRAILING_WHITESPACE +) +string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" _CLANG_VERSION "${_CLANG_VERSION_OUTPUT}") +if(LLVM_PACKAGE_VERSION AND _CLANG_VERSION AND + NOT _CLANG_VERSION VERSION_EQUAL LLVM_PACKAGE_VERSION) + message(WARNING + "Clang version (${_CLANG_VERSION}) does not match " + "LLVM version (${LLVM_PACKAGE_VERSION}). " + "Pre-built bitcode may have compatibility issues.") +endif() + +set(KERNEL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/kernels.cpp) +set(KERNEL_BC ${CMAKE_CURRENT_BINARY_DIR}/kernels.bc) +set(KERNEL_HEADER ${CMAKE_CURRENT_BINARY_DIR}/kernels_bc.h) + +# Generate kernel compile flags from CMake target properties. +# Add more targets if future kernels need additional headers. +set(KERNEL_DEP_TARGETS bolt_vector) +set(KERNEL_FLAGS_RSP ${CMAKE_CURRENT_BINARY_DIR}/kernel_flags.rsp) + +# Merge INCLUDE_DIRECTORIES and COMPILE_DEFINITIONS from all dep targets. +set(_INC "") +set(_DEF "") +foreach(_target ${KERNEL_DEP_TARGETS}) + list(APPEND _INC "$") + list(APPEND _DEF "$") +endforeach() + +# Deduplicate, add -I/-D prefixes, filter empty entries, join with newlines +set(_INC_DEDUP "$") +set(_INC_FLAGS "$") + +set(_DEF_DEDUP "$") +set(_DEF_CLEAN "$") +set(_DEF_FLAGS "$") + +file(GENERATE OUTPUT ${KERNEL_FLAGS_RSP} CONTENT + "$\n$\n-DNDEBUG\n" +) + +# Compile kernels.cpp to LLVM bitcode with bolt headers +add_custom_command( + OUTPUT ${KERNEL_BC} + COMMAND ${LLVM_CLANG} -emit-llvm -c -O2 -std=c++17 + @${KERNEL_FLAGS_RSP} + -o ${KERNEL_BC} ${KERNEL_SRC} + DEPENDS ${KERNEL_SRC} ${KERNEL_FLAGS_RSP} + COMMENT "Compiling JIT kernels to LLVM bitcode" +) + +# Embed bitcode as C byte array header +add_custom_command( + OUTPUT ${KERNEL_HEADER} + COMMAND xxd -i kernels.bc > ${KERNEL_HEADER} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS ${KERNEL_BC} + COMMENT "Embedding JIT kernel bitcode as C header" +) + +add_custom_target(jit_kernels_bc DEPENDS ${KERNEL_HEADER}) diff --git a/bolt/jit/kernels/kernels.cpp b/bolt/jit/kernels/kernels.cpp new file mode 100644 index 000000000..a36a89913 --- /dev/null +++ b/bolt/jit/kernels/kernels.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Pre-built JIT kernels — compiled to LLVM bitcode at build time. +// Compiled with bolt headers so kernels can use bolt types (DecodedVector, +// etc.) directly. Inline methods from bolt headers get compiled into the +// bitcode, avoiding virtual dispatch at JIT runtime. + +#include +#include +#include + +#include "bolt/type/HugeInt.h" +#include "bolt/type/Timestamp.h" +#include "bolt/vector/DecodedVector.h" + +using bytedance::bolt::DecodedVector; +using bytedance::bolt::HugeInt; +using bytedance::bolt::int128_t; +using bytedance::bolt::Timestamp; + +extern "C" { + +// PoC: simple add function to validate the prebuilt IR pipeline. +__attribute__((always_inline)) int8_t jit_prebuilt_add(int8_t a, int8_t b) { + return a + b; +} + +// ============================================================================ +// Store kernels — store values from DecodedVector into RowContainer rows. +// +// Fixed-width types: fully inlined (DecodedVector methods are inline). +// Variable-width types (StringView, complex): extern call to RowContainer +// (resolved at JIT link time via process symbol table). +// +// Sentinel values on null match RowContainer::storeWithNulls. +// ============================================================================ + +// --- Fixed-width arithmetic types (fully inlined) --- + +#define DEFINE_STORE_KERNEL(name, T, sentinel) \ + __attribute__((always_inline)) void name( \ + char* row, \ + int32_t offset, \ + const DecodedVector* decoded, \ + int32_t index, \ + int32_t nullByte, \ + int8_t nullMask) { \ + if (decoded->isNullAt(index)) { \ + row[nullByte] |= nullMask; \ + *reinterpret_cast(row + offset) = sentinel; \ + } else { \ + *reinterpret_cast(row + offset) = decoded->valueAt(index); \ + } \ + } + +DEFINE_STORE_KERNEL(jit_store_i8, int8_t, std::numeric_limits::max()) +DEFINE_STORE_KERNEL(jit_store_i16, int16_t, std::numeric_limits::max()) +DEFINE_STORE_KERNEL(jit_store_i32, int32_t, std::numeric_limits::max()) +DEFINE_STORE_KERNEL(jit_store_i64, int64_t, std::numeric_limits::max()) +DEFINE_STORE_KERNEL(jit_store_f32, float, std::numeric_limits::max()) +DEFINE_STORE_KERNEL(jit_store_f64, double, std::numeric_limits::max()) + +#undef DEFINE_STORE_KERNEL + +// --- HUGEINT (int128_t): memcpy-based, fully inlined --- + +__attribute__((always_inline)) void jit_store_i128( + char* row, + int32_t offset, + const DecodedVector* decoded, + int32_t index, + int32_t nullByte, + int8_t nullMask) { + if (decoded->isNullAt(index)) { + row[nullByte] |= nullMask; + memset(row + offset, 0, sizeof(int128_t)); + } else { + HugeInt::serialize(decoded->valueAt(index), row + offset); + } +} + +// --- TIMESTAMP: struct copy, fully inlined --- + +__attribute__((always_inline)) void jit_store_ts( + char* row, + int32_t offset, + const DecodedVector* decoded, + int32_t index, + int32_t nullByte, + int8_t nullMask) { + if (decoded->isNullAt(index)) { + row[nullByte] |= nullMask; + *reinterpret_cast(row + offset) = Timestamp(); + } else { + *reinterpret_cast(row + offset) = + decoded->valueAt(index); + } +} + +// --- VARCHAR and Complex types (ARRAY/MAP/ROW) --- +// VARCHAR and complex types (ARRAY/MAP/ROW) need RowContainer access +// (HashStringAllocator for VARCHAR, ContainerRowSerde for complex types). +// We can't include RowContainer.h here because clang (used for bitcode +// compilation) has incompatibilities with the deep header chain +// (DecimalUtil, folly/hash, type_traits — __int128 make_unsigned, +// ambiguous to_chars, etc.). These types fall back to an extern call +// to RowContainer::store(), resolved at JIT link time via the process +// symbol table. See jit_store_row_column in PrebuiltStoreBenchmark.cpp. + +} // extern "C" diff --git a/bolt/jit/tests/CMakeLists.txt b/bolt/jit/tests/CMakeLists.txt index 1e2ed11bf..fca070350 100644 --- a/bolt/jit/tests/CMakeLists.txt +++ b/bolt/jit/tests/CMakeLists.txt @@ -18,6 +18,7 @@ if(${ENABLE_BOLT_JIT}) add_executable(bolt_thrustjit_test RowContainerIRTest.cpp ThrustJITv2Test.cpp + PrebuiltIRTest.cpp ) target_compile_features(bolt_thrustjit_test PUBLIC cxx_std_20) target_link_libraries( @@ -51,6 +52,7 @@ if(${ENABLE_BOLT_JIT}) Folly::follybenchmark ) add_test(bolt_row_vec_cmp_test bolt_row_vec_cmp_test) + endif() # TODO: refactor expression codegen diff --git a/bolt/jit/tests/PrebuiltIRTest.cpp b/bolt/jit/tests/PrebuiltIRTest.cpp new file mode 100644 index 000000000..c2907ed84 --- /dev/null +++ b/bolt/jit/tests/PrebuiltIRTest.cpp @@ -0,0 +1,259 @@ +/* + * Copyright (c) ByteDance Ltd. and/or its affiliates + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef ENABLE_BOLT_JIT + +#include +#include +#include +#include +#include + +#include "bolt/exec/RowContainer.h" +#include "bolt/exec/tests/utils/OperatorTestBase.h" +#include "bolt/jit/PrebuiltIR.h" +#include "bolt/jit/ThrustJITv2.h" +#include "bolt/vector/fuzzer/VectorFuzzer.h" + +extern "C" int jit_StringViewCompareWrapper(char* l, char* r); + +// Fallback store for VARCHAR/complex types — resolved at JIT link time. +extern "C" void jit_store_row_column( + void* rowContainer, + const bytedance::bolt::DecodedVector* decoded, + int32_t index, + char* row, + int32_t column) { + static_cast(rowContainer) + ->store(*decoded, index, row, column); +} + +namespace bytedance::bolt::jit::test { + +// Build an outer function via IRBuilder that calls jit_prebuilt_add. +// PrebuiltIR::linkInto handles linking and internalization. +// AlwaysInliner (in ThrustJITv2's IR transform) inlines the kernel. +TEST(PrebuiltIRTest, inlinedCall) { + // Force linker to import jit_StringViewCompareWrapper + int32_t sz1{0}, sz2{0}; + ::jit_StringViewCompareWrapper( + reinterpret_cast(&sz1), reinterpret_cast(&sz2)); + + auto* jit = ThrustJITv2::getInstance(); + ASSERT_NE(jit, nullptr); + + const std::string fnName = "prebuilt_inlined_add_test"; + + auto irGenerator = [jit, &fnName](llvm::Module& m) -> bool { + // Link pre-built functions and inline them + PrebuiltIR::linkInto(m); + + // Build outer function: int8_t fnName(int8_t a, int8_t b) + auto& ctx = m.getContext(); + llvm::IRBuilder<> builder(ctx); + + auto* i8ty = builder.getInt8Ty(); + auto* funcTy = llvm::FunctionType::get(i8ty, {i8ty, i8ty}, false); + auto* func = llvm::Function::Create( + funcTy, llvm::Function::ExternalLinkage, fnName, m); + + func->getArg(0)->setName("a"); + func->getArg(1)->setName("b"); + + auto* bb = llvm::BasicBlock::Create(ctx, "entry", func); + builder.SetInsertPoint(bb); + + // Call pre-built function — already inlined by PrebuiltIR::linkInto, + // but the declaration remains for us to call. The inliner will + // inline this call when LLJIT optimizes the module. + auto* callee = m.getFunction("jit_prebuilt_add"); + if (!callee) { + llvm::errs() << "jit_prebuilt_add not found after linking\n"; + return true; + } + + auto* result = + builder.CreateCall(callee, {func->getArg(0), func->getArg(1)}); + builder.CreateRet(result); + + return llvm::verifyFunction(*func, &llvm::errs()); + }; + + auto mod = jit->CompileModule(irGenerator, fnName); + ASSERT_NE(mod, nullptr); + + using AddFunc = int8_t (*)(int8_t, int8_t); + auto fn = reinterpret_cast(mod->getFuncPtr(fnName)); + ASSERT_NE(fn, nullptr); + + EXPECT_EQ(fn(10, 20), 30); + EXPECT_EQ(fn(-50, 50), 0); + EXPECT_EQ(fn(1, -1), 0); + EXPECT_EQ(fn(63, 64), 127); +} + +// Store PoC: JIT-compiled store for fixed-width types (i8–f64, i128, ts). +// +// This is a PoC — only fixed-width types are fully inlined in bitcode. +// StringView and complex types (ARRAY/MAP/ROW) require RowContainer's +// HashStringAllocator and ContainerRowSerde, which pull in RowContainer.h. +// That header can't be compiled to bitcode with clang because of +// clang/g++ incompatibilities in the deep header chain (DecimalUtil's +// ambiguous to_chars, folly/hash's __int128 make_unsigned, etc.). +// These types fall back to an extern call (jit_store_row_column). +// +// When the project moves to clang as the host compiler, RowContainer.h +// will be clang-compatible and all types can be fully inlined. +class PrebuiltStoreTest : public exec::test::OperatorTestBase {}; + +TEST_F(PrebuiltStoreTest, storeKeys) { + int32_t sz1{0}, sz2{0}; + ::jit_StringViewCompareWrapper( + reinterpret_cast(&sz1), reinterpret_cast(&sz2)); + + using namespace bytedance::bolt; + using namespace bytedance::bolt::exec; + + auto pool = memory::memoryManager()->addLeafPool(); + std::vector keyTypes = {BIGINT(), DOUBLE(), INTEGER()}; + auto numKeys = keyTypes.size(); + + // Create two RowContainers with identical schema + auto rcExisting = std::make_shared(keyTypes, pool.get()); + auto rcJit = std::make_shared(keyTypes, pool.get()); + + // Generate test data + VectorFuzzer::Options opts; + opts.vectorSize = 100; + opts.nullRatio = 0.1; + VectorFuzzer fuzzer(opts, pool.get(), 42); + + std::vector> decoded; + std::vector decodedPtrs; + for (size_t i = 0; i < numKeys; ++i) { + auto vec = fuzzer.fuzzFlat(keyTypes[i]); + decoded.emplace_back(std::make_shared(*vec)); + decodedPtrs.push_back(decoded.back().get()); + } + + // Compose JIT store function + auto* jit = ThrustJITv2::getInstance(); + const std::string fnName = "prebuilt_store_test"; + + std::vector offsets, nullByteOffsets; + std::vector nullMasks; + for (size_t i = 0; i < numKeys; ++i) { + auto col = rcJit->columnAt(i); + offsets.push_back(col.offset()); + nullByteOffsets.push_back(col.nullByte()); + nullMasks.push_back(col.nullMask()); + } + + auto irGenerator = [&](llvm::Module& m) -> bool { + PrebuiltIR::linkInto(m); + + auto& ctx = m.getContext(); + llvm::IRBuilder<> builder(ctx); + auto* ptrTy = builder.getPtrTy(); + auto* i32Ty = builder.getInt32Ty(); + auto* voidTy = builder.getVoidTy(); + + // void store_keys(void* rc, char* row, DecodedVector** cols, int32_t idx) + auto* funcTy = + llvm::FunctionType::get(voidTy, {ptrTy, ptrTy, ptrTy, i32Ty}, false); + auto* func = llvm::Function::Create( + funcTy, llvm::Function::ExternalLinkage, fnName, m); + + auto* rc = func->getArg(0); + auto* row = func->getArg(1); + auto* cols = func->getArg(2); + auto* index = func->getArg(3); + + auto* entry = llvm::BasicBlock::Create(ctx, "entry", func); + builder.SetInsertPoint(entry); + + // Map type to kernel name + auto kernelName = [](TypeKind kind) -> std::string { + switch (kind) { + case TypeKind::INTEGER: + return "jit_store_i32"; + case TypeKind::BIGINT: + return "jit_store_i64"; + case TypeKind::DOUBLE: + return "jit_store_f64"; + default: + return ""; + } + }; + + for (size_t i = 0; i < numKeys; ++i) { + auto* colPtr = builder.CreateGEP(ptrTy, cols, builder.getInt32(i)); + auto* dec = builder.CreateLoad(ptrTy, colPtr); + + auto name = kernelName(keyTypes[i]->kind()); + auto* kernel = m.getFunction(name); + EXPECT_NE(kernel, nullptr) << "kernel not found: " << name; + if (!kernel) + return true; + + builder.CreateCall( + kernel, + {row, + builder.getInt32(offsets[i]), + dec, + index, + builder.getInt32(nullByteOffsets[i]), + builder.getInt8(nullMasks[i])}); + } + + builder.CreateRetVoid(); + return llvm::verifyFunction(*func, &llvm::errs()); + }; + + auto mod = jit->CompileModule(irGenerator, fnName); + ASSERT_NE(mod, nullptr); + + using StoreKeysFunc = void (*)(void*, char*, const DecodedVector**, int32_t); + auto storeFn = reinterpret_cast(mod->getFuncPtr(fnName)); + ASSERT_NE(storeFn, nullptr); + + // Store rows using both methods + auto numRows = opts.vectorSize; + std::vector existingRows(numRows), jitRows(numRows); + for (int i = 0; i < numRows; ++i) { + existingRows[i] = rcExisting->newRow(); + jitRows[i] = rcJit->newRow(); + + // Existing: per-column store + for (size_t col = 0; col < numKeys; ++col) { + rcExisting->store(*decoded[col], i, existingRows[i], col); + } + + // JIT: all columns in one call + storeFn(rcJit.get(), jitRows[i], decodedPtrs.data(), i); + } + + // Correctness: both should produce identical row bytes + for (int i = 0; i < numRows; ++i) { + auto size = rcExisting->fixedRowSize(); + ASSERT_EQ(memcmp(existingRows[i], jitRows[i], size), 0) + << "Row " << i << " differs"; + } +} + +} // namespace bytedance::bolt::jit::test + +#endif // ENABLE_BOLT_JIT