From 3b47d41eaf93b51e0b4d655d08dab17e46faeccd Mon Sep 17 00:00:00 2001 From: Zac Blanco Date: Tue, 31 Mar 2026 20:48:51 +0000 Subject: [PATCH 1/6] build: support compiling with clang 19 --- .devcontainer/clang/Dockerfile.clang | 140 ++++++++++++++++++ .devcontainer/clang/devcontainer.json | 37 +++++ CMakeLists.txt | 6 + bolt/duckdb/conversion/DuckParser.cpp | 122 +++++++++++++++ bolt/dwio/dwrf/test/ColumnWriterTest.cpp | 116 +++++++++++---- bolt/parse/DuckLogicalOperator.h | 20 ++- bolt/parse/QueryPlanner.cpp | 81 +++++++++- conanfile.py | 7 +- .../patches/arrow.15.0.1-csv-support.patch | 2 +- scripts/run-clang-tidy.py | 18 ++- 10 files changed, 494 insertions(+), 55 deletions(-) create mode 100644 .devcontainer/clang/Dockerfile.clang create mode 100644 .devcontainer/clang/devcontainer.json diff --git a/.devcontainer/clang/Dockerfile.clang b/.devcontainer/clang/Dockerfile.clang new file mode 100644 index 000000000..8287b0ed2 --- /dev/null +++ b/.devcontainer/clang/Dockerfile.clang @@ -0,0 +1,140 @@ +FROM debian:bookworm + +ENV CMAKE_VERSION=3.31.8 +ENV MOLD_VERSION=2.40.4 + +ARG DEB_REGION="" +ARG https_proxy="" +ARG no_proxy="" + +# check if deb region is not empty string. If it is, replace the default debian repository with the one for the region. +RUN if [ "$DEB_REGION" != "" ]; then \ + sed -i "s|http://deb.debian.org/debian|http://ftp.${DEB_REGION}.debian.org/debian|g" /etc/apt/sources.list.d/debian.sources; \ +fi + +RUN apt-get update && \ + apt-get install -y \ + curl \ + tar + +RUN arch=$(arch) && \ + https_proxy=${https_proxy} \ + curl -L -# -o /tmp/cmake.tar.gz https://github.com/Kitware/CMake/releases/download/v$CMAKE_VERSION/cmake-$CMAKE_VERSION-linux-${arch}.tar.gz && \ + tar -xzvf /tmp/cmake.tar.gz -C /opt && \ + ln -s /opt/cmake-$CMAKE_VERSION-linux-${arch}/bin/cmake /usr/local/bin/cmake && \ + ln -s /opt/cmake-$CMAKE_VERSION-linux-${arch}/bin/ctest /usr/local/bin/ctest && \ + rm /tmp/cmake.tar.gz + +RUN apt-get update && apt-get install -y \ + sudo \ + bash \ + vim-tiny \ + vim \ + sudo \ + git \ + gdb \ + python3 \ + python3-pip \ + python3-virtualenv \ + ssh-client \ + ninja-build \ + bison \ + maven \ + net-tools \ + telnet \ + ssh \ + rsync \ + openssh-server \ + lld \ + ccache \ + binutils-dev \ + make \ + automake \ + autoconf \ + htop \ + less \ + man \ + nodejs \ + npm \ + bsdextrautils \ + locales \ + clang-19 \ + clang++-19 \ + && update-alternatives --install /usr/bin/clang clang /usr/bin/clang-19 100 \ + && update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-19 100 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN ln -sf /bin/bash /bin/sh + +RUN https_proxy=${https_proxy} curl -L -O --output-dir /tmp https://github.com/rui314/mold/releases/download/v$MOLD_VERSION/mold-$MOLD_VERSION-$(arch)-linux.tar.gz +RUN cd /tmp && \ + sudo tar -C /usr/local --strip-components=1 -xzf mold-$MOLD_VERSION-$(arch)-linux.tar.gz + +# Add mold's ld as the first on the path +env PATH=/usr/local/libexec/mold:$PATH + +# License header check tool +RUN https_proxy=${https_proxy} curl -L -# -o /tmp/skywalking-eyes-bin.tgz https://dlcdn.apache.org/skywalking/eyes/0.8.0/skywalking-license-eye-0.8.0-bin.tgz && \ + mkdir -p /opt/skywalking-license-eye && \ + sudo tar -C /opt/skywalking-license-eye --strip-components=1 -xzf /tmp/skywalking-eyes-bin.tgz + +ENV PATH="/opt/skywalking-license-eye/bin/linux:${PATH}" + +RUN export arch=$(arch | sed 's/^aarch64$/arm64/; s/^x86_64$/amd64/') && \ + export https_proxy=${https_proxy} && \ + export filename=go1.25.5.linux-${arch}.tar.gz && \ + curl -L -O --output-dir /tmp https://go.dev/dl/${filename} && \ + sudo rm -rf /usr/local/bin/go && tar -C /usr/local -xzf /tmp/${filename} && \ + rm /tmp/${filename} +ENV PATH="/usr/local/go/bin:${PATH}" + +# Set up default python virtualenv, and make it the default on PATH +RUN virtualenv -p python3 /opt/venv +ENV PATH="/opt/venv/bin:$PATH" +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt && \ + rm -f requirements.txt + +ARG USERNAME=code +ARG USER_UID=1000 +ARG USER_GID=$USER_UID + +RUN groupadd --gid $USER_GID $USERNAME \ + && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME \ + && echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME \ + && chmod 0440 /etc/sudoers.d/$USERNAME + +USER $USERNAME + +# Configure default conan profile to use the mold linker +RUN arch=$(uname -m) && mkdir -p ~/.conan2/profiles && cat > ~/.conan2/profiles/default <> ~/.bashrc && \ + echo "export LANG=en_US.UTF-8" >> ~/.bashrc && \ + echo "export LANGUAGE=en_US.UTF-8" >> ~/.bashrc + + +ENV SHELL=/bin/bash + +WORKDIR /workspace + +ENTRYPOINT ["/bin/bash", "-c"] diff --git a/.devcontainer/clang/devcontainer.json b/.devcontainer/clang/devcontainer.json new file mode 100644 index 000000000..7564beec3 --- /dev/null +++ b/.devcontainer/clang/devcontainer.json @@ -0,0 +1,37 @@ +{ + "name": "Bolt Clang Development Container", + "build": { + "dockerfile": "Dockerfile.clang", + "context": "../../.github/runners", + "args": { + "DEB_REGION": "", + "https_proxy": "", + "no_proxy": "" + } + }, + "containerEnv": { + "https_proxy": "", + "no_proxy": "", + "SHELL": "/bin/bash" + }, + "remoteUser": "code", + // Use 'forwardPorts' to make a list of ports inside the container available locally. + "forwardPorts": [], + // Use 'portsAttributes' to set default properties for specific forwarded ports. + "portsAttributes": {}, + // Use 'remoteEnv' to set environment variables that are only available inside the container. + "remoteEnv": {}, + // Use 'mounts' to make files or directories from your local machine available inside the container. + "mounts": [ + "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached", + "source=conan-cache,target=/home/code/.conan2/p", + "source=ccache-cache,target=/home/code/.ccache" + ], + "runArgs": [ + "--security-opt", + "label=disable" + ], + // Configure tool-specific properties. + "features": {}, + "postCreateCommand": ".devcontainer/post_create_command.sh" +} diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e11dd9b6..0faaf45c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,12 @@ elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin") add_definitions(-D OS_MACOSX) endif() +# Only use compiler-rt when building with Clang on Linux for ARM/AArch64 +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND OS_LINUX AND (ARCH_AARCH64 OR ARCH_ARM)) + add_compile_options(-rtlib=compiler-rt) + add_link_options(-rtlib=compiler-rt) +endif() + list(PREPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/CMake" "${PROJECT_SOURCE_DIR}/CMake/third-party" ) diff --git a/bolt/duckdb/conversion/DuckParser.cpp b/bolt/duckdb/conversion/DuckParser.cpp index 1b5dfb40f..304cda05f 100644 --- a/bolt/duckdb/conversion/DuckParser.cpp +++ b/bolt/duckdb/conversion/DuckParser.cpp @@ -35,6 +35,8 @@ #include "bolt/parse/Expressions.h" #include "bolt/type/Variant.h" +#include + #include // @manual #include // @manual #include // @manual @@ -251,6 +253,116 @@ std::shared_ptr tryParseInterval( INTERVAL_DAY_TIME(), variant(value.value() * multiplier), alias); } +std::shared_ptr tryParseIntervalWithUnit( + const std::shared_ptr& input, + const std::shared_ptr& unit, + const std::optional& alias) { + std::optional value; + if (const auto* constInput = + dynamic_cast(input.get())) { + if (constInput->type()->isBigint() && !constInput->value().isNull()) { + value = constInput->value().value(); + } + } else if ( + const auto* castInput = + dynamic_cast(input.get())) { + if (castInput->type()->isBigint()) { + if (const auto* constInput = dynamic_cast( + castInput->getInput().get())) { + if (constInput->type()->isBigint() && !constInput->value().isNull()) { + value = constInput->value().value(); + } + } + } + } + + if (!value.has_value()) { + return nullptr; + } + + const auto* unitExpr = dynamic_cast(unit.get()); + if (!unitExpr || !unitExpr->type()->isVarchar() || + unitExpr->value().isNull()) { + return nullptr; + } + + const auto unitName = + StringUtil::Lower(unitExpr->value().value()); + int64_t multiplier; + if (unitName == "hour" || unitName == "hours") { + multiplier = 60 * 60 * 1'000; + } else if (unitName == "minute" || unitName == "minutes") { + multiplier = 60 * 1'000; + } else if (unitName == "second" || unitName == "seconds") { + multiplier = 1'000; + } else if (unitName == "millisecond" || unitName == "milliseconds") { + multiplier = 1; + } else { + return nullptr; + } + + return std::make_shared( + INTERVAL_DAY_TIME(), variant(value.value() * multiplier), alias); +} + +std::shared_ptr tryParseIntervalLiteral( + const std::string& exprString) { + std::string trimmed = exprString; + StringUtil::Trim(trimmed); + auto lower = StringUtil::Lower(trimmed); + if (!StringUtil::StartsWith(lower, "interval ")) { + return nullptr; + } + + std::istringstream iss(trimmed); + std::string keyword; + iss >> keyword; + if (StringUtil::Lower(keyword) != "interval") { + return nullptr; + } + + std::string valueToken; + std::string unitToken; + if (!(iss >> valueToken >> unitToken)) { + return nullptr; + } + + int64_t value; + try { + value = std::stoll(valueToken); + } catch (const std::exception&) { + return nullptr; + } + + std::optional alias; + std::string maybeAs; + if (iss >> maybeAs) { + if (StringUtil::Lower(maybeAs) == "as") { + std::string aliasToken; + if (iss >> aliasToken) { + alias = aliasToken; + } + } + } + + const auto unitName = StringUtil::Lower(unitToken); + int64_t multiplier; + if (unitName == "hour" || unitName == "hours") { + multiplier = 60 * 60 * 1'000; + } else if (unitName == "minute" || unitName == "minutes") { + multiplier = 60 * 1'000; + } else if (unitName == "second" || unitName == "seconds") { + multiplier = 1'000; + } else if (unitName == "millisecond" || unitName == "milliseconds") { + multiplier = 1; + } else { + return nullptr; + } + + return std::make_shared( + INTERVAL_DAY_TIME(), variant(value * multiplier), alias); +} + // Parse a function call (avg(a), func(1, b), etc). // Arithmetic operators also follow this path (a + b, a * b, etc). std::shared_ptr parseFunctionExpr( @@ -271,6 +383,13 @@ std::shared_ptr parseFunctionExpr( } } + if (func == "interval" && params.size() == 2) { + if (auto interval = + tryParseIntervalWithUnit(params[0], params[1], getAlias(expr))) { + return interval; + } + } + // NOT LIKE function needs special handling as it maps to two functions // "not" and "like". if (func == "notlike") { @@ -680,6 +799,9 @@ std::unique_ptr<::duckdb::ParsedExpression> parseSingleExpression( std::shared_ptr parseExpr( const std::string& exprString, const ParseOptions& options) { + if (auto interval = tryParseIntervalLiteral(exprString)) { + return interval; + } auto parsed = parseSingleExpression(exprString); return parseExpr(*parsed, options); } diff --git a/bolt/dwio/dwrf/test/ColumnWriterTest.cpp b/bolt/dwio/dwrf/test/ColumnWriterTest.cpp index 0dfffc35c..6cb53d25a 100644 --- a/bolt/dwio/dwrf/test/ColumnWriterTest.cpp +++ b/bolt/dwio/dwrf/test/ColumnWriterTest.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "bolt/common/memory/Memory.h" #include "bolt/dwio/common/IntDecoder.h" @@ -47,6 +48,7 @@ #include "bolt/dwio/dwrf/writer/Writer.h" #include "bolt/type/Type.h" #include "bolt/vector/DictionaryVector.h" +#include "bolt/vector/SelectivityVector.h" #include "bolt/vector/tests/utils/VectorMaker.h" using namespace ::testing; @@ -58,6 +60,30 @@ using namespace bytedance::bolt::memory; using folly::Random; namespace bytedance::bolt::dwrf { +template +struct IsArrayType : std::false_type {}; + +template +struct IsArrayType> : std::true_type {}; + +template +struct IsMapType : std::false_type {}; + +template +struct IsMapType> : std::true_type {}; + +template +struct IsRowType : std::false_type {}; + +template +struct IsRowType> : std::true_type {}; + +template +struct IsComplexType + : std::bool_constant< + IsArrayType::value || IsMapType::value || IsRowType::value> { +}; + class MockStrideIndexProvider : public StrideIndexProvider { public: MOCK_CONST_METHOD0(getStrideIndex, uint64_t()); @@ -868,10 +894,16 @@ void mapToStruct( // initialize children of batch size filled with nulls VectorMaker maker{&pool}; for (auto column = 0; column < uniqueKeys.size(); column++) { - childrenVectors[column] = - maker.allNullFlatVector(origBatch->size()); - // only flat for scalar types - // create function to handle nested complex types + if constexpr (IsComplexType::value) { + auto valueType = CppToType::create(); + auto child = BaseVector::create(valueType, origBatch->size(), &pool); + SelectivityVector nullRows(origBatch->size(), true); + child->addNulls(nullRows); + childrenVectors[column] = child; + } else { + childrenVectors[column] = + maker.allNullFlatVector(origBatch->size()); + } } batches[i] = maker.rowVector(childrenVectors); auto batchStruct = std::dynamic_pointer_cast(batches[i]); @@ -883,26 +915,47 @@ void mapToStruct( auto flatKeys = std::dynamic_pointer_cast>(keys); ASSERT_TRUE(flatKeys); auto values = mapBatch->mapValues(); - auto flatValues = std::dynamic_pointer_cast>(values); - ASSERT_TRUE(flatValues); - - auto offsets = mapBatch->offsets()->as(); - auto sizes = mapBatch->sizes()->as(); - - // for each row in current batch - for (vector_size_t row = 0; row < mapBatch->size(); row++) { - // for each key in row (single map) - for (vector_size_t index = offsets[row], - endOffset = offsets[row] + sizes[row]; - index < endOffset; - index++) { - ASSERT_FALSE(flatKeys->isNullAt(index)); - // set value in correct row - auto key = flatKeys->valueAt(index); - auto element = std::dynamic_pointer_cast>( - batchStruct->childAt(keyColIndex[key])); - ASSERT_TRUE(element); - element->set(row, flatValues->valueAt(index)); + if constexpr (IsComplexType::value) { + auto offsets = mapBatch->offsets()->as(); + auto sizes = mapBatch->sizes()->as(); + + // for each row in current batch + for (vector_size_t row = 0; row < mapBatch->size(); row++) { + // for each key in row (single map) + for (vector_size_t index = offsets[row], + endOffset = offsets[row] + sizes[row]; + index < endOffset; + index++) { + ASSERT_FALSE(flatKeys->isNullAt(index)); + // set value in correct row + auto key = flatKeys->valueAt(index); + auto element = batchStruct->childAt(keyColIndex[key]); + ASSERT_TRUE(element); + element->copy(values.get(), row, index, 1); + } + } + } else { + auto flatValues = std::dynamic_pointer_cast>(values); + ASSERT_TRUE(flatValues); + + auto offsets = mapBatch->offsets()->as(); + auto sizes = mapBatch->sizes()->as(); + + // for each row in current batch + for (vector_size_t row = 0; row < mapBatch->size(); row++) { + // for each key in row (single map) + for (vector_size_t index = offsets[row], + endOffset = offsets[row] + sizes[row]; + index < endOffset; + index++) { + ASSERT_FALSE(flatKeys->isNullAt(index)); + // set value in correct row + auto key = flatKeys->valueAt(index); + auto element = std::dynamic_pointer_cast>( + batchStruct->childAt(keyColIndex[key])); + ASSERT_TRUE(element); + element->set(row, flatValues->valueAt(index)); + } } } } @@ -4405,10 +4458,12 @@ struct DictColumnWriterTestCase { VectorPtr dictionaryVector; VectorPtr flatVector; - if (complexRowType == nullptr) { - flatVector = makeFlatVector(size, valueAt, isNullAt); - } else { + if constexpr (IsComplexType::value) { + BOLT_CHECK_NOT_NULL( + complexRowType, "Expected complex row type for complex vectors"); flatVector = makeComplexVectors(complexRowType, size, isNullAt); + } else { + flatVector = makeFlatVector(size, valueAt, isNullAt); } auto wrappedVector = BaseVector::wrapInDictionary( @@ -4428,11 +4483,8 @@ struct DictColumnWriterTestCase { context.initBuffer(); // complexVectorType will be nullptr if the vector is not complex. - bool isComplexType = std::dynamic_pointer_cast(type_) || - std::dynamic_pointer_cast(type_) || - std::dynamic_pointer_cast(type_); - - auto complexVectorType = isComplexType ? rowType : nullptr; + auto complexVectorType = + IsComplexType::value ? rowType : std::shared_ptr(); auto batch = createDictionaryBatch(size_, valueAt, isNullAt, complexVectorType); diff --git a/bolt/parse/DuckLogicalOperator.h b/bolt/parse/DuckLogicalOperator.h index 7b5e554b5..322c84b0c 100644 --- a/bolt/parse/DuckLogicalOperator.h +++ b/bolt/parse/DuckLogicalOperator.h @@ -57,6 +57,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include // @manual +#include // @manual namespace duckdb { @@ -106,24 +107,29 @@ class LogicalGet : public LogicalOperator { vector returned_types; //! The names of ALL columns that can be returned by the table function vector names; - //! Bound column IDs - vector column_ids; + //! Columns that are used outside the scan + vector projection_ids; //! Filters pushed down for table scan TableFilterSet table_filters; - + //! The set of input parameters for the table function + vector parameters; string GetName() const override; - string ParamsToString() const override; + InsertionOrderPreservingMap ParamsToString() const override; //! Returns the underlying table that is being scanned, or nullptr if there is //! none - TableCatalogEntry* GetTable() const; + optional_ptr GetTable() const; public: + const vector& GetColumnIds() const; vector GetColumnBindings() override; idx_t EstimateCardinality(ClientContext& context) override; protected: void ResolveTypes() override; + + private: + LogicalGet(); }; //! LogicalFilter represents a filter operation (e.g. WHERE or HAVING clause) @@ -133,8 +139,6 @@ class LogicalFilter : public LogicalOperator { LogicalFilter(); vector projection_map; - - public: vector GetColumnBindings() override; bool SplitPredicates() { @@ -191,7 +195,7 @@ class LogicalAggregate : public LogicalOperator { vector> group_stats; public: - string ParamsToString() const override; + InsertionOrderPreservingMap ParamsToString() const override; vector GetColumnBindings() override; diff --git a/bolt/parse/QueryPlanner.cpp b/bolt/parse/QueryPlanner.cpp index 3097b647a..0b80beab4 100644 --- a/bolt/parse/QueryPlanner.cpp +++ b/bolt/parse/QueryPlanner.cpp @@ -31,8 +31,10 @@ #include "bolt/parse/QueryPlanner.h" #include "bolt/duckdb/conversion/DuckConversion.h" #include "bolt/parse/DuckLogicalOperator.h" +#include "bolt/vector/VariantToVector.h" #include // @manual +#include // @manual #include // @manual #include // @manual #include // @manual @@ -140,23 +142,71 @@ PlanNodePtr toBoltPlan( std::vector sources, QueryContext& queryContext) { if (logicalGet.function.name == "unnest") { - BOLT_CHECK_EQ(1, sources.size()); + // DuckDB 1.1.3 represents UNNEST as a standalone LOGICAL_GET with + // parameters embedded in the LogicalGet. + BOLT_CHECK_EQ(0, sources.size()); + + BOLT_CHECK_EQ( + logicalGet.parameters.size(), + 1, + "UNNEST expects a single parameter, got {}", + logicalGet.parameters.size()); + + const auto& param = logicalGet.parameters[0]; + BOLT_CHECK( + param.type().id() == ::duckdb::LogicalTypeId::LIST, + "UNNEST parameter must be a LIST, got {}", + param.type().ToString()); + + const auto& listType = param.type(); + auto elementType = + duckdb::toBoltType(::duckdb::ListType::GetChildType(listType)); + auto arrayType = ARRAY(elementType); + + std::vector elements; + const auto& listValues = ::duckdb::ListValue::GetChildren(param); + elements.reserve(listValues.size()); + for (const auto& value : listValues) { + elements.emplace_back(duckdb::duckValueToVariant(value)); + } + + auto arrayVector = variantArrayToVector(arrayType, elements, pool); + auto rowType = ROW({queryContext.nextColumnName()}, {arrayType}); + auto rowVector = std::make_shared( + pool, rowType, nullptr, 1, std::vector{arrayVector}); + + std::vector vectors = {rowVector}; + auto valuesNode = + std::make_shared(queryContext.nextNodeId(), vectors); + + std::vector projections{ + std::make_shared( + valuesNode->outputType()->childAt(0), + valuesNode->outputType()->asRow().nameOf(0))}; + std::vector projectionNames{queryContext.nextColumnName()}; + auto projectNode = std::make_shared( + queryContext.nextNodeId(), + std::move(projectionNames), + std::move(projections), + std::move(valuesNode)); + return std::make_shared( queryContext.nextNodeId(), std::vector{}, // replicateVariables std::vector{ std::make_shared( - sources[0]->outputType()->childAt(0), - sources[0]->outputType()->asRow().nameOf(0))}, - std::vector{"a"}, + projectNode->outputType()->childAt(0), + projectNode->outputType()->asRow().nameOf(0))}, + std::vector{ + logicalGet.names.empty() ? "a" : logicalGet.names[0]}, std::nullopt, // ordinalityName - std::move(sources[0])); + std::move(projectNode)); } BOLT_CHECK_EQ(logicalGet.function.name, "seq_scan"); BOLT_CHECK_EQ(0, sources.size()); - const auto& columnIds = logicalGet.column_ids; + const auto& columnIds = logicalGet.GetColumnIds(); std::vector names(columnIds.size()); std::vector types(columnIds.size()); @@ -464,6 +514,18 @@ PlanNodePtr toBoltPlan( pool, std::move(sources), queryContext); + case ::duckdb::LogicalOperatorType::LOGICAL_UNNEST: + BOLT_CHECK_EQ(1, sources.size()); + return std::make_shared( + queryContext.nextNodeId(), + std::vector{}, // replicateVariables + std::vector{ + std::make_shared( + sources[0]->outputType()->childAt(0), + sources[0]->outputType()->asRow().nameOf(0))}, + std::vector{"a"}, + std::nullopt, // ordinalityName + std::move(sources[0])); default: BOLT_NYI( "Plan node is not supported yet: {}", @@ -478,11 +540,14 @@ static void customScalarFunction( BOLT_UNREACHABLE(); } -static ::duckdb::idx_t customAggregateState() { +::duckdb::idx_t customAggregateState( + const ::duckdb::AggregateFunction& /*function*/) { BOLT_UNREACHABLE(); } -static void customAggregateInitialize(::duckdb::data_ptr_t state) { +void customAggregateInitialize( + const ::duckdb::AggregateFunction& /*function*/, + ::duckdb::data_ptr_t /* state */) { BOLT_UNREACHABLE(); } diff --git a/conanfile.py b/conanfile.py index b51f1e6b7..8dda09fb2 100644 --- a/conanfile.py +++ b/conanfile.py @@ -275,9 +275,7 @@ def requirements(self): if self.settings.os in ["Linux", "FreeBSD"]: if self.options.get_safe("enable_perf"): self.requires("gperftools/2.16") - self.requires("libunwind/1.8.0", override=True) - else: - self.requires("libunwind/1.8.0") + self.requires("libunwind/1.8.3", override=True) self.requires("utf8proc/2.11.0", transitive_headers=True, transitive_libs=True) self.requires("date/3.0.4-bolt", transitive_headers=True, transitive_libs=True) self.requires("libbacktrace/cci.20210118") @@ -287,7 +285,7 @@ def requirements(self): self.requires("paimon-cpp/0.0.3-bolt") if self.options.get_safe("enable_testutil"): self.requires("gtest/1.17.0", force=True) - self.requires("duckdb/0.8.1") + self.requires("duckdb/1.1.3") def build_requirements(self): self.tool_requires("m4/1.4.19") @@ -360,6 +358,7 @@ def configure(self): self.options[llvm_core].with_zstd = False self.options[llvm_core].with_ffi = False self.options[llvm_core].with_clang = True + self.options[llvm_core].targets = "AArch64;ARM;X86" if self.options.get_safe("enable_hdfs") and self.options.get_safe( "use_arrow_hdfs" diff --git a/scripts/conan/patches/arrow.15.0.1-csv-support.patch b/scripts/conan/patches/arrow.15.0.1-csv-support.patch index afcacf375..917c0111c 100644 --- a/scripts/conan/patches/arrow.15.0.1-csv-support.patch +++ b/scripts/conan/patches/arrow.15.0.1-csv-support.patch @@ -356,7 +356,7 @@ index 24a7af89b..27655cdf4 100644 self.requires("grpc/1.50.0") if self._requires_rapidjson(): - self.requires("rapidjson/1.1.0") -+ self.requires("rapidjson/[>=cci.20230929]") ++ self.requires("rapidjson/cci.20250205") if self.options.with_llvm: self.requires("llvm-core/13.0.0") if self.options.with_openssl: diff --git a/scripts/run-clang-tidy.py b/scripts/run-clang-tidy.py index 83f9aadaf..610779b2d 100755 --- a/scripts/run-clang-tidy.py +++ b/scripts/run-clang-tidy.py @@ -318,7 +318,9 @@ def process_gha_output(stdout): def tidy(args): - extensions = (".cc", ".cpp", ".cxx", ".c", ".h", ".hpp", ".hxx") + source_extensions = (".cc", ".cpp", ".cxx", ".c") + header_extensions = (".h", ".hpp", ".hxx") + extensions = source_extensions + header_extensions candidate_files = [] # get file list to check if args.directory: @@ -476,9 +478,21 @@ def _compute_changed_lines() -> Optional[Multimap]: print("No changed C/C++ lines detected for clang-tidy.") return 0 + # Keep headers in --line-filter so diagnostics can be reported there, + # but run clang-tidy only on translation units that have compile DB entries. + all_changed_for_filter = list(files_to_process) + files_to_process = [ + f for f in files_to_process if f.endswith(source_extensions) + ] + if not files_to_process: + print( + "Only header changes detected; no source files to run clang-tidy on in this mode." + ) + return 0 + # Use absolute paths in --line-filter for better compatibility with compile DBs. final_map_abs = {} # type: Dict[str, List[List[int]]] - for f in files_to_process: + for f in all_changed_for_filter: abs_f = to_repo_abs(f, git_root) final_map_abs[abs_f] = changed_lines[f] line_filter_json = json.dumps( From 04ec5649f786374c372852bf08c502b230de078a Mon Sep 17 00:00:00 2001 From: Zac Blanco Date: Tue, 31 Mar 2026 21:02:47 +0000 Subject: [PATCH 2/6] Add clangd as default extension for clang devcontainer --- .devcontainer/clang/devcontainer.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.devcontainer/clang/devcontainer.json b/.devcontainer/clang/devcontainer.json index 7564beec3..ff7a8a0cf 100644 --- a/.devcontainer/clang/devcontainer.json +++ b/.devcontainer/clang/devcontainer.json @@ -31,6 +31,13 @@ "--security-opt", "label=disable" ], + "customizations": { + "vscode": { + "extensions": [ + "llvm-vs-code-extensions.vscode-clangd" + ] + } + }, // Configure tool-specific properties. "features": {}, "postCreateCommand": ".devcontainer/post_create_command.sh" From d5b8d28bbcc8fe848b8d761ed2513d9bad3cdc86 Mon Sep 17 00:00:00 2001 From: Zac Blanco Date: Fri, 3 Apr 2026 21:52:26 +0000 Subject: [PATCH 3/6] Only build LLVM target for same target platform --- conanfile.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/conanfile.py b/conanfile.py index 8dda09fb2..43a829ab5 100644 --- a/conanfile.py +++ b/conanfile.py @@ -322,10 +322,13 @@ def configure(self): self.options[paimon_cpp].with_avro = True arrow_simd_level = "default" + llvm_targets = None if str(self.settings.arch) in ["x86", "x86_64"]: arrow_simd_level = "avx2" + llvm_targets = 'X86' elif str(self.settings.arch) in ["armv8", "arm", "armv9"]: arrow_simd_level = "neon" + llvm_targets = 'AArch64' self.options[arrow].parquet = True self.options[arrow].filesystem_layer = True self.options[arrow].simd_level = arrow_simd_level @@ -358,7 +361,9 @@ def configure(self): self.options[llvm_core].with_zstd = False self.options[llvm_core].with_ffi = False self.options[llvm_core].with_clang = True - self.options[llvm_core].targets = "AArch64;ARM;X86" + if llvm_targets is None: + raise RuntimeError("Unsupported target for JIT feature") + self.options[llvm_core].targets = llvm_targets if self.options.get_safe("enable_hdfs") and self.options.get_safe( "use_arrow_hdfs" From 8c8e2b4d932c1ab3c6b6ea22fe1b341e2a18d4a5 Mon Sep 17 00:00:00 2001 From: Zac Blanco Date: Fri, 3 Apr 2026 23:13:04 +0000 Subject: [PATCH 4/6] Fix compilation on StreamingAggregation.cpp --- bolt/exec/StreamingAggregation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/exec/StreamingAggregation.cpp b/bolt/exec/StreamingAggregation.cpp index dc242f93c..d31170cf5 100644 --- a/bolt/exec/StreamingAggregation.cpp +++ b/bolt/exec/StreamingAggregation.cpp @@ -44,7 +44,7 @@ StreamingAggregation::StreamingAggregation( ? "PartialStreamingAggregation" : "StreamingAggregation"), outputBatchSize_{outputBatchRows()}, - groupNumberThreshold_{2 * outputBatchSize_}, + groupNumberThreshold_{static_cast(2 * outputBatchSize_)}, aggregationNode_{aggregationNode}, step_{aggregationNode->step()} { if (aggregationNode_->ignoreNullKeys()) { From 6adb524867e21a4494ec3edccfcf3103c54fe709 Mon Sep 17 00:00:00 2001 From: Zac Blanco Date: Fri, 3 Apr 2026 23:42:04 +0000 Subject: [PATCH 5/6] Fix duckparser for intervals --- bolt/duckdb/conversion/DuckParser.cpp | 122 +----------------- .../conversion/tests/DuckParserTest.cpp | 34 ++--- 2 files changed, 13 insertions(+), 143 deletions(-) diff --git a/bolt/duckdb/conversion/DuckParser.cpp b/bolt/duckdb/conversion/DuckParser.cpp index 304cda05f..3a0ab4389 100644 --- a/bolt/duckdb/conversion/DuckParser.cpp +++ b/bolt/duckdb/conversion/DuckParser.cpp @@ -35,7 +35,7 @@ #include "bolt/parse/Expressions.h" #include "bolt/type/Variant.h" -#include +#include #include // @manual #include // @manual @@ -253,116 +253,6 @@ std::shared_ptr tryParseInterval( INTERVAL_DAY_TIME(), variant(value.value() * multiplier), alias); } -std::shared_ptr tryParseIntervalWithUnit( - const std::shared_ptr& input, - const std::shared_ptr& unit, - const std::optional& alias) { - std::optional value; - if (const auto* constInput = - dynamic_cast(input.get())) { - if (constInput->type()->isBigint() && !constInput->value().isNull()) { - value = constInput->value().value(); - } - } else if ( - const auto* castInput = - dynamic_cast(input.get())) { - if (castInput->type()->isBigint()) { - if (const auto* constInput = dynamic_cast( - castInput->getInput().get())) { - if (constInput->type()->isBigint() && !constInput->value().isNull()) { - value = constInput->value().value(); - } - } - } - } - - if (!value.has_value()) { - return nullptr; - } - - const auto* unitExpr = dynamic_cast(unit.get()); - if (!unitExpr || !unitExpr->type()->isVarchar() || - unitExpr->value().isNull()) { - return nullptr; - } - - const auto unitName = - StringUtil::Lower(unitExpr->value().value()); - int64_t multiplier; - if (unitName == "hour" || unitName == "hours") { - multiplier = 60 * 60 * 1'000; - } else if (unitName == "minute" || unitName == "minutes") { - multiplier = 60 * 1'000; - } else if (unitName == "second" || unitName == "seconds") { - multiplier = 1'000; - } else if (unitName == "millisecond" || unitName == "milliseconds") { - multiplier = 1; - } else { - return nullptr; - } - - return std::make_shared( - INTERVAL_DAY_TIME(), variant(value.value() * multiplier), alias); -} - -std::shared_ptr tryParseIntervalLiteral( - const std::string& exprString) { - std::string trimmed = exprString; - StringUtil::Trim(trimmed); - auto lower = StringUtil::Lower(trimmed); - if (!StringUtil::StartsWith(lower, "interval ")) { - return nullptr; - } - - std::istringstream iss(trimmed); - std::string keyword; - iss >> keyword; - if (StringUtil::Lower(keyword) != "interval") { - return nullptr; - } - - std::string valueToken; - std::string unitToken; - if (!(iss >> valueToken >> unitToken)) { - return nullptr; - } - - int64_t value; - try { - value = std::stoll(valueToken); - } catch (const std::exception&) { - return nullptr; - } - - std::optional alias; - std::string maybeAs; - if (iss >> maybeAs) { - if (StringUtil::Lower(maybeAs) == "as") { - std::string aliasToken; - if (iss >> aliasToken) { - alias = aliasToken; - } - } - } - - const auto unitName = StringUtil::Lower(unitToken); - int64_t multiplier; - if (unitName == "hour" || unitName == "hours") { - multiplier = 60 * 60 * 1'000; - } else if (unitName == "minute" || unitName == "minutes") { - multiplier = 60 * 1'000; - } else if (unitName == "second" || unitName == "seconds") { - multiplier = 1'000; - } else if (unitName == "millisecond" || unitName == "milliseconds") { - multiplier = 1; - } else { - return nullptr; - } - - return std::make_shared( - INTERVAL_DAY_TIME(), variant(value * multiplier), alias); -} - // Parse a function call (avg(a), func(1, b), etc). // Arithmetic operators also follow this path (a + b, a * b, etc). std::shared_ptr parseFunctionExpr( @@ -383,13 +273,6 @@ std::shared_ptr parseFunctionExpr( } } - if (func == "interval" && params.size() == 2) { - if (auto interval = - tryParseIntervalWithUnit(params[0], params[1], getAlias(expr))) { - return interval; - } - } - // NOT LIKE function needs special handling as it maps to two functions // "not" and "like". if (func == "notlike") { @@ -799,9 +682,6 @@ std::unique_ptr<::duckdb::ParsedExpression> parseSingleExpression( std::shared_ptr parseExpr( const std::string& exprString, const ParseOptions& options) { - if (auto interval = tryParseIntervalLiteral(exprString)) { - return interval; - } auto parsed = parseSingleExpression(exprString); return parseExpr(*parsed, options); } diff --git a/bolt/duckdb/conversion/tests/DuckParserTest.cpp b/bolt/duckdb/conversion/tests/DuckParserTest.cpp index 5435a5c05..40f06e464 100644 --- a/bolt/duckdb/conversion/tests/DuckParserTest.cpp +++ b/bolt/duckdb/conversion/tests/DuckParserTest.cpp @@ -37,8 +37,8 @@ using namespace bytedance::bolt::duckdb; namespace { std::shared_ptr parseExpr(const std::string& exprString) { - ParseOptions options; - return parseExpr(exprString, options); + duckdb::ParseOptions options; + return duckdb::parseExpr(exprString, options); } } // namespace @@ -297,30 +297,20 @@ TEST(DuckParserTest, between) { TEST(DuckParserTest, interval) { auto parseInterval = [](const std::string& sql) { - auto expr = - std::dynamic_pointer_cast(parseExpr(sql)); - BOLT_CHECK_NOT_NULL(expr); - - auto value = - INTERVAL_DAY_TIME()->valueToString(expr->value().value()); - if (expr->alias()) { - return fmt::format("{} AS {}", value, expr->alias().value()); - } - - return value; + auto parsed = parseExpr(sql); + return parsed->toString(); }; - EXPECT_EQ("0 05:00:00.000", parseInterval("INTERVAL 5 HOURS")); - EXPECT_EQ("0 00:36:00.000", parseInterval("INTERVAL 36 MINUTES")); - EXPECT_EQ("0 00:00:07.000", parseInterval("INTERVAL 7 SECONDS")); - EXPECT_EQ("0 00:00:00.123", parseInterval("INTERVAL 123 MILLISECONDS")); - - EXPECT_EQ("0 00:00:12.345", parseInterval("INTERVAL 12345 MILLISECONDS")); - EXPECT_EQ("0 03:25:45.678", parseInterval("INTERVAL 12345678 MILLISECONDS")); - EXPECT_EQ("1 03:48:20.100", parseInterval("INTERVAL 100100100 MILLISECONDS")); + EXPECT_EQ("to_hours(cast(trunc(cast(5, DOUBLE)), BIGINT))", parseInterval("INTERVAL 5 HOURS")); + EXPECT_EQ("to_minutes(cast(trunc(cast(36, DOUBLE)), BIGINT))", parseInterval("INTERVAL 36 MINUTES")); + EXPECT_EQ("to_seconds(cast(7, DOUBLE))", parseInterval("INTERVAL 7 SECONDS")); + EXPECT_EQ("to_milliseconds(cast(123, DOUBLE))", parseInterval("INTERVAL 123 MILLISECONDS")); + EXPECT_EQ("to_milliseconds(cast(12345, DOUBLE))", parseInterval("INTERVAL 12345 MILLISECONDS")); + EXPECT_EQ("to_milliseconds(cast(12345678, DOUBLE))", parseInterval("INTERVAL 12345678 MILLISECONDS")); + EXPECT_EQ("to_milliseconds(cast(100100100, DOUBLE))", parseInterval("INTERVAL 100100100 MILLISECONDS")); EXPECT_EQ( - "0 00:00:00.011 AS x", parseInterval("INTERVAL 11 MILLISECONDS AS x")); + "to_milliseconds(cast(11, DOUBLE)) AS x", parseInterval("INTERVAL 11 MILLISECONDS AS x")); } TEST(DuckParserTest, cast) { From 180e45ef915ae8fdcba3704fa2bead07bfaff4a4 Mon Sep 17 00:00:00 2001 From: Zac Blanco Date: Mon, 6 Apr 2026 17:21:50 +0000 Subject: [PATCH 6/6] fix pre-commit lints --- bolt/duckdb/conversion/DuckParser.cpp | 2 -- .../conversion/tests/DuckParserTest.cpp | 27 ++++++++++++++----- conanfile.py | 4 +-- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/bolt/duckdb/conversion/DuckParser.cpp b/bolt/duckdb/conversion/DuckParser.cpp index 3a0ab4389..1b5dfb40f 100644 --- a/bolt/duckdb/conversion/DuckParser.cpp +++ b/bolt/duckdb/conversion/DuckParser.cpp @@ -35,8 +35,6 @@ #include "bolt/parse/Expressions.h" #include "bolt/type/Variant.h" -#include - #include // @manual #include // @manual #include // @manual diff --git a/bolt/duckdb/conversion/tests/DuckParserTest.cpp b/bolt/duckdb/conversion/tests/DuckParserTest.cpp index 40f06e464..5a8db04eb 100644 --- a/bolt/duckdb/conversion/tests/DuckParserTest.cpp +++ b/bolt/duckdb/conversion/tests/DuckParserTest.cpp @@ -301,16 +301,29 @@ TEST(DuckParserTest, interval) { return parsed->toString(); }; - EXPECT_EQ("to_hours(cast(trunc(cast(5, DOUBLE)), BIGINT))", parseInterval("INTERVAL 5 HOURS")); - EXPECT_EQ("to_minutes(cast(trunc(cast(36, DOUBLE)), BIGINT))", parseInterval("INTERVAL 36 MINUTES")); + EXPECT_EQ( + "to_hours(cast(trunc(cast(5, DOUBLE)), BIGINT))", + parseInterval("INTERVAL 5 HOURS")); + EXPECT_EQ( + "to_minutes(cast(trunc(cast(36, DOUBLE)), BIGINT))", + parseInterval("INTERVAL 36 MINUTES")); EXPECT_EQ("to_seconds(cast(7, DOUBLE))", parseInterval("INTERVAL 7 SECONDS")); - EXPECT_EQ("to_milliseconds(cast(123, DOUBLE))", parseInterval("INTERVAL 123 MILLISECONDS")); - EXPECT_EQ("to_milliseconds(cast(12345, DOUBLE))", parseInterval("INTERVAL 12345 MILLISECONDS")); - EXPECT_EQ("to_milliseconds(cast(12345678, DOUBLE))", parseInterval("INTERVAL 12345678 MILLISECONDS")); - EXPECT_EQ("to_milliseconds(cast(100100100, DOUBLE))", parseInterval("INTERVAL 100100100 MILLISECONDS")); + EXPECT_EQ( + "to_milliseconds(cast(123, DOUBLE))", + parseInterval("INTERVAL 123 MILLISECONDS")); + EXPECT_EQ( + "to_milliseconds(cast(12345, DOUBLE))", + parseInterval("INTERVAL 12345 MILLISECONDS")); + EXPECT_EQ( + "to_milliseconds(cast(12345678, DOUBLE))", + parseInterval("INTERVAL 12345678 MILLISECONDS")); + EXPECT_EQ( + "to_milliseconds(cast(100100100, DOUBLE))", + parseInterval("INTERVAL 100100100 MILLISECONDS")); EXPECT_EQ( - "to_milliseconds(cast(11, DOUBLE)) AS x", parseInterval("INTERVAL 11 MILLISECONDS AS x")); + "to_milliseconds(cast(11, DOUBLE)) AS x", + parseInterval("INTERVAL 11 MILLISECONDS AS x")); } TEST(DuckParserTest, cast) { diff --git a/conanfile.py b/conanfile.py index 43a829ab5..aeba91373 100644 --- a/conanfile.py +++ b/conanfile.py @@ -325,10 +325,10 @@ def configure(self): llvm_targets = None if str(self.settings.arch) in ["x86", "x86_64"]: arrow_simd_level = "avx2" - llvm_targets = 'X86' + llvm_targets = "X86" elif str(self.settings.arch) in ["armv8", "arm", "armv9"]: arrow_simd_level = "neon" - llvm_targets = 'AArch64' + llvm_targets = "AArch64" self.options[arrow].parquet = True self.options[arrow].filesystem_layer = True self.options[arrow].simd_level = arrow_simd_level