diff --git a/CMakeLists.txt b/CMakeLists.txt index 944b55090e2..595665386d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,6 +140,7 @@ include(cmake/xxhash.cmake) include(cmake/span.cmake) include(cmake/trie.cmake) include(cmake/pegtl.cmake) +include(cmake/rangev3.cmake) if (ENABLE_LUAJIT) include(cmake/luajit.cmake) @@ -173,6 +174,7 @@ list(APPEND EXTERNAL_LIBS xxhash) list(APPEND EXTERNAL_LIBS span-lite) list(APPEND EXTERNAL_LIBS tsl_hat_trie) list(APPEND EXTERNAL_LIBS pegtl) +list(APPEND EXTERNAL_LIBS range-v3) # Add git sha to version.h find_package(Git REQUIRED) diff --git a/NOTICE b/NOTICE index 86c357d8e20..8488d466c93 100644 --- a/NOTICE +++ b/NOTICE @@ -66,7 +66,7 @@ The text of each license is also included in licenses/LICENSE-[project].txt * LuaJIT(https://github.com/KvrocksLabs/LuaJIT) * lua(https://github.com/KvrocksLabs/lua, alternative to LuaJIT) * hat-trie(https://github.com/Tessil/hat-trie) -* pegtl(https://github.com/taocpp/PEGTL, NOTE: changed to BSL-1.0 in main branch) +* pegtl(https://github.com/taocpp/PEGTL, NOTE: changed to Boost Software License Version 1.0 in main branch) ================================================================ Boost Software License Version 1.0 @@ -76,6 +76,7 @@ The text of each license is also included in licenses/LICENSE-[project].txt * jsoncons(https://github.com/danielaparker/jsoncons) * span-lite(https://github.com/martinmoene/span-lite) +* range-v3(https://github.com/ericniebler/range-v3) ================================================================ zlib/libpng licenses diff --git a/cmake/rangev3.cmake b/cmake/rangev3.cmake new file mode 100644 index 00000000000..c30a1468e60 --- /dev/null +++ b/cmake/rangev3.cmake @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +include_guard() + +include(cmake/utils.cmake) + +FetchContent_DeclareGitHubWithMirror(rangev3 + ericniebler/range-v3 0.12.0 + MD5=e220e3f545fdf46241b4f139822d73a1 +) + +if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(WITH_DEBUG_INFO ON) +elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + set(WITH_DEBUG_INFO OFF) +elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + set(WITH_DEBUG_INFO ON) +elseif (CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + set(WITH_DEBUG_INFO OFF) +endif() + +if (PORTABLE STREQUAL 0) + set(ARG_RANGES_NATIVE ON) +else() + set(ARG_RANGES_NATIVE OFF) +endif() + +FetchContent_MakeAvailableWithArgs(rangev3 + RANGES_CXX_STD=17 + RANGES_BUILD_CALENDAR_EXAMPLE=OFF + RANGES_DEBUG_INFO=${WITH_DEBUG_INFO} + RANGES_NATIVE=${ARG_RANGES_NATIVE} +) diff --git a/licenses/LICENSE-range-v3.txt b/licenses/LICENSE-range-v3.txt new file mode 100644 index 00000000000..698193e974e --- /dev/null +++ b/licenses/LICENSE-range-v3.txt @@ -0,0 +1,151 @@ +======================================================== +Boost Software License - Version 1.0 - August 17th, 2003 +======================================================== + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +============================================================================== +libc++ License +============================================================================== + +The libc++ library is dual licensed under both the University of Illinois +"BSD-Like" license and the MIT license. As a user of this code you may choose +to use it under either license. As a contributor, you agree to allow your code +to be used under both. + +Full text of the relevant licenses is included below. + +============================================================================== + +University of Illinois/NCSA +Open Source License + +Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT +http://llvm.org/svn/llvm-project/libcxx/trunk/CREDITS.TXT + +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + +============================================================================== + +Copyright (c) 2009-2014 by the contributors listed in CREDITS.TXT + http://llvm.org/svn/llvm-project/libcxx/trunk/CREDITS.TXT + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +============================================================================== +Stepanov and McJones, "Elements of Programming" license +============================================================================== + +// Copyright (c) 2009 Alexander Stepanov and Paul McJones +// +// Permission to use, copy, modify, distribute and sell this software +// and its documentation for any purpose is hereby granted without +// fee, provided that the above copyright notice appear in all copies +// and that both that copyright notice and this permission notice +// appear in supporting documentation. The authors make no +// representations about the suitability of this software for any +// purpose. It is provided "as is" without express or implied +// warranty. +// +// Algorithms from +// Elements of Programming +// by Alexander Stepanov and Paul McJones +// Addison-Wesley Professional, 2009 + +============================================================================== +SGI C++ Standard Template Library license +============================================================================== + +// Copyright (c) 1994 +// Hewlett-Packard Company +// +// Permission to use, copy, modify, distribute and sell this software +// and its documentation for any purpose is hereby granted without fee, +// provided that the above copyright notice appear in all copies and +// that both that copyright notice and this permission notice appear +// in supporting documentation. Hewlett-Packard Company makes no +// representations about the suitability of this software for any +// purpose. It is provided "as is" without express or implied warranty. +// +// Copyright (c) 1996 +// Silicon Graphics Computer Systems, Inc. +// +// Permission to use, copy, modify, distribute and sell this software +// and its documentation for any purpose is hereby granted without fee, +// provided that the above copyright notice appear in all copies and +// that both that copyright notice and this permission notice appear +// in supporting documentation. Silicon Graphics makes no +// representations about the suitability of this software for any +// purpose. It is provided "as is" without express or implied warranty. +// diff --git a/src/search/index_info.h b/src/search/index_info.h index df059918774..5b0cb7077d7 100644 --- a/src/search/index_info.h +++ b/src/search/index_info.h @@ -39,6 +39,12 @@ struct FieldInfo { : name(std::move(name)), metadata(std::move(metadata)) {} bool IsSortable() const { return dynamic_cast(metadata.get()) != nullptr; } + bool HasIndex() const { return !metadata->noindex; } + + template + const T *MetadataAs() const { + return dynamic_cast(metadata.get()); + } }; struct IndexInfo { diff --git a/src/search/interval.h b/src/search/interval.h index d6ebbf5cebc..5ce90a45282 100644 --- a/src/search/interval.h +++ b/src/search/interval.h @@ -34,9 +34,13 @@ namespace kqir { struct Interval { double l, r; // [l, r) + static inline const double inf = std::numeric_limits::infinity(); + static inline const double minf = -inf; + Interval(double l, double r) : l(l), r(r) {} bool IsEmpty() const { return l >= r; } + static Interval Full() { return {minf, inf}; } bool operator==(const Interval &other) const { return l == other.l && r == other.r; } bool operator!=(const Interval &other) const { return !(*this == other); } @@ -69,8 +73,8 @@ struct IntervalSet { using DataType = std::vector>; DataType intervals; - static inline const double inf = std::numeric_limits::infinity(); - static inline const double minf = -inf; + static inline const double inf = Interval::inf; + static inline const double minf = Interval::minf; static double NextNum(double val) { return std::nextafter(val, inf); } diff --git a/src/search/ir.h b/src/search/ir.h index 3acc5ae8b7e..b841c0fa64d 100644 --- a/src/search/ir.h +++ b/src/search/ir.h @@ -256,6 +256,16 @@ struct AndExpr : QueryExpr { explicit AndExpr(std::vector> &&inners) : inners(std::move(inners)) {} + static std::unique_ptr Create(std::vector> &&exprs) { + CHECK(!exprs.empty()); + + if (exprs.size() == 1) { + return std::move(exprs.front()); + } + + return std::make_unique(std::move(exprs)); + } + std::string_view Name() const override { return "AndExpr"; } std::string Dump() const override { return fmt::format("(and {})", util::StringJoin(inners, [](const auto &v) { return v->Dump(); })); @@ -279,6 +289,16 @@ struct OrExpr : QueryExpr { explicit OrExpr(std::vector> &&inners) : inners(std::move(inners)) {} + static std::unique_ptr Create(std::vector> &&exprs) { + CHECK(!exprs.empty()); + + if (exprs.size() == 1) { + return std::move(exprs.front()); + } + + return std::make_unique(std::move(exprs)); + } + std::string_view Name() const override { return "OrExpr"; } std::string Dump() const override { return fmt::format("(or {})", util::StringJoin(inners, [](const auto &v) { return v->Dump(); })); diff --git a/src/search/ir_plan.h b/src/search/ir_plan.h index 94f801f4338..9a81a94a7d7 100644 --- a/src/search/ir_plan.h +++ b/src/search/ir_plan.h @@ -60,15 +60,19 @@ struct FieldScan : PlanOperator { struct NumericFieldScan : FieldScan { Interval range; + SortByClause::Order order; - NumericFieldScan(std::unique_ptr field, Interval range) : FieldScan(std::move(field)), range(range) {} + NumericFieldScan(std::unique_ptr field, Interval range, SortByClause::Order order = SortByClause::ASC) + : FieldScan(std::move(field)), range(range), order(order) {} std::string_view Name() const override { return "NumericFieldScan"; }; - std::string Content() const override { return fmt::format("{}, {}", field->name, range.ToString()); }; + std::string Content() const override { + return fmt::format("{}, {}, {}", field->name, range.ToString(), SortByClause::OrderToString(order)); + }; std::string Dump() const override { return fmt::format("numeric-scan {}", Content()); } std::unique_ptr Clone() const override { - return std::make_unique(field->CloneAs(), range); + return std::make_unique(field->CloneAs(), range, order); } }; @@ -110,6 +114,16 @@ struct Merge : PlanOperator { explicit Merge(std::vector> &&ops) : ops(std::move(ops)) {} + static std::unique_ptr Create(std::vector> &&ops) { + CHECK(!ops.empty()); + + if (ops.size() == 1) { + return std::move(ops.front()); + } + + return std::make_unique(std::move(ops)); + } + std::string_view Name() const override { return "Merge"; }; std::string Dump() const override { return fmt::format("(merge {})", util::StringJoin(ops, [](const auto &v) { return v->Dump(); })); diff --git a/src/search/ir_sema_checker.h b/src/search/ir_sema_checker.h index 40f316cba0b..d8982e5c6eb 100644 --- a/src/search/ir_sema_checker.h +++ b/src/search/ir_sema_checker.h @@ -75,7 +75,7 @@ struct SemaChecker { } else if (auto v = dynamic_cast(node)) { if (auto iter = current_index->fields.find(v->field->name); iter == current_index->fields.end()) { return {Status::NotOK, fmt::format("field `{}` not found in index `{}`", v->field->name)}; - } else if (auto meta = dynamic_cast(iter->second.metadata.get()); !meta) { + } else if (auto meta = iter->second.MetadataAs(); !meta) { return {Status::NotOK, fmt::format("field `{}` is not a tag field", v->field->name)}; } else { v->field->info = &iter->second; @@ -91,7 +91,7 @@ struct SemaChecker { } else if (auto v = dynamic_cast(node)) { if (auto iter = current_index->fields.find(v->field->name); iter == current_index->fields.end()) { return {Status::NotOK, fmt::format("field `{}` not found in index `{}`", v->field->name, current_index->name)}; - } else if (!dynamic_cast(iter->second.metadata.get())) { + } else if (!iter->second.MetadataAs()) { return {Status::NotOK, fmt::format("field `{}` is not a numeric field", v->field->name)}; } else { v->field->info = &iter->second; diff --git a/src/search/passes/cost_model.h b/src/search/passes/cost_model.h new file mode 100644 index 00000000000..86e0e3a58e5 --- /dev/null +++ b/src/search/passes/cost_model.h @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#pragma once + +#include +#include + +#include "search/interval.h" +#include "search/ir.h" +#include "search/ir_plan.h" + +namespace kqir { + +// TODO: collect statistical information of index in runtime +// to optimize the cost model +struct CostModel { + static size_t Transform(const PlanOperator *node) { + if (auto v = dynamic_cast(node)) { + return Visit(v); + } + if (auto v = dynamic_cast(node)) { + return Visit(v); + } + if (auto v = dynamic_cast(node)) { + return Visit(v); + } + if (auto v = dynamic_cast(node)) { + return Visit(v); + } + if (auto v = dynamic_cast(node)) { + return Visit(v); + } + + CHECK(false) << "plan operator type not supported"; + } + + static size_t Visit(const FullIndexScan *node) { return 100; } + + static size_t Visit(const NumericFieldScan *node) { + if (node->range.r == IntervalSet::NextNum(node->range.l)) { + return 5; + } + + size_t base = 10; + + if (std::isinf(node->range.l)) { + base += 20; + } + + if (std::isinf(node->range.r)) { + base += 20; + } + + return base; + } + + static size_t Visit(const TagFieldScan *node) { return 10; } + + static size_t Visit(const Filter *node) { return Transform(node->source.get()) + 1; } + + static size_t Visit(const Merge *node) { + return std::accumulate(node->ops.begin(), node->ops.end(), size_t(0), [](size_t res, const auto &v) { + if (dynamic_cast(v.get())) { + res += 9; + } + return res + Transform(v.get()); + }); + } +}; + +} // namespace kqir diff --git a/src/search/passes/index_selection.h b/src/search/passes/index_selection.h new file mode 100644 index 00000000000..4ced79727b2 --- /dev/null +++ b/src/search/passes/index_selection.h @@ -0,0 +1,323 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#pragma once + +#include +#include +#include +#include + +#include "search/index_info.h" +#include "search/interval.h" +#include "search/ir.h" +#include "search/ir_pass.h" +#include "search/ir_plan.h" +#include "search/passes/cost_model.h" +#include "search/passes/interval_analysis.h" +#include "search/passes/push_down_not_expr.h" +#include "search/passes/simplify_and_or_expr.h" +#include "search/search_encoding.h" + +namespace kqir { + +struct IndexSelection : Visitor { + SortByClause *order = nullptr; + bool sort_removable = false; + IndexRef *index = nullptr; + IntervalAnalysis::Result intervals; + + void Reset() override { + order = nullptr; + sort_removable = false; + index = nullptr; + intervals.clear(); + } + + std::unique_ptr Visit(std::unique_ptr node) override { + IntervalAnalysis analysis(false); + node = Node::MustAs(analysis.Transform(std::move(node))); + intervals = std::move(analysis.result); + + return Visitor::Visit(std::move(node)); + } + + std::unique_ptr Visit(std::unique_ptr node) override { + order = node->order.get(); + + node = Node::MustAs(Visitor::Visit(std::move(node))); + + if (sort_removable) return std::move(node->op); + + return node; + } + + bool HasGoodOrder() const { return order && order->field->info->HasIndex(); } + + std::unique_ptr GenerateScanFromOrder() const { + if (order->field->info->MetadataAs()) { + return std::make_unique(order->field->CloneAs(), Interval::Full(), order->order); + } else { + CHECK(false) << "current only numeric field is supported for ordering"; + } + } + + // if there's no Filter node, enter this method + std::unique_ptr Visit(std::unique_ptr node) override { + if (HasGoodOrder()) { + sort_removable = true; + return GenerateScanFromOrder(); + } + + return node; + } + + std::unique_ptr Visit(std::unique_ptr node) override { + auto index_scan = Node::MustAs(std::move(node->source)); + + if (HasGoodOrder()) { + // TODO: optimize plan with sorting order via the cost model + sort_removable = true; + + auto scan = GenerateScanFromOrder(); + return std::make_unique(std::move(scan), std::move(node->filter_expr)); + } else { + index = index_scan->index.get(); + + return TransformExpr(node->filter_expr.get()); + } + } + + std::unique_ptr TransformExpr(QueryExpr *node) { + if (auto v = dynamic_cast(node)) { + return VisitExpr(v); + } + if (auto v = dynamic_cast(node)) { + return VisitExpr(v); + } + if (auto v = dynamic_cast(node)) { + return VisitExpr(v); + } + if (auto v = dynamic_cast(node)) { + return VisitExpr(v); + } + if (auto v = dynamic_cast(node)) { + return VisitExpr(v); + } + + CHECK(false) << "unreachable"; + } + + std::unique_ptr MakeFullIndexFilter(QueryExpr *node) const { + return std::make_unique(std::make_unique(index->CloneAs()), + node->CloneAs()); + } + + std::unique_ptr VisitExpr(NotExpr *node) const { + // after PushDownNotExpr, `node->inner` should be one of TagContainExpr and NumericCompareExpr + return MakeFullIndexFilter(node); + } + + std::unique_ptr VisitExpr(TagContainExpr *node) const { + if (node->field->info->HasIndex()) { + return std::make_unique(node->field->CloneAs(), node->tag->val); + } + + return MakeFullIndexFilter(node); + } + + // enter only if there's just a single NumericCompareExpr, without and/or expression + std::unique_ptr VisitExpr(NumericCompareExpr *node) const { + if (node->field->info->HasIndex() && node->op != NumericCompareExpr::NE) { + IntervalSet is(node->op, node->num->val); + return PlanFromInterval(is, node->field.get(), SortByClause::ASC); + } + + return MakeFullIndexFilter(node); + } + + template + std::unique_ptr VisitExprImpl(Expr *node) { + struct AggregatedNodes { + std::set nodes; + IntervalSet intervals; + }; + + std::map agg_nodes; + std::vector> rest_nodes; + + for (const auto &n : node->inners) { + IntervalSet is; + const FieldInfo *field = nullptr; + + if (auto iter = intervals.find(n.get()); iter != intervals.end()) { + field = iter->second.field_info; + is = iter->second.intervals; + } else if (auto expr = dynamic_cast(n.get()); expr && expr->op != NumericCompareExpr::NE) { + field = expr->field->info; + is = IntervalSet(expr->op, expr->num->val); + } else { + rest_nodes.emplace_back(n.get()); + continue; + } + + if (!field->HasIndex()) { + rest_nodes.emplace_back(n.get()); + continue; + } + + if (auto jter = agg_nodes.find(field); jter != agg_nodes.end()) { + jter->second.nodes.emplace(n.get()); + if constexpr (std::is_same_v) { + jter->second.intervals = jter->second.intervals & is; + } else { + jter->second.intervals = jter->second.intervals | is; + } + } else { + rest_nodes.emplace_back(field); + agg_nodes.emplace(field, AggregatedNodes{std::set{n.get()}, is}); + } + } + + if constexpr (std::is_same_v) { + struct SelectionInfo { + std::unique_ptr plan; + std::set selected_nodes; + size_t cost; + + SelectionInfo(std::unique_ptr &&plan, std::set nodes) + : plan(std::move(plan)), selected_nodes(std::move(nodes)), cost(CostModel::Transform(this->plan.get())) {} + }; + + std::vector available_plans; + + available_plans.emplace_back(std::make_unique(index->CloneAs()), std::set{}); + + for (auto v : rest_nodes) { + if (std::holds_alternative(v)) { + auto n = std::get(v); + auto op = TransformExpr(n); + + available_plans.emplace_back(std::move(op), std::set{n}); + } else { + auto n = std::get(v); + const auto &agg_info = agg_nodes.at(n); + auto field_ref = std::make_unique(n->name, n); + available_plans.emplace_back(PlanFromInterval(agg_info.intervals, field_ref.get(), SortByClause::ASC), + agg_info.nodes); + } + } + + auto &best_plan = *std::min_element(available_plans.begin(), available_plans.end(), + [](const auto &l, const auto &r) { return l.cost < r.cost; }); + + std::vector> filter_nodes; + for (const auto &n : node->inners) { + if (best_plan.selected_nodes.count(n.get()) == 0) filter_nodes.push_back(n->template CloneAs()); + } + + if (filter_nodes.empty()) { + return std::move(best_plan.plan); + } else if (filter_nodes.size() == 1) { + return std::make_unique(std::move(best_plan.plan), std::move(filter_nodes.front())); + } else { + return std::make_unique(std::move(best_plan.plan), std::make_unique(std::move(filter_nodes))); + } + } else { + auto full_scan_plan = MakeFullIndexFilter(node); + + std::vector> merged_elems; + std::vector> elem_filter; + + auto add_filter = [&elem_filter](std::unique_ptr op) { + if (!elem_filter.empty()) { + std::unique_ptr filter = std::make_unique(OrExpr::Create(CloneExprs(elem_filter))); + + PushDownNotExpr pdne; + filter = Node::MustAs(pdne.Transform(std::move(filter))); + SimplifyAndOrExpr saoe; + filter = Node::MustAs(saoe.Transform(std::move(filter))); + + op = std::make_unique(std::move(op), std::move(filter)); + } + + return op; + }; + + for (auto v : rest_nodes) { + if (std::holds_alternative(v)) { + auto n = std::get(v); + auto op = add_filter(TransformExpr(n)); + + merged_elems.push_back(std::move(op)); + elem_filter.push_back(n->CloneAs()); + } else { + auto n = std::get(v); + const auto &agg_info = agg_nodes.at(n); + auto field_ref = std::make_unique(n->name, n); + auto elem = PlanFromInterval(agg_info.intervals, field_ref.get(), SortByClause::ASC); + elem = add_filter(std::move(elem)); + + merged_elems.push_back(std::move(elem)); + for (auto nn : agg_info.nodes) { + elem_filter.push_back(nn->template CloneAs()); + } + } + } + + auto merge_plan = Merge::Create(std::move(merged_elems)); + auto &best_plan = const_cast &>(std::min( + full_scan_plan, merge_plan, + [](const auto &l, const auto &r) { return CostModel::Transform(l.get()) < CostModel::Transform(r.get()); })); + + return std::move(best_plan); + } + } + + static std::vector> CloneExprs(const std::vector> &exprs) { + std::vector> result; + result.reserve(exprs.size()); + + for (const auto &e : exprs) result.push_back(e->CloneAs()); + return result; + } + + std::unique_ptr VisitExpr(AndExpr *node) { return VisitExprImpl(node); } + + std::unique_ptr VisitExpr(OrExpr *node) { return VisitExprImpl(node); } + + static std::unique_ptr PlanFromInterval(const IntervalSet &intervals, FieldRef *field, + SortByClause::Order order) { + std::vector> result; + if (order == SortByClause::ASC) { + for (const auto &[l, r] : intervals.intervals) { + result.push_back(std::make_unique(field->CloneAs(), Interval(l, r), order)); + } + } else { + for (const auto &[l, r] : ranges::views::reverse(intervals.intervals)) { + result.push_back(std::make_unique(field->CloneAs(), Interval(l, r), order)); + } + } + + return Merge::Create(std::move(result)); + } +}; + +} // namespace kqir diff --git a/src/search/passes/interval_analysis.h b/src/search/passes/interval_analysis.h index 59959979560..5ed56125195 100644 --- a/src/search/passes/interval_analysis.h +++ b/src/search/passes/interval_analysis.h @@ -29,6 +29,7 @@ #include "search/interval.h" #include "search/ir.h" #include "search/ir_pass.h" +#include "search/ir_plan.h" #include "type_util.h" namespace kqir { @@ -40,7 +41,13 @@ struct IntervalAnalysis : Visitor { IntervalSet intervals; }; - std::map result; + using Result = std::map; + + Result result; + const bool simplify_numeric_compare; + + explicit IntervalAnalysis(bool simplify_numeric_compare = false) + : simplify_numeric_compare(simplify_numeric_compare) {} void Reset() override { result.clear(); } @@ -92,13 +99,18 @@ struct IntervalAnalysis : Visitor { result.emplace(node.get(), IntervalInfo{elem.first, elem.second.field, elem.second.intervals}); } - for (const auto &[field, info] : interval_map) { - auto iter = std::remove_if(node->inners.begin(), node->inners.end(), - [&info = info](const auto &n) { return info.nodes.count(n.get()) == 1; }); - node->inners.erase(iter, node->inners.end()); + if (simplify_numeric_compare) { + for (const auto &[field, info] : interval_map) { + auto iter = std::remove_if(node->inners.begin(), node->inners.end(), + [&info = info](const auto &n) { return info.nodes.count(n.get()) == 1; }); + node->inners.erase(iter, node->inners.end()); + for (const auto &n : info.nodes) { + if (auto iter = result.find(n); iter != result.end()) result.erase(iter); + } - auto field_node = std::make_unique(field, info.field); - node->inners.emplace_back(GenerateFromInterval(info.intervals, field_node.get())); + auto field_node = std::make_unique(field, info.field); + node->inners.emplace_back(GenerateFromInterval(info.intervals, field_node.get())); + } } return node; diff --git a/src/search/passes/lower_to_plan.h b/src/search/passes/lower_to_plan.h index dad1db39c23..94828c22b2b 100644 --- a/src/search/passes/lower_to_plan.h +++ b/src/search/passes/lower_to_plan.h @@ -31,17 +31,27 @@ namespace kqir { struct LowerToPlan : Visitor { std::unique_ptr Visit(std::unique_ptr node) override { auto scan = std::make_unique(node->index->CloneAs()); - auto filter = std::make_unique(std::move(scan), std::move(node->query_expr)); - std::unique_ptr op = std::move(filter); - - // order is important here, since limit(sort(op)) is different from sort(limit(op)) - if (node->sort_by) { - op = std::make_unique(std::move(op), std::move(node->sort_by)); + std::unique_ptr op; + if (auto b = Node::As(std::move(node->query_expr))) { + if (b->val) { + op = std::move(scan); + } else { + op = std::make_unique(); + } + } else { + op = std::make_unique(std::move(scan), std::move(node->query_expr)); } - if (node->limit) { - op = std::make_unique(std::move(op), std::move(node->limit)); + if (!dynamic_cast(op.get())) { + // order is important here, since limit(sort(op)) is different from sort(limit(op)) + if (node->sort_by) { + op = std::make_unique(std::move(op), std::move(node->sort_by)); + } + + if (node->limit) { + op = std::make_unique(std::move(op), std::move(node->limit)); + } } return std::make_unique(std::move(op), std::move(node->select)); diff --git a/src/search/passes/manager.h b/src/search/passes/manager.h index 480e27a7ae9..e94a12e940f 100644 --- a/src/search/passes/manager.h +++ b/src/search/passes/manager.h @@ -20,14 +20,21 @@ #pragma once +#include #include +#include #include #include "search/ir.h" #include "search/ir_pass.h" +#include "search/passes/index_selection.h" +#include "search/passes/interval_analysis.h" +#include "search/passes/lower_to_plan.h" #include "search/passes/push_down_not_expr.h" #include "search/passes/simplify_and_or_expr.h" #include "search/passes/simplify_boolean.h" +#include "search/passes/sort_limit_fuse.h" +#include "type_util.h" namespace kqir { @@ -43,13 +50,35 @@ struct PassManager { } template - static PassSequence GeneratePasses() { + static PassSequence Create(Passes &&...passes) { + static_assert(std::conjunction_v>...>); + + PassSequence result; + result.reserve(sizeof...(passes)); + (result.push_back(std::make_unique(std::move(passes))), ...); + + return result; + } + + template + static PassSequence Merge(PassSeqs &&...seqs) { + static_assert(std::conjunction_v>...>); + static_assert(std::conjunction_v>...>); + PassSequence result; - (result.push_back(std::make_unique()), ...); + result.reserve((seqs.size() + ...)); + (result.insert(result.end(), std::make_move_iterator(seqs.begin()), std::make_move_iterator(seqs.end())), ...); + return result; } - static PassSequence ExprPasses() { return GeneratePasses(); } + static PassSequence ExprPasses() { + return Create(SimplifyAndOrExpr{}, PushDownNotExpr{}, SimplifyBoolean{}, SimplifyAndOrExpr{}); + } + static PassSequence NumericPasses() { return Create(IntervalAnalysis{true}, SimplifyAndOrExpr{}, SimplifyBoolean{}); } + static PassSequence PlanPasses() { return Create(LowerToPlan{}, IndexSelection{}, SortLimitFuse{}); } + + static PassSequence Default() { return Merge(ExprPasses(), NumericPasses(), PlanPasses()); } }; } // namespace kqir diff --git a/src/search/passes/sort_limit_fuse.h b/src/search/passes/sort_limit_fuse.h new file mode 100644 index 00000000000..0e857297175 --- /dev/null +++ b/src/search/passes/sort_limit_fuse.h @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +#pragma once + +#include + +#include "search/ir.h" +#include "search/ir_pass.h" +#include "search/ir_plan.h" + +namespace kqir { + +struct SortLimitFuse : Visitor { + std::unique_ptr Visit(std::unique_ptr node) override { + node = Node::MustAs(Visitor::Visit(std::move(node))); + + if (auto sort = Node::As(std::move(node->op))) { + return std::make_unique(std::move(sort->op), std::move(sort->order), std::move(node->limit)); + } + + return node; + } +}; + +} // namespace kqir diff --git a/tests/cppunit/ir_pass_test.cc b/tests/cppunit/ir_pass_test.cc index bfb630a5f2a..a02dc907395 100644 --- a/tests/cppunit/ir_pass_test.cc +++ b/tests/cppunit/ir_pass_test.cc @@ -20,6 +20,7 @@ #include "search/ir_pass.h" +#include "fmt/core.h" #include "gtest/gtest.h" #include "search/interval.h" #include "search/ir_sema_checker.h" @@ -113,7 +114,10 @@ TEST(IRPassTest, Manager) { TEST(IRPassTest, LowerToPlan) { LowerToPlan ltp; - ASSERT_EQ(ltp.Transform(*Parse("select * from a"))->Dump(), "project *: (filter true: full-scan a)"); + ASSERT_EQ(ltp.Transform(*Parse("select * from a"))->Dump(), "project *: full-scan a"); + ASSERT_EQ(ltp.Transform(*Parse("select * from a limit 1"))->Dump(), "project *: (limit 0, 1: full-scan a)"); + ASSERT_EQ(ltp.Transform(*Parse("select * from a where false"))->Dump(), "project *: noop"); + ASSERT_EQ(ltp.Transform(*Parse("select * from a where false limit 1"))->Dump(), "project *: noop"); ASSERT_EQ(ltp.Transform(*Parse("select * from a where b > 1"))->Dump(), "project *: (filter b > 1: full-scan a)"); ASSERT_EQ(ltp.Transform(*Parse("select a from b where c = 1 order by d"))->Dump(), "project a: (sort d, asc: (filter c = 1: full-scan b))"); @@ -124,7 +128,7 @@ TEST(IRPassTest, LowerToPlan) { } TEST(IRPassTest, IntervalAnalysis) { - auto ia_passes = PassManager::GeneratePasses(); + auto ia_passes = PassManager::Create(IntervalAnalysis{true}, SimplifyAndOrExpr{}, SimplifyBoolean{}); ASSERT_EQ(PassManager::Execute(ia_passes, *Parse("select * from a where a > 1 or a < 3"))->Dump(), "select * from a where true"); @@ -163,3 +167,105 @@ TEST(IRPassTest, IntervalAnalysis) { ASSERT_EQ(PassManager::Execute(ia_passes, *Parse("select * from a where a != 1 and b > 1 and b = 2"))->Dump(), "select * from a where (and a != 1, b = 2)"); } + +static IndexMap MakeIndexMap() { + auto f1 = FieldInfo("t1", std::make_unique()); + auto f2 = FieldInfo("t2", std::make_unique()); + f2.metadata->noindex = true; + auto f3 = FieldInfo("n1", std::make_unique()); + auto f4 = FieldInfo("n2", std::make_unique()); + auto f5 = FieldInfo("n3", std::make_unique()); + f5.metadata->noindex = true; + auto ia = IndexInfo("ia", SearchMetadata()); + ia.Add(std::move(f1)); + ia.Add(std::move(f2)); + ia.Add(std::move(f3)); + ia.Add(std::move(f4)); + ia.Add(std::move(f5)); + + auto& name = ia.name; + IndexMap res; + res.emplace(name, std::move(ia)); + return res; +} + +std::unique_ptr ParseS(SemaChecker& sc, const std::string& in) { + auto res = *Parse(in); + EXPECT_EQ(sc.Check(res.get()).Msg(), Status::ok_msg); + return res; +} + +TEST(IRPassTest, IndexSelection) { + auto index_map = MakeIndexMap(); + auto sc = SemaChecker(index_map); + + auto passes = PassManager::Default(); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia"))->Dump(), "project *: full-scan ia"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia order by n1"))->Dump(), + "project *: numeric-scan n1, [-inf, inf), asc"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia order by n1 limit 1"))->Dump(), + "project *: (limit 0, 1: numeric-scan n1, [-inf, inf), asc)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia order by n3"))->Dump(), + "project *: (sort n3, asc: full-scan ia)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia order by n3 limit 1"))->Dump(), + "project *: (top-n sort n3, asc, 0, 1: full-scan ia)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n2 = 1 order by n1"))->Dump(), + "project *: (filter n2 = 1: numeric-scan n1, [-inf, inf), asc)"); + + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 = 1"))->Dump(), + fmt::format("project *: numeric-scan n1, [1, {}), asc", IntervalSet::NextNum(1))); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 != 1"))->Dump(), + "project *: (filter n1 != 1: full-scan ia)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 >= 1"))->Dump(), + "project *: numeric-scan n1, [1, inf), asc"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 >= 1 and n1 < 2"))->Dump(), + "project *: numeric-scan n1, [1, 2), asc"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 >= 1 and n2 >= 2"))->Dump(), + "project *: (filter n2 >= 2: numeric-scan n1, [1, inf), asc)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 >= 1 and n2 = 2"))->Dump(), + fmt::format("project *: (filter n1 >= 1: numeric-scan n2, [2, {}), asc)", IntervalSet::NextNum(2))); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 >= 1 and n3 = 2"))->Dump(), + "project *: (filter n3 = 2: numeric-scan n1, [1, inf), asc)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n3 = 1"))->Dump(), + "project *: (filter n3 = 1: full-scan ia)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 = 1 and n3 = 2"))->Dump(), + fmt::format("project *: (filter n3 = 2: numeric-scan n1, [1, {}), asc)", IntervalSet::NextNum(1))); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 = 1 and t1 hastag \"a\""))->Dump(), + fmt::format("project *: (filter t1 hastag \"a\": numeric-scan n1, [1, {}), asc)", IntervalSet::NextNum(1))); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where t1 hastag \"a\""))->Dump(), + "project *: tag-scan t1, a"); + ASSERT_EQ( + PassManager::Execute(passes, ParseS(sc, "select * from ia where t1 hastag \"a\" and t2 hastag \"a\""))->Dump(), + "project *: (filter t2 hastag \"a\": tag-scan t1, a)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where t2 hastag \"a\""))->Dump(), + "project *: (filter t2 hastag \"a\": full-scan ia)"); + + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 >= 2 or n1 < 1"))->Dump(), + "project *: (merge numeric-scan n1, [-inf, 1), asc, numeric-scan n1, [2, inf), asc)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 >= 1 or n2 >= 2"))->Dump(), + "project *: (merge numeric-scan n1, [1, inf), asc, (filter n1 < 1: numeric-scan n2, [2, inf), asc))"); + ASSERT_EQ( + PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 >= 1 or n2 = 2"))->Dump(), + fmt::format("project *: (merge numeric-scan n1, [1, inf), asc, (filter n1 < 1: numeric-scan n2, [2, {}), asc))", + IntervalSet::NextNum(2))); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 >= 1 or n3 = 2"))->Dump(), + "project *: (filter (or n1 >= 1, n3 = 2): full-scan ia)"); + ASSERT_EQ(PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 = 1 or n3 = 2"))->Dump(), + "project *: (filter (or n1 = 1, n3 = 2): full-scan ia)"); + ASSERT_EQ( + PassManager::Execute(passes, ParseS(sc, "select * from ia where n1 = 1 or t1 hastag \"a\""))->Dump(), + fmt::format("project *: (merge tag-scan t1, a, (filter not t1 hastag \"a\": numeric-scan n1, [1, {}), asc))", + IntervalSet::NextNum(1))); + ASSERT_EQ( + PassManager::Execute(passes, ParseS(sc, "select * from ia where t1 hastag \"a\" or t2 hastag \"a\""))->Dump(), + "project *: (filter (or t1 hastag \"a\", t2 hastag \"a\"): full-scan ia)"); + ASSERT_EQ( + PassManager::Execute(passes, ParseS(sc, "select * from ia where t1 hastag \"a\" or t1 hastag \"b\""))->Dump(), + "project *: (merge tag-scan t1, a, (filter not t1 hastag \"a\": tag-scan t1, b))"); + + ASSERT_EQ( + PassManager::Execute( + passes, ParseS(sc, "select * from ia where (n1 < 2 or n1 >= 3) and (n1 >= 1 and n1 < 4) and not n3 != 1")) + ->Dump(), + "project *: (filter n3 = 1: (merge numeric-scan n1, [1, 2), asc, numeric-scan n1, [3, 4), asc))"); +} diff --git a/tests/cppunit/ir_sema_checker_test.cc b/tests/cppunit/ir_sema_checker_test.cc index 75beb68693e..9068a38b7c5 100644 --- a/tests/cppunit/ir_sema_checker_test.cc +++ b/tests/cppunit/ir_sema_checker_test.cc @@ -34,7 +34,7 @@ using namespace kqir; static auto Parse(const std::string& in) { return sql::ParseToIR(peg::string_input(in, "test")); } -IndexMap MakeIndexMap() { +static IndexMap MakeIndexMap() { auto f1 = FieldInfo("f1", std::make_unique()); auto f2 = FieldInfo("f2", std::make_unique()); auto f3 = FieldInfo("f3", std::make_unique());