From f00c4de31434884cc7b2bf9e7b295dfe5ce88e64 Mon Sep 17 00:00:00 2001 From: Jin Hai Date: Wed, 25 Dec 2024 11:20:47 +0800 Subject: [PATCH 1/2] Fix killing leader causing follower/learner core dump (#2406) ### What problem does this PR solve? As title Issue link:#2399 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Signed-off-by: Jin Hai --- src/network/peer_thrift_client.cpp | 5 +++++ src/storage/io/s3_client_minio.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/network/peer_thrift_client.cpp b/src/network/peer_thrift_client.cpp index dfb631a3dd..933dfd8a16 100644 --- a/src/network/peer_thrift_client.cpp +++ b/src/network/peer_thrift_client.cpp @@ -337,6 +337,11 @@ void PeerClient::HeartBeat(HeartBeatPeerTask *peer_task) { peer_task->error_code_ = static_cast(ErrorCode::kCantConnectLeader); return; } + case TTransportExceptionType::NOT_OPEN: { + peer_task->error_message_ = thrift_exception.what(); + peer_task->error_code_ = static_cast(ErrorCode::kCantConnectLeader); + return; + } default: { String error_message = "Heartbeat: error happens when data transfer to leader"; UnrecoverableError(error_message); diff --git a/src/storage/io/s3_client_minio.cpp b/src/storage/io/s3_client_minio.cpp index 17ef0467fb..a512b87aff 100644 --- a/src/storage/io/s3_client_minio.cpp +++ b/src/storage/io/s3_client_minio.cpp @@ -126,7 +126,7 @@ Status S3ClientMinio::BucketExists(const String &bucket_name) { return Status::MinioInvalidAccessKey(resp.Error().String()); } default: { - UnrecoverableError(fmt::format("Unable to do bucket existence check: {}", resp.Error().String())); + UnrecoverableError(fmt::format("Unable to do bucket existence check: {}, Please check if the MINIO connection", resp.Error().String())); return Status::OK(); } } From 31fa6b1e1f676e2ea10c6b14297600ef39b5ed40 Mon Sep 17 00:00:00 2001 From: yangzq50 Date: Wed, 25 Dec 2024 16:19:35 +0800 Subject: [PATCH 2/2] Support customizable whitespace analyzer (#2409) ### What problem does this PR solve? Support user provided delimiters for whitespace analyzer For example, analyzer 'whitespace-#$@' will use any character of "#$@" as the delimiter. resolve #2405 ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring - [x] Test cases --- src/common/analyzer/analyzer_pool.cpp | 14 ++++-- src/common/analyzer/analyzer_pool.cppm | 2 + src/common/analyzer/whitespace_analyzer.cpp | 40 ++++++++++++++--- src/common/analyzer/whitespace_analyzer.cppm | 3 ++ .../invertedindex/search/search_driver.cpp | 16 ++++--- .../fulltext/fulltext_whitespace_sharp.slt | 45 +++++++++++++++++++ 6 files changed, 103 insertions(+), 17 deletions(-) create mode 100644 test/sql/dql/fulltext/fulltext_whitespace_sharp.slt diff --git a/src/common/analyzer/analyzer_pool.cpp b/src/common/analyzer/analyzer_pool.cpp index 660a1727eb..6a709fcea8 100644 --- a/src/common/analyzer/analyzer_pool.cpp +++ b/src/common/analyzer/analyzer_pool.cpp @@ -46,6 +46,10 @@ constexpr u64 Str2Int(const char *str, u64 last_value = basis) { return (*str != '\0' && *str != '-') ? Str2Int(str + 1, (*str ^ last_value) * prime) : last_value; } +u64 AnalyzerPool::AnalyzerNameToInt(const char *str) { + return Str2Int(str); +} + bool IcharEquals(char a, char b) { return ToLower(static_cast(a)) == ToLower(static_cast(b)); } bool IEquals(std::string_view lhs, std::string_view rhs) { return std::ranges::equal(lhs, rhs, IcharEquals); } @@ -321,11 +325,13 @@ Tuple, Status> AnalyzerPool::GetAnalyzer(const std::string_v } return {MakeUnique(ngram), Status::OK()}; } - case Str2Int(KEYWORD.data()): { - return {MakeUnique(), Status::OK()}; - } + case Str2Int(KEYWORD.data()): case Str2Int(WHITESPACE.data()): { - return {MakeUnique(), Status::OK()}; + const auto suffix_pos = name.find_first_of('-'); + if (suffix_pos == std::string_view::npos || suffix_pos + 1 == name.size()) { + return {MakeUnique(), Status::OK()}; + } + return {MakeUnique(name.substr(suffix_pos + 1)), Status::OK()}; } default: { if(std::filesystem::is_regular_file(name)) { diff --git a/src/common/analyzer/analyzer_pool.cppm b/src/common/analyzer/analyzer_pool.cppm index 46a2e0cf08..804783cfe7 100644 --- a/src/common/analyzer/analyzer_pool.cppm +++ b/src/common/analyzer/analyzer_pool.cppm @@ -30,6 +30,8 @@ public: Tuple, Status> GetAnalyzer(const std::string_view &name); + static u64 AnalyzerNameToInt(const char *str); + void Set(const std::string_view &name); public: diff --git a/src/common/analyzer/whitespace_analyzer.cpp b/src/common/analyzer/whitespace_analyzer.cpp index fc76172a30..40927eef8e 100644 --- a/src/common/analyzer/whitespace_analyzer.cpp +++ b/src/common/analyzer/whitespace_analyzer.cpp @@ -23,14 +23,42 @@ import analyzer; namespace infinity { +WhitespaceAnalyzer::WhitespaceAnalyzer(const std::string_view delimiters) { + delimiters_ = delimiters; + std::sort(delimiters_.begin(), delimiters_.end()); + const auto last_unique = std::unique(delimiters_.begin(), delimiters_.end()); + delimiters_.erase(last_unique, delimiters_.end()); +} + int WhitespaceAnalyzer::AnalyzeImpl(const Term &input, void *data, HookType func) { - std::istringstream is(input.text_); - std::string t; - u32 offset = 0; - while (is >> t) { - func(data, t.data(), t.size(), offset++, 0, Term::AND, 0, false); + if (delimiters_.empty()) { + // split by std::isspace() + std::istringstream is(input.text_); + std::string t; + u32 offset = 0; + while (is >> t) { + func(data, t.data(), t.size(), offset++, 0, Term::AND, 0, false); + } + return 0; + } else { + // split by delimiters + const std::string_view delimiters = delimiters_; + const std::string_view input_text = input.text_; + u32 search_start = 0; + u32 offset = 0; + while (search_start < input_text.size()) { + const auto found = input_text.find_first_of(delimiters, search_start); + if (found == std::string_view::npos) { + func(data, input_text.data() + search_start, input_text.size() - search_start, offset++, 0, Term::AND, 0, false); + break; + } + if (found > search_start) { + func(data, input_text.data() + search_start, found - search_start, offset++, 0, Term::AND, 0, false); + } + search_start = found + 1; + } + return 0; } - return 0; } } // namespace infinity diff --git a/src/common/analyzer/whitespace_analyzer.cppm b/src/common/analyzer/whitespace_analyzer.cppm index d262b07c7f..be2e881838 100644 --- a/src/common/analyzer/whitespace_analyzer.cppm +++ b/src/common/analyzer/whitespace_analyzer.cppm @@ -22,8 +22,11 @@ import analyzer; namespace infinity { export class WhitespaceAnalyzer : public Analyzer { + String delimiters_{}; + public: WhitespaceAnalyzer() = default; + explicit WhitespaceAnalyzer(std::string_view delimiters); ~WhitespaceAnalyzer() override = default; protected: diff --git a/src/storage/invertedindex/search/search_driver.cpp b/src/storage/invertedindex/search/search_driver.cpp index f346928c23..6fe76da617 100644 --- a/src/storage/invertedindex/search/search_driver.cpp +++ b/src/storage/invertedindex/search/search_driver.cpp @@ -147,15 +147,16 @@ inline TermList GetTermListFromAnalyzer(const std::string &analyzer_name, Analyz } inline std::string GetAnalyzerName(const std::string &field, const std::map &field2analyzer) { - std::string analyzer_name = "standard"; if (!field.empty()) { if (const auto it = field2analyzer.find(field); it != field2analyzer.end()) { - analyzer_name = it->second; + return it->second; } } - return analyzer_name; + return "standard"; } +inline static const auto keyword_analyzer_name_int = AnalyzerPool::AnalyzerNameToInt(AnalyzerPool::KEYWORD.data()); + std::unique_ptr SearchDriver::ParseSingle(const std::string &query, const std::string *default_field_ptr) const { std::istringstream iss(query); if (!iss.good()) { @@ -166,7 +167,8 @@ std::unique_ptr SearchDriver::ParseSingle(const std::string &query, c } const auto &default_field = *default_field_ptr; const auto default_analyzer_name = GetAnalyzerName(default_field, field2analyzer_); - if (default_analyzer_name != "keyword" && operator_option_ == FulltextQueryOperatorOption::kInfinitySyntax) { + if (const auto default_analyzer_name_int = AnalyzerPool::AnalyzerNameToInt(default_analyzer_name.c_str()); + default_analyzer_name_int != keyword_analyzer_name_int && operator_option_ == FulltextQueryOperatorOption::kInfinitySyntax) { // use parser std::unique_ptr result; const auto scanner = std::make_unique(&iss); @@ -185,14 +187,14 @@ std::unique_ptr SearchDriver::ParseSingle(const std::string &query, c if (terms.empty()) { return nullptr; } - if (terms.size() == 1 && default_analyzer_name != "keyword") { + if (terms.size() == 1 && default_analyzer_name_int != keyword_analyzer_name_int) { auto q = std::make_unique(); q->term_ = terms.front().text_; q->column_ = default_field; return q; } std::unique_ptr multi_query; - if (default_analyzer_name == "keyword") { + if (default_analyzer_name_int == keyword_analyzer_name_int) { multi_query = std::make_unique(); } else if (operator_option_ == FulltextQueryOperatorOption::kOr) { multi_query = std::make_unique(); @@ -226,7 +228,7 @@ SearchDriver::AnalyzeAndBuildQueryNode(const std::string &field, const std::stri if (terms.empty()) { return nullptr; } - if (analyzer_name == "keyword") { + if (AnalyzerPool::AnalyzerNameToInt(analyzer_name.c_str()) == keyword_analyzer_name_int) { auto result = std::make_unique(); for (const auto &term : terms) { auto subquery = std::make_unique(); diff --git a/test/sql/dql/fulltext/fulltext_whitespace_sharp.slt b/test/sql/dql/fulltext/fulltext_whitespace_sharp.slt new file mode 100644 index 0000000000..b888da1d98 --- /dev/null +++ b/test/sql/dql/fulltext/fulltext_whitespace_sharp.slt @@ -0,0 +1,45 @@ + +statement ok +DROP TABLE IF EXISTS ft_whitespace_sharp; + +statement ok +CREATE TABLE ft_whitespace_sharp(num int, doc varchar DEFAULT 'default text'); + +statement ok +INSERT INTO ft_whitespace_sharp VALUES (1, '2020-01-01#2023-01-01'), (2, '2023@01$01'), (3, '01 01#@2023'), (4); + +statement ok +CREATE INDEX ft_index ON ft_whitespace_sharp(doc) USING FULLTEXT WITH (analyzer = 'whitespace-#@$'); + +query I +SELECT * FROM ft_whitespace_sharp; +---- +1 2020-01-01#2023-01-01 +2 2023@01$01 +3 01 01#@2023 +4 default text + +query I +SELECT * FROM ft_whitespace_sharp SEARCH MATCH TEXT ('doc^4.5', '2023-01-01^5.0', 'topn=10'); +---- +1 2020-01-01#2023-01-01 + +query II +SELECT * FROM ft_whitespace_sharp SEARCH MATCH TEXT ('doc^4.5', '"01 01"^3.3', 'topn=10'); +---- +3 01 01#@2023 + +query III +SELECT * FROM ft_whitespace_sharp SEARCH MATCH TEXT ('doc^4.5', '"01#01"^3.3', 'topn=10'); +---- +2 2023@01$01 + +query IV rowsort +SELECT * FROM ft_whitespace_sharp SEARCH MATCH TEXT ('doc^4.5', '2023^3.3', 'topn=10'); +---- +2 2023@01$01 +3 01 01#@2023 + +# Clean up +statement ok +DROP TABLE ft_whitespace_sharp;