From 6b1ef3b3fe100f73904042797d12f33404b52f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Mon, 14 Oct 2024 12:15:37 +0800 Subject: [PATCH 01/13] debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/FunctionsStringReplace.h | 123 +++++++++++++- dbms/src/Functions/FunctionsStringSearch.cpp | 160 +++++++++++++++++++ 2 files changed, 282 insertions(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index 604c2479bb0..1bb6064804a 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -122,10 +122,40 @@ class FunctionStringReplace : public IFunction ColumnWithTypeAndName & column_result = block.getByPosition(result); + bool src_const = column_src->isColumnConst(); bool needle_const = column_needle->isColumnConst(); bool replacement_const = column_replacement->isColumnConst(); - if (needle_const && replacement_const) + if (src_const){ + if (!needle_const && replacement_const) + { + executeImplConstReplacement( + column_src, + column_needle, + column_replacement, + pos, + occ, + match_type, + column_result + ); + }else if (!needle_const && !replacement_const) + { + executeImplConstFirstParaReplacement( + column_src, + column_needle, + column_replacement, + pos, + occ, + match_type, + column_result + ); + }else + { + throw Exception( + "UnImplement function.", + ErrorCodes::BAD_ARGUMENTS); + } + }else if (needle_const && replacement_const) { executeImpl(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); } @@ -400,6 +430,97 @@ class FunctionStringReplace : public IFunction } } + void executeImplConstFirstParaReplacement( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) + { + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement = typeid_cast(column_replacement.get()); + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorConstFirstParaReplacement( + col->getValue(), + col_needle->getChars(), + col_needle->getOffsets(), + col_replacement->getChars(), + col_replacement->getOffsets(), + pos, + occ, + match_type, + collator, + col_res->getChars(), + col_res->getOffsets() + ); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName() + + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() + + "Illegal column " + column_replacement->getName() + " of third argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception( + "Argument at index 1 for function replace must be constant", + ErrorCodes::ILLEGAL_COLUMN); + } + } + + void executeImplConstReplacement( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) + { + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement = typeid_cast(column_replacement.get()); + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorConstReplacement( + col->getValue(), + col_needle->getChars(), + col_needle->getOffsets(), + col_replacement->getValue(), + pos, + occ, + match_type, + collator, + col_res->getChars(), + col_res->getOffsets() + ); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName() + + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() + + "Illegal column " + column_replacement->getName() + " of third argument of function " + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception( + "Argument at index 1 and 3 for function replace must be constant", + ErrorCodes::ILLEGAL_COLUMN); + } + } + TiDB::TiDBCollatorPtr collator{}; }; } // namespace DB diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 5b5318bcc30..7a2296fce46 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -1063,6 +1063,166 @@ struct ReplaceStringImpl } } + // Handle the case where `column_src` and `replace` are const + static void vectorConstReplacement( + const std::string & data, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const std::string & replace_chars, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size()); + res_offsets.resize(needle_offsets.size()); + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < needle_offsets.size(); ++i) + { + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + // Copy the data without changing if `need_size` is null. + if (needle_size == 0) + { + res_data.resize(res_data.size() + data.size()); + memcpy(&res_data[res_offset], data.data(), data.size()); + res_offset += data.size(); + res_offsets[i] = res_offset; + continue; + } + + size_t pos_in_data = 0; // trace the location in `data` + int replace_cnt = 0; + + while (pos_in_data < data.size()) + { + bool match = true; + + // Check if there are enough characters to match + if (pos_in_data + needle_size > data.size() || (replace_one && replace_cnt > 0)) + match = false; + + // Check if characters match + for (size_t j = 0; match && j < needle_size; ++j) + if (data[pos_in_data + j] != needle_chars[needle_offset + j]) + match = false; + + // If it matches, replace `needle` with `replacement` + if (match) + { + ++replace_cnt; + res_data.resize(res_data.size() + replace_chars.size()); + memcpy(&res_data[res_offset], replace_chars.data(), replace_chars.size()); + res_offset += replace_chars.size(); + pos_in_data += needle_size; + } + else + { + // Copy unmatched characters + res_data.resize(res_data.size() + 1); + res_data[res_offset] = data[pos_in_data]; + res_offset += 1; + pos_in_data += 1; + } + + // If `replace_one` is enabled, stop after replacing once. + if (replace_one && replace_cnt > 0) + { + // Process the rest + res_data.resize(res_data.size() + (data.size() - pos_in_data)); + memcpy(&res_data[res_offset], data.data() + pos_in_data, data.size() - pos_in_data); + res_offset += (data.size() - pos_in_data); + break; + } + } + + res_offsets[i] = res_offset; + } + } + + // Handle the case where `column_src` are const + static void vectorConstFirstParaReplacement( + const std::string & data, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size()); + res_offsets.resize(needle_offsets.size()); + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < needle_offsets.size(); ++i) + { + // 获取当前 needle 的起始位置和大小 + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // Ignore trailing zero bytes + + // If needle is empty, copy the entire data directly + if (needle_size == 0) + { + res_data.resize(res_data.size() + data.size()); + memcpy(&res_data[res_offset], data.data(), data.size()); + res_offset += data.size(); + res_offsets[i] = res_offset; + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data.size()); + size_t pos_in_data = 0; // trace the location in `data` + + while (pos_in_data < data.size()) + { + const char* match = searcher.search(data.data() + pos_in_data, data.size() - pos_in_data); + + // Copy unmatched characters + size_t unmatched_len = match - (data.data() + pos_in_data); + res_data.resize(res_data.size() + unmatched_len); + memcpy(&res_data[res_offset], data.data() + pos_in_data, unmatched_len); + res_offset += unmatched_len; + pos_in_data += unmatched_len; + + if (match == data.data() + data.size()) + { + break; + } + + if (pos_in_data + needle_size <= data.size()) + { + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // 忽略末尾的零字节 + + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + pos_in_data += needle_size; + } + + // If `replace_one` is enabled, stop after replacing once. + if (replace_one) + { + size_t remaining_len = data.size() - pos_in_data; + res_data.resize(res_data.size() + remaining_len); + memcpy(&res_data[res_offset], data.data() + pos_in_data, remaining_len); + res_offset += remaining_len; + break; + } + } + + res_offsets[i] = res_offset; + } + } + static void vectorNonConstNeedleReplacement( const ColumnString::Chars_t & data, const ColumnString::Offsets & offsets, From 42be4d8bb37b0744af1a7857116151837ff2564b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Mon, 14 Oct 2024 16:35:07 +0800 Subject: [PATCH 02/13] add 2 function to complete replace for constant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/FunctionsStringReplace.h | 153 ++++++++++++++++--- dbms/src/Functions/FunctionsStringSearch.cpp | 98 +++++++++--- 2 files changed, 211 insertions(+), 40 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index 1bb6064804a..25075f475b3 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -126,8 +126,20 @@ class FunctionStringReplace : public IFunction bool needle_const = column_needle->isColumnConst(); bool replacement_const = column_replacement->isColumnConst(); - if (src_const){ - if (!needle_const && replacement_const) + if (src_const) + { + if (needle_const && !replacement_const) + { + executeImplConstFirstThireParaReplacement( + column_src, + column_needle, + column_replacement, + pos, + occ, + match_type, + column_result); + } + else if (!needle_const && replacement_const) { executeImplConstReplacement( column_src, @@ -136,9 +148,9 @@ class FunctionStringReplace : public IFunction pos, occ, match_type, - column_result - ); - }else if (!needle_const && !replacement_const) + column_result); + } + else if (!needle_const && !replacement_const) { executeImplConstFirstParaReplacement( column_src, @@ -147,15 +159,21 @@ class FunctionStringReplace : public IFunction pos, occ, match_type, - column_result - ); - }else + column_result); + } + else { - throw Exception( - "UnImplement function.", - ErrorCodes::BAD_ARGUMENTS); + executeImplConstAllParaReplacement( + column_src, + column_needle, + column_replacement, + pos, + occ, + match_type, + column_result); } - }else if (needle_const && replacement_const) + } + else if (needle_const && replacement_const) { executeImpl(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); } @@ -457,21 +475,64 @@ class FunctionStringReplace : public IFunction match_type, collator, col_res->getChars(), - col_res->getOffsets() - ); + col_res->getOffsets()); column_result.column = std::move(col_res); } else throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName() + - "Illegal column " + column_needle->getName() + " of second argument of function " + getName() + - "Illegal column " + column_replacement->getName() + " of third argument of function " + getName(), + "Illegal column " + column_src->getName() + " of first argument of function " + getName() + + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() + + "Illegal column " + column_replacement->getName() + " of third argument of function " + + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception("Argument at index 1 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); + } + } + + void executeImplConstFirstThireParaReplacement( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) + { + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement = typeid_cast(column_replacement.get()); + if (const auto * col = checkAndGetColumn(column_src.get())) + { + auto col_res = ColumnString::create(); + Impl::vectorConstFirstThireParaReplacement( + col->getValue(), + col_needle->getValue(), + col_replacement->getChars(), + col_replacement->getOffsets(), + pos, + occ, + match_type, + collator, + col_res->getChars(), + col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName() + + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() + + "Illegal column " + column_replacement->getName() + " of third argument of function " + + getName(), ErrorCodes::ILLEGAL_COLUMN); } else { throw Exception( - "Argument at index 1 for function replace must be constant", + "Argument at index 1 and 2 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); } } @@ -502,15 +563,15 @@ class FunctionStringReplace : public IFunction match_type, collator, col_res->getChars(), - col_res->getOffsets() - ); + col_res->getOffsets()); column_result.column = std::move(col_res); } else throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName() + - "Illegal column " + column_needle->getName() + " of second argument of function " + getName() + - "Illegal column " + column_replacement->getName() + " of third argument of function " + getName(), + "Illegal column " + column_src->getName() + " of first argument of function " + getName() + + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() + + "Illegal column " + column_replacement->getName() + " of third argument of function " + + getName(), ErrorCodes::ILLEGAL_COLUMN); } else @@ -521,6 +582,52 @@ class FunctionStringReplace : public IFunction } } + void executeImplConstAllParaReplacement( + const ColumnPtr & column_src, + const ColumnPtr & column_needle, + const ColumnPtr & column_replacement, + Int64 pos [[maybe_unused]], + Int64 occ [[maybe_unused]], + const String & match_type, + ColumnWithTypeAndName & column_result) const + { + if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) + { + const auto * col_needle = typeid_cast(column_needle.get()); + const auto * col_replacement = typeid_cast(column_replacement.get()); + auto col_resss = ColumnString::create(); + if (const auto * col = checkAndGetColumn(column_src.get())) + { + std::string result_value; + Impl::constant( + col->getValue(), + col_needle->getValue(), + col_replacement->getValue(), + pos, + occ, + match_type, + collator, + result_value); + auto col_res = ColumnString::create(); + col_res->insert(result_value); + column_result.column = std::move(col_res); + } + else + throw Exception( + "Illegal column " + column_src->getName() + " of first argument of function " + getName() + + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() + + "Illegal column " + column_replacement->getName() + " of third argument of function " + + getName(), + ErrorCodes::ILLEGAL_COLUMN); + } + else + { + throw Exception( + "Argument at index 1 and 2 and 3 for function replace must be constant", + ErrorCodes::ILLEGAL_COLUMN); + } + } + TiDB::TiDBCollatorPtr collator{}; }; } // namespace DB diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 7a2296fce46..d9efe6fe298 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -1101,7 +1101,7 @@ struct ReplaceStringImpl while (pos_in_data < data.size()) { bool match = true; - + // Check if there are enough characters to match if (pos_in_data + needle_size > data.size() || (replace_one && replace_cnt > 0)) match = false; @@ -1146,17 +1146,17 @@ struct ReplaceStringImpl // Handle the case where `column_src` are const static void vectorConstFirstParaReplacement( - const std::string & data, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) + const std::string & data, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) { res_data.reserve(data.size()); res_offsets.resize(needle_offsets.size()); @@ -1166,7 +1166,7 @@ struct ReplaceStringImpl { // 获取当前 needle 的起始位置和大小 auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // Ignore trailing zero bytes + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // Ignore trailing zero bytes // If needle is empty, copy the entire data directly if (needle_size == 0) @@ -1179,12 +1179,12 @@ struct ReplaceStringImpl } Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data.size()); - size_t pos_in_data = 0; // trace the location in `data` + size_t pos_in_data = 0; // trace the location in `data` while (pos_in_data < data.size()) { - const char* match = searcher.search(data.data() + pos_in_data, data.size() - pos_in_data); - + const char * match = searcher.search(data.data() + pos_in_data, data.size() - pos_in_data); + // Copy unmatched characters size_t unmatched_len = match - (data.data() + pos_in_data); res_data.resize(res_data.size() + unmatched_len); @@ -1200,7 +1200,7 @@ struct ReplaceStringImpl if (pos_in_data + needle_size <= data.size()) { auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // 忽略末尾的零字节 + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // 忽略末尾的零字节 res_data.resize(res_data.size() + replacement_size); memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); @@ -1223,6 +1223,70 @@ struct ReplaceStringImpl } } + // Handle the case where `column_src` and 'needle_chars' are const + static void vectorConstFirstThireParaReplacement( + const std::string & data, + const std::string & needle_chars, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size()); + res_offsets.resize(replacement_offsets.size()); + ColumnString::Offset res_offset = 0; + + size_t needle_size = needle_chars.size(); + + for (size_t i = 0; i < replacement_offsets.size(); ++i) + { + size_t pos_in_data = 0; // trace the location in `data` + + while (pos_in_data < data.size()) + { + size_t match_pos = data.find(needle_chars, pos_in_data); + + if (match_pos == std::string::npos) + { + size_t remaining_len = data.size() - pos_in_data; + res_data.resize(res_data.size() + remaining_len); + memcpy(&res_data[res_offset], data.data() + pos_in_data, remaining_len); + res_offset += remaining_len; + break; + } + + size_t unmatched_len = match_pos - pos_in_data; + res_data.resize(res_data.size() + unmatched_len); + memcpy(&res_data[res_offset], data.data() + pos_in_data, unmatched_len); + res_offset += unmatched_len; + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // Ignore trailing zero bytes + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + + pos_in_data = match_pos + needle_size; + + if (replace_one) + { + size_t remaining_len = data.size() - pos_in_data; + res_data.resize(res_data.size() + remaining_len); + memcpy(&res_data[res_offset], data.data() + pos_in_data, remaining_len); + res_offset += remaining_len; + break; + } + } + + res_offsets[i] = res_offset; + } + } + + static void vectorNonConstNeedleReplacement( const ColumnString::Chars_t & data, const ColumnString::Offsets & offsets, From f163a320b704d63369bcb2c8aa644e42e9bc542f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Mon, 14 Oct 2024 16:41:46 +0800 Subject: [PATCH 03/13] remove chinese note MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/FunctionsStringSearch.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index d9efe6fe298..7a96ba41659 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -1164,7 +1164,6 @@ struct ReplaceStringImpl for (size_t i = 0; i < needle_offsets.size(); ++i) { - // 获取当前 needle 的起始位置和大小 auto needle_offset = StringUtil::offsetAt(needle_offsets, i); auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // Ignore trailing zero bytes @@ -1200,7 +1199,7 @@ struct ReplaceStringImpl if (pos_in_data + needle_size <= data.size()) { auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // 忽略末尾的零字节 + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; res_data.resize(res_data.size() + replacement_size); memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); From 92e88330705f1f9145224b3aa9a81a739d50fb83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Mon, 14 Oct 2024 18:17:29 +0800 Subject: [PATCH 04/13] fix format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/FunctionsStringSearch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 7a96ba41659..ff925d2e026 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -1108,7 +1108,7 @@ struct ReplaceStringImpl // Check if characters match for (size_t j = 0; match && j < needle_size; ++j) - if (data[pos_in_data + j] != needle_chars[needle_offset + j]) + if (static_cast(data[pos_in_data + j]) != needle_chars[needle_offset + j]) match = false; // If it matches, replace `needle` with `replacement` @@ -1199,7 +1199,7 @@ struct ReplaceStringImpl if (pos_in_data + needle_size <= data.size()) { auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; res_data.resize(res_data.size() + replacement_size); memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); From fd4d75ee7f271d64d0aee08a309a23cc40b427b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Mon, 14 Oct 2024 19:49:33 +0800 Subject: [PATCH 05/13] shorten code and add unit-test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/FunctionsStringReplace.h | 280 ++++--------------- dbms/src/Functions/FunctionsStringSearch.cpp | 223 --------------- 2 files changed, 58 insertions(+), 445 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index 25075f475b3..61103dec560 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -122,58 +122,10 @@ class FunctionStringReplace : public IFunction ColumnWithTypeAndName & column_result = block.getByPosition(result); - bool src_const = column_src->isColumnConst(); bool needle_const = column_needle->isColumnConst(); bool replacement_const = column_replacement->isColumnConst(); - if (src_const) - { - if (needle_const && !replacement_const) - { - executeImplConstFirstThireParaReplacement( - column_src, - column_needle, - column_replacement, - pos, - occ, - match_type, - column_result); - } - else if (!needle_const && replacement_const) - { - executeImplConstReplacement( - column_src, - column_needle, - column_replacement, - pos, - occ, - match_type, - column_result); - } - else if (!needle_const && !replacement_const) - { - executeImplConstFirstParaReplacement( - column_src, - column_needle, - column_replacement, - pos, - occ, - match_type, - column_result); - } - else - { - executeImplConstAllParaReplacement( - column_src, - column_needle, - column_replacement, - pos, - occ, - match_type, - column_result); - } - } - else if (needle_const && replacement_const) + if (needle_const && replacement_const) { executeImpl(column_src, column_needle, column_replacement, pos, occ, match_type, column_result); } @@ -280,7 +232,28 @@ class FunctionStringReplace : public IFunction const auto * col_replacement_const = typeid_cast(column_replacement.get()); auto replacement = col_replacement_const->getValue(); - if (const auto * col = checkAndGetColumn(column_src.get())) + bool col_const = column_src->isColumnConst(); + + if (col_const) + { + auto new_src = column_src->convertToFullColumnIfConst(); + const auto * col = typeid_cast(new_src.get()); + auto col_res = ColumnString::create(); + Impl::vectorNonConstNeedle( + col->getChars(), + col->getOffsets(), + col_needle->getChars(), + col_needle->getOffsets(), + replacement, + pos, + occ, + match_type, + collator, + col_res->getChars(), + col_res->getOffsets()); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vectorNonConstNeedle( @@ -340,8 +313,12 @@ class FunctionStringReplace : public IFunction auto needle = col_needle_const->getValue(); const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const auto * col = checkAndGetColumn(column_src.get())) + bool col_const = column_src->isColumnConst(); + + if (col_const) { + auto new_src = column_src->convertToFullColumnIfConst(); + const auto * col = typeid_cast(new_src.get()); auto col_res = ColumnString::create(); Impl::vectorNonConstReplacement( col->getChars(), @@ -357,56 +334,13 @@ class FunctionStringReplace : public IFunction col_res->getOffsets()); column_result.column = std::move(col_res); } - else if (const auto * col = checkAndGetColumn(column_src.get())) - { - auto col_res = ColumnString::create(); - Impl::vectorFixedNonConstReplacement( - col->getChars(), - col->getN(), - needle, - col_replacement->getChars(), - col_replacement->getOffsets(), - pos, - occ, - match_type, - collator, - col_res->getChars(), - col_res->getOffsets()); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception("Argument at index 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); - } - } - - void executeImplNonConstNeedleReplacement( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) - { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement = typeid_cast(column_replacement.get()); - - if (const auto * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); - Impl::vectorNonConstNeedleReplacement( + Impl::vectorNonConstReplacement( col->getChars(), col->getOffsets(), - col_needle->getChars(), - col_needle->getOffsets(), + needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, @@ -420,11 +354,10 @@ class FunctionStringReplace : public IFunction else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); - Impl::vectorFixedNonConstNeedleReplacement( + Impl::vectorFixedNonConstReplacement( col->getChars(), col->getN(), - col_needle->getChars(), - col_needle->getOffsets(), + needle, col_replacement->getChars(), col_replacement->getOffsets(), pos, @@ -442,13 +375,11 @@ class FunctionStringReplace : public IFunction } else { - throw Exception( - "Argument at index 2 and 3 for function replace must be constant", - ErrorCodes::ILLEGAL_COLUMN); + throw Exception("Argument at index 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); } } - void executeImplConstFirstParaReplacement( + void executeImplNonConstNeedleReplacement( const ColumnPtr & column_src, const ColumnPtr & column_needle, const ColumnPtr & column_replacement, @@ -461,11 +392,17 @@ class FunctionStringReplace : public IFunction { const auto * col_needle = typeid_cast(column_needle.get()); const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const auto * col = checkAndGetColumn(column_src.get())) + + bool col_const = column_src->isColumnConst(); + + if (col_const) { + auto new_src = column_src->convertToFullColumnIfConst(); + const auto * col = typeid_cast(new_src.get()); auto col_res = ColumnString::create(); - Impl::vectorConstFirstParaReplacement( - col->getValue(), + Impl::vectorNonConstNeedleReplacement( + col->getChars(), + col->getOffsets(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), @@ -478,39 +415,14 @@ class FunctionStringReplace : public IFunction col_res->getOffsets()); column_result.column = std::move(col_res); } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName() - + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() - + "Illegal column " + column_replacement->getName() + " of third argument of function " - + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception("Argument at index 1 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); - } - } - - void executeImplConstFirstThireParaReplacement( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) - { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const auto * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); - Impl::vectorConstFirstThireParaReplacement( - col->getValue(), - col_needle->getValue(), + Impl::vectorNonConstNeedleReplacement( + col->getChars(), + col->getOffsets(), + col_needle->getChars(), + col_needle->getOffsets(), col_replacement->getChars(), col_replacement->getOffsets(), pos, @@ -521,43 +433,16 @@ class FunctionStringReplace : public IFunction col_res->getOffsets()); column_result.column = std::move(col_res); } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName() - + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() - + "Illegal column " + column_replacement->getName() + " of third argument of function " - + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception( - "Argument at index 1 and 2 for function replace must be constant", - ErrorCodes::ILLEGAL_COLUMN); - } - } - - void executeImplConstReplacement( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) - { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement = typeid_cast(column_replacement.get()); - if (const auto * col = checkAndGetColumn(column_src.get())) + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); - Impl::vectorConstReplacement( - col->getValue(), + Impl::vectorFixedNonConstNeedleReplacement( + col->getChars(), + col->getN(), col_needle->getChars(), col_needle->getOffsets(), - col_replacement->getValue(), + col_replacement->getChars(), + col_replacement->getOffsets(), pos, occ, match_type, @@ -568,62 +453,13 @@ class FunctionStringReplace : public IFunction } else throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName() - + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() - + "Illegal column " + column_replacement->getName() + " of third argument of function " - + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - else - { - throw Exception( - "Argument at index 1 and 3 for function replace must be constant", - ErrorCodes::ILLEGAL_COLUMN); - } - } - - void executeImplConstAllParaReplacement( - const ColumnPtr & column_src, - const ColumnPtr & column_needle, - const ColumnPtr & column_replacement, - Int64 pos [[maybe_unused]], - Int64 occ [[maybe_unused]], - const String & match_type, - ColumnWithTypeAndName & column_result) const - { - if constexpr (Impl::support_non_const_needle && Impl::support_non_const_replacement) - { - const auto * col_needle = typeid_cast(column_needle.get()); - const auto * col_replacement = typeid_cast(column_replacement.get()); - auto col_resss = ColumnString::create(); - if (const auto * col = checkAndGetColumn(column_src.get())) - { - std::string result_value; - Impl::constant( - col->getValue(), - col_needle->getValue(), - col_replacement->getValue(), - pos, - occ, - match_type, - collator, - result_value); - auto col_res = ColumnString::create(); - col_res->insert(result_value); - column_result.column = std::move(col_res); - } - else - throw Exception( - "Illegal column " + column_src->getName() + " of first argument of function " + getName() - + "Illegal column " + column_needle->getName() + " of second argument of function " + getName() - + "Illegal column " + column_replacement->getName() + " of third argument of function " - + getName(), + "Illegal column " + column_src->getName() + " of first argument of function " + getName(), ErrorCodes::ILLEGAL_COLUMN); } else { throw Exception( - "Argument at index 1 and 2 and 3 for function replace must be constant", + "Argument at index 2 and 3 for function replace must be constant", ErrorCodes::ILLEGAL_COLUMN); } } diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index ff925d2e026..5b5318bcc30 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -1063,229 +1063,6 @@ struct ReplaceStringImpl } } - // Handle the case where `column_src` and `replace` are const - static void vectorConstReplacement( - const std::string & data, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const std::string & replace_chars, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - res_data.reserve(data.size()); - res_offsets.resize(needle_offsets.size()); - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < needle_offsets.size(); ++i) - { - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - - // Copy the data without changing if `need_size` is null. - if (needle_size == 0) - { - res_data.resize(res_data.size() + data.size()); - memcpy(&res_data[res_offset], data.data(), data.size()); - res_offset += data.size(); - res_offsets[i] = res_offset; - continue; - } - - size_t pos_in_data = 0; // trace the location in `data` - int replace_cnt = 0; - - while (pos_in_data < data.size()) - { - bool match = true; - - // Check if there are enough characters to match - if (pos_in_data + needle_size > data.size() || (replace_one && replace_cnt > 0)) - match = false; - - // Check if characters match - for (size_t j = 0; match && j < needle_size; ++j) - if (static_cast(data[pos_in_data + j]) != needle_chars[needle_offset + j]) - match = false; - - // If it matches, replace `needle` with `replacement` - if (match) - { - ++replace_cnt; - res_data.resize(res_data.size() + replace_chars.size()); - memcpy(&res_data[res_offset], replace_chars.data(), replace_chars.size()); - res_offset += replace_chars.size(); - pos_in_data += needle_size; - } - else - { - // Copy unmatched characters - res_data.resize(res_data.size() + 1); - res_data[res_offset] = data[pos_in_data]; - res_offset += 1; - pos_in_data += 1; - } - - // If `replace_one` is enabled, stop after replacing once. - if (replace_one && replace_cnt > 0) - { - // Process the rest - res_data.resize(res_data.size() + (data.size() - pos_in_data)); - memcpy(&res_data[res_offset], data.data() + pos_in_data, data.size() - pos_in_data); - res_offset += (data.size() - pos_in_data); - break; - } - } - - res_offsets[i] = res_offset; - } - } - - // Handle the case where `column_src` are const - static void vectorConstFirstParaReplacement( - const std::string & data, - const ColumnString::Chars_t & needle_chars, - const ColumnString::Offsets & needle_offsets, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - res_data.reserve(data.size()); - res_offsets.resize(needle_offsets.size()); - ColumnString::Offset res_offset = 0; - - for (size_t i = 0; i < needle_offsets.size(); ++i) - { - auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // Ignore trailing zero bytes - - // If needle is empty, copy the entire data directly - if (needle_size == 0) - { - res_data.resize(res_data.size() + data.size()); - memcpy(&res_data[res_offset], data.data(), data.size()); - res_offset += data.size(); - res_offsets[i] = res_offset; - continue; - } - - Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data.size()); - size_t pos_in_data = 0; // trace the location in `data` - - while (pos_in_data < data.size()) - { - const char * match = searcher.search(data.data() + pos_in_data, data.size() - pos_in_data); - - // Copy unmatched characters - size_t unmatched_len = match - (data.data() + pos_in_data); - res_data.resize(res_data.size() + unmatched_len); - memcpy(&res_data[res_offset], data.data() + pos_in_data, unmatched_len); - res_offset += unmatched_len; - pos_in_data += unmatched_len; - - if (match == data.data() + data.size()) - { - break; - } - - if (pos_in_data + needle_size <= data.size()) - { - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; - - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - pos_in_data += needle_size; - } - - // If `replace_one` is enabled, stop after replacing once. - if (replace_one) - { - size_t remaining_len = data.size() - pos_in_data; - res_data.resize(res_data.size() + remaining_len); - memcpy(&res_data[res_offset], data.data() + pos_in_data, remaining_len); - res_offset += remaining_len; - break; - } - } - - res_offsets[i] = res_offset; - } - } - - // Handle the case where `column_src` and 'needle_chars' are const - static void vectorConstFirstThireParaReplacement( - const std::string & data, - const std::string & needle_chars, - const ColumnString::Chars_t & replacement_chars, - const ColumnString::Offsets & replacement_offsets, - const Int64 & /* pos */, - const Int64 & /* occ */, - const std::string & /* match_type */, - TiDB::TiDBCollatorPtr /* collator */, - ColumnString::Chars_t & res_data, - ColumnString::Offsets & res_offsets) - { - res_data.reserve(data.size()); - res_offsets.resize(replacement_offsets.size()); - ColumnString::Offset res_offset = 0; - - size_t needle_size = needle_chars.size(); - - for (size_t i = 0; i < replacement_offsets.size(); ++i) - { - size_t pos_in_data = 0; // trace the location in `data` - - while (pos_in_data < data.size()) - { - size_t match_pos = data.find(needle_chars, pos_in_data); - - if (match_pos == std::string::npos) - { - size_t remaining_len = data.size() - pos_in_data; - res_data.resize(res_data.size() + remaining_len); - memcpy(&res_data[res_offset], data.data() + pos_in_data, remaining_len); - res_offset += remaining_len; - break; - } - - size_t unmatched_len = match_pos - pos_in_data; - res_data.resize(res_data.size() + unmatched_len); - memcpy(&res_data[res_offset], data.data() + pos_in_data, unmatched_len); - res_offset += unmatched_len; - - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // Ignore trailing zero bytes - res_data.resize(res_data.size() + replacement_size); - memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); - res_offset += replacement_size; - - pos_in_data = match_pos + needle_size; - - if (replace_one) - { - size_t remaining_len = data.size() - pos_in_data; - res_data.resize(res_data.size() + remaining_len); - memcpy(&res_data[res_offset], data.data() + pos_in_data, remaining_len); - res_offset += remaining_len; - break; - } - } - - res_offsets[i] = res_offset; - } - } - - static void vectorNonConstNeedleReplacement( const ColumnString::Chars_t & data, const ColumnString::Offsets & offsets, From ec18d500aac95faf0425a0cd657f4e7e4739738b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Mon, 14 Oct 2024 20:20:19 +0800 Subject: [PATCH 06/13] add unit test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- .../Functions/tests/gtest_strings_replace.cpp | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/dbms/src/Functions/tests/gtest_strings_replace.cpp b/dbms/src/Functions/tests/gtest_strings_replace.cpp index 4615d634e5f..e897315ae0d 100644 --- a/dbms/src/Functions/tests/gtest_strings_replace.cpp +++ b/dbms/src/Functions/tests/gtest_strings_replace.cpp @@ -104,6 +104,38 @@ try toVec({" hello ", " h e llo", "hello ", " ", "hello, world"}), toVec({" ", "h", "", "h", ","}), toVec({"", "x", "xx", " ", ","}))); + + /// const src replacement + ASSERT_COLUMN_EQ( + toVec({"Good Night!", "Bad Afternoon", "Good Afterwhile"}), + executeFunction( + "replaceAll", + toVec({"Good Afternoon!"}), + toVec({"Afternoon", "Good", "noon"}), + toVec({"Night", "Bad", "while"}))); + + /// const src and needle replacement + ASSERT_COLUMN_EQ( + toVec({"Good Night!", "Good Bad!", "Good while!"}), + executeFunction( + "replaceAll", + toVec({"Good Afternoon!"}), + toConst({"Afternoon"}), + toVec({"Night", "Bad", "while"}))); + + /// const src and replace replacement + ASSERT_COLUMN_EQ( + toVec({"Good Night!", "Night Afternoon!", "Good AfterNight!"}), + executeFunction( + "replaceAll", + toVec({"Good Afternoon!"}), + toVec({"Afternoon", "Good", "noon"}), + toConst({"Night"}))); + + /// const src and replace replacement + ASSERT_COLUMN_EQ( + toVec({"Good Night!"}), + executeFunction("replaceAll", toVec({"Good Afternoon!"}), toConst({"Afternoon"}), toConst({"Night"}))); } CATCH From 3284676907f856f7c6178487c2806a26c5cac0de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Mon, 14 Oct 2024 22:26:55 +0800 Subject: [PATCH 07/13] fix unit test fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/tests/gtest_strings_replace.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dbms/src/Functions/tests/gtest_strings_replace.cpp b/dbms/src/Functions/tests/gtest_strings_replace.cpp index e897315ae0d..453f6828c35 100644 --- a/dbms/src/Functions/tests/gtest_strings_replace.cpp +++ b/dbms/src/Functions/tests/gtest_strings_replace.cpp @@ -110,7 +110,7 @@ try toVec({"Good Night!", "Bad Afternoon", "Good Afterwhile"}), executeFunction( "replaceAll", - toVec({"Good Afternoon!"}), + toConst({"Good Afternoon!"}), toVec({"Afternoon", "Good", "noon"}), toVec({"Night", "Bad", "while"}))); @@ -119,7 +119,7 @@ try toVec({"Good Night!", "Good Bad!", "Good while!"}), executeFunction( "replaceAll", - toVec({"Good Afternoon!"}), + toConst({"Good Afternoon!"}), toConst({"Afternoon"}), toVec({"Night", "Bad", "while"}))); @@ -128,14 +128,14 @@ try toVec({"Good Night!", "Night Afternoon!", "Good AfterNight!"}), executeFunction( "replaceAll", - toVec({"Good Afternoon!"}), + toConst({"Good Afternoon!"}), toVec({"Afternoon", "Good", "noon"}), toConst({"Night"}))); /// const src and replace replacement ASSERT_COLUMN_EQ( toVec({"Good Night!"}), - executeFunction("replaceAll", toVec({"Good Afternoon!"}), toConst({"Afternoon"}), toConst({"Night"}))); + executeFunction("replaceAll", toConst({"Good Afternoon!"}), toConst({"Afternoon"}), toConst({"Night"}))); } CATCH From b09195baf37cf36ddafa427acd9f49d389b087d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Mon, 14 Oct 2024 23:49:19 +0800 Subject: [PATCH 08/13] fix ci MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- .../Functions/tests/gtest_strings_replace.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dbms/src/Functions/tests/gtest_strings_replace.cpp b/dbms/src/Functions/tests/gtest_strings_replace.cpp index 453f6828c35..a62e877a652 100644 --- a/dbms/src/Functions/tests/gtest_strings_replace.cpp +++ b/dbms/src/Functions/tests/gtest_strings_replace.cpp @@ -107,35 +107,35 @@ try /// const src replacement ASSERT_COLUMN_EQ( - toVec({"Good Night!", "Bad Afternoon", "Good Afterwhile"}), + toVec({"Good Night", "Bad Afternoon", "Good Afterwhile"}), executeFunction( "replaceAll", - toConst({"Good Afternoon!"}), + toConst({"Good Afternoon"}), toVec({"Afternoon", "Good", "noon"}), toVec({"Night", "Bad", "while"}))); /// const src and needle replacement ASSERT_COLUMN_EQ( - toVec({"Good Night!", "Good Bad!", "Good while!"}), + toVec({"Good Night", "Good Bad", "Good while"}), executeFunction( "replaceAll", - toConst({"Good Afternoon!"}), + toConst({"Good Afternoon"}), toConst({"Afternoon"}), toVec({"Night", "Bad", "while"}))); /// const src and replace replacement ASSERT_COLUMN_EQ( - toVec({"Good Night!", "Night Afternoon!", "Good AfterNight!"}), + toVec({"Good Night", "Night Afternoon", "Good AfterNight"}), executeFunction( "replaceAll", - toConst({"Good Afternoon!"}), + toConst({"Good Afternoon"}), toVec({"Afternoon", "Good", "noon"}), toConst({"Night"}))); /// const src and replace replacement ASSERT_COLUMN_EQ( - toVec({"Good Night!"}), - executeFunction("replaceAll", toConst({"Good Afternoon!"}), toConst({"Afternoon"}), toConst({"Night"}))); + toVec({"Good Night"}), + executeFunction("replaceAll", toConst({"Good Afternoon"}), toConst({"Afternoon"}), toConst({"Night"}))); } CATCH From ec6c7d0264a164e3057ac5374b38fd384f566a91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Tue, 15 Oct 2024 10:50:01 +0800 Subject: [PATCH 09/13] fix ci MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/FunctionsStringReplace.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index 61103dec560..da0704cbfa4 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -179,7 +179,19 @@ class FunctionStringReplace : public IFunction auto needle = c1_const->getValue(); auto replacement = c2_const->getValue(); - if (const auto * col = checkAndGetColumn(column_src.get())) + bool col_const = column_src->isColumnConst(); + + if (col_const) + { + std::string result_value; + const auto * src_const = typeid_cast(column_src.get()); + auto src = src_const->getValue(); + Impl::constant(src, needle, replacement, pos, occ, match_type, collator, result_value); + auto col_res = ColumnString::create(); + col_res->insert(result_value); + column_result.column = std::move(col_res); + } + else if (const auto * col = checkAndGetColumn(column_src.get())) { auto col_res = ColumnString::create(); Impl::vector( From 6408ebd76702f3bb2338b8ef8884a1ed7f93be81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Tue, 15 Oct 2024 15:06:32 +0800 Subject: [PATCH 10/13] fix comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/FunctionsStringReplace.h | 35 ++++++++++----------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index da0704cbfa4..08d0288714d 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -179,9 +179,7 @@ class FunctionStringReplace : public IFunction auto needle = c1_const->getValue(); auto replacement = c2_const->getValue(); - bool col_const = column_src->isColumnConst(); - - if (col_const) + if (const auto * col_const = checkAndGetColumnConst(column_src.get())) { std::string result_value; const auto * src_const = typeid_cast(column_src.get()); @@ -244,12 +242,12 @@ class FunctionStringReplace : public IFunction const auto * col_replacement_const = typeid_cast(column_replacement.get()); auto replacement = col_replacement_const->getValue(); - bool col_const = column_src->isColumnConst(); - - if (col_const) + if (const auto * col_const = checkAndGetColumnConst(column_src.get())) { - auto new_src = column_src->convertToFullColumnIfConst(); - const auto * col = typeid_cast(new_src.get()); + // using the data directly as a reference. + const auto & const_data = col_const->getDataColumn(); + const auto * col = typeid_cast(&const_data); + auto col_res = ColumnString::create(); Impl::vectorNonConstNeedle( col->getChars(), @@ -263,6 +261,7 @@ class FunctionStringReplace : public IFunction collator, col_res->getChars(), col_res->getOffsets()); + column_result.column = std::move(col_res); } else if (const auto * col = checkAndGetColumn(column_src.get())) @@ -325,12 +324,12 @@ class FunctionStringReplace : public IFunction auto needle = col_needle_const->getValue(); const auto * col_replacement = typeid_cast(column_replacement.get()); - bool col_const = column_src->isColumnConst(); - - if (col_const) + if (const auto * col_const = checkAndGetColumnConst(column_src.get())) { - auto new_src = column_src->convertToFullColumnIfConst(); - const auto * col = typeid_cast(new_src.get()); + // using the data directly as a reference. + const auto & const_data = col_const->getDataColumn(); + const auto * col = typeid_cast(&const_data); + auto col_res = ColumnString::create(); Impl::vectorNonConstReplacement( col->getChars(), @@ -405,12 +404,12 @@ class FunctionStringReplace : public IFunction const auto * col_needle = typeid_cast(column_needle.get()); const auto * col_replacement = typeid_cast(column_replacement.get()); - bool col_const = column_src->isColumnConst(); - - if (col_const) + if (const auto * col_const = checkAndGetColumnConst(column_src.get())) { - auto new_src = column_src->convertToFullColumnIfConst(); - const auto * col = typeid_cast(new_src.get()); + // using the data directly as a reference. + const auto & const_data = col_const->getDataColumn(); + const auto * col = typeid_cast(&const_data); + auto col_res = ColumnString::create(); Impl::vectorNonConstNeedleReplacement( col->getChars(), From b80338a30fcd3495467e9e002515aa2028eb9d7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Tue, 15 Oct 2024 16:40:01 +0800 Subject: [PATCH 11/13] add function to solve constant case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/FunctionsStringReplace.h | 6 +- dbms/src/Functions/FunctionsStringSearch.cpp | 219 +++++++++++++++++++ 2 files changed, 222 insertions(+), 3 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index 08d0288714d..be09a70aeed 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -249,7 +249,7 @@ class FunctionStringReplace : public IFunction const auto * col = typeid_cast(&const_data); auto col_res = ColumnString::create(); - Impl::vectorNonConstNeedle( + Impl::vectorConstSrcAndReplace( col->getChars(), col->getOffsets(), col_needle->getChars(), @@ -331,7 +331,7 @@ class FunctionStringReplace : public IFunction const auto * col = typeid_cast(&const_data); auto col_res = ColumnString::create(); - Impl::vectorNonConstReplacement( + Impl::vectorConstSrcAndNeedle( col->getChars(), col->getOffsets(), needle, @@ -411,7 +411,7 @@ class FunctionStringReplace : public IFunction const auto * col = typeid_cast(&const_data); auto col_res = ColumnString::create(); - Impl::vectorNonConstNeedleReplacement( + Impl::vectorConstSrc( col->getChars(), col->getOffsets(), col_needle->getChars(), diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index 5b5318bcc30..ad00ade036d 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -972,6 +972,225 @@ struct ReplaceStringImpl } } + static void vectorConstSrcAndReplace( + const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const std::string & replacement, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size() * needle_offsets.size()); // Leave enough space to handle multiple lines + res_offsets.resize(needle_offsets.size()); + + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < needle_offsets.size(); ++i) + { + auto data_offset = StringUtil::offsetAt(offsets, 0); // data have 1 rows. + auto data_size = StringUtil::sizeAt(offsets, 0); + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero + + const UInt8 * begin = &data[data_offset]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data_size; + + if (needle_size == 0) + { + /// Copy the whole data to res without changing + res_data.resize(res_data.size() + data_size); + memcpy(&res_data[res_offset], begin, data_size); + res_offset += data_size; + res_offsets[i] = res_offset; + continue; + } + + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the data without changing. + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + if (match == end) + { + break; + } + + res_data.resize(res_data.size() + replacement.size()); + memcpy(&res_data[res_offset], replacement.data(), replacement.size()); + res_offset += replacement.size(); + pos = match + needle_size; + + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, (end - pos)); + res_offset += (end - pos); + break; + } + res_offsets[i] = res_offset; + } + } + + static void vectorConstSrcAndNeedle( + const ColumnString::Chars_t & data, + const ColumnString::Offsets & /* offsets */, + const std::string & needle, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + const UInt8 * begin = &data[0]; + const UInt8 * end = begin + data.size(); + + ColumnString::Offset res_offset = 0; + res_data.reserve(data.size() * replacement_offsets.size()); // Leave enough space to handle multiple lines + res_offsets.resize(replacement_offsets.size()); + + if (needle.empty()) + { + for (size_t i = 0; i < replacement_offsets.size(); ++i) + { + res_data.resize(res_data.size() + data.size()); + memcpy(&res_data[res_offset], begin, data.size()); + res_offset += data.size(); + res_offsets[i] = res_offset; + } + return; + } + + for (size_t i = 0; i < replacement_offsets.size(); ++i) + { + const UInt8 * pos = begin; + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + /// Copy the whole data to res without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + if (match == end) + break; + + // Replace the matched part with the current replacement + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + + pos = match + needle.size(); + + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, end - pos); + res_offset += (end - pos); + res_offsets[i] = res_offset; + break; + } + } + } + + static void vectorConstSrc( + const ColumnString::Chars_t & data, + const ColumnString::Offsets & offsets, + const ColumnString::Chars_t & needle_chars, + const ColumnString::Offsets & needle_offsets, + const ColumnString::Chars_t & replacement_chars, + const ColumnString::Offsets & replacement_offsets, + const Int64 & /* pos */, + const Int64 & /* occ */, + const std::string & /* match_type */, + TiDB::TiDBCollatorPtr /* collator */, + ColumnString::Chars_t & res_data, + ColumnString::Offsets & res_offsets) + { + res_data.reserve(data.size() * offsets.size()); // Reserve space in the result data and offsets + res_offsets.resize(offsets.size()); + + ColumnString::Offset res_offset = 0; + + for (size_t i = 0; i < offsets.size(); ++i) + { + auto data_offset = 0; // data have 1 rows. + auto data_size = data.size(); + + auto needle_offset = StringUtil::offsetAt(needle_offsets, i); + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // Ignore the trailing zero + + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // Ignore the trailing zero + + const UInt8 * begin = &data[data_offset]; + const UInt8 * pos = begin; + const UInt8 * end = pos + data_size; + + // Handle empty needle case + if (needle_size == 0) + { + res_data.resize(res_data.size() + data_size); + memcpy(&res_data[res_offset], begin, data_size); + res_offset += data_size; + res_offsets[i] = res_offset; + continue; + } + + // Search for the needle in the data + Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); + while (pos < end) + { + const UInt8 * match = searcher.search(pos, end - pos); + + // Copy the data before the match + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); + res_offset += match - pos; + + if (match == end) + { + break; + } + + // Replace the matched needle with the replacement + res_data.resize(res_data.size() + replacement_size); + memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); + res_offset += replacement_size; + + // Move the position pointer to after the matched needle + pos = match + needle_size; + + if (replace_one) + { + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, (end - pos)); + res_offset += (end - pos); + break; + } + } + + res_offsets[i] = res_offset; + } + } + + static void vectorNonConstReplacement( const ColumnString::Chars_t & data, const ColumnString::Offsets & offsets, From 5fc3f5ea1ee2d870e889821f57e8e3bc431faa26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Wed, 16 Oct 2024 00:54:22 +0800 Subject: [PATCH 12/13] fix comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/FunctionsStringReplace.h | 24 +-- dbms/src/Functions/FunctionsStringSearch.cpp | 160 ++++++++++++------- 2 files changed, 105 insertions(+), 79 deletions(-) diff --git a/dbms/src/Functions/FunctionsStringReplace.h b/dbms/src/Functions/FunctionsStringReplace.h index be09a70aeed..6b58f5c7370 100644 --- a/dbms/src/Functions/FunctionsStringReplace.h +++ b/dbms/src/Functions/FunctionsStringReplace.h @@ -244,14 +244,10 @@ class FunctionStringReplace : public IFunction if (const auto * col_const = checkAndGetColumnConst(column_src.get())) { - // using the data directly as a reference. - const auto & const_data = col_const->getDataColumn(); - const auto * col = typeid_cast(&const_data); - auto col_res = ColumnString::create(); + Impl::vectorConstSrcAndReplace( - col->getChars(), - col->getOffsets(), + col_const->getValue(), col_needle->getChars(), col_needle->getOffsets(), replacement, @@ -326,14 +322,10 @@ class FunctionStringReplace : public IFunction if (const auto * col_const = checkAndGetColumnConst(column_src.get())) { - // using the data directly as a reference. - const auto & const_data = col_const->getDataColumn(); - const auto * col = typeid_cast(&const_data); - auto col_res = ColumnString::create(); + Impl::vectorConstSrcAndNeedle( - col->getChars(), - col->getOffsets(), + col_const->getValue(), needle, col_replacement->getChars(), col_replacement->getOffsets(), @@ -406,14 +398,10 @@ class FunctionStringReplace : public IFunction if (const auto * col_const = checkAndGetColumnConst(column_src.get())) { - // using the data directly as a reference. - const auto & const_data = col_const->getDataColumn(); - const auto * col = typeid_cast(&const_data); - auto col_res = ColumnString::create(); + Impl::vectorConstSrc( - col->getChars(), - col->getOffsets(), + col_const->getValue(), col_needle->getChars(), col_needle->getOffsets(), col_replacement->getChars(), diff --git a/dbms/src/Functions/FunctionsStringSearch.cpp b/dbms/src/Functions/FunctionsStringSearch.cpp index ad00ade036d..e2b33a5f943 100644 --- a/dbms/src/Functions/FunctionsStringSearch.cpp +++ b/dbms/src/Functions/FunctionsStringSearch.cpp @@ -973,8 +973,7 @@ struct ReplaceStringImpl } static void vectorConstSrcAndReplace( - const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, + const std::string & data, const ColumnString::Chars_t & needle_chars, const ColumnString::Offsets & needle_offsets, const std::string & replacement, @@ -985,20 +984,25 @@ struct ReplaceStringImpl ColumnString::Chars_t & res_data, ColumnString::Offsets & res_offsets) { - res_data.reserve(data.size() * needle_offsets.size()); // Leave enough space to handle multiple lines - res_offsets.resize(needle_offsets.size()); + auto data_col = ColumnString::create(); + data_col->insert(data); + const ColumnString::Chars_t & search_data = data_col->getChars(); + const ColumnString::Offsets & search_offsets = data_col->getOffsets(); + + res_data.reserve(search_data.size()); + res_offsets.resize(search_offsets.size()); ColumnString::Offset res_offset = 0; - for (size_t i = 0; i < needle_offsets.size(); ++i) + for (size_t i = 0; i < search_offsets.size(); ++i) { - auto data_offset = StringUtil::offsetAt(offsets, 0); // data have 1 rows. - auto data_size = StringUtil::sizeAt(offsets, 0); + auto data_offset = StringUtil::offsetAt(search_offsets, i); + auto data_size = StringUtil::sizeAt(search_offsets, i); auto needle_offset = StringUtil::offsetAt(needle_offsets, i); auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero - const UInt8 * begin = &data[data_offset]; + const UInt8 * begin = &search_data[data_offset]; const UInt8 * pos = begin; const UInt8 * end = pos + data_size; @@ -1024,6 +1028,7 @@ struct ReplaceStringImpl if (match == end) { + /// It's time to stop. break; } @@ -1032,18 +1037,21 @@ struct ReplaceStringImpl res_offset += replacement.size(); pos = match + needle_size; - res_data.resize(res_data.size() + (end - pos)); - memcpy(&res_data[res_offset], pos, (end - pos)); - res_offset += (end - pos); - break; + if (replace_one) + { + /// Copy the rest of data and stop. + res_data.resize(res_data.size() + (end - pos)); + memcpy(&res_data[res_offset], pos, (end - pos)); + res_offset += (end - pos); + break; + } } res_offsets[i] = res_offset; } } static void vectorConstSrcAndNeedle( - const ColumnString::Chars_t & data, - const ColumnString::Offsets & /* offsets */, + const std::string & data, const std::string & needle, const ColumnString::Chars_t & replacement_chars, const ColumnString::Offsets & replacement_offsets, @@ -1054,64 +1062,92 @@ struct ReplaceStringImpl ColumnString::Chars_t & res_data, ColumnString::Offsets & res_offsets) { - const UInt8 * begin = &data[0]; - const UInt8 * end = begin + data.size(); + // create a ColumnString which has 1 rows. + auto data_col = ColumnString::create(); + data_col->insert(data); + const ColumnString::Chars_t & search_data = data_col->getChars(); + const ColumnString::Offsets & search_offsets = data_col->getOffsets(); + + const UInt8 * begin = &search_data[0]; + const UInt8 * pos = begin; + const UInt8 * end = pos + search_data.size(); ColumnString::Offset res_offset = 0; - res_data.reserve(data.size() * replacement_offsets.size()); // Leave enough space to handle multiple lines - res_offsets.resize(replacement_offsets.size()); + res_data.reserve(search_data.size()); + size_t size = search_offsets.size(); + res_offsets.resize(size); if (needle.empty()) { - for (size_t i = 0; i < replacement_offsets.size(); ++i) - { - res_data.resize(res_data.size() + data.size()); - memcpy(&res_data[res_offset], begin, data.size()); - res_offset += data.size(); - res_offsets[i] = res_offset; - } + /// Copy all the data without changing. + res_data.resize(search_data.size()); + memcpy(&res_data[0], begin, search_data.size()); + memcpy(&res_offsets[0], &search_offsets[0], size * sizeof(UInt64)); return; } - for (size_t i = 0; i < replacement_offsets.size(); ++i) + /// The current index in the array of strings. + size_t i = 0; + + Volnitsky searcher(needle.data(), needle.size(), end - pos); + + /// We will search for the next occurrence in all rows at once. + while (pos < end) { - const UInt8 * pos = begin; - Volnitsky searcher(needle.data(), needle.size(), end - pos); + const UInt8 * match = searcher.search(pos, end - pos); - auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + /// Copy the data without changing + res_data.resize(res_data.size() + (match - pos)); + memcpy(&res_data[res_offset], pos, match - pos); - while (pos < end) + /// Determine which index it belongs to. + while (i < search_offsets.size() && begin + search_offsets[i] <= match) { - const UInt8 * match = searcher.search(pos, end - pos); + res_offsets[i] = res_offset + ((begin + search_offsets[i]) - pos); + ++i; + } + res_offset += (match - pos); - /// Copy the whole data to res without changing - res_data.resize(res_data.size() + (match - pos)); - memcpy(&res_data[res_offset], pos, match - pos); - res_offset += match - pos; + /// If you have reached the end, it's time to stop + if (i == search_offsets.size()) + break; - if (match == end) - break; + /// Is it true that this line no longer needs to perform transformations. + bool can_finish_current_string = false; - // Replace the matched part with the current replacement + auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero + + /// We check that the entry does not go through the boundaries of strings. + if (match + needle.size() < begin + search_offsets[i]) + { res_data.resize(res_data.size() + replacement_size); memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); res_offset += replacement_size; - pos = match + needle.size(); + if (replace_one) + can_finish_current_string = true; + } + else + { + pos = match; + can_finish_current_string = true; + } - res_data.resize(res_data.size() + (end - pos)); - memcpy(&res_data[res_offset], pos, end - pos); - res_offset += (end - pos); + if (can_finish_current_string) + { + res_data.resize(res_data.size() + (begin + search_offsets[i] - pos)); + memcpy(&res_data[res_offset], pos, (begin + search_offsets[i] - pos)); + res_offset += (begin + search_offsets[i] - pos); res_offsets[i] = res_offset; - break; + pos = begin + search_offsets[i]; + ++i; } } } static void vectorConstSrc( - const ColumnString::Chars_t & data, - const ColumnString::Offsets & offsets, + const std::string & data, const ColumnString::Chars_t & needle_chars, const ColumnString::Offsets & needle_offsets, const ColumnString::Chars_t & replacement_chars, @@ -1123,27 +1159,32 @@ struct ReplaceStringImpl ColumnString::Chars_t & res_data, ColumnString::Offsets & res_offsets) { - res_data.reserve(data.size() * offsets.size()); // Reserve space in the result data and offsets - res_offsets.resize(offsets.size()); + // create a ColumnString which has 1 rows. + auto data_col = ColumnString::create(); + data_col->insert(data); + const ColumnString::Chars_t & search_data = data_col->getChars(); + const ColumnString::Offsets & search_offsets = data_col->getOffsets(); + + res_data.reserve(search_data.size()); + res_offsets.resize(search_offsets.size()); ColumnString::Offset res_offset = 0; - for (size_t i = 0; i < offsets.size(); ++i) + for (size_t i = 0; i < search_offsets.size(); ++i) { - auto data_offset = 0; // data have 1 rows. - auto data_size = data.size(); + auto data_offset = StringUtil::offsetAt(search_offsets, i); + auto data_size = StringUtil::sizeAt(search_offsets, i); auto needle_offset = StringUtil::offsetAt(needle_offsets, i); - auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // Ignore the trailing zero + auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i); - auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // Ignore the trailing zero + auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero - const UInt8 * begin = &data[data_offset]; + const UInt8 * begin = &search_data[data_offset]; const UInt8 * pos = begin; const UInt8 * end = pos + data_size; - // Handle empty needle case if (needle_size == 0) { res_data.resize(res_data.size() + data_size); @@ -1153,39 +1194,36 @@ struct ReplaceStringImpl continue; } - // Search for the needle in the data Volnitsky searcher(reinterpret_cast(&needle_chars[needle_offset]), needle_size, data_size); while (pos < end) { const UInt8 * match = searcher.search(pos, end - pos); - // Copy the data before the match + /// Copy the data without changing. res_data.resize(res_data.size() + (match - pos)); memcpy(&res_data[res_offset], pos, match - pos); res_offset += match - pos; if (match == end) { + /// It's time to stop. break; } - // Replace the matched needle with the replacement res_data.resize(res_data.size() + replacement_size); memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size); res_offset += replacement_size; - - // Move the position pointer to after the matched needle pos = match + needle_size; if (replace_one) { + /// Copy the rest of data and stop. res_data.resize(res_data.size() + (end - pos)); memcpy(&res_data[res_offset], pos, (end - pos)); res_offset += (end - pos); break; } } - res_offsets[i] = res_offset; } } From 464d9907af3950cf4754a1491a83450f53198c22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CEricZequan=E2=80=9D?= Date: Wed, 16 Oct 2024 10:45:48 +0800 Subject: [PATCH 13/13] fix ci MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: “EricZequan” --- dbms/src/Functions/tests/gtest_strings_replace.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dbms/src/Functions/tests/gtest_strings_replace.cpp b/dbms/src/Functions/tests/gtest_strings_replace.cpp index a62e877a652..dbe26a2ea46 100644 --- a/dbms/src/Functions/tests/gtest_strings_replace.cpp +++ b/dbms/src/Functions/tests/gtest_strings_replace.cpp @@ -110,7 +110,7 @@ try toVec({"Good Night", "Bad Afternoon", "Good Afterwhile"}), executeFunction( "replaceAll", - toConst({"Good Afternoon"}), + toVec({"Good Afternoon"}), toVec({"Afternoon", "Good", "noon"}), toVec({"Night", "Bad", "while"}))); @@ -119,7 +119,7 @@ try toVec({"Good Night", "Good Bad", "Good while"}), executeFunction( "replaceAll", - toConst({"Good Afternoon"}), + toVec({"Good Afternoon"}), toConst({"Afternoon"}), toVec({"Night", "Bad", "while"}))); @@ -128,14 +128,14 @@ try toVec({"Good Night", "Night Afternoon", "Good AfterNight"}), executeFunction( "replaceAll", - toConst({"Good Afternoon"}), + toVec({"Good Afternoon"}), toVec({"Afternoon", "Good", "noon"}), toConst({"Night"}))); /// const src and replace replacement ASSERT_COLUMN_EQ( toVec({"Good Night"}), - executeFunction("replaceAll", toConst({"Good Afternoon"}), toConst({"Afternoon"}), toConst({"Night"}))); + executeFunction("replaceAll", toVec({"Good Afternoon"}), toConst({"Afternoon"}), toConst({"Night"}))); } CATCH