Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix sql failed when using replace function #9524

Draft
wants to merge 14 commits into
base: master
Choose a base branch
from
83 changes: 79 additions & 4 deletions dbms/src/Functions/FunctionsStringReplace.h
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,17 @@ class FunctionStringReplace : public IFunction
auto needle = c1_const->getValue<String>();
auto replacement = c2_const->getValue<String>();

if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
if (const auto * col_const = checkAndGetColumnConst<ColumnString>(column_src.get()))
{
std::string result_value;
const auto * src_const = typeid_cast<const ColumnConst *>(column_src.get());
auto src = src_const->getValue<String>();
Impl::constant(src, needle, replacement, pos, occ, match_type, collator, result_value);
auto col_res = ColumnString::create();
col_res->insert(result_value);
column_result.column = std::move(col_res);
}
else if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vector(
Expand Down Expand Up @@ -232,7 +242,29 @@ class FunctionStringReplace : public IFunction
const auto * col_replacement_const = typeid_cast<const ColumnConst *>(column_replacement.get());
auto replacement = col_replacement_const->getValue<String>();

if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
if (const auto * col_const = checkAndGetColumnConst<ColumnString>(column_src.get()))
{
// using the data directly as a reference.
const auto & const_data = col_const->getDataColumn();
const auto * col = typeid_cast<const ColumnString *>(&const_data);
Copy link
Contributor

@gengliqi gengliqi Oct 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's wrong. The length of this column is just 1. You will find some errors if you correctly add a test for this case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right. Now I add vectorConstSrcAndReplacevectorConstSrcAndNeedlevectorConstSrc to handle the corresponding case to ensure the same number of rows.


auto col_res = ColumnString::create();
Impl::vectorConstSrcAndReplace(
col->getChars(),
col->getOffsets(),
col_needle->getChars(),
col_needle->getOffsets(),
replacement,
pos,
occ,
match_type,
collator,
col_res->getChars(),
col_res->getOffsets());

column_result.column = std::move(col_res);
}
else if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vectorNonConstNeedle(
Expand Down Expand Up @@ -292,7 +324,28 @@ class FunctionStringReplace : public IFunction
auto needle = col_needle_const->getValue<String>();
const auto * col_replacement = typeid_cast<const ColumnString *>(column_replacement.get());

if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
if (const auto * col_const = checkAndGetColumnConst<ColumnString>(column_src.get()))
{
// using the data directly as a reference.
const auto & const_data = col_const->getDataColumn();
const auto * col = typeid_cast<const ColumnString *>(&const_data);

auto col_res = ColumnString::create();
Impl::vectorConstSrcAndNeedle(
col->getChars(),
col->getOffsets(),
needle,
col_replacement->getChars(),
col_replacement->getOffsets(),
pos,
occ,
match_type,
collator,
col_res->getChars(),
col_res->getOffsets());
column_result.column = std::move(col_res);
}
else if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vectorNonConstReplacement(
Expand Down Expand Up @@ -351,7 +404,29 @@ class FunctionStringReplace : public IFunction
const auto * col_needle = typeid_cast<const ColumnString *>(column_needle.get());
const auto * col_replacement = typeid_cast<const ColumnString *>(column_replacement.get());

if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
if (const auto * col_const = checkAndGetColumnConst<ColumnString>(column_src.get()))
{
// using the data directly as a reference.
const auto & const_data = col_const->getDataColumn();
const auto * col = typeid_cast<const ColumnString *>(&const_data);

auto col_res = ColumnString::create();
Impl::vectorConstSrc(
col->getChars(),
col->getOffsets(),
col_needle->getChars(),
col_needle->getOffsets(),
col_replacement->getChars(),
col_replacement->getOffsets(),
pos,
occ,
match_type,
collator,
col_res->getChars(),
col_res->getOffsets());
column_result.column = std::move(col_res);
}
else if (const auto * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vectorNonConstNeedleReplacement(
Expand Down
219 changes: 219 additions & 0 deletions dbms/src/Functions/FunctionsStringSearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,225 @@ struct ReplaceStringImpl
}
}

static void vectorConstSrcAndReplace(
const ColumnString::Chars_t & data,
const ColumnString::Offsets & offsets,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the data is const, how about using const std::string &?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated. PTAL~

const ColumnString::Chars_t & needle_chars,
const ColumnString::Offsets & needle_offsets,
const std::string & replacement,
const Int64 & /* pos */,
const Int64 & /* occ */,
const std::string & /* match_type */,
TiDB::TiDBCollatorPtr /* collator */,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
res_data.reserve(data.size() * needle_offsets.size()); // Leave enough space to handle multiple lines
res_offsets.resize(needle_offsets.size());

ColumnString::Offset res_offset = 0;

for (size_t i = 0; i < needle_offsets.size(); ++i)
{
auto data_offset = StringUtil::offsetAt(offsets, 0); // data have 1 rows.
auto data_size = StringUtil::sizeAt(offsets, 0);

auto needle_offset = StringUtil::offsetAt(needle_offsets, i);
auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // ignore the trailing zero

const UInt8 * begin = &data[data_offset];
const UInt8 * pos = begin;
const UInt8 * end = pos + data_size;

if (needle_size == 0)
{
/// Copy the whole data to res without changing
res_data.resize(res_data.size() + data_size);
memcpy(&res_data[res_offset], begin, data_size);
res_offset += data_size;
res_offsets[i] = res_offset;
continue;
}

Volnitsky searcher(reinterpret_cast<const char *>(&needle_chars[needle_offset]), needle_size, data_size);
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);

/// Copy the data without changing.
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
res_offset += match - pos;

if (match == end)
{
break;
}

res_data.resize(res_data.size() + replacement.size());
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
res_offset += replacement.size();
pos = match + needle_size;

res_data.resize(res_data.size() + (end - pos));
memcpy(&res_data[res_offset], pos, (end - pos));
res_offset += (end - pos);
break;
}
res_offsets[i] = res_offset;
}
}

static void vectorConstSrcAndNeedle(
const ColumnString::Chars_t & data,
const ColumnString::Offsets & /* offsets */,
const std::string & needle,
const ColumnString::Chars_t & replacement_chars,
const ColumnString::Offsets & replacement_offsets,
const Int64 & /* pos */,
const Int64 & /* occ */,
const std::string & /* match_type */,
TiDB::TiDBCollatorPtr /* collator */,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
const UInt8 * begin = &data[0];
const UInt8 * end = begin + data.size();

ColumnString::Offset res_offset = 0;
res_data.reserve(data.size() * replacement_offsets.size()); // Leave enough space to handle multiple lines
res_offsets.resize(replacement_offsets.size());

if (needle.empty())
{
for (size_t i = 0; i < replacement_offsets.size(); ++i)
{
res_data.resize(res_data.size() + data.size());
memcpy(&res_data[res_offset], begin, data.size());
res_offset += data.size();
res_offsets[i] = res_offset;
}
return;
}

for (size_t i = 0; i < replacement_offsets.size(); ++i)
{
const UInt8 * pos = begin;
Volnitsky searcher(needle.data(), needle.size(), end - pos);

auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i);
auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // ignore the trailing zero

while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);

/// Copy the whole data to res without changing
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
res_offset += match - pos;

if (match == end)
break;

// Replace the matched part with the current replacement
res_data.resize(res_data.size() + replacement_size);
memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size);
res_offset += replacement_size;

pos = match + needle.size();

res_data.resize(res_data.size() + (end - pos));
memcpy(&res_data[res_offset], pos, end - pos);
res_offset += (end - pos);
res_offsets[i] = res_offset;
break;
}
}
}

static void vectorConstSrc(
const ColumnString::Chars_t & data,
const ColumnString::Offsets & offsets,
const ColumnString::Chars_t & needle_chars,
const ColumnString::Offsets & needle_offsets,
const ColumnString::Chars_t & replacement_chars,
const ColumnString::Offsets & replacement_offsets,
const Int64 & /* pos */,
const Int64 & /* occ */,
const std::string & /* match_type */,
TiDB::TiDBCollatorPtr /* collator */,
ColumnString::Chars_t & res_data,
ColumnString::Offsets & res_offsets)
{
res_data.reserve(data.size() * offsets.size()); // Reserve space in the result data and offsets
res_offsets.resize(offsets.size());

ColumnString::Offset res_offset = 0;

for (size_t i = 0; i < offsets.size(); ++i)
{
auto data_offset = 0; // data have 1 rows.
auto data_size = data.size();

auto needle_offset = StringUtil::offsetAt(needle_offsets, i);
auto needle_size = StringUtil::sizeAt(needle_offsets, i) - 1; // Ignore the trailing zero

auto replacement_offset = StringUtil::offsetAt(replacement_offsets, i);
auto replacement_size = StringUtil::sizeAt(replacement_offsets, i) - 1; // Ignore the trailing zero

const UInt8 * begin = &data[data_offset];
const UInt8 * pos = begin;
const UInt8 * end = pos + data_size;

// Handle empty needle case
if (needle_size == 0)
{
res_data.resize(res_data.size() + data_size);
memcpy(&res_data[res_offset], begin, data_size);
res_offset += data_size;
res_offsets[i] = res_offset;
continue;
}

// Search for the needle in the data
Volnitsky searcher(reinterpret_cast<const char *>(&needle_chars[needle_offset]), needle_size, data_size);
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);

// Copy the data before the match
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
res_offset += match - pos;

if (match == end)
{
break;
}

// Replace the matched needle with the replacement
res_data.resize(res_data.size() + replacement_size);
memcpy(&res_data[res_offset], &replacement_chars[replacement_offset], replacement_size);
res_offset += replacement_size;

// Move the position pointer to after the matched needle
pos = match + needle_size;

if (replace_one)
{
res_data.resize(res_data.size() + (end - pos));
memcpy(&res_data[res_offset], pos, (end - pos));
res_offset += (end - pos);
break;
}
}

res_offsets[i] = res_offset;
}
}


static void vectorNonConstReplacement(
const ColumnString::Chars_t & data,
const ColumnString::Offsets & offsets,
Expand Down
32 changes: 32 additions & 0 deletions dbms/src/Functions/tests/gtest_strings_replace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,38 @@ try
toVec({" hello ", " h e llo", "hello ", " ", "hello, world"}),
toVec({" ", "h", "", "h", ","}),
toVec({"", "x", "xx", " ", ","})));

/// const src replacement
ASSERT_COLUMN_EQ(
toVec({"Good Night", "Bad Afternoon", "Good Afterwhile"}),
executeFunction(
"replaceAll",
toConst({"Good Afternoon"}),
toVec({"Afternoon", "Good", "noon"}),
toVec({"Night", "Bad", "while"})));

/// const src and needle replacement
ASSERT_COLUMN_EQ(
toVec({"Good Night", "Good Bad", "Good while"}),
executeFunction(
"replaceAll",
toConst({"Good Afternoon"}),
toConst({"Afternoon"}),
toVec({"Night", "Bad", "while"})));

/// const src and replace replacement
ASSERT_COLUMN_EQ(
toVec({"Good Night", "Night Afternoon", "Good AfterNight"}),
executeFunction(
"replaceAll",
toConst({"Good Afternoon"}),
toVec({"Afternoon", "Good", "noon"}),
toConst({"Night"})));

/// const src and replace replacement
ASSERT_COLUMN_EQ(
toVec({"Good Night"}),
executeFunction("replaceAll", toConst({"Good Afternoon"}), toConst({"Afternoon"}), toConst({"Night"})));
}
CATCH

Expand Down