Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support collation in position function #9601

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
327 changes: 301 additions & 26 deletions dbms/src/Functions/FunctionsString.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <fmt/core.h>

#include <boost/algorithm/string/predicate.hpp>
#include <cstring>
#include <ext/range.h>
#include <magic_enum.hpp>

Expand Down Expand Up @@ -5004,6 +5005,10 @@ class FunctionPosition : public IFunction
std::string getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }

bool useDefaultImplementationForConstants() const override { return true; }

void setCollator(const TiDB::TiDBCollatorPtr & collator_) override { collator = collator_; }

DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() != 2)
Expand All @@ -5022,52 +5027,322 @@ class FunctionPosition : public IFunction
const IColumn * c0_col = block.getByPosition(arguments[0]).column.get();
const auto * c0_const = checkAndGetColumn<ColumnConst>(c0_col);
const auto * c0_string = checkAndGetColumn<ColumnString>(c0_col);
Field c0_field;

const IColumn * c1_col = block.getByPosition(arguments[1]).column.get();
const auto * c1_const = checkAndGetColumn<ColumnConst>(c1_col);
const auto * c1_string = checkAndGetColumn<ColumnString>(c1_col);
Field c1_field;

if ((c0_const == nullptr && c0_string == nullptr) || (c1_const == nullptr && c1_string == nullptr))
throw Exception(
fmt::format("Illegal argument of function {}", getName()),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
size_t row_num = c0_col->size();

if (c0_col->size() != c1_col->size())
auto col_res = ColumnInt64::create();
PaddedPODArray<Int64> & vec_res = col_res->getData();
vec_res.resize(row_num);
xzhangxian1008 marked this conversation as resolved.
Show resolved Hide resolved

if (c0_const && c1_string)
{
const String & c0_str = c0_const->getValue<String>();
if unlikely (c0_str.empty())
{
for (size_t i = 0; i < row_num; i++)
vec_res[i] = 1;
}
else
{
if (collator != nullptr && collator->isCI())
xzhangxian1008 marked this conversation as resolved.
Show resolved Hide resolved
constVector<true>(
c0_const->getValue<String>(),
c1_string->getChars(),
c1_string->getOffsets(),
vec_res);
else
constVector<false>(
c0_const->getValue<String>(),
c1_string->getChars(),
c1_string->getOffsets(),
vec_res);
}
}
else if (c0_string && c1_string)
{
if (collator != nullptr && collator->isCI())
vectorVector<true>(
c0_string->getChars(),
c0_string->getOffsets(),
c1_string->getChars(),
c1_string->getOffsets(),
vec_res);
else
vectorVector<false>(
c0_string->getChars(),
c0_string->getOffsets(),
c1_string->getChars(),
c1_string->getOffsets(),
vec_res);
}
else if (c0_string && c1_const)
{
if (collator != nullptr && collator->isCI())
vectorConst<true>(
c0_string->getChars(),
c0_string->getOffsets(),
c1_const->getValue<String>(),
vec_res);
else
vectorConst<false>(
c0_string->getChars(),
c0_string->getOffsets(),
c1_const->getValue<String>(),
vec_res);
}
else
throw Exception(
fmt::format("Function {} column number is inconformity", getName()),
ErrorCodes::LOGICAL_ERROR);
"Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and "
+ block.getByPosition(arguments[1]).column->getName() + " of arguments of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);

auto col_res = ColumnInt64::create();
int val_num = c0_col->size();
col_res->reserve(val_num);
block.getByPosition(result).column = std::move(col_res);
}

template <bool is_ci>
void vectorVector(
const ColumnString::Chars_t & col0_data,
const ColumnString::Offsets & col0_offsets,
const ColumnString::Chars_t & col1_data,
const ColumnString::Offsets & col1_offsets,
PaddedPODArray<Int64> & res) const
{
size_t pos;
size_t row_num = col0_offsets.size();
ColumnString::Offset prev_col0_str_offset = 0;
ColumnString::Offset prev_col1_str_offset = 0;

for (int i = 0; i < val_num; i++)
String col0_container;
String col1_container;

std::vector<size_t> lens;

for (size_t i = 0; i < row_num; i++)
{
c0_col->get(i, c0_field);
c1_col->get(i, c1_field);
size_t col0_str_len = col0_offsets[i] - prev_col0_str_offset - 1;
size_t col1_str_len = col1_offsets[i] - prev_col1_str_offset - 1;

String c0_str = c0_field.get<String>();
String c1_str = c1_field.get<String>();
if unlikely (col0_str_len == 0)
{
res[i] = 1;
}
else
{
if constexpr (is_ci)
{
const StringRef & col0_collation_str = collator->sortKeyNoTrim(
reinterpret_cast<const char *>(&col0_data[prev_col0_str_offset]),
col0_str_len,
col0_container);
const StringRef & col1_collation_str = collator->convert(
reinterpret_cast<const char *>(&col1_data[prev_col1_str_offset]),
col1_str_len,
col1_container,
&lens);
void * res_start = memmem(
col1_collation_str.data,
col1_collation_str.size,
col0_collation_str.data,
col0_collation_str.size);

if (res_start == nullptr)
res[i] = 0;
else
{
size_t pos = reinterpret_cast<const char *>(res_start) - col1_collation_str.data;
res[i] = 1 + getPositionWithCollationString(pos, lens);
}
}
else
{
LibCASCIICaseSensitiveStringSearcher searcher = LibCASCIICaseSensitiveStringSearcher(
reinterpret_cast<const char *>(&col0_data[prev_col0_str_offset]),
col0_str_len);

pos = searcher.search(&col1_data[prev_col1_str_offset], &col1_data[col1_offsets[i] - 1])
- &col1_data[prev_col1_str_offset];

if (pos != col1_str_len)
res[i] = 1
+ getPositionUTF8(
reinterpret_cast<const char *>(&col1_data[prev_col1_str_offset]),
reinterpret_cast<const char *>(&col1_data[prev_col1_str_offset + pos]));
else
res[i] = 0;
}
}

// return -1 when c1_str not contains the c0_str
Int64 idx = c1_str.find(c0_str);
col_res->insert(getPositionUTF8(c1_str, idx));
prev_col0_str_offset = col0_offsets[i];
prev_col1_str_offset = col1_offsets[i];
}
}

block.getByPosition(result).column = std::move(col_res);
template <bool is_ci>
void vectorConst(
const ColumnString::Chars_t & col0_data,
const ColumnString::Offsets & col0_offsets,
const String & col1_str,
PaddedPODArray<Int64> & res) const
{
size_t pos;
size_t row_num = col0_offsets.size();
ColumnString::Offset prev_col0_str_offset = 0;
size_t col1_str_len = col1_str.size();

String col0_container;
String col1_container;
std::vector<size_t> lens;
StringRef col1_collation_str;

if constexpr (is_ci)
col1_collation_str = collator->convert(col1_str.data(), col1_str.size(), col1_container, &lens);

for (size_t i = 0; i < row_num; i++)
{
size_t col0_str_len = col0_offsets[i] - prev_col0_str_offset - 1;

if unlikely (col0_str_len == 0)
{
res[i] = 1;
}
else
{
if constexpr (is_ci)
{
const StringRef & col0_collation_str = collator->sortKeyNoTrim(
reinterpret_cast<const char *>(&col0_data[prev_col0_str_offset]),
col0_str_len,
col0_container);
void * res_start = memmem(
col1_collation_str.data,
col1_collation_str.size,
col0_collation_str.data,
col0_collation_str.size);

if (res_start == nullptr)
res[i] = 0;
else
{
size_t pos = reinterpret_cast<const char *>(res_start) - col1_collation_str.data;
res[i] = 1 + getPositionWithCollationString(pos, lens);
}
}
else
{
LibCASCIICaseSensitiveStringSearcher searcher = LibCASCIICaseSensitiveStringSearcher(
reinterpret_cast<const char *>(&col0_data[prev_col0_str_offset]),
col0_str_len);

pos = searcher.search(
reinterpret_cast<const UInt8 *>(col1_str.c_str()),
reinterpret_cast<const UInt8 *>(col1_str.c_str() + col1_str_len))
- reinterpret_cast<const UInt8 *>(col1_str.c_str());

if (pos != col1_str_len)
res[i] = 1 + getPositionUTF8(col1_str.c_str(), col1_str.c_str() + pos);
else
res[i] = 0;
}
}

prev_col0_str_offset = col0_offsets[i];
}
}

template <bool is_ci>
void constVector(
const String & col0_str,
const ColumnString::Chars_t & col1_data,
const ColumnString::Offsets & col1_offsets,
PaddedPODArray<Int64> & res) const
{
size_t pos;
size_t row_num = col1_offsets.size();
ColumnString::Offset prev_col1_str_offset = 0;

String col0_container;
String col1_container;
std::vector<size_t> lens;
StringRef col0_collation_str;

if constexpr (is_ci)
col0_collation_str = collator->sortKeyNoTrim(col0_str.c_str(), col0_str.size(), col0_container);

// This construction will be wasted when is_ci is true, but it's acceptable
LibCASCIICaseSensitiveStringSearcher searcher_cs
= LibCASCIICaseSensitiveStringSearcher(col0_str.data(), col0_str.size());

for (size_t i = 0; i < row_num; i++)
{
size_t col1_str_len = col1_offsets[i] - prev_col1_str_offset - 1;

if constexpr (is_ci)
{
const StringRef & col1_collation_str = collator->convert(
reinterpret_cast<const char *>(&col1_data[prev_col1_str_offset]),
col1_str_len,
col1_container,
&lens);
void * res_start = memmem(
col1_collation_str.data,
col1_collation_str.size,
col0_collation_str.data,
col0_collation_str.size);

if (res_start == nullptr)
res[i] = 0;
else
{
size_t pos = reinterpret_cast<const char *>(res_start) - col1_collation_str.data;
res[i] = 1 + getPositionWithCollationString(pos, lens);
}
}
xzhangxian1008 marked this conversation as resolved.
Show resolved Hide resolved
else
{
pos = searcher_cs.search(&col1_data[prev_col1_str_offset], &col1_data[col1_offsets[i] - 1])
- &col1_data[prev_col1_str_offset];

if (pos != col1_str_len)
res[i] = 1
+ getPositionUTF8(
reinterpret_cast<const char *>(&col1_data[prev_col1_str_offset]),
reinterpret_cast<const char *>(&col1_data[prev_col1_str_offset + pos]));
else
res[i] = 0;
}

prev_col1_str_offset = col1_offsets[i];
}
}

private:
static Int64 getPositionUTF8(const String & c1_str, Int64 idx)
static Int64 getPositionUTF8(const char * begin, const char * end)
{
if (idx == -1)
return 0;
size_t res = 0;
for (const char * it = begin; it != end; ++it)
if (!UTF8::isContinuationOctet(static_cast<UInt8>(*it)))
++res;
return res;
}

const auto * data = reinterpret_cast<const UInt8 *>(c1_str.data());
return static_cast<size_t>(UTF8::countCodePoints(data, idx) + 1);
static Int64 getPositionWithCollationString(size_t pos, const std::vector<size_t> & lens)
{
Int64 actual_pos = 0;
size_t idx = 0;
xzhangxian1008 marked this conversation as resolved.
Show resolved Hide resolved
while (pos > 0)
{
xzhangxian1008 marked this conversation as resolved.
Show resolved Hide resolved
actual_pos++;
pos -= lens[idx];
xzhangxian1008 marked this conversation as resolved.
Show resolved Hide resolved
idx++;
}
return actual_pos;
}

TiDB::TiDBCollatorPtr collator{};
};

class FunctionSubStringIndex : public IFunction
Expand Down
Loading