Skip to content

Commit

Permalink
perf: 解耦unicode_escape
Browse files Browse the repository at this point in the history
  • Loading branch information
MistEO committed May 24, 2024
1 parent b402505 commit ecae640
Showing 1 changed file with 105 additions and 90 deletions.
195 changes: 105 additions & 90 deletions include/parser/parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ class parser
bool skip_string_literal_with_accel();
bool skip_whitespace() noexcept;
bool skip_digit();
bool skip_unicode_escape(uint16_t& pair_high);

private:
parsing_iter_t _cur;
Expand Down Expand Up @@ -420,113 +421,34 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
}
switch (*_cur) {
case '"':
result.push_back('"');
result.emplace_back('"');
break;
case '\\':
result.push_back('\\');
result.emplace_back('\\');
break;
case '/':
result.push_back('/');
result.emplace_back('/');
break;
case 'b':
result.push_back('\b');
result.emplace_back('\b');
break;
case 'f':
result.push_back('\f');
result.emplace_back('\f');
break;
case 'n':
result.push_back('\n');
result.emplace_back('\n');
break;
case 'r':
result.push_back('\r');
result.emplace_back('\r');
break;
case 't':
result.push_back('\t');
result.emplace_back('\t');
break;
case 'u': {
uint16_t cp = 0;
for (int i = 0; i < 4; i++) {
++_cur;
if (_cur == _end) {
return std::nullopt;
}
if (!std::isxdigit(static_cast<unsigned char>(*_cur))) {
return std::nullopt;
}
cp <<= 4;
if ('0' <= *_cur && *_cur <= '9') {
cp |= *_cur - '0';
}
else if ('a' <= *_cur && *_cur <= 'f') {
cp |= *_cur - 'a' + 10;
}
else if ('A' <= *_cur && *_cur <= 'F') {
cp |= *_cur - 'A' + 10;
}
else {
return std::nullopt;
}
}
uint32_t ext_cp = cp;
uint16_t hi_cp = 0, lo_cp = 0;
if (0xD800 <= cp && cp <= 0xDBFF) {
if (pair_high) {
return std::nullopt;
} else {
pair_high = cp;
break;
}
} else if (0xDC00 <= cp && cp <= 0xDFFF) {
if (!pair_high) {
return std::nullopt;
} else {
ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000;
hi_cp = pair_high;
lo_cp = cp;
pair_high = 0;
}
}
if constexpr (std::is_same_v<typename string_t::value_type, char>) {
// utf8
if (ext_cp <= 0x7F) {
result.push_back(static_cast<char>(ext_cp));
}
else if (ext_cp <= 0x7FF) {
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00011111) | 0b11000000u));
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
}
else if (ext_cp <= 0xFFFF) {
result.push_back(
static_cast<char>(((ext_cp >> 12) & 0b00001111) | 0b11100000u));
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
} else {
result.push_back(
static_cast<char>(((ext_cp >> 18) & 0b00000111) | 0b11110000u));
result.push_back(static_cast<char>(((ext_cp >> 12) & 0b00111111) | 0b10000000u));
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
}
}
else if constexpr (std::is_same_v<typename string_t::value_type, wchar_t>) {
if constexpr (sizeof (wchar_t) == 4) {
result.push_back(static_cast<wchar_t>(ext_cp));
} else if constexpr (sizeof (wchar_t) == 2) {
if (ext_cp <= 0xFFFF) {
result.push_back(static_cast<wchar_t>(ext_cp));
} else {
result.push_back(static_cast<wchar_t>(hi_cp));
result.push_back(static_cast<wchar_t>(lo_cp));
}
} else {
static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar");
}
}
else {
static_assert(!sizeof(typename string_t::value_type), "Unsupported type");
case 'u':
if (!skip_unicode_escape(pair_high)) {
return std::nullopt;
}
break;
}
default:
// Illegal backslash escape
return std::nullopt;
Expand All @@ -552,6 +474,99 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
return std::nullopt;
}

template <typename string_t, typename parsing_t, typename accel_traits>
inline bool parser<string_t, parsing_t, accel_traits>::skip_unicode_escape(uint16_t& pair_high)
{
uint16_t cp = 0;
for (int i = 0; i < 4; i++) {
++_cur;
if (_cur == _end) {
return false;
}
if (!std::isxdigit(static_cast<unsigned char>(*_cur))) {
return false;
}
cp <<= 4;
if ('0' <= *_cur && *_cur <= '9') {
cp |= *_cur - '0';
}
else if ('a' <= *_cur && *_cur <= 'f') {
cp |= *_cur - 'a' + 10;
}
else if ('A' <= *_cur && *_cur <= 'F') {
cp |= *_cur - 'A' + 10;
}
else {
return false;
}
}

uint32_t ext_cp = cp;
uint16_t hi_cp = 0, lo_cp = 0;

if (0xD800 <= cp && cp <= 0xDBFF) {
if (pair_high) {
return false;
}
pair_high = cp;
return true;
}

if (0xDC00 <= cp && cp <= 0xDFFF) {
if (!pair_high) {
return false;
}
ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000;
hi_cp = pair_high;
lo_cp = cp;
pair_high = 0;
}

if constexpr (std::is_same_v<typename string_t::value_type, char>) {
// utf8
if (ext_cp <= 0x7F) {
result.emplace_back(static_cast<char>(ext_cp));
}
else if (ext_cp <= 0x7FF) {
result.emplace_back(static_cast<char>(((ext_cp >> 6) & 0b00011111) | 0b11000000u));
result.emplace_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
}
else if (ext_cp <= 0xFFFF) {
result.emplace_back(static_cast<char>(((ext_cp >> 12) & 0b00001111) | 0b11100000u));
result.emplace_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
result.emplace_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
}
else {
result.emplace_back(static_cast<char>(((ext_cp >> 18) & 0b00000111) | 0b11110000u));
result.emplace_back(static_cast<char>(((ext_cp >> 12) & 0b00111111) | 0b10000000u));
result.emplace_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
result.emplace_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
}
}
else if constexpr (std::is_same_v<typename string_t::value_type, wchar_t>) {
if constexpr (sizeof(wchar_t) == 4) {
result.emplace_back(static_cast<wchar_t>(ext_cp));
}
else if constexpr (sizeof(wchar_t) == 2) {
if (ext_cp <= 0xFFFF) {
result.emplace_back(static_cast<wchar_t>(ext_cp));
}
else {
result.emplace_back(static_cast<wchar_t>(hi_cp));
result.emplace_back(static_cast<wchar_t>(lo_cp));
}
}
else {
static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar");
}
}
else {
static_assert(!sizeof(typename string_t::value_type), "Unsupported type");
}

return true;
}

template <typename string_t, typename parsing_t, typename accel_traits>
inline bool parser<string_t, parsing_t, accel_traits>::skip_string_literal_with_accel()
{
Expand Down

0 comments on commit ecae640

Please sign in to comment.