From 676cb37d0ee8a392984e3a0ad399380f44d804de Mon Sep 17 00:00:00 2001 From: MistEO Date: Fri, 24 May 2024 15:50:27 +0800 Subject: [PATCH] =?UTF-8?q?perf:=20=E8=A7=A3=E8=80=A6unicode=5Fescape?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/parser/parser.hpp | 183 +++++++++++++++++++++----------------- 1 file changed, 101 insertions(+), 82 deletions(-) diff --git a/include/parser/parser.hpp b/include/parser/parser.hpp index 5556e2d..e6eac2e 100644 --- a/include/parser/parser.hpp +++ b/include/parser/parser.hpp @@ -57,6 +57,7 @@ class parser bool skip_string_literal_with_accel(); bool skip_whitespace() noexcept; bool skip_digit(); + bool skip_unicode_escape(uint16_t& pair_high, string_t& result); private: parsing_iter_t _cur; @@ -443,90 +444,11 @@ inline std::optional parser::parse_ case 't': result.push_back('\t'); break; - case 'u': { - uint16_t cp = 0; - for (int i = 0; i < 4; i++) { - ++_cur; - if (_cur == _end) { - return std::nullopt; - } - if (!std::isxdigit(static_cast(*_cur))) { - return std::nullopt; - } - cp <<= 4; - if ('0' <= *_cur && *_cur <= '9') { - cp |= *_cur - '0'; - } - else if ('a' <= *_cur && *_cur <= 'f') { - cp |= *_cur - 'a' + 10; - } - else if ('A' <= *_cur && *_cur <= 'F') { - cp |= *_cur - 'A' + 10; - } - else { - return std::nullopt; - } - } - uint32_t ext_cp = cp; - uint16_t hi_cp = 0, lo_cp = 0; - if (0xD800 <= cp && cp <= 0xDBFF) { - if (pair_high) { - return std::nullopt; - } else { - pair_high = cp; - break; - } - } else if (0xDC00 <= cp && cp <= 0xDFFF) { - if (!pair_high) { - return std::nullopt; - } else { - ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000; - hi_cp = pair_high; - lo_cp = cp; - pair_high = 0; - } - } - if constexpr (std::is_same_v) { - // utf8 - if (ext_cp <= 0x7F) { - result.push_back(static_cast(ext_cp)); - } - else if (ext_cp <= 0x7FF) { - result.push_back(static_cast(((ext_cp >> 6) & 0b00011111) | 0b11000000u)); - result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); - } - else if (ext_cp <= 0xFFFF) { - result.push_back( - static_cast(((ext_cp >> 12) & 0b00001111) | 0b11100000u)); - result.push_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); - result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); - } else { - result.push_back( - static_cast(((ext_cp >> 18) & 0b00000111) | 0b11110000u)); - result.push_back(static_cast(((ext_cp >> 12) & 0b00111111) | 0b10000000u)); - result.push_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); - result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); - } - } - else if constexpr (std::is_same_v) { - if constexpr (sizeof (wchar_t) == 4) { - result.push_back(static_cast(ext_cp)); - } else if constexpr (sizeof (wchar_t) == 2) { - if (ext_cp <= 0xFFFF) { - result.push_back(static_cast(ext_cp)); - } else { - result.push_back(static_cast(hi_cp)); - result.push_back(static_cast(lo_cp)); - } - } else { - static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar"); - } - } - else { - static_assert(!sizeof(typename string_t::value_type), "Unsupported type"); + case 'u': + if (!skip_unicode_escape(pair_high, result)) { + return std::nullopt; } break; - } default: // Illegal backslash escape return std::nullopt; @@ -552,6 +474,103 @@ inline std::optional parser::parse_ return std::nullopt; } +template +inline bool parser::skip_unicode_escape( + uint16_t& pair_high, + string_t& result) +{ + uint16_t cp = 0; + for (int i = 0; i < 4; ++i) { + if (++_cur == _end) { + return false; + } + + if (!std::isxdigit(static_cast(*_cur))) { + return false; + } + + cp <<= 4; + + if ('0' <= *_cur && *_cur <= '9') { + cp |= *_cur - '0'; + } + else if ('a' <= *_cur && *_cur <= 'f') { + cp |= *_cur - 'a' + 10; + } + else if ('A' <= *_cur && *_cur <= 'F') { + cp |= *_cur - 'A' + 10; + } + else { + return false; + } + } + + uint32_t ext_cp = cp; + uint16_t hi_cp = 0, lo_cp = 0; + + if (0xD800 <= cp && cp <= 0xDBFF) { + if (pair_high) { + return false; + } + pair_high = cp; + return true; + } + + if (0xDC00 <= cp && cp <= 0xDFFF) { + if (!pair_high) { + return false; + } + ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000; + hi_cp = pair_high; + lo_cp = cp; + pair_high = 0; + } + + if constexpr (std::is_same_v) { + // utf8 + if (ext_cp <= 0x7F) { + result.push_back(static_cast(ext_cp)); + } + else if (ext_cp <= 0x7FF) { + result.push_back(static_cast(((ext_cp >> 6) & 0b00011111) | 0b11000000u)); + result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); + } + else if (ext_cp <= 0xFFFF) { + result.push_back(static_cast(((ext_cp >> 12) & 0b00001111) | 0b11100000u)); + result.push_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); + result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); + } + else { + result.push_back(static_cast(((ext_cp >> 18) & 0b00000111) | 0b11110000u)); + result.push_back(static_cast(((ext_cp >> 12) & 0b00111111) | 0b10000000u)); + result.push_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); + result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); + } + } + else if constexpr (std::is_same_v) { + if constexpr (sizeof(wchar_t) == 4) { + result.push_back(static_cast(ext_cp)); + } + else if constexpr (sizeof(wchar_t) == 2) { + if (ext_cp <= 0xFFFF) { + result.push_back(static_cast(ext_cp)); + } + else { + result.push_back(static_cast(hi_cp)); + result.push_back(static_cast(lo_cp)); + } + } + else { + static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar"); + } + } + else { + static_assert(!sizeof(typename string_t::value_type), "Unsupported type"); + } + + return true; +} + template inline bool parser::skip_string_literal_with_accel() {