diff --git a/include/parser/parser.hpp b/include/parser/parser.hpp index 5556e2d..c120a08 100644 --- a/include/parser/parser.hpp +++ b/include/parser/parser.hpp @@ -57,6 +57,7 @@ class parser bool skip_string_literal_with_accel(); bool skip_whitespace() noexcept; bool skip_digit(); + bool skip_unicode_escape(uint16_t& pair_high); private: parsing_iter_t _cur; @@ -420,113 +421,34 @@ inline std::optional parser::parse_ } switch (*_cur) { case '"': - result.push_back('"'); + result.emplace_back('"'); break; case '\\': - result.push_back('\\'); + result.emplace_back('\\'); break; case '/': - result.push_back('/'); + result.emplace_back('/'); break; case 'b': - result.push_back('\b'); + result.emplace_back('\b'); break; case 'f': - result.push_back('\f'); + result.emplace_back('\f'); break; case 'n': - result.push_back('\n'); + result.emplace_back('\n'); break; case 'r': - result.push_back('\r'); + result.emplace_back('\r'); break; case 't': - result.push_back('\t'); + result.emplace_back('\t'); break; - case 'u': { - uint16_t cp = 0; - for (int i = 0; i < 4; i++) { - ++_cur; - if (_cur == _end) { - return std::nullopt; - } - if (!std::isxdigit(static_cast(*_cur))) { - return std::nullopt; - } - cp <<= 4; - if ('0' <= *_cur && *_cur <= '9') { - cp |= *_cur - '0'; - } - else if ('a' <= *_cur && *_cur <= 'f') { - cp |= *_cur - 'a' + 10; - } - else if ('A' <= *_cur && *_cur <= 'F') { - cp |= *_cur - 'A' + 10; - } - else { - return std::nullopt; - } - } - uint32_t ext_cp = cp; - uint16_t hi_cp = 0, lo_cp = 0; - if (0xD800 <= cp && cp <= 0xDBFF) { - if (pair_high) { - return std::nullopt; - } else { - pair_high = cp; - break; - } - } else if (0xDC00 <= cp && cp <= 0xDFFF) { - if (!pair_high) { - return std::nullopt; - } else { - ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000; - hi_cp = pair_high; - lo_cp = cp; - pair_high = 0; - } - } - if constexpr (std::is_same_v) { - // utf8 - if (ext_cp <= 0x7F) { - result.push_back(static_cast(ext_cp)); - } - else if (ext_cp <= 0x7FF) { - result.push_back(static_cast(((ext_cp >> 6) & 0b00011111) | 0b11000000u)); - result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); - } - else if (ext_cp <= 0xFFFF) { - result.push_back( - static_cast(((ext_cp >> 12) & 0b00001111) | 0b11100000u)); - result.push_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); - result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); - } else { - result.push_back( - static_cast(((ext_cp >> 18) & 0b00000111) | 0b11110000u)); - result.push_back(static_cast(((ext_cp >> 12) & 0b00111111) | 0b10000000u)); - result.push_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); - result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); - } - } - else if constexpr (std::is_same_v) { - if constexpr (sizeof (wchar_t) == 4) { - result.push_back(static_cast(ext_cp)); - } else if constexpr (sizeof (wchar_t) == 2) { - if (ext_cp <= 0xFFFF) { - result.push_back(static_cast(ext_cp)); - } else { - result.push_back(static_cast(hi_cp)); - result.push_back(static_cast(lo_cp)); - } - } else { - static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar"); - } - } - else { - static_assert(!sizeof(typename string_t::value_type), "Unsupported type"); + case 'u': + if (!skip_unicode_escape(pair_high)) { + return std::nullopt; } break; - } default: // Illegal backslash escape return std::nullopt; @@ -552,6 +474,101 @@ inline std::optional parser::parse_ return std::nullopt; } +template +inline bool parser::skip_unicode_escape(uint16_t& pair_high) +{ + uint16_t cp = 0; + for (int i = 0; i < 4; ++i) { + if (++_cur == _end) { + return false; + } + + if (!std::isxdigit(static_cast(*_cur))) { + return false; + } + + cp <<= 4; + + if ('0' <= *_cur && *_cur <= '9') { + cp |= *_cur - '0'; + } + else if ('a' <= *_cur && *_cur <= 'f') { + cp |= *_cur - 'a' + 10; + } + else if ('A' <= *_cur && *_cur <= 'F') { + cp |= *_cur - 'A' + 10; + } + else { + return false; + } + } + + uint32_t ext_cp = cp; + uint16_t hi_cp = 0, lo_cp = 0; + + if (0xD800 <= cp && cp <= 0xDBFF) { + if (pair_high) { + return false; + } + pair_high = cp; + return true; + } + + if (0xDC00 <= cp && cp <= 0xDFFF) { + if (!pair_high) { + return false; + } + ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000; + hi_cp = pair_high; + lo_cp = cp; + pair_high = 0; + } + + if constexpr (std::is_same_v) { + // utf8 + if (ext_cp <= 0x7F) { + result.emplace_back(static_cast(ext_cp)); + } + else if (ext_cp <= 0x7FF) { + result.emplace_back(static_cast(((ext_cp >> 6) & 0b00011111) | 0b11000000u)); + result.emplace_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); + } + else if (ext_cp <= 0xFFFF) { + result.emplace_back(static_cast(((ext_cp >> 12) & 0b00001111) | 0b11100000u)); + result.emplace_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); + result.emplace_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); + } + else { + result.emplace_back(static_cast(((ext_cp >> 18) & 0b00000111) | 0b11110000u)); + result.emplace_back(static_cast(((ext_cp >> 12) & 0b00111111) | 0b10000000u)); + result.emplace_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); + result.emplace_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); + } + } + else if constexpr (std::is_same_v) { + if constexpr (sizeof(wchar_t) == 4) { + result.emplace_back(static_cast(ext_cp)); + } + else if constexpr (sizeof(wchar_t) == 2) { + if (ext_cp <= 0xFFFF) { + result.emplace_back(static_cast(ext_cp)); + } + else { + result.emplace_back(static_cast(hi_cp)); + result.emplace_back(static_cast(lo_cp)); + } + } + else { + static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar"); + } + } + else { + static_assert(!sizeof(typename string_t::value_type), "Unsupported type"); + } + + return true; +} + template inline bool parser::skip_string_literal_with_accel() {