From b402505f5b8e5432ca5d23eae729222528e88ebf Mon Sep 17 00:00:00 2001 From: nekosu Date: Fri, 24 May 2024 15:41:34 +0800 Subject: [PATCH] feat: unicode pair (#64) --- include/parser/parser.hpp | 66 +++++++++++++++++++++++++++++++++------ test/unicode_test.cpp | 7 +++-- 2 files changed, 60 insertions(+), 13 deletions(-) diff --git a/include/parser/parser.hpp b/include/parser/parser.hpp index 9e6d9bc..5556e2d 100644 --- a/include/parser/parser.hpp +++ b/include/parser/parser.hpp @@ -397,6 +397,7 @@ inline std::optional parser::parse_ string_t result; auto no_escape_beg = _cur; + uint16_t pair_high = 0; while (_cur != _end) { if constexpr (sizeof(*_cur) == 1 && accel_traits::available) { @@ -414,6 +415,9 @@ inline std::optional parser::parse_ if (_cur == _end) { return std::nullopt; } + if (pair_high && *_cur != 'u') { + return std::nullopt; + } switch (*_cur) { case '"': result.push_back('"'); @@ -463,24 +467,60 @@ inline std::optional parser::parse_ return std::nullopt; } } + uint32_t ext_cp = cp; + uint16_t hi_cp = 0, lo_cp = 0; + if (0xD800 <= cp && cp <= 0xDBFF) { + if (pair_high) { + return std::nullopt; + } else { + pair_high = cp; + break; + } + } else if (0xDC00 <= cp && cp <= 0xDFFF) { + if (!pair_high) { + return std::nullopt; + } else { + ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000; + hi_cp = pair_high; + lo_cp = cp; + pair_high = 0; + } + } if constexpr (std::is_same_v) { // utf8 - if (cp <= 0x7F) { - result.push_back(static_cast(cp)); + if (ext_cp <= 0x7F) { + result.push_back(static_cast(ext_cp)); } - else if (cp <= 0x7FF) { - result.push_back(static_cast(((cp >> 6) & 0b00011111) | 0b11000000u)); - result.push_back(static_cast((cp & 0b00111111) | 0b10000000u)); + else if (ext_cp <= 0x7FF) { + result.push_back(static_cast(((ext_cp >> 6) & 0b00011111) | 0b11000000u)); + result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); } - else { + else if (ext_cp <= 0xFFFF) { + result.push_back( + static_cast(((ext_cp >> 12) & 0b00001111) | 0b11100000u)); + result.push_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); + result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); + } else { result.push_back( - static_cast(((cp >> 12) & 0b00001111) | 0b11100000u)); - result.push_back(static_cast(((cp >> 6) & 0b00111111) | 0b10000000u)); - result.push_back(static_cast((cp & 0b00111111) | 0b10000000u)); + static_cast(((ext_cp >> 18) & 0b00000111) | 0b11110000u)); + result.push_back(static_cast(((ext_cp >> 12) & 0b00111111) | 0b10000000u)); + result.push_back(static_cast(((ext_cp >> 6) & 0b00111111) | 0b10000000u)); + result.push_back(static_cast((ext_cp & 0b00111111) | 0b10000000u)); } } else if constexpr (std::is_same_v) { - result.push_back(cp); + if constexpr (sizeof (wchar_t) == 4) { + result.push_back(static_cast(ext_cp)); + } else if constexpr (sizeof (wchar_t) == 2) { + if (ext_cp <= 0xFFFF) { + result.push_back(static_cast(ext_cp)); + } else { + result.push_back(static_cast(hi_cp)); + result.push_back(static_cast(lo_cp)); + } + } else { + static_assert(!sizeof(typename string_t::value_type), "Unsupported wchar"); + } } else { static_assert(!sizeof(typename string_t::value_type), "Unsupported type"); @@ -495,10 +535,16 @@ inline std::optional parser::parse_ break; } case '"': { + if (pair_high) { + return std::nullopt; + } result += string_t(no_escape_beg, _cur++); return result; } default: + if (pair_high) { + return std::nullopt; + } ++_cur; break; } diff --git a/test/unicode_test.cpp b/test/unicode_test.cpp index 995a48e..f4235f9 100644 --- a/test/unicode_test.cpp +++ b/test/unicode_test.cpp @@ -5,7 +5,8 @@ bool unicode_test() { - std::string test = R"({ "test": "abc\u9a8fdef" })"; + std::string test = R"({ "test": "abc\u9a8fdef\ud83d\udca9ghi" })"; + std::string target = "abc\u9a8fdef\U0001f4a9ghi"; auto obj = json::parse(test); if (!obj.has_value()) { std::cout << "parse failed" << std::endl; @@ -16,10 +17,10 @@ bool unicode_test() << static_cast(static_cast(ch)) << ' '; } std::cout << std::endl; - for (auto ch : "abc\u9a8fdef") { + for (auto ch : target) { std::cout << std::hex << std::setw(2) << static_cast(static_cast(ch)) << ' '; } std::cout << std::endl; - return obj.value().at("test").as_string() == "abc\u9a8fdef"; + return obj.value().at("test").as_string() == target; }