Skip to content

Commit

Permalink
feat: unicode pair
Browse files Browse the repository at this point in the history
  • Loading branch information
neko-para committed May 17, 2024
1 parent 34b7dd2 commit baab5dc
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 13 deletions.
60 changes: 50 additions & 10 deletions include/parser/parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_

string_t result;
auto no_escape_beg = _cur;
uint16_t pair_high = 0;

while (_cur != _end) {
if constexpr (sizeof(*_cur) == 1 && accel_traits::available) {
Expand All @@ -414,6 +415,9 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
if (_cur == _end) {
return std::nullopt;
}
if (pair_high && *_cur != 'u') {
return std::nullopt;
}
switch (*_cur) {
case '"':
result.push_back('"');
Expand Down Expand Up @@ -463,24 +467,54 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
return std::nullopt;
}
}
uint32_t ext_cp = cp;
uint16_t hi_cp = 0, lo_cp = 0;
if (0xD800 <= cp && cp <= 0xDBFF) {
if (pair_high) {
return std::nullopt;
} else {
pair_high = cp;
break;
}
} else if (0xDC00 <= cp && cp <= 0xDFFF) {
if (!pair_high) {
return std::nullopt;
} else {
ext_cp = (((pair_high - 0xD800) << 10) | (cp - 0xDC00)) + 0x10000;
hi_cp = pair_high;
lo_cp = cp;
pair_high = 0;
}
}
if constexpr (std::is_same_v<typename string_t::value_type, char>) {
// utf8
if (cp <= 0x7F) {
result.push_back(static_cast<char>(cp));
if (ext_cp <= 0x7F) {
result.push_back(static_cast<char>(ext_cp));
}
else if (cp <= 0x7FF) {
result.push_back(static_cast<char>(((cp >> 6) & 0b00011111) | 0b11000000u));
result.push_back(static_cast<char>((cp & 0b00111111) | 0b10000000u));
else if (ext_cp <= 0x7FF) {
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00011111) | 0b11000000u));
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
}
else {
else if (ext_cp <= 0xFFFF) {
result.push_back(
static_cast<char>(((ext_cp >> 12) & 0b00001111) | 0b11100000u));
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
} else {
result.push_back(
static_cast<char>(((cp >> 12) & 0b00001111) | 0b11100000u));
result.push_back(static_cast<char>(((cp >> 6) & 0b00111111) | 0b10000000u));
result.push_back(static_cast<char>((cp & 0b00111111) | 0b10000000u));
static_cast<char>(((ext_cp >> 18) & 0b00000111) | 0b11110000u));
result.push_back(static_cast<char>(((ext_cp >> 12) & 0b00111111) | 0b10000000u));
result.push_back(static_cast<char>(((ext_cp >> 6) & 0b00111111) | 0b10000000u));
result.push_back(static_cast<char>((ext_cp & 0b00111111) | 0b10000000u));
}
}
else if constexpr (std::is_same_v<typename string_t::value_type, wchar_t>) {
result.push_back(cp);
if (ext_cp <= 0xFFFF) {
result.push_back(static_cast<uint16_t>(ext_cp));
} else {
result.push_back(hi_cp);
result.push_back(lo_cp);
}
}
else {
static_assert(!sizeof(typename string_t::value_type), "Unsupported type");
Expand All @@ -495,10 +529,16 @@ inline std::optional<string_t> parser<string_t, parsing_t, accel_traits>::parse_
break;
}
case '"': {
if (pair_high) {
return std::nullopt;
}
result += string_t(no_escape_beg, _cur++);
return result;
}
default:
if (pair_high) {
return std::nullopt;
}
++_cur;
break;
}
Expand Down
7 changes: 4 additions & 3 deletions test/unicode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

bool unicode_test()
{
std::string test = R"({ "test": "abc\u9a8fdef" })";
std::string test = R"({ "test": "abc\u9a8fdef\ud83d\udca9ghi" })";
std::string target = "abc\u9a8fdef\U0001f4a9ghi";
auto obj = json::parse(test);
if (!obj.has_value()) {
std::cout << "parse failed" << std::endl;
Expand All @@ -16,10 +17,10 @@ bool unicode_test()
<< static_cast<unsigned>(static_cast<unsigned char>(ch)) << ' ';
}
std::cout << std::endl;
for (auto ch : "abc\u9a8fdef") {
for (auto ch : target) {
std::cout << std::hex << std::setw(2)
<< static_cast<unsigned>(static_cast<unsigned char>(ch)) << ' ';
}
std::cout << std::endl;
return obj.value().at("test").as_string() == "abc\u9a8fdef";
return obj.value().at("test").as_string() == target;
}

0 comments on commit baab5dc

Please sign in to comment.