Skip to content

Commit

Permalink
AK: Decode paired UTF-16 surrogates in a JSON string
Browse files Browse the repository at this point in the history
For example, such use is seen on Twitter.

(cherry picked from commit 698a95d2dee0ba7e9a3f1c39af5459ba506c445f)
  • Loading branch information
trflynn89 authored and nico committed Jul 7, 2024
1 parent c56a965 commit 8adcbd7
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 13 deletions.
24 changes: 11 additions & 13 deletions AK/JsonParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,20 +105,18 @@ ErrorOr<ByteString> JsonParser::consume_and_unescape_string()
case 'u': {
ignore(); // 'u'

if (tell_remaining() < 4)
return Error::from_string_literal("JsonParser: EOF while parsing Unicode escape");
auto escaped_string = consume(4);
auto code_point = AK::StringUtils::convert_to_uint_from_hex(escaped_string);
if (!code_point.has_value()) {
dbgln("JsonParser: Error while parsing Unicode escape {}", escaped_string);
// https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf
//
// To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
// twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
// example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
// However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
// explicit surrogate pair is a semantic decision that is determined by the specific processor.
auto code_point = decode_single_or_paired_surrogate();

if (code_point.is_error())
return Error::from_string_literal("JsonParser: Error while parsing Unicode escape");
}
// Note/FIXME: "To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
// twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
// example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
// However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
// explicit surrogate pair is a semantic decision that is determined by the specific processor."
// ~ECMA-404, 2nd Edition Dec. 2017, page 5

final_sb.append_code_point(code_point.value());
break;
}
Expand Down
22 changes: 22 additions & 0 deletions Tests/AK/TestJSON.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,28 @@ TEST_CASE(json_utf8_character)
EXPECT_EQ(json.as_string() == "A", true);
}

TEST_CASE(json_encoded_surrogates)
{
{
auto json = JsonValue::from_string("\"\\uD83E\\uDD13\""sv).value();
EXPECT_EQ(json.type(), JsonValue::Type::String);
EXPECT_EQ(json.as_string().length(), 4u);
EXPECT_EQ(json.as_string(), "🤓"sv);
}
{
auto json = JsonValue::from_string("\"\\uD83E\""sv).value();
EXPECT_EQ(json.type(), JsonValue::Type::String);
EXPECT_EQ(json.as_string().length(), 3u);
EXPECT_EQ(json.as_string(), "\xED\xA0\xBE"sv);
}
{
auto json = JsonValue::from_string("\"\\uDD13\""sv).value();
EXPECT_EQ(json.type(), JsonValue::Type::String);
EXPECT_EQ(json.as_string().length(), 3u);
EXPECT_EQ(json.as_string(), "\xED\xB4\x93"sv);
}
}

/*
FIXME: Parse JSON from a Utf8View
Expand Down

0 comments on commit 8adcbd7

Please sign in to comment.