Skip to content

Commit 35ea705

Browse files
author
Robert Fancsik
committed
Implement String.prototype.normalize
JerryScript-DCO-1.0-Signed-off-by: Robert Fancsik [email protected]
1 parent 42523bd commit 35ea705

14 files changed

+286
-40
lines changed

.github/workflows/gh-actions.yml

+8
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ jobs:
8585
Conformance_Tests_ES2015:
8686
runs-on: ubuntu-latest
8787
steps:
88+
- run: sudo apt update
89+
- run: sudo apt install libicu-dev
8890
- uses: actions/checkout@v2
8991
- run: $RUNNER --test262-es2015=update
9092
- run: $RUNNER --test262-es2015=update --build-debug
@@ -99,6 +101,8 @@ jobs:
99101
Conformance_Tests_ESNext:
100102
runs-on: ubuntu-latest
101103
steps:
104+
- run: sudo apt update
105+
- run: sudo apt install libicu-dev
102106
- uses: actions/checkout@v2
103107
- run: $RUNNER --test262-esnext=update
104108
- uses: actions/upload-artifact@v2
@@ -111,6 +115,8 @@ jobs:
111115
Conformance_Tests_ESNext_Debug_A:
112116
runs-on: ubuntu-latest
113117
steps:
118+
- run: sudo apt update
119+
- run: sudo apt install libicu-dev
114120
- uses: actions/checkout@v2
115121
- run: $RUNNER --test262-esnext=update --build-debug --test262-test-list=built-ins,annexB,harness,intl402
116122
- uses: actions/upload-artifact@v2
@@ -123,6 +129,8 @@ jobs:
123129
Conformance_Tests_ESNext_Debug_B:
124130
runs-on: ubuntu-latest
125131
steps:
132+
- run: sudo apt update
133+
- run: sudo apt install libicu-dev
126134
- uses: actions/checkout@v2
127135
- run: $RUNNER --test262-esnext=update --build-debug --test262-test-list=language
128136
- uses: actions/upload-artifact@v2

jerry-core/CMakeLists.txt

+22
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ set(JERRY_ERROR_MESSAGES OFF CACHE BOOL "Enable error mess
2525
set(JERRY_EXTERNAL_CONTEXT OFF CACHE BOOL "Enable external context?")
2626
set(JERRY_PARSER ON CACHE BOOL "Enable javascript-parser?")
2727
set(JERRY_FUNCTION_TO_STRING OFF CACHE BOOL "Enable function toString operation?")
28+
set(JERRY_ICU OFF CACHE BOOL "Enable ICU support?")
2829
set(JERRY_LINE_INFO OFF CACHE BOOL "Enable line info?")
2930
set(JERRY_LOGGING OFF CACHE BOOL "Enable logging?")
3031
set(JERRY_MEM_STATS OFF CACHE BOOL "Enable memory statistics?")
@@ -78,13 +79,24 @@ if(JERRY_MEM_STATS OR JERRY_PARSER_DUMP_BYTE_CODE OR JERRY_REGEXP_DUMP_BYTE_CODE
7879
set(JERRYRE_LOGGING_MESSAGE " (FORCED BY STATS OR DUMP)")
7980
endif()
8081

82+
# ICU
83+
if(JERRY_ICU)
84+
find_package(ICU REQUIRED COMPONENTS uc)
85+
86+
if(NOT ICU_FOUND)
87+
set(JERRY_ICU OFF)
88+
set(JERRY_ICU_MESSAGE " (FORCED BY MISSING LIBRARY)")
89+
endif()
90+
endif()
91+
8192
# Status messages
8293
message(STATUS "JERRY_CPOINTER_32_BIT " ${JERRY_CPOINTER_32_BIT} ${JERRY_CPOINTER_32_BIT_MESSAGE})
8394
message(STATUS "JERRY_DEBUGGER " ${JERRY_DEBUGGER})
8495
message(STATUS "JERRY_ERROR_MESSAGES " ${JERRY_ERROR_MESSAGES})
8596
message(STATUS "JERRY_EXTERNAL_CONTEXT " ${JERRY_EXTERNAL_CONTEXT})
8697
message(STATUS "JERRY_PARSER " ${JERRY_PARSER})
8798
message(STATUS "JERRY_FUNCTION_TO_STRING " ${JERRY_FUNCTION_TO_STRING})
99+
message(STATUS "JERRY_ICU " ${JERRY_ICU} ${JERRY_ICU_MESSAGE})
88100
message(STATUS "JERRY_LINE_INFO " ${JERRY_LINE_INFO})
89101
message(STATUS "JERRY_LOGGING " ${JERRY_LOGGING} ${JERRY_LOGGING_MESSAGE})
90102
message(STATUS "JERRY_MEM_STATS " ${JERRY_MEM_STATS})
@@ -641,6 +653,12 @@ if(JERRY_VALGRIND)
641653
set(INCLUDE_CORE_PRIVATE ${INCLUDE_CORE_PRIVATE} ${INCLUDE_THIRD_PARTY_VALGRIND})
642654
endif()
643655

656+
# ICU
657+
jerry_add_define01(JERRY_ICU)
658+
if(JERRY_ICU)
659+
set(INCLUDE_CORE_PRIVATE ${INCLUDE_CORE_PRIVATE} ${ICU_INCLUDE_DIRS})
660+
endif()
661+
644662
# Enable VM execution stop callback
645663
jerry_add_define01(JERRY_VM_HALT)
646664

@@ -766,6 +784,10 @@ else()
766784
endif()
767785
endif()
768786

787+
if(JERRY_ICU)
788+
target_link_libraries (${JERRY_CORE_NAME} ${ICU_LIBRARIES})
789+
endif()
790+
769791
separate_arguments(EXTERNAL_LINK_LIBS)
770792
foreach(EXT_LIB ${EXTERNAL_LINK_LIBS})
771793
target_link_libraries(${JERRY_CORE_NAME} ${EXT_LIB})

jerry-core/ecma/base/ecma-error-messages.inc.h

+6-5
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,9 @@ ECMA_ERROR_DEF (ECMA_ERR_INVALID_REGEXP_FLAGS, "Invalid RegExp flags")
7070
#if JERRY_BUILTIN_JSON
7171
ECMA_ERROR_DEF (ECMA_ERR_JSON_STRINGIFY_ERROR, "JSON stringify error")
7272
#endif /* JERRY_BUILTIN_JSON */
73+
#if JERRY_BUILTIN_STRING && JERRY_ESNEXT
74+
ECMA_ERROR_DEF (ECMA_ERR_NORMALIZATION_FAILED, "Normalization failed")
75+
#endif /* JERRY_BUILTIN_STRING && JERRY_ESNEXT */
7376
#if JERRY_BUILTIN_REGEXP
7477
ECMA_ERROR_DEF (ECMA_ERR_STACK_LIMIT_EXCEEDED, "Stack limit exceeded")
7578
#endif /* JERRY_BUILTIN_REGEXP */
@@ -203,6 +206,9 @@ ECMA_ERROR_DEF (ECMA_ERR_EXPECTED_A_FUNCTION_OBJECT, "Expected a function object
203206
#if JERRY_BUILTIN_TYPEDARRAY
204207
ECMA_ERROR_DEF (ECMA_ERR_INVALID_ARRAYBUFFER_LENGTH, "Invalid ArrayBuffer length")
205208
#endif /* JERRY_BUILTIN_TYPEDARRAY */
209+
#if JERRY_BUILTIN_STRING && JERRY_ESNEXT
210+
ECMA_ERROR_DEF (ECMA_ERR_INVALID_NORMALIZATION_FORM, "Invalid normalization form")
211+
#endif /* JERRY_BUILTIN_STRING && JERRY_ESNEXT */
206212
#if !(JERRY_MODULE_SYSTEM)
207213
ECMA_ERROR_DEF (ECMA_ERR_MODULE_NOT_SUPPORTED, "Module support is disabled")
208214
#endif /* !(JERRY_MODULE_SYSTEM) */
@@ -547,11 +553,6 @@ ECMA_ERROR_DEF (ECMA_ERR_CONSTRUCTOR_UINT32_ARRAY_REQUIRES_NEW, "Constructor Uin
547553
#if JERRY_ESNEXT
548554
ECMA_ERROR_DEF (ECMA_ERR_GENERATOR_IS_CURRENTLY_UNDER_EXECUTION, "Generator is currently under execution")
549555
ECMA_ERROR_DEF (ECMA_ERR_ITERATOR_RETURN_RESULT_IS_NOT_OBJECT, "Iterator 'return' result is not object")
550-
#endif /* JERRY_ESNEXT */
551-
#if JERRY_BUILTIN_TYPEDARRAY
552-
ECMA_ERROR_DEF (ECMA_ERR_RETURNED_ARRAYBUFFER_HAS_BEEN_DETACHED, "Returned ArrayBuffer has been detached")
553-
#endif /* JERRY_BUILTIN_TYPEDARRAY */
554-
#if JERRY_ESNEXT
555556
ECMA_ERROR_DEF (ECMA_ERR_SEARCH_STRING_CANNOT_BE_OF_TYPE_REGEXP, "Search string can't be of type: RegExp")
556557
ECMA_ERROR_DEF (ECMA_ERR_VALUE_RECEIVED_BY_YIELD_IS_NOT_OBJECT, "Value received by yield* is not object")
557558
#endif /* JERRY_ESNEXT */

jerry-core/ecma/base/ecma-error-messages.ini

+2-1
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,6 @@ ECMA_ERR_RESOLVE_MUST_BE_UNDEFINED = "Resolve must be undefined"
213213
ECMA_ERR_RESULT_OF_DEFAULTVALUE_IS_INVALID = "Result of [[DefaultValue]] is invalid"
214214
ECMA_ERR_RETURN_VALUE_IS_NOT_AN_ARRAYBUFFER_OBJECT = "Return value is not an ArrayBuffer object"
215215
ECMA_ERR_RETURN_VALUE_OF_EXEC_MUST_BE_AN_OBJECT_OR_NULL = "Return value of 'exec' must be an object or null"
216-
ECMA_ERR_RETURNED_ARRAYBUFFER_HAS_BEEN_DETACHED = "Returned ArrayBuffer has been detached"
217216
ECMA_ERR_RIGHT_VALUE_OF_IN_MUST_BE_AN_OBJECT = "Right value of 'in' must be an object"
218217
ECMA_ERR_RIGHT_VALUE_OF_INSTANCEOF_MUST_BE_AN_OBJECT = "Right value of 'instanceof' must be an object"
219218
ECMA_ERR_SEARCH_STRING_CANNOT_BE_OF_TYPE_REGEXP = "Search string can't be of type: RegExp"
@@ -333,3 +332,5 @@ ECMA_ERR_PRIVATE_METHOD_IS_NOT_WRITABLE = "Private method is not writable"
333332
ECMA_ERR_PRIVATE_FIELD_WAS_DEFINED_WITHOUT_A_SETTER = "Private field was defined without a setter"
334333
ECMA_ERR_CANNOT_READ_PRIVATE_MEMBER_TO_AN_OBJECT_WHOSE_CLASS_DID_NOT_DECLARE_IT = "Cannot read private member to an object whose class did not declare it"
335334
ECMA_ERR_PRIVATE_FIELD_WAS_DEFINED_WITHOUT_A_GETTER = "Private field was defined without a getter"
335+
ECMA_ERR_INVALID_NORMALIZATION_FORM = "Invalid normalization form"
336+
ECMA_ERR_NORMALIZATION_FAILED = "Normalization failed"

jerry-core/ecma/base/ecma-helpers-string.c

+52
Original file line numberDiff line numberDiff line change
@@ -2805,6 +2805,58 @@ ecma_op_advance_string_index (ecma_string_t *str_p, /**< input string */
28052805
} /* ecma_op_advance_string_index */
28062806
#endif /* JERRY_ESNEXT */
28072807

2808+
#if JERRY_ICU
2809+
/**
2810+
* Copy the string's data into a newly allocated UTF16 encoded buffer
2811+
*
2812+
* @return pointer to the allocated buffer
2813+
*/
2814+
uint16_t *
2815+
ecma_string_cesu8_to_utf16 (ecma_string_t *str_p, /**< input string */
2816+
lit_utf8_size_t *utf16_length_p) /**< [out] utf16 buffer size */
2817+
{
2818+
lit_utf8_size_t utf8_size;
2819+
lit_utf8_size_t utf8_length;
2820+
uint8_t flags = ECMA_STRING_FLAG_EMPTY;
2821+
const lit_utf8_byte_t *utf8_buffer_p = ecma_string_get_chars (str_p, &utf8_size, &utf8_length, NULL, &flags);
2822+
const lit_utf8_byte_t *utf8_buffer_end_p = utf8_buffer_p + utf8_size;
2823+
2824+
*utf16_length_p = utf8_length;
2825+
uint16_t *utf16_buff_p = (uint16_t *) jmem_heap_alloc_block (*utf16_length_p * sizeof (uint16_t));
2826+
uint16_t *utf16_buff_iter_p = utf16_buff_p;
2827+
2828+
while (utf8_buffer_p < utf8_buffer_end_p)
2829+
{
2830+
*utf16_buff_iter_p++ = (uint16_t) lit_cesu8_read_next (&utf8_buffer_p);
2831+
}
2832+
2833+
if (flags & ECMA_STRING_FLAG_MUST_BE_FREED)
2834+
{
2835+
jmem_heap_free_block ((void *) utf8_buffer_p, utf8_size);
2836+
}
2837+
2838+
return utf16_buff_p;
2839+
} /* ecma_string_cesu8_to_utf16 */
2840+
2841+
/**
2842+
* Allocate a new string from UTF16 encoded buffer
2843+
*
2844+
* @return pointer to the allocated string
2845+
*/
2846+
ecma_string_t *
2847+
ecma_new_ecma_string_from_utf16 (uint16_t *utf16_buff_p, lit_utf8_size_t utf16_length)
2848+
{
2849+
ecma_stringbuilder_t builder = ecma_stringbuilder_create ();
2850+
2851+
while (utf16_length--)
2852+
{
2853+
ecma_stringbuilder_append_codepoint (&builder, *utf16_buff_p++);
2854+
}
2855+
2856+
return ecma_stringbuilder_finalize (&builder);
2857+
} /* ecma_new_ecma_string_from_utf16 */
2858+
#endif /* JERRY_ICU */
2859+
28082860
/**
28092861
* @}
28102862
* @}

jerry-core/ecma/base/ecma-helpers.h

+4
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,10 @@ ecma_string_t *ecma_new_symbol_from_descriptor_string (ecma_value_t string_desc)
282282
bool ecma_prop_name_is_symbol (ecma_string_t *string_p);
283283
ecma_length_t ecma_op_advance_string_index (ecma_string_t *str_p, ecma_length_t index_num, bool is_unicode);
284284
#endif /* JERRY_ESNEXT */
285+
#if JERRY_ICU
286+
uint16_t *ecma_string_cesu8_to_utf16 (ecma_string_t *str_p, lit_utf8_size_t *utf16_length_p);
287+
ecma_string_t *ecma_new_ecma_string_from_utf16 (uint16_t *utf16_buff_p, lit_utf8_size_t utf16_length);
288+
#endif /* JERRY_ICU */
285289
#if JERRY_BUILTIN_CONTAINER
286290
ecma_string_t *ecma_new_map_key_string (ecma_value_t value);
287291
bool ecma_prop_name_is_map_key (ecma_string_t *string_p);

jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.c

+154
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@
3838
#include "ecma-regexp-object.h"
3939
#endif /* JERRY_BUILTIN_REGEXP */
4040

41+
#if JERRY_ICU
42+
#include "unicode/unorm2.h"
43+
#endif /* JERRY_ICU */
44+
4145
#if JERRY_BUILTIN_STRING
4246

4347
#define ECMA_BUILTINS_INTERNAL
@@ -80,6 +84,7 @@ enum
8084

8185
ECMA_STRING_PROTOTYPE_SUBSTR,
8286

87+
ECMA_STRING_PROTOTYPE_NORMALIZE,
8388
ECMA_STRING_PROTOTYPE_REPEAT,
8489
ECMA_STRING_PROTOTYPE_CODE_POINT_AT,
8590
ECMA_STRING_PROTOTYPE_PAD_START,
@@ -1225,6 +1230,150 @@ ecma_builtin_string_prototype_object_trim (ecma_string_t *original_string_p) /**
12251230
} /* ecma_builtin_string_prototype_object_trim */
12261231

12271232
#if JERRY_ESNEXT
1233+
#if JERRY_ICU
1234+
/**
1235+
* Helper macro to register form normalizer entries
1236+
*/
1237+
#define FORM_ENTRY(id, instance_cb) \
1238+
{ \
1239+
id, instance_cb \
1240+
}
1241+
1242+
/**
1243+
* ICU string normalizer instance callback
1244+
*/
1245+
typedef const UNormalizer2 *(*icu_string_normalizer_instance_cb_t) (UErrorCode *);
1246+
#else /* !JERRY_ICU */
1247+
/**
1248+
* Helper macro to register form normalizer entries
1249+
*/
1250+
#define FORM_ENTRY(id, instance_cb) \
1251+
{ \
1252+
id \
1253+
}
1254+
#endif /* JERRY_ICU */
1255+
1256+
/**
1257+
* Normalization form descriptor
1258+
*/
1259+
typedef struct
1260+
{
1261+
lit_magic_string_id_t kind; /**< kind */
1262+
#if JERRY_ICU
1263+
icu_string_normalizer_instance_cb_t instance_cb; /**< normalizer instance callback */
1264+
#endif /* JERRY_ICU */
1265+
} icu_string_form_normalizer_t;
1266+
1267+
/**
1268+
* List of normalization forms
1269+
*/
1270+
static const icu_string_form_normalizer_t icu_string_normalize_forms[] = {
1271+
FORM_ENTRY (LIT_MAGIC_STRING_NFC_U, unorm2_getNFCInstance),
1272+
FORM_ENTRY (LIT_MAGIC_STRING_NFD_U, unorm2_getNFDInstance),
1273+
FORM_ENTRY (LIT_MAGIC_STRING_NFKC_U, unorm2_getNFKCInstance),
1274+
FORM_ENTRY (LIT_MAGIC_STRING_NFKD_U, unorm2_getNFKDInstance)
1275+
};
1276+
1277+
#undef FORM_ENTRY
1278+
1279+
/**
1280+
* The String.prototype object's 'normalize' routine
1281+
*
1282+
* See also:
1283+
* ECMA-262 v12, 22.1.3.13
1284+
*
1285+
* @return ecma value
1286+
* Returned value must be freed with ecma_free_value.
1287+
*/
1288+
static ecma_value_t
1289+
ecma_builtin_string_prototype_object_normalize (ecma_string_t *original_string_p, /**< this argument */
1290+
ecma_value_t form_value) /**< normalization from */
1291+
{
1292+
#if JERRY_ICU
1293+
icu_string_normalizer_instance_cb_t normalizer_instance_cb = unorm2_getNFCInstance;
1294+
#endif /* JERRY_ICU */
1295+
1296+
if (!ecma_is_value_undefined (form_value))
1297+
{
1298+
ecma_string_t *form_p = ecma_op_to_string (form_value);
1299+
1300+
if (JERRY_UNLIKELY (form_p == NULL))
1301+
{
1302+
return ECMA_VALUE_ERROR;
1303+
}
1304+
1305+
size_t forms_size = sizeof (icu_string_normalize_forms) / sizeof (icu_string_normalize_forms[0]);
1306+
uint32_t form_idx = 0;
1307+
1308+
for (; form_idx < forms_size; form_idx++)
1309+
{
1310+
if (ecma_compare_ecma_string_to_magic_id (form_p, icu_string_normalize_forms[form_idx].kind))
1311+
{
1312+
#if JERRY_ICU
1313+
normalizer_instance_cb = icu_string_normalize_forms[form_idx].instance_cb;
1314+
#endif /* JERRY_ICU */
1315+
break;
1316+
}
1317+
}
1318+
1319+
ecma_deref_ecma_string (form_p);
1320+
1321+
if (form_idx >= forms_size)
1322+
{
1323+
return ecma_raise_range_error (ECMA_ERR_INVALID_NORMALIZATION_FORM);
1324+
}
1325+
}
1326+
1327+
#if JERRY_ICU
1328+
JERRY_ASSERT (normalizer_instance_cb != NULL);
1329+
size_t string_size = ecma_string_get_size (original_string_p);
1330+
1331+
if (string_size == 0)
1332+
{
1333+
ecma_ref_ecma_string (original_string_p);
1334+
return ecma_make_string_value (original_string_p);
1335+
}
1336+
1337+
UErrorCode status = U_ZERO_ERROR;
1338+
const UNormalizer2 *normalizer_cb = normalizer_instance_cb (&status);
1339+
1340+
if (!U_FAILURE (status))
1341+
{
1342+
ecma_value_t result = ECMA_VALUE_ERROR;
1343+
1344+
lit_utf8_size_t length;
1345+
uint16_t *buffer_p = ecma_string_cesu8_to_utf16 (original_string_p, &length);
1346+
int32_t norm_length = unorm2_normalize (normalizer_cb, buffer_p, (int32_t) length, NULL, 0, &status);
1347+
1348+
if (!U_FAILURE (status) || status == U_BUFFER_OVERFLOW_ERROR)
1349+
{
1350+
uint16_t *norm_buff_p = (uint16_t *) jmem_heap_alloc_block ((uint32_t) norm_length * sizeof (uint16_t));
1351+
1352+
status = U_ZERO_ERROR;
1353+
norm_length = unorm2_normalize (normalizer_cb, buffer_p, (int32_t) length, norm_buff_p, norm_length, &status);
1354+
1355+
if (!U_FAILURE (status))
1356+
{
1357+
result = ecma_make_string_value (ecma_new_ecma_string_from_utf16 (norm_buff_p, (uint32_t) norm_length));
1358+
}
1359+
1360+
jmem_heap_free_block (norm_buff_p, (uint32_t) norm_length * sizeof (uint16_t));
1361+
}
1362+
1363+
jmem_heap_free_block (buffer_p, length * sizeof (uint16_t));
1364+
1365+
if (!ECMA_IS_VALUE_ERROR (result))
1366+
{
1367+
return result;
1368+
}
1369+
}
1370+
1371+
return ecma_raise_type_error (ECMA_ERR_NORMALIZATION_FAILED);
1372+
#else /* !JERRY_ICU */
1373+
ecma_ref_ecma_string (original_string_p);
1374+
return ecma_make_string_value (original_string_p);
1375+
#endif /* JERRY_ICU */
1376+
} /* ecma_builtin_string_prototype_object_normalize */
12281377

12291378
/**
12301379
* The String.prototype object's 'repeat' routine
@@ -1570,6 +1719,11 @@ ecma_builtin_string_prototype_dispatch_routine (uint8_t builtin_routine_id, /**<
15701719
}
15711720
#endif /* JERRY_BUILTIN_ANNEXB */
15721721
#if JERRY_ESNEXT
1722+
case ECMA_STRING_PROTOTYPE_NORMALIZE:
1723+
{
1724+
ret_value = ecma_builtin_string_prototype_object_normalize (string_p, arg1);
1725+
break;
1726+
}
15731727
case ECMA_STRING_PROTOTYPE_REPEAT:
15741728
{
15751729
ret_value = ecma_builtin_string_prototype_object_repeat (string_p, arg1);

jerry-core/ecma/builtin-objects/ecma-builtin-string-prototype.inc.h

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ ROUTINE (LIT_MAGIC_STRING_SUBSTR, ECMA_STRING_PROTOTYPE_SUBSTR, 2, 2)
6868
#endif /* JERRY_BUILTIN_ANNEXB */
6969

7070
#if JERRY_ESNEXT
71+
ROUTINE (LIT_MAGIC_STRING_NORMALIZE, ECMA_STRING_PROTOTYPE_NORMALIZE, 1, 0)
7172
ROUTINE (LIT_MAGIC_STRING_REPEAT, ECMA_STRING_PROTOTYPE_REPEAT, 1, 1)
7273
ROUTINE (LIT_MAGIC_STRING_STARTS_WITH, ECMA_STRING_PROTOTYPE_STARTS_WITH, 2, 1)
7374
ROUTINE (LIT_MAGIC_STRING_INCLUDES, ECMA_STRING_PROTOTYPE_INCLUDES, 2, 1)

0 commit comments

Comments
 (0)