Skip to content

Commit

Permalink
Make the normalizer work with new Unicode 16 normalization behaviors (#…
Browse files Browse the repository at this point in the history
…4860)

Closes #4859

ICU4C PR coming up.

---------

Co-authored-by: Robert Bastian <[email protected]>
  • Loading branch information
hsivonen and robertbastian authored Oct 28, 2024
1 parent 161bb79 commit c3da0ca
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 15 deletions.
21 changes: 17 additions & 4 deletions components/normalizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,19 @@ const SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16: u16 = 2;

/// Marker that a complex decomposition isn't round-trippable
/// under re-composition.
const NON_ROUND_TRIP_MARKER: u16 = 1;
///
/// TODO: When taking a data format break, swap this and
/// `BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER` around
/// to make backward-combiningness use the same bit in all
/// cases.
const NON_ROUND_TRIP_MARKER: u16 = 0b1;

/// Marker that a complex decomposition starts with a starter
/// that can combine backwards.
const BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER: u16 = 0b10;

/// Values above this are treated as a BMP character.
const HIGHEST_MARKER: u16 = NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER;

/// Checks if a trie value carries a (non-zero) canonical
/// combining class.
Expand Down Expand Up @@ -398,6 +410,7 @@ impl CharacterAndTrieValue {
pub fn can_combine_backwards(&self) -> bool {
decomposition_starts_with_non_starter(self.trie_val)
|| self.trie_val == BACKWARD_COMBINING_STARTER_MARKER
|| (((self.trie_val as u16) & !1) == BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER && (self.trie_val >> 16) != 0) // Combine with the previous condition when taking a data format break
|| in_inclusive_range32(self.trie_val, 0x1161, 0x11C2)
}
#[inline(always)]
Expand Down Expand Up @@ -426,7 +439,7 @@ impl CharacterAndTrieValue {
if lead == 0 {
return true;
}
if lead == NON_ROUND_TRIP_MARKER {
if lead <= HIGHEST_MARKER {
return false;
}
if (trail_or_complex & 0x7F) == 0x3C
Expand Down Expand Up @@ -830,14 +843,14 @@ where
} else {
let trail_or_complex = (decomposition >> 16) as u16;
let lead = decomposition as u16;
if lead > NON_ROUND_TRIP_MARKER && trail_or_complex != 0 {
if lead > HIGHEST_MARKER && trail_or_complex != 0 {
// Decomposition into two BMP characters: starter and non-starter
let starter = char_from_u16(lead);
let combining = char_from_u16(trail_or_complex);
self.buffer
.push(CharacterAndClass::new_with_placeholder(combining));
(starter, 0)
} else if lead > NON_ROUND_TRIP_MARKER {
} else if trail_or_complex == 0 {
if lead != FDFA_MARKER {
debug_assert_ne!(
lead, SPECIAL_NON_STARTER_DECOMPOSITION_MARKER_U16,
Expand Down
12 changes: 1 addition & 11 deletions components/normalizer/src/properties.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,17 +396,7 @@ impl CanonicalDecompositionBorrowed<'_> {
let offset24 = offset - tables.scalars16.len();
if let Some(first_c) = tables.scalars24.get(offset24) {
if len == 1 {
if c != first_c {
return Decomposed::Singleton(first_c);
} else {
// Singleton representation used to avoid
// NFC passthrough of characters that combine
// with starters that can occur as the first
// character of an expansion decomposition.
// See section 5 of
// https://www.unicode.org/L2/L2024/24009-utc178-properties-recs.pdf
return Decomposed::Default;
}
return Decomposed::Singleton(first_c);
}
if let Some(second_c) = tables.scalars24.get(offset24 + 1) {
return Decomposed::Expansion(first_c, second_c);
Expand Down

0 comments on commit c3da0ca

Please sign in to comment.