Skip to content

Commit beeb8e3

Browse files
committed
Auto merge of #146173 - Kmeakin:km/unicode-data/no-ascii, r=jhpratt
Don't include ASCII characters in Unicode tables Split off from #145219
2 parents a09fbe2 + a8c6694 commit beeb8e3

File tree

7 files changed

+320
-247
lines changed

7 files changed

+320
-247
lines changed

library/alloc/src/str.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,9 +418,8 @@ impl str {
418418
}
419419

420420
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
421-
use core::unicode::{Case_Ignorable, Cased};
422-
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
423-
Some(c) => Cased(c),
421+
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
422+
Some(c) => c.is_cased(),
424423
None => false,
425424
}
426425
}

library/core/src/char/methods.rs

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -969,7 +969,43 @@ impl char {
969969
#[must_use]
970970
#[inline]
971971
pub(crate) fn is_grapheme_extended(self) -> bool {
972-
unicode::Grapheme_Extend(self)
972+
!self.is_ascii() && unicode::Grapheme_Extend(self)
973+
}
974+
975+
/// Returns `true` if this `char` has the `Cased` property.
976+
///
977+
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
978+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
979+
///
980+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
981+
/// [ucd]: https://www.unicode.org/reports/tr44/
982+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
983+
#[must_use]
984+
#[inline]
985+
#[doc(hidden)]
986+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
987+
pub fn is_cased(self) -> bool {
988+
if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
989+
}
990+
991+
/// Returns `true` if this `char` has the `Case_Ignorable` property.
992+
///
993+
/// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
994+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
995+
///
996+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
997+
/// [ucd]: https://www.unicode.org/reports/tr44/
998+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
999+
#[must_use]
1000+
#[inline]
1001+
#[doc(hidden)]
1002+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
1003+
pub fn is_case_ignorable(self) -> bool {
1004+
if self.is_ascii() {
1005+
matches!(self, '\'' | '.' | ':' | '^' | '`')
1006+
} else {
1007+
unicode::Case_Ignorable(self)
1008+
}
9731009
}
9741010

9751011
/// Returns `true` if this `char` has one of the general categories for numbers.

library/core/src/unicode/unicode_data.rs

Lines changed: 276 additions & 243 deletions
Large diffs are not rendered by default.

src/tools/unicode-table-generator/src/cascading_map.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ impl RawEmitter {
6464

6565
writeln!(&mut self.file, "#[inline]").unwrap();
6666
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
67+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
6768
writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap();
6869
for arm in arms {
6970
writeln!(&mut self.file, " {arm},").unwrap();

src/tools/unicode-table-generator/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,7 @@ fn load_data() -> UnicodeData {
195195
.into_iter()
196196
.flatten()
197197
.flat_map(|cp| cp.scalar())
198+
.filter(|c| !c.is_ascii())
198199
.map(u32::from)
199200
.collect::<Vec<_>>();
200201
(prop, ranges_from_set(&codepoints))

src/tools/unicode-table-generator/src/raw_emitter.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ impl RawEmitter {
9898
self.blank_line();
9999

100100
writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap();
101+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
101102
if first_code_point > 0x7f {
102103
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap();
103104
}

src/tools/unicode-table-generator/src/skiplist.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ impl RawEmitter {
9999
if first_code_point > 0x7f {
100100
writeln!(&mut self.file, "#[inline]").unwrap();
101101
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
102+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
102103
writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)")
103104
.unwrap();
104105
writeln!(&mut self.file, "}}").unwrap();
@@ -107,6 +108,7 @@ impl RawEmitter {
107108
writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap();
108109
} else {
109110
writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap();
111+
writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap();
110112
}
111113
writeln!(&mut self.file, " const {{").unwrap();
112114
writeln!(

0 commit comments

Comments
 (0)