Skip to content

Commit a765086

Browse files
committed
optimization: Eliminate Cased table
`Cased` is a derived property - it is the union of the `Lowercase` property, the `Uppercase` property, and the `Titlecase_Letter` general categories. We already have lookup tables for `Lowercase` and `Uppercase`, and `Titlecase_Letter` is very small. So instead of duplicating a lookup table for `Cased`, just test each of those properties in turn. This probably will be slower than the old approach, but it is not a public API: it is only used in `string::to_lower` when deciding when a Greek "sigma" should be mapped to `ς` or to `σ`. This is a very rare case, so should not be performance sensitive.
1 parent fd75a9c commit a765086

File tree

5 files changed

+80
-56
lines changed

5 files changed

+80
-56
lines changed

library/alloc/src/str.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,9 +418,8 @@ impl str {
418418
}
419419

420420
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
421-
use core::unicode::{Case_Ignorable, Cased};
422-
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
423-
Some(c) => Cased(c),
421+
match iter.skip_while(|&c| c.is_case_ignorable()).next() {
422+
Some(c) => c.is_cased(),
424423
None => false,
425424
}
426425
}

library/core/src/char/methods.rs

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use crate::slice;
66
use crate::str::from_utf8_unchecked_mut;
77
use crate::ub_checks::assert_unsafe_precondition;
88
use crate::unicode::printable::is_printable;
9-
use crate::unicode::{self, conversions};
9+
use crate::unicode::{self, Case_Ignorable, conversions};
1010

1111
impl char {
1212
/// The lowest valid code point a `char` can have, `'\0'`.
@@ -969,7 +969,47 @@ impl char {
969969
#[must_use]
970970
#[inline]
971971
pub(crate) fn is_grapheme_extended(self) -> bool {
972-
unicode::Grapheme_Extend(self)
972+
!self.is_ascii() && unicode::Grapheme_Extend(self)
973+
}
974+
975+
/// Returns `true` if this `char` has the `Cased` derived property.
976+
///
977+
/// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
978+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
979+
///
980+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
981+
/// [ucd]: https://www.unicode.org/reports/tr44/
982+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
983+
#[must_use]
984+
#[inline]
985+
#[doc(hidden)]
986+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
987+
pub fn is_cased(self) -> bool {
988+
if self.is_ascii() {
989+
self.is_ascii_alphabetic()
990+
} else {
991+
unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self)
992+
}
993+
}
994+
995+
/// Returns `true` if this `char` has the `Case_Ignorable` property.
996+
///
997+
/// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and
998+
/// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`].
999+
///
1000+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
1001+
/// [ucd]: https://www.unicode.org/reports/tr44/
1002+
/// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt
1003+
#[must_use]
1004+
#[inline]
1005+
#[doc(hidden)]
1006+
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
1007+
pub fn is_case_ignorable(self) -> bool {
1008+
if self.is_ascii() {
1009+
matches!(self, '\'' | '.' | ':' | '^' | '`')
1010+
} else {
1011+
Case_Ignorable(self)
1012+
}
9731013
}
9741014

9751015
/// Returns `true` if this `char` has one of the general categories for numbers.

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
// for use in alloc, not re-exported in std.
66
#[rustfmt::skip]
77
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
8-
pub use unicode_data::cased::lookup as Cased;
98
pub use unicode_data::conversions;
109

1110
#[rustfmt::skip]
1211
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
1312
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
1413
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
14+
pub(crate) use unicode_data::lt::lookup as Lt;
1515
pub(crate) use unicode_data::n::lookup as N;
1616
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
1717
pub(crate) use unicode_data::white_space::lookup as White_Space;

library/core/src/unicode/unicode_data.rs

Lines changed: 34 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,15 @@
11
///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
22
// Alphabetic : 1727 bytes, 142759 codepoints in 757 ranges (U+000041 - U+0323B0) using skiplist
33
// Case_Ignorable : 1053 bytes, 2749 codepoints in 452 ranges (U+000027 - U+0E01F0) using skiplist
4-
// Cased : 407 bytes, 4578 codepoints in 159 ranges (U+000041 - U+01F18A) using skiplist
5-
// Cc : 9 bytes, 65 codepoints in 2 ranges (U+000000 - U+0000A0) using skiplist
64
// Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist
75
// Lowercase : 935 bytes, 2569 codepoints in 675 ranges (U+000061 - U+01E944) using bitset
6+
// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist
87
// N : 457 bytes, 1911 codepoints in 144 ranges (U+000030 - U+01FBFA) using skiplist
98
// Uppercase : 799 bytes, 1978 codepoints in 656 ranges (U+000041 - U+01F18A) using bitset
109
// White_Space : 256 bytes, 25 codepoints in 10 ranges (U+000009 - U+003001) using cascading
1110
// to_lower : 11484 bytes
1211
// to_upper : 13432 bytes
13-
// Total : 31446 bytes
12+
// Total : 31063 bytes
1413

1514
#[inline(always)]
1615
const fn bitset_search<
@@ -324,52 +323,6 @@ pub mod case_ignorable {
324323
}
325324
}
326325

327-
#[rustfmt::skip]
328-
pub mod cased {
329-
use super::ShortOffsetRunHeader;
330-
331-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [
332-
ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(55, 5024),
333-
ShortOffsetRunHeader::new(65, 7296), ShortOffsetRunHeader::new(69, 7958),
334-
ShortOffsetRunHeader::new(78, 9398), ShortOffsetRunHeader::new(153, 11264),
335-
ShortOffsetRunHeader::new(155, 42560), ShortOffsetRunHeader::new(167, 43824),
336-
ShortOffsetRunHeader::new(187, 64256), ShortOffsetRunHeader::new(193, 65313),
337-
ShortOffsetRunHeader::new(197, 66560), ShortOffsetRunHeader::new(201, 67456),
338-
ShortOffsetRunHeader::new(223, 68736), ShortOffsetRunHeader::new(231, 71840),
339-
ShortOffsetRunHeader::new(239, 93760), ShortOffsetRunHeader::new(241, 119808),
340-
ShortOffsetRunHeader::new(243, 120486), ShortOffsetRunHeader::new(280, 122624),
341-
ShortOffsetRunHeader::new(303, 122928), ShortOffsetRunHeader::new(309, 125184),
342-
ShortOffsetRunHeader::new(311, 127280), ShortOffsetRunHeader::new(313, 1241482),
343-
];
344-
static OFFSETS: [u8; 319] = [
345-
65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5,
346-
96, 1, 42, 4, 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9,
347-
41, 0, 38, 1, 1, 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6,
348-
2, 38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4,
349-
13, 5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1,
350-
4, 1, 6, 4, 1, 2, 4, 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5,
351-
1, 0, 46, 18, 30, 132, 102, 3, 4, 1, 62, 2, 2, 1, 1, 1, 8, 21, 5, 1, 3, 0, 43, 1, 14, 6, 80,
352-
0, 7, 12, 5, 0, 26, 6, 26, 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15,
353-
1, 7, 1, 2, 0, 1, 2, 3, 1, 42, 1, 9, 0, 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 0, 85, 1,
354-
71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5,
355-
1, 1, 3, 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1,
356-
8, 0, 10, 1, 20, 6, 6, 0, 62, 0, 68, 0, 26, 6, 26, 6, 26, 0,
357-
];
358-
pub fn lookup(c: char) -> bool {
359-
const {
360-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
361-
let mut i = 0;
362-
while i < SHORT_OFFSET_RUNS.len() {
363-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
364-
i += 1;
365-
}
366-
}
367-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
368-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
369-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
370-
}
371-
}
372-
373326
#[rustfmt::skip]
374327
pub mod grapheme_extend {
375328
use super::ShortOffsetRunHeader;
@@ -549,6 +502,38 @@ pub mod lowercase {
549502
}
550503
}
551504

505+
#[rustfmt::skip]
506+
pub mod lt {
507+
use super::ShortOffsetRunHeader;
508+
509+
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [
510+
ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072),
511+
ShortOffsetRunHeader::new(9, 1122301),
512+
];
513+
static OFFSETS: [u8; 21] = [
514+
0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0,
515+
];
516+
#[inline]
517+
pub fn lookup(c: char) -> bool {
518+
(c as u32) >= 0x1c5 && lookup_slow(c)
519+
}
520+
521+
#[inline(never)]
522+
fn lookup_slow(c: char) -> bool {
523+
const {
524+
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
525+
let mut i = 0;
526+
while i < SHORT_OFFSET_RUNS.len() {
527+
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
528+
i += 1;
529+
}
530+
}
531+
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
532+
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
533+
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
534+
}
535+
}
536+
552537
#[rustfmt::skip]
553538
pub mod n {
554539
use super::ShortOffsetRunHeader;

src/tools/unicode-table-generator/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ static PROPERTIES: &[&str] = &[
9090
"Alphabetic",
9191
"Lowercase",
9292
"Uppercase",
93-
"Cased",
93+
"Lt",
9494
"Case_Ignorable",
9595
"Grapheme_Extend",
9696
"White_Space",

0 commit comments

Comments
 (0)