Skip to content

Commit 4ca4c44

Browse files
committed
optimization: Eliminate Cased table
`Cased` is a derived property - it is the union of the `Lowercase` property, the `Uppercase` property, and the `Titlecase_Letter` general categories. We already have lookup tables for `Lowercase` and `Uppercase`, and `Titlecase_Letter` is very small. So instead of duplicating a lookup table for `Cased`, just test each of those properties in turn. This probably will be slower than the old approach, but it is not a public API: it is only used in `string::to_lower` when deciding when a Greek "sigma" should be mapped to `ς` or to `σ`. This is a very rare case, so should not be performance sensitive.
1 parent 68baa87 commit 4ca4c44

File tree

4 files changed

+42
-58
lines changed

4 files changed

+42
-58
lines changed

library/core/src/char/methods.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,11 @@ impl char {
985985
#[doc(hidden)]
986986
#[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")]
987987
pub fn is_cased(self) -> bool {
988-
if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) }
988+
if self.is_ascii() {
989+
self.is_ascii_alphabetic()
990+
} else {
991+
unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self)
992+
}
989993
}
990994

991995
/// Returns `true` if this `char` has the `Case_Ignorable` property.

library/core/src/unicode/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55
// for use in alloc, not re-exported in std.
66
#[rustfmt::skip]
77
pub use unicode_data::case_ignorable::lookup as Case_Ignorable;
8-
pub use unicode_data::cased::lookup as Cased;
98
pub use unicode_data::conversions;
109

1110
#[rustfmt::skip]
1211
pub(crate) use unicode_data::alphabetic::lookup as Alphabetic;
1312
pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend;
1413
pub(crate) use unicode_data::lowercase::lookup as Lowercase;
14+
pub(crate) use unicode_data::lt::lookup as Lt;
1515
pub(crate) use unicode_data::n::lookup as N;
1616
pub(crate) use unicode_data::uppercase::lookup as Uppercase;
1717
pub(crate) use unicode_data::white_space::lookup as White_Space;

library/core/src/unicode/unicode_data.rs

Lines changed: 35 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,15 @@
11
//! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!
22
// Alphabetic : 1723 bytes, 142707 codepoints in 755 ranges (U+0000AA - U+0323B0) using skiplist
33
// Case_Ignorable : 1043 bytes, 2744 codepoints in 447 ranges (U+0000A8 - U+0E01F0) using skiplist
4-
// Cased : 403 bytes, 4526 codepoints in 157 ranges (U+0000AA - U+01F18A) using skiplist
54
// Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist
65
// Lowercase : 933 bytes, 2543 codepoints in 674 ranges (U+0000AA - U+01E944) using bitset
6+
// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist
77
// N : 455 bytes, 1901 codepoints in 143 ranges (U+0000B2 - U+01FBFA) using skiplist
88
// Uppercase : 797 bytes, 1952 codepoints in 655 ranges (U+0000C0 - U+01F18A) using bitset
99
// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading
1010
// to_lower : 11484 bytes
1111
// to_upper : 13432 bytes
12-
// Total : 31413 bytes
12+
// Total : 31043 bytes
1313

1414
#[inline(always)]
1515
const fn bitset_search<
@@ -337,59 +337,6 @@ pub mod case_ignorable {
337337
}
338338
}
339339

340-
#[rustfmt::skip]
341-
pub mod cased {
342-
use super::ShortOffsetRunHeader;
343-
344-
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [
345-
ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(51, 5024),
346-
ShortOffsetRunHeader::new(61, 7296), ShortOffsetRunHeader::new(65, 7958),
347-
ShortOffsetRunHeader::new(74, 9398), ShortOffsetRunHeader::new(149, 11264),
348-
ShortOffsetRunHeader::new(151, 42560), ShortOffsetRunHeader::new(163, 43824),
349-
ShortOffsetRunHeader::new(183, 64256), ShortOffsetRunHeader::new(189, 65313),
350-
ShortOffsetRunHeader::new(193, 66560), ShortOffsetRunHeader::new(197, 67456),
351-
ShortOffsetRunHeader::new(219, 68736), ShortOffsetRunHeader::new(227, 71840),
352-
ShortOffsetRunHeader::new(235, 93760), ShortOffsetRunHeader::new(237, 119808),
353-
ShortOffsetRunHeader::new(239, 120486), ShortOffsetRunHeader::new(276, 122624),
354-
ShortOffsetRunHeader::new(299, 122928), ShortOffsetRunHeader::new(305, 125184),
355-
ShortOffsetRunHeader::new(307, 127280), ShortOffsetRunHeader::new(309, 1241482),
356-
];
357-
static OFFSETS: [u8; 315] = [
358-
170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5, 96, 1, 42, 4,
359-
2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1, 1,
360-
5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6, 2, 8,
361-
1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116,
362-
1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4, 1, 2, 4,
363-
5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46, 18, 30, 132,
364-
102, 3, 4, 1, 62, 2, 2, 1, 1, 1, 8, 21, 5, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6,
365-
26, 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3,
366-
1, 42, 1, 9, 0, 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2,
367-
4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25,
368-
1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, 0,
369-
62, 0, 68, 0, 26, 6, 26, 6, 26, 0,
370-
];
371-
#[inline]
372-
pub fn lookup(c: char) -> bool {
373-
debug_assert!(!c.is_ascii());
374-
(c as u32) >= 0xaa && lookup_slow(c)
375-
}
376-
377-
#[inline(never)]
378-
fn lookup_slow(c: char) -> bool {
379-
const {
380-
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
381-
let mut i = 0;
382-
while i < SHORT_OFFSET_RUNS.len() {
383-
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
384-
i += 1;
385-
}
386-
}
387-
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
388-
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
389-
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
390-
}
391-
}
392-
393340
#[rustfmt::skip]
394341
pub mod grapheme_extend {
395342
use super::ShortOffsetRunHeader;
@@ -572,6 +519,39 @@ pub mod lowercase {
572519
}
573520
}
574521

522+
#[rustfmt::skip]
523+
pub mod lt {
524+
use super::ShortOffsetRunHeader;
525+
526+
static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [
527+
ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072),
528+
ShortOffsetRunHeader::new(9, 1122301),
529+
];
530+
static OFFSETS: [u8; 21] = [
531+
0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0,
532+
];
533+
#[inline]
534+
pub fn lookup(c: char) -> bool {
535+
debug_assert!(!c.is_ascii());
536+
(c as u32) >= 0x1c5 && lookup_slow(c)
537+
}
538+
539+
#[inline(never)]
540+
fn lookup_slow(c: char) -> bool {
541+
const {
542+
assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32);
543+
let mut i = 0;
544+
while i < SHORT_OFFSET_RUNS.len() {
545+
assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len());
546+
i += 1;
547+
}
548+
}
549+
// SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX`
550+
// and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`.
551+
unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) }
552+
}
553+
}
554+
575555
#[rustfmt::skip]
576556
pub mod n {
577557
use super::ShortOffsetRunHeader;

src/tools/unicode-table-generator/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ static PROPERTIES: &[&str] = &[
9090
"Alphabetic",
9191
"Lowercase",
9292
"Uppercase",
93-
"Cased",
93+
"Lt",
9494
"Case_Ignorable",
9595
"Grapheme_Extend",
9696
"White_Space",

0 commit comments

Comments
 (0)