Skip to content

Commit bc59654

Browse files
committed
Measure the zerotrie sizes:
locales_only_zerotrie: 3339 regions_only_zerotrie: 1268 sparse_map: 50751 sparse_zerotrie: 272897 hybrid_sparse_map: 2223 num_dense_locales: 207 (122130 B) hybrid_sparse_zerotrie: 13146
1 parent bf3388f commit bc59654

File tree

1 file changed

+84
-13
lines changed

1 file changed

+84
-13
lines changed

provider/source/tests/dnametest.rs

Lines changed: 84 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,18 @@ use icu::locale::{
1111
use icu_experimental::displaynames::provider::RegionDisplayNamesV1;
1212
use icu_provider::prelude::*;
1313
use icu_provider_source::SourceDataProvider;
14+
use litemap::LiteMap;
1415
use ndarray::{Array2, Axis};
1516
use tinystr::TinyAsciiStr;
17+
use zerotrie::ZeroTrieSimpleAscii;
1618

1719
#[test]
1820
fn dnametest() {
19-
let provider = SourceDataProvider::new();
21+
let provider = SourceDataProvider::new_custom()
22+
.with_cldr(&std::path::PathBuf::from(
23+
"/home/sffc/lib/cldr-46.0.0-json-full",
24+
))
25+
.unwrap();
2026

2127
let locales: BTreeMap<DataIdentifierCow<'_>, usize> =
2228
IterableDataProvider::<RegionDisplayNamesV1>::iter_ids(&provider)
@@ -40,16 +46,23 @@ fn dnametest() {
4046
})
4147
.collect();
4248

43-
let en_names = payloads
44-
.get(&DataIdentifierCow::from_locale(locale!("en").into()))
45-
.unwrap();
49+
let unique_names: Vec<&str> = payloads
50+
.values()
51+
.flat_map(|v| v.get().names.iter_values())
52+
.collect::<BTreeSet<_>>()
53+
.into_iter()
54+
.collect();
55+
let unique_names_required_bits = (unique_names.len() as f64).log2().ceil() as usize;
56+
println!("unique_names: {} ({unique_names_required_bits})", unique_names.len());
4657

47-
let regions = en_names
58+
let regions: BTreeSet<TinyAsciiStr<3>> = payloads
59+
.get(&DataIdentifierCow::from_locale(locale!("en").into()))
60+
.unwrap()
4861
.get()
4962
.names
5063
.iter_keys()
5164
.map(|s| s.try_into_tinystr().unwrap())
52-
.collect::<BTreeSet<TinyAsciiStr<3>>>();
65+
.collect();
5366

5467
let expander = LocaleExpander::try_new_common_unstable(&provider).unwrap();
5568
let fallbacker = LocaleFallbacker::try_new_unstable(&provider).unwrap();
@@ -80,11 +93,14 @@ fn dnametest() {
8093
.collect();
8194

8295
let mut dense_matrix =
83-
Array2::<Option<&str>>::default((locales.len() + script_locales.len(), regions.len()));
96+
Array2::<Option<usize>>::default((locales.len() + script_locales.len(), regions.len()));
8497

8598
for (i, (_locale, payload)) in payloads.iter().enumerate() {
8699
for (j, region) in regions.iter().enumerate() {
87-
dense_matrix[(i, j)] = payload.get().names.get(&region.to_unvalidated());
100+
if let Some(name) = payload.get().names.get(&region.to_unvalidated()) {
101+
let index = unique_names.binary_search(&name).unwrap();
102+
dense_matrix[(i, j)] = Some(index);
103+
}
88104
}
89105
}
90106

@@ -137,11 +153,66 @@ fn dnametest() {
137153
values.iter().filter(|v| v.is_some()).count()
138154
});
139155

140-
for (i, locale) in locales.keys().enumerate() {
141-
println!("{locale:<3}: {}", large_small[i]);
142-
}
143-
for (i, locale) in script_locales.keys().enumerate() {
144-
let i = i + locales.len();
156+
for (i, locale) in locales.keys().chain(script_locales.keys()).enumerate() {
145157
println!("{locale:<3}: {}", large_small[i]);
146158
}
159+
160+
let locales_only_zerotrie: ZeroTrieSimpleAscii<Vec<u8>> = locales
161+
.keys()
162+
.chain(script_locales.keys())
163+
.enumerate()
164+
.map(|(i, locale)| (locale.to_string(), i))
165+
.collect();
166+
println!("locales_only_zerotrie: {}", locales_only_zerotrie.byte_len());
167+
168+
let regions_only_zerotrie: ZeroTrieSimpleAscii<Vec<u8>> = regions.iter().enumerate()
169+
.map(|(i, locale)| (locale.to_string(), i))
170+
.collect();
171+
172+
println!("regions_only_zerotrie: {}", regions_only_zerotrie.byte_len());
173+
174+
let sparse_map: LiteMap<String, usize> = locales
175+
.keys()
176+
.chain(script_locales.keys())
177+
.enumerate()
178+
.flat_map(|(i, locale)| {
179+
let dense_matrix = &dense_matrix;
180+
regions.iter().enumerate().filter_map(move |(j, region)| {
181+
dense_matrix[(i, j)].map(|index| (format!("{locale}/{region}"), index))
182+
})
183+
})
184+
.collect();
185+
println!("sparse_map: {}", sparse_map.len());
186+
187+
let sparse_zerotrie: ZeroTrieSimpleAscii<Vec<u8>> =
188+
sparse_map.iter().map(|(k, v)| (k, *v)).collect();
189+
println!("sparse_zerotrie: {}", sparse_zerotrie.byte_len());
190+
191+
let dense_row_bit_size = regions.len() * unique_names_required_bits;
192+
193+
let mut num_dense_locales = 0;
194+
let hybrid_sparse_map: LiteMap<String, usize> = locales
195+
.keys()
196+
.chain(script_locales.keys())
197+
.enumerate()
198+
.flat_map(|(i, locale)| {
199+
let dense_matrix = &dense_matrix;
200+
let row: Vec<(String, usize)> = regions.iter().enumerate().filter_map(move |(j, region)| {
201+
dense_matrix[(i, j)].map(|index| (format!("{locale}/{region}"), index))
202+
}).collect();
203+
let inner_zerotrie: ZeroTrieSimpleAscii<_> = row.iter().map(|(k, v)| (k, *v)).collect();
204+
if inner_zerotrie.byte_len() * 8 > dense_row_bit_size {
205+
num_dense_locales += 1;
206+
vec![(locale.to_string(), 0)].into_iter()
207+
} else {
208+
row.into_iter()
209+
}
210+
})
211+
.collect();
212+
println!("hybrid_sparse_map: {}", hybrid_sparse_map.len());
213+
println!("num_dense_locales: {} ({} B)", num_dense_locales, num_dense_locales * dense_row_bit_size / 8);
214+
215+
let hybrid_sparse_zerotrie: ZeroTrieSimpleAscii<Vec<u8>> =
216+
hybrid_sparse_map.iter().map(|(k, v)| (k, *v)).collect();
217+
println!("hybrid_sparse_zerotrie: {}", hybrid_sparse_zerotrie.byte_len());
147218
}

0 commit comments

Comments
 (0)