Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ rand = "0.9"
rand_distr = "0.5"
rand_pcg = "0.9"
rayon = "1.3.0"
regex = "1.12.2"
rkyv = "0.7"
rmp-serde = "1.2.0"
serde_json = "1.0.45"
Expand Down
4 changes: 2 additions & 2 deletions provider/baked/tests/data/hello_world_v1.rs.data
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
/// `icu`'s `_unstable` constructors.
///
/// Using this implementation will embed the following data in the binary's data segment:
/// * 176B for the lookup data structure (27 data identifiers)
/// * 234B for the lookup data structure (38 data identifiers)
/// * 1100B[^1] for the actual data (27 unique structs)
///
/// [^1]: these numbers can be smaller in practice due to linker deduplication
Expand All @@ -23,7 +23,7 @@ macro_rules! __impl_hello_world_v1 {
#[clippy::msrv = "1.83"]
impl $provider {
const DATA_HELLO_WORLD_V1: icu_provider::baked::zerotrie::DataForVarULEs<icu_provider::hello_world::HelloWorldV1> = {
const TRIE: icu_provider::baked::zerotrie::ZeroTrieSimpleAscii<&'static [u8]> = icu_provider::baked::zerotrie::ZeroTrieSimpleAscii { store: b"\xCDbcdefijlprsvz\x02\x04\nCIKX[^fpsn\x80s\x81e\x82-AT\x83\xC3lno\x012\x84\x85\xC2\x1E-\treverse\x90\t\xC301G\x0C\x0F\xC201\x06\xC212\x01\x86\x879\x8842\x89B\x8A-u-sd-gbeng\x8B\x8C\xC2ai\x01\x8D\x8Es\x8Fa\x90\0\x1Ereverse\x90\na\x90\x01t\x90\x02\xC2ou\x02\x90\x03\x90\x04r\x90\x05-Latn\x90\x06i\x90\x07h\x90\x08" };
const TRIE: icu_provider::baked::zerotrie::ZeroTrieSimpleAscii<&'static [u8]> = icu_provider::baked::zerotrie::ZeroTrieSimpleAscii { store: b"\xCDbcdefijlprsvz\x02\x04\x17y\x83\x85\x92\x95\x98\xA0\xAA\xADn\x80s\x81e\x82\xC2\x1E-\x03de\x82AT\x83\x1Ede-AT\x83\xC3lno\x01W\x84\x85\xC2\x1E-\x0E\xC2er\x02n\x85everse\x90\t\xC301G\x18\x1F\xC201\x0E\xC212\x05\x86\x1Een\x86\x87\x1Een\x879\x88\x1Een\x8842\x89\x1Een\x89B\x8A\xC2\x1E-\x06en-GB\x8Au-sd-gbeng\x8B\x1Een-GB\x8B\x8C\x1Eeo\x8C\xC2ai\x05\x8D\x1Efa\x8D\x8Es\x8Fa\x90\0\x1Ereverse\x90\na\x90\x01t\x90\x02\xC2ou\x02\x90\x03\x90\x04r\x90\x05-Latn\x90\x06i\x90\x07h\x90\x08" };
const VALUES: &'static zerovec::VarZeroSlice<<<icu_provider::hello_world::HelloWorldV1 as icu_provider::baked::zerotrie::DynamicDataMarker>::DataStruct as icu_provider::ule::MaybeAsVarULE>::EncodedStruct> = unsafe { zerovec::vecs::VarZeroSlice16::from_bytes_unchecked(b"\x1B\0\x19\0$\0.\09\0T\0_\0q\0\x80\0\x8F\0\x9E\0\xB1\0\xD8\0\xE6\0\xFA\0\x05\x01\x13\x01(\x012\x01=\x01H\x01\\\x01u\x01\x82\x01\x98\x01\xA4\x01\xAF\x01\xE0\xA6\x93\xE0\xA6\xB9\xE0\xA7\x87 \xE0\xA6\xAC\xE0\xA6\xBF\xE0\xA6\xB6\xE0\xA7\x8D\xE0\xA6\xACAhoj sv\xC4\x9BteHallo WeltServus Welt\xCE\x9A\xCE\xB1\xCE\xBB\xCE\xB7\xCE\xBC\xCE\xAD\xCF\x81\xCE\xB1 \xCE\xBA\xCF\x8C\xCF\x83\xCE\xBC\xCE\xB5Hello WorldHello from \xF0\x9F\x97\xBA\xEF\xB8\x8FHello from \xF0\x9F\x8C\x8DHello from \xF0\x9F\x8C\x8EHello from \xF0\x9F\x8C\x8FHello from \xF0\x9F\x87\xAC\xF0\x9F\x87\xA7Hello from \xF0\x9F\x8F\xB4\xF3\xA0\x81\xA7\xF3\xA0\x81\xA2\xF3\xA0\x81\xA5\xF3\xA0\x81\xAE\xF3\xA0\x81\xA7\xF3\xA0\x81\xBFSaluton, Mondo\xD8\xB3\xD9\x84\xD8\xA7\xD9\x85 \xD8\xAF\xD9\x86\xDB\x8C\xD8\xA7\xE2\x80\x8Ehei maailmaHall\xC3\xB3, heimur\xE3\x81\x93\xE3\x82\x93\xE3\x81\xAB\xE3\x81\xA1\xE3\x81\xAF\xE4\xB8\x96\xE7\x95\x8CAve, mundeOl\xC3\xA1, mundoSalut, lume\xD0\x9F\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82, \xD0\xBC\xD0\xB8\xD1\x80\xD0\x9F\xD0\xBE\xD0\xB7\xD0\xB4\xD1\x80\xD0\xB0\xD0\xB2 \xD1\x81\xD0\xB2\xD0\xB5\xD1\x82\xD0\xB5Pozdrav sveteXin ch\xC3\xA0o th\xE1\xBA\xBF gi\xE1\xBB\x9Bi\xE4\xBD\xA0\xE5\xA5\xBD\xE4\xB8\x96\xE7\x95\x8COlleh Dlrow\xE7\x95\x8C\xE4\xB8\x96\xE3\x81\xAF\xE3\x81\xA1\xE3\x81\xAB\xE3\x82\x93\xE3\x81\x93") };
unsafe { icu_provider::baked::zerotrie::DataForVarULEs::from_trie_and_values_unchecked(TRIE, VALUES) }
};
Expand Down
Binary file modified provider/blob/tests/data/v3.postcard
Binary file not shown.
62 changes: 62 additions & 0 deletions provider/core/src/hello_world.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ data_marker!(
HelloWorldV1,
HelloWorld<'static>,
has_checksum = true,
#[cfg(feature = "export")]
attributes_domain = "hello",
);

/// A data provider returning Hello World strings in different languages.
Expand Down Expand Up @@ -144,6 +146,22 @@ impl HelloWorldProvider {
("sr-Latn", "", "Pozdrav svete"),
("vi", "", "Xin chào thế giới"),
("zh", "", "你好世界"),
// tests for the attribute filter tests
("de", "de", "Hallo Welt"),
("de-AT", "de-AT", "Servus Welt"),
("en", "en", "Hello World"),
("en-001", "en", "Hello from 🗺️"),
// AFRICA
("en-002", "en", "Hello from 🌍"),
// AMERICAS
("en-019", "en", "Hello from 🌎"),
// ASIA
("en-142", "en", "Hello from 🌏"),
// GREAT BRITAIN
("en-GB", "en-GB", "Hello from 🇬🇧"),
("en-GB-u-sd-gbeng", "en-GB", "Hello from 🏴󠁧󠁢󠁥󠁮󠁧󠁿"),
("eo", "eo", "Saluton, Mondo"),
("fa", "fa", "سلام دنیا‎"),
];

/// Converts this provider into a [`BufferProvider`] that uses JSON serialization.
Expand Down Expand Up @@ -395,6 +413,50 @@ fn test_iter() {
DataIdentifierCow::from_locale(locale!("sr-Latn").into()),
DataIdentifierCow::from_locale(locale!("vi").into()),
DataIdentifierCow::from_locale(locale!("zh").into()),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("de"),
locale!("de").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("de-AT"),
locale!("de-AT").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("en"),
locale!("en").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("en"),
locale!("en-001").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("en"),
locale!("en-002").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("en"),
locale!("en-019").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("en"),
locale!("en-142").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("en-GB"),
locale!("en-GB").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("en-GB"),
locale!("en-GB-u-sd-gbeng").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("eo"),
locale!("eo").into()
),
DataIdentifierCow::from_borrowed_and_owned(
DataMarkerAttributes::from_str_or_panic("fa"),
locale!("fa").into()
),
])
);
}
5 changes: 4 additions & 1 deletion provider/core/src/response.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1111,7 +1111,10 @@ fn test_debug() {
use crate::prelude::*;
let resp = HelloWorldProvider
.load(DataRequest {
id: DataIdentifierBorrowed::for_locale(&icu_locale_core::locale!("en").into()),
id: DataIdentifierBorrowed::for_marker_attributes_and_locale(
DataMarkerAttributes::from_str_or_panic("en"),
&icu_locale_core::locale!("en").into(),
),
..Default::default()
})
.unwrap();
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Servus Welt"}
1 change: 1 addition & 0 deletions provider/fs/tests/data/json/hello/world/v1/de/de.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hallo Welt"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🏴󠁧󠁢󠁥󠁮󠁧󠁿"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🇬🇧"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🗺️"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🌍"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🌎"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🌏"}
1 change: 1 addition & 0 deletions provider/fs/tests/data/json/hello/world/v1/en/en.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello World"}
1 change: 1 addition & 0 deletions provider/fs/tests/data/json/hello/world/v1/eo/eo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Saluton, Mondo"}
1 change: 1 addition & 0 deletions provider/fs/tests/data/json/hello/world/v1/fa/fa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"سلام دنیا‎"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Servus Welt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

Hallo Welt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
'Hello from 🏴󠁧󠁢󠁥󠁮󠁧󠁿
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello from 🇬🇧
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello from 🗺️
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello from 🌍
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello from 🌎
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello from 🌏
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hello World
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Saluton, Mondo
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
سلام دنیا‎
1 change: 1 addition & 0 deletions provider/fs/tests/filter/hello/world/v1/.checksum
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1234
1 change: 1 addition & 0 deletions provider/fs/tests/filter/hello/world/v1/de-AT/de-AT.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Servus Welt"}
1 change: 1 addition & 0 deletions provider/fs/tests/filter/hello/world/v1/en/en-001.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🗺️"}
1 change: 1 addition & 0 deletions provider/fs/tests/filter/hello/world/v1/en/en-002.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🌍"}
1 change: 1 addition & 0 deletions provider/fs/tests/filter/hello/world/v1/en/en-019.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🌎"}
1 change: 1 addition & 0 deletions provider/fs/tests/filter/hello/world/v1/en/en-142.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello from 🌏"}
1 change: 1 addition & 0 deletions provider/fs/tests/filter/hello/world/v1/en/en.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Hello World"}
1 change: 1 addition & 0 deletions provider/fs/tests/filter/hello/world/v1/eo/eo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"Saluton, Mondo"}
1 change: 1 addition & 0 deletions provider/fs/tests/filter/hello/world/v1/fa/fa.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"message":"سلام دنیا‎"}
3 changes: 3 additions & 0 deletions provider/fs/tests/filter/manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"syntax": "Json"
}
2 changes: 2 additions & 0 deletions provider/icu4x-datagen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ icu_provider_registry = { workspace = true }
icu_provider_blob = { workspace = true, features = ["alloc"], optional = true }

clap = { workspace = true, features = ["derive"] }
displaydoc = { workspace = true }
eyre = { workspace = true }
log = { workspace = true }
regex = { workspace = true }
simple_logger = { workspace = true }

[features]
Expand Down
76 changes: 76 additions & 0 deletions provider/icu4x-datagen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
)]

use clap::{Parser, ValueEnum};
use displaydoc::Display;
use eyre::WrapErr;
use icu_provider::export::ExportableProvider;
use icu_provider::hello_world::HelloWorldV1;
Expand All @@ -35,9 +36,63 @@ use icu_provider_export::prelude::*;
use icu_provider_export::ExportMetadata;
#[cfg(feature = "provider")]
use icu_provider_source::SourceDataProvider;
use regex::Regex;
use simple_logger::SimpleLogger;
use std::collections::HashMap;
use std::path::PathBuf;
use std::str::FromStr;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a test for this new flag, because it is sufficiently novel that it isn't covered by the ExportDriver tests.

I suggest adding a command to export a human-readable JSON file tree into a tests/filter_test/out directory using an interesting attributes filter.

You can add the command line to generate the JSON here:

[tasks.testdata-hello-world]

You can add any additional interesting attributes to the hello world data here. There is already one attribute that is either empty or is the string "reverse":

impl HelloWorldProvider {

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added a test, that tests the following three cases:

  • normal regex syntax works
  • you can have multiple regexes for the same domain
  • you can invert the regex
  • the regex is a full-string match

the test does not test, if it works for multiple different domains, because that would have meant adding a second data_marker! struct, but i can (reluctantly) do that if you want me to.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok nvmd i don't think i can do the tests like that? things are failing and i don't really know how to fix those. i'll look into it again later though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i duplicated some of the test data to assign data marker attributes to the new ones without eliminating the existing tests, lmk what you think!

#[derive(Clone)]
struct Filter {
domain: String,
regex: Regex,
inverted: bool,
}

#[derive(Debug, Display)]
enum FilterError {
#[displaydoc("no filter found. specify one after an =")]
NoFilter,
#[displaydoc("opening / delimiter for regex not found")]
NoOpeningSlash,
#[displaydoc("closing / delimiter for regex not found")]
NoClosingSlash,
#[displaydoc("{0}")]
Regex(regex::Error),
}

impl From<regex::Error> for FilterError {
fn from(value: regex::Error) -> Self {
FilterError::Regex(value)
}
}

impl std::error::Error for FilterError {}

impl FromStr for Filter {
type Err = FilterError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let (domain, regex) = s.split_once('=').ok_or(FilterError::NoFilter)?;

let (regex, inverted) = regex
.strip_prefix('-')
.map(|regex| (regex, true))
.unwrap_or((regex, false));

let regex = regex.strip_prefix('/').ok_or(FilterError::NoOpeningSlash)?;
let regex = regex.strip_suffix('/').ok_or(FilterError::NoClosingSlash)?;

// add an implicit `^(?:)$` around the regex
let regex = format!("^(?:{})$", regex);
let regex = Regex::new(&regex)?;

Ok(Filter {
domain: domain.to_owned(),
regex,
inverted,
})
}
}

#[derive(Parser)]
#[command(name = "icu4x-datagen")]
Expand Down Expand Up @@ -169,6 +224,10 @@ struct Cli {
#[arg(help = "Analyzes the binary and only includes markers that are used by the binary.")]
markers_for_bin: Option<PathBuf>,

#[arg(long, value_name = "FILTER")]
#[arg(help = "Filter attributes on markers for a domain. Accepts form `domain=/regex/`.")]
attribute_filter: Vec<Filter>,

#[arg(long, short, num_args = 0..)]
#[cfg_attr(feature = "provider", arg(default_value = "recommended"))]
#[arg(
Expand Down Expand Up @@ -528,6 +587,23 @@ fn main() -> eyre::Result<()> {
driver.with_segmenter_models(cli.segmenter_models.clone())
};

let attribute_filters = cli
.attribute_filter
.iter()
.fold(HashMap::new(), |mut map, filter| {
map.entry(&filter.domain)
.and_modify(|v: &mut Vec<_>| v.push((filter.regex.clone(), filter.inverted)))
.or_insert_with(|| vec![(filter.regex.clone(), filter.inverted)]);
map
});
for (domain, filters) in attribute_filters {
driver = driver.with_marker_attributes_filter(domain, move |attr| {
filters
.iter()
.all(|(regex, inverted)| regex.is_match(attr) ^ inverted)
})
}

let metadata: Result<ExportMetadata, DataError> = match cli.format {
#[cfg(not(feature = "fs_exporter"))]
Format::Fs => {
Expand Down
3 changes: 3 additions & 0 deletions tools/make/data.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ exec --fail-on-error target/debug/icu4x-datagen --markers HelloWorldV1 --locales

# Baked
exec --fail-on-error target/debug/icu4x-datagen --markers HelloWorldV1 --locales full --format baked --pretty --overwrite --no-internal-fallback --deduplication none --out provider/baked/tests/data

# Filtered
exec --fail-on-error target/debug/icu4x-datagen --markers HelloWorldV1 --locales full --attribute-filter hello=/[d-f].+/ --attribute-filter hello=-/en-GB/ --attribute-filter hello=-/de/ --format fs --syntax json --out provider/fs/tests/filter --overwrite
'''

[tasks.testdata-check]
Expand Down