Document locale preferences

unicode-org · Nov 2, 2024 · 95105d4 · 95105d4
1 parent efa5a9a
commit 95105d4
Show file tree

Hide file tree

Showing 36 changed files with 1,085 additions and 237 deletions.
diff --git a/components/list/README.md b/components/list/README.md
diff --git a/components/list/src/list_formatter.rs b/components/list/src/list_formatter.rs
@@ -74,6 +74,7 @@ macro_rules! constructor {
 }
 
 fn get_data_locale_from_prefs(prefs: ListFormatterPreferences) -> DataLocale {
+    // XXX: This should utilize region source priority.
     DataLocale::from_subtags(
         prefs.language,
         prefs.script,

diff --git a/components/list/src/options.rs b/components/list/src/options.rs
@@ -2,19 +2,29 @@
 // called LICENSE at the top level of the ICU4X source tree
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
-/// TODO
+/// A list of options set by the developer to adjust the behavior of the ListFormatter.
+///
+/// # Examples
+/// ```
+/// use icu::list::{ListFormatterOptions, ListLength};
+///
+/// let options = ListFormatterOptions::new()
+///     .with_length(ListLength::Wide);
+/// ```
 #[derive(Default, Debug, Clone)]
 #[non_exhaustive]
 pub struct ListFormatterOptions {
-    /// Length
+    /// The length variant should reflect available space for the list.
     pub length: Option<ListLength>,
 }
 
 impl ListFormatterOptions {
+    /// Constructs a new [`ListFormatterOptions`] struct.
     pub fn new() -> Self {
         Self::default()
     }
 
+    /// Auguments the struct with the set [`ListLength`].
     pub fn with_length(mut self, length: ListLength) -> Self {
         self.length = Some(length);
         self

diff --git a/components/locale_core/README.md b/components/locale_core/README.md
diff --git a/components/locale_core/src/extensions/mod.rs b/components/locale_core/src/extensions/mod.rs
@@ -37,6 +37,14 @@
 //! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value));
 //! ```
 //!
+//! # Syntactic vs Semantic Extension Handling
+//!
+//! This module is useful when you need to work with Locale extensions at a syntactic level,
+//! perhaps for parsing or generating locale identifiers that include any syntactically valid
+//! extensions.
+//! For handling and validating known CLDR values with semantic meaning, see the
+//! [`crate::preferences::extensions`] module.
+//!
 //! [`LanguageIdentifier`]: super::LanguageIdentifier
 //! [`Locale`]: super::Locale
 //! [`subtags`]: super::subtags

diff --git a/components/locale_core/src/extensions/unicode/keywords.rs b/components/locale_core/src/extensions/unicode/keywords.rs
@@ -237,7 +237,7 @@ impl Keywords {
     ///
     /// Returns the old Unicode extension keywords.
     ///
-    /// # Example
+    /// # Examples
     ///
     /// ```
     /// use icu::locale::Locale;

diff --git a/components/locale_core/src/extensions/unicode/subdivision.rs b/components/locale_core/src/extensions/unicode/subdivision.rs
@@ -4,8 +4,10 @@
 
 use core::str::FromStr;
 
+use tinystr::TinyAsciiStr;
+
 use crate::parser::ParseError;
-use crate::subtags::Region;
+use crate::subtags::{Region, Subtag};
 
 impl_tinystr_subtag!(
     /// A subdivision suffix used in [`SubdivisionId`].
@@ -131,6 +133,19 @@ impl SubdivisionId {
         let suffix = SubdivisionSuffix::try_from_utf8(suffix_code_units)?;
         Ok(Self { region, suffix })
     }
+
+    /// Convert to [`Subtag`]
+    pub fn into_subtag(self) -> Subtag {
+        use writeable::Writeable;
+
+        // XXX: This can be optimized to concatenate two TinyAsciiStr.
+        let mut result = alloc::string::String::with_capacity(8);
+        let _ = self.write_to(&mut result);
+        #[allow(clippy::expect_used)]
+        let tinystr = TinyAsciiStr::try_from_str(&result)
+            .expect("Constructing 8 chars TinyAsciiStr from two 4 char ones");
+        Subtag::from_tinystr_unvalidated(tinystr)
+    }
 }
 
 impl writeable::Writeable for SubdivisionId {

diff --git a/components/locale_core/src/lib.rs b/components/locale_core/src/lib.rs
@@ -8,7 +8,9 @@
 //! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
 //!
 //! The module provides algorithms for parsing a string into a well-formed language or locale identifier
-//! as defined by [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`].
+//! as defined by [`UTS #35: Unicode LDML 3. Unicode Language and Locale Identifiers`]. Additionally
+//! the module provides [`preferences`] interface for operations on locale preferences and conversions
+//! from and to locale unicode extensions.
 //!
 //! [`Locale`] is the most common structure to use for storing information about a language,
 //! script, region, variants and extensions. In almost all cases, this struct should be used as the

diff --git a/components/locale_core/src/preferences/extensions/mod.rs b/components/locale_core/src/preferences/extensions/mod.rs
@@ -2,6 +2,22 @@
 // called LICENSE at the top level of the ICU4X source tree
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
-//! TODO
+//! A set of extensions which correspond to preferences.
+//!
+//! The module provides structures that represent known values for each keyword
+//! in Locale [`extensions`](crate::extensions) with semantic meaning.
+//!
+//! # Syntactic vs Semantic Extension Handling
+//!
+//! This module ensures that only valid, recognized values are used, providing semantic validation.
+//! It would reject invalid values such as `-u-hc-BB` because `BB` is not a known hour cycle. This
+//! is ideal for applications that require strict adherence to standardized values and need to
+//! prevent invalid or unrecognized data.
+//!
+//! If you need to construct syntactically valid Locale extensions without semantic validation,
+//! allowing any valid key-value pair regardless of recognition, consider using the
+//! [`crate::extensions`] module.
+//!
+//! [`Locale`]: crate::Locale
 
 pub mod unicode;
diff --git a/components/locale_core/src/preferences/extensions/unicode/errors.rs b/components/locale_core/src/preferences/extensions/unicode/errors.rs
@@ -2,14 +2,12 @@
 // called LICENSE at the top level of the ICU4X source tree
 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
 
-//! TODO
+//! Errors related to parsing of Preferences.
 
+/// Error returned by parsers of unicode extensions as preferences.
 #[non_exhaustive]
 #[derive(Debug)]
-/// TODO
 pub enum PreferencesParseError {
-    /// TODO
-    UnknownKeyword,
-    /// TODO
+    /// The given keyword value is not a valid preference variant.
     InvalidKeywordValue,
 }
diff --git a/components/locale_core/src/preferences/extensions/unicode/keywords/calendar.rs b/components/locale_core/src/preferences/extensions/unicode/keywords/calendar.rs
@@ -6,36 +6,65 @@
 
 use crate::preferences::extensions::unicode::enum_keyword;
 
-// https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml
 enum_keyword!(
-    /// TODO
+    /// Islamic Calendar sub-type
+    ///
+    /// The list is based on [`CLDR Calendars`](https://github.com/unicode-org/cldr/blob/main/common/bcp47/calendar.xml)
     IslamicCalendarAlgorithm {
-        "umalqura" => Umalqura,
-        "tbla" => Tbla,
-        "civil" => Civil,
-        "rgsa" => Rgsa
+        /// Islamic calendar, Umm al-Qura
+        ("umalqura" => Umalqura),
+        /// Hijri calendar, tabular (intercalary years \[2,5,7,10,13,16,18,21,24,26,29] - astronomical epoch)
+        ("tbla" => Tbla),
+        /// Islamic calendar, tabular (intercalary years \[2,5,7,10,13,16,18,21,24,26,29] - civil epoch)
+        ("civil" => Civil),
+        /// Hijri calendar, Saudi Arabia sighting
+        ("rgsa" => Rgsa)
 });
 
 enum_keyword!(
-    /// TODO
+    /// A Unicode Calendar Identifier defines a type of calendar.
+    ///
+    /// This selects calendar-specific data within a locale used for formatting and parsing,
+    /// such as date/time symbols and patterns; it also selects supplemental calendarData used
+    /// for calendrical calculations. The value can affect the computation of the first day of the week.
+    ///
+    /// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCalendarIdentifier).
     CalendarAlgorithm {
-        "buddhist" => Buddhist,
-        "chinese" => Chinese,
-        "coptic" => Coptic,
-        "dangi" => Dangi,
-        "ethioaa" => Ethioaa,
-        "ethiopic" => Ethiopic,
-        "gregory" => Gregory,
-        "hebrew" => Hebrew,
-        "indian" => Indian,
-        "islamic" => Islamic(IslamicCalendarAlgorithm) {
-            "umalqura" => Umalqura,
-            "tbla" => Tbla,
-            "civil" => Civil,
-            "rgsa" => Rgsa
-        },
-        "iso8601" => Iso8601,
-        "japanese" => Japanese,
-        "persian" => Persian,
-        "roc" => Roc
+        /// Thai Buddhist calendar (same as Gregorian except for the year)
+        ("buddhist" => Buddhist),
+        /// Traditional Chinese calendar
+        ("chinese" => Chinese),
+        /// Coptic calendar
+        ("coptic" => Coptic),
+        /// Traditional Korean calendar
+        ("dangi" => Dangi),
+        /// Ethiopic calendar, Amete Alem (epoch approx. 5493 B.C.E)
+        ("ethioaa" => Ethioaa),
+        /// Ethiopic calendar, Amete Mihret (epoch approx, 8 C.E.)
+        ("ethiopic" => Ethiopic),
+        /// Gregorian calendar
+        ("gregory" => Gregory),
+        /// Traditional Hebrew calendar
+        ("hebrew" => Hebrew),
+        /// Indian calendar
+        ("indian" => Indian),
+        /// Islamic calendar
+        ("islamic" => Islamic(IslamicCalendarAlgorithm) {
+             /// Islamic calendar, Umm al-Qura
+             ("umalqura" => Umalqura),
+             /// Hijri calendar, tabular (intercalary years \[2,5,7,10,13,16,18,21,24,26,29] - astronomical epoch)
+             ("tbla" => Tbla),
+             /// Islamic calendar, tabular (intercalary years \[2,5,7,10,13,16,18,21,24,26,29] - civil epoch)
+             ("civil" => Civil),
+             /// Hijri calendar, Saudi Arabia sighting
+             ("rgsa" => Rgsa)
+        }),
+        /// ISO calendar (Gregorian calendar using the ISO 8601 calendar week rules)
+        ("iso8601" => Iso8601),
+        /// Japanese Imperial calendar
+        ("japanese" => Japanese),
+        /// Persian calendar
+        ("persian" => Persian),
+        /// Republic of China calendar
+        ("roc" => Roc)
 }, "ca");
diff --git a/components/locale_core/src/preferences/extensions/unicode/keywords/collation.rs b/components/locale_core/src/preferences/extensions/unicode/keywords/collation.rs
@@ -5,25 +5,46 @@
 use crate::preferences::extensions::unicode::enum_keyword;
 
 enum_keyword!(
-    /// TODO
+    /// A Unicode Collation Identifier defines a type of collation (sort order).
+    ///
+    /// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCollationIdentifier).
     CollationType {
-        "big5han" => Big5han,
-        "compat" => Compat,
-        "dict" => Dict,
-        "direct" => Direct,
-        "ducet" => Ducet,
-        "emoji" => Emoji,
-        "eor" => Eor,
-        "gb2312" => Gb2312,
-        "phonebk" => Phonebk,
-        "phonetic" => Phonetic,
-        "pinyin" => Pinyin,
-        "reformed" => Reformed,
-        "search" => Search,
-        "searchjl" => Searchjl,
-        "standard" => Standard,
-        "stroke" => Stroke,
-        "trad" => Trad,
-        "unihan" => Unihan,
-        "zhuyin" => Zhuyin,
+        /// Pinyin ordering for Latin, big5 charset ordering for CJK characters (used in Chinese)
+        ("big5han" => Big5han),
+        /// A previous version of the ordering, for compatibility
+        ("compat" => Compat),
+        /// Dictionary style ordering (such as in Sinhala)
+        ("dict" => Dict),
+        /// Binary code point order (used in Hindi)
+        ("direct" => Direct),
+        /// The default Unicode collation element table order
+        ("ducet" => Ducet),
+        /// Recommended ordering for emoji characters
+        ("emoji" => Emoji),
+        /// European ordering rules
+        ("eor" => Eor),
+        /// Pinyin ordering for Latin, gb2312han charset ordering for CJK characters (used in Chinese)
+        ("gb2312" => Gb2312),
+        /// Phonebook style ordering (such as in German)
+        ("phonebk" => Phonebk),
+        /// Phonetic ordering (sorting based on pronunciation)
+        ("phonetic" => Phonetic),
+        /// Pinyin ordering for Latin and for CJK characters (used in Chinese)
+        ("pinyin" => Pinyin),
+        /// Reformed ordering (such as in Swedish)
+        ("reformed" => Reformed),
+        /// Special collation type for string search
+        ("search" => Search),
+        /// Special collation type for Korean initial consonant search
+        ("searchjl" => Searchjl),
+        /// Default ordering for each language
+        ("standard" => Standard),
+        /// Pinyin ordering for Latin, stroke order for CJK characters (used in Chinese)
+        ("stroke" => Stroke),
+        /// Traditional style ordering (such as in Spanish)
+        ("trad" => Trad),
+        /// Pinyin ordering for Latin, Unihan radical-stroke ordering for CJK characters (used in Chinese)
+        ("unihan" => Unihan),
+        /// Pinyin ordering for Latin, zhuyin order for Bopomofo and CJK characters (used in Chinese)
+        ("zhuyin" => Zhuyin),
 }, "co");
diff --git a/components/locale_core/src/preferences/extensions/unicode/keywords/currency.rs b/components/locale_core/src/preferences/extensions/unicode/keywords/currency.rs
@@ -8,7 +8,9 @@ use crate::{extensions::unicode::Value, subtags::Subtag};
 use tinystr::TinyAsciiStr;
 
 struct_keyword!(
-    /// TODO
+    /// A Unicode Currency Identifier defines a type of currency.
+    ///
+    /// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCurrencyIdentifier).
     CurrencyType,
     "cu",
     TinyAsciiStr<3>,

diff --git a/components/locale_core/src/preferences/extensions/unicode/keywords/currency_format.rs b/components/locale_core/src/preferences/extensions/unicode/keywords/currency_format.rs
@@ -5,8 +5,12 @@
 use crate::preferences::extensions::unicode::enum_keyword;
 
 enum_keyword!(
-    /// TODO
+    /// A Unicode Currency Format Identifier defines a style for currency formatting.
+    ///
+    /// The valid values are listed in [LDML](https://unicode.org/reports/tr35/#UnicodeCurrencyFormatIdentifier).
     CurrencyFormatStyle {
-        "standard" => Standard,
-        "account" => Account
+        /// Negative numbers use the minusSign symbol (the default)
+        ("standard" => Standard),
+        /// Negative numbers use parentheses or equivalent
+        ("account" => Account)
 }, "cf");
diff --git a/components/locale_core/src/preferences/extensions/unicode/keywords/dictionary_break.rs b/components/locale_core/src/preferences/extensions/unicode/keywords/dictionary_break.rs
@@ -10,7 +10,10 @@ use alloc::vec::Vec;
 use core::str::FromStr;
 
 struct_keyword!(
-    /// TODO
+    /// A Unicode Dictionary Break Exclusion Identifier specifies
+    /// scripts to be excluded from dictionary-based text break (for words and lines).
+    ///
+    /// The valid values are of one or more items of type [`Script`](crate::subtags::Script).
     DictionaryBreakScriptExclusions,
     "dx",
     Vec<Script>,