Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions gen/src/writer/ucd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ mod ident;
mod name;
mod name_aliases;
mod normal;
mod numeric;
mod segment;
mod segment_tests;

Expand All @@ -38,6 +39,7 @@ pub fn generate() {
name::generate(&clean_dir("unic/ucd/name/tables"));
name_aliases::generate(&clean_dir("unic/ucd/name_aliases/tables"));
normal::generate(&clean_dir("unic/ucd/normal/tables"));
numeric::generate(&clean_dir("unic/ucd/numeric/tables"));
segment::generate(&clean_dir("unic/ucd/segment/tables"));
segment_tests::generate(&clean_dir("unic/ucd/segment/tests/tables"));
}
57 changes: 57 additions & 0 deletions gen/src/writer/ucd/numeric.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
use std::collections::BTreeMap;
use std::convert::TryInto;
use std::path::Path;

use crate::source::ucd::readme::UNICODE_VERSION;
use crate::source::ucd::unicode_data::UNICODE_DATA;

use crate::writer::common::emit_unicode_version;
use crate::writer::utils::tables::ToDirectCharTable;
use crate::writer::utils::write;

pub fn generate(dir: &Path) {
emit_unicode_version(dir, &UNICODE_VERSION);
emit_digit_decimal_numeric(dir);
}

fn emit_digit_decimal_numeric(dir: &Path) {
let mut str_reprs = String::new();
// we pack as much info as possible into a u32, since due to char's alignment we get 32 bits of
// info for "free" - even with V=u8, size_of::<[(char, u8); N]>() = 64 * N
let map: BTreeMap<char, u32> = UNICODE_DATA
.entries
.iter()
.filter_map(|x| {
let num_str = x.numeric_numeric_value.as_deref()?;
let i = match x.digit_numeric_value {
Some(d) => {
assert!(matches!(d, 0..=9));
let is_decimal = x.decimal_numeric_value.is_some();
(1u32 << 31) | ((is_decimal as u32) << 30) | (d as u32)
}
None => {
// if we made V something with `&'static str`, it would almost double the size
// of the table in static memory. instead keep the string data separate, which
// allows packing a short idx/len into a u32, and also lets us deduplicate data
let idx = str_reprs.find(num_str).unwrap_or_else(|| {
let i = str_reprs.len();
str_reprs.push_str(num_str);
i
});
let idx: u16 = idx.try_into().unwrap();
let len: u16 = num_str.len().try_into().unwrap();
assert_eq!(len >> 15, 0);
((len as u32) << 16) | (idx as u32)
}
};
Some((x.character, i))
})
.collect();

write(dir, "numeric_strs.rsv", &format!("{:?}", str_reprs));
write(
dir,
"numeric_values.rsv",
&map.to_direct_char_table(|val, f| write!(f, "{:#x?}", val)),
);
}
1 change: 1 addition & 0 deletions unic/ucd/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ unic-ucd-ident = { path = "ident/", version = "0.9.0" }
unic-ucd-name = { path = "name/", version = "0.9.0" }
unic-ucd-name_aliases = { path = "name_aliases/", version = "0.9.0" }
unic-ucd-normal = { path = "normal/", version = "0.9.0", features = ["unic-ucd-category"] }
unic-ucd-numeric = { path = "numeric/", version = "0.9.0" }
unic-ucd-segment = { path = "segment/", version = "0.9.0" }
unic-ucd-version = { path = "version/", version = "0.9.0" }

Expand Down
22 changes: 22 additions & 0 deletions unic/ucd/numeric/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[package]
name = "unic-ucd-numeric"
version = "0.9.0"
edition = "2018"
authors = ["The UNIC Project Developers"]
repository = "https://github.com/open-i18n/rust-unic/"
license = "MIT/Apache-2.0"
description = "UNIC — Unicode Character Database — Numeric"
keywords = ["text", "unicode", "numeric", "digit", "decimal"]
categories = ["internationalization", "text-processing", "parsing", "rendering"]

# No tests/benches that depends on /data/
exclude = []

[dependencies]
unic-char-property = { path = "../../char/property/", version = "0.9.0" }
unic-ucd-version = { path = "../version/", version = "0.9.0" }

[badges]
maintenance = { status = "actively-developed" }
is-it-maintained-issue-resolution = { repository = "open-i18n/rust-unic" }
is-it-maintained-open-issues = { repository = "open-i18n/rust-unic" }
20 changes: 20 additions & 0 deletions unic/ucd/numeric/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

mod pkg_info;
pub use crate::pkg_info::{PKG_DESCRIPTION, PKG_NAME, PKG_VERSION};

mod numeric;
pub use crate::numeric::NumericValue;

use unic_ucd_version::UnicodeVersion;

/// The [Unicode version](https://www.unicode.org/versions/) of data
pub const UNICODE_VERSION: UnicodeVersion = include!("../tables/unicode_version.rsv");
103 changes: 103 additions & 0 deletions unic/ucd/numeric/src/numeric.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use std::fmt;
use unic_char_property::PartialCharProperty;

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum NumericValue {
/// A decimal digit in the range `0..=9`, corresponding to `Numeric_Type=Decimal`
Decimal(u8),
/// A digit in the range `0..=9`, corresponding to `Numeric_Type=Digit`
Digit(u8),
/// A string representing an integer or a rational number, corresponding to `Numeric_Type=Numeric`
Numeric(&'static str),
}

mod data {
use unic_char_property::tables::CharDataTable;
pub const NUMERIC_VALUES: CharDataTable<u32> = include!("../tables/numeric_values.rsv");
pub const NUMERIC_STRS: &'static str = include!("../tables/numeric_strs.rsv");
}

impl NumericValue {
pub fn of(ch: char) -> Option<NumericValue> {
data::NUMERIC_VALUES.find(ch).map(|i| {
if (i >> 31) == 0 {
let idx = (i & 0xffff) as usize;
let len = (i >> 16) as usize;
// SAFETY: these bounds are always valid, as they're generated based on
// NUMERIC_STRS itself in gen/src/writer/ucd/numeric.rs
let s = unsafe { data::NUMERIC_STRS.get_unchecked(idx..).get_unchecked(..len) };
NumericValue::Numeric(s)
} else {
let is_decimal = i & (1 << 30) != 0;
let d = (i & 0xff) as u8;
if is_decimal {
NumericValue::Decimal(d)
} else {
NumericValue::Digit(d)
}
}
})
}

pub fn as_str(&self) -> &'static str {
match *self {
NumericValue::Decimal(d) | NumericValue::Digit(d) => {
let digits = "0123456789";
// SAFETY: d is always in range 0..=9, both according to the
// spec and verified in gen/src/writer/ucd/numeric.rs
unsafe { digits.get_unchecked(d as usize..).get_unchecked(..1) }
}
NumericValue::Numeric(s) => s,
}
}
}

impl PartialCharProperty for NumericValue {
fn of(ch: char) -> Option<NumericValue> {
Self::of(ch)
}
}

impl fmt::Display for NumericValue {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(self.as_str())
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_decimal() {
assert_eq!(NumericValue::of('7').unwrap(), NumericValue::Decimal(7));
// ARABIC-INDIC DIGIT THREE
assert_eq!(NumericValue::of('٣').unwrap(), NumericValue::Decimal(3));
}

#[test]
fn test_digit() {
// CIRCLED DIGIT EIGHT
assert_eq!(NumericValue::of('⑧').unwrap(), NumericValue::Digit(8));
// DIGIT THREE FULL STOP
assert_eq!(NumericValue::of('⒊').unwrap(), NumericValue::Digit(3));
}

#[test]
fn test_numeric() {
// VULGAR FRACTION ONE HALF
assert_eq!(NumericValue::of('½').unwrap(), NumericValue::Numeric("1/2"));
// ROMAN NUMERAL TWELVE
assert_eq!(NumericValue::of('Ⅻ').unwrap(), NumericValue::Numeric("12"));
}
}
20 changes: 20 additions & 0 deletions unic/ucd/numeric/src/pkg_info.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Package information

/// UNIC component version.
pub const PKG_VERSION: &str = env!("CARGO_PKG_VERSION");

/// UNIC component name.
pub const PKG_NAME: &str = env!("CARGO_PKG_NAME");

/// UNIC component description.
pub const PKG_DESCRIPTION: &str = env!("CARGO_PKG_DESCRIPTION");
3 changes: 3 additions & 0 deletions unic/ucd/numeric/tables/numeric_strs.rsv

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading