Skip to content

Commit

Permalink
internal script iterator
Browse files Browse the repository at this point in the history
zero dependencies 🎉
  • Loading branch information
chearon committed Sep 12, 2023
1 parent bc4447c commit cd0149b
Show file tree
Hide file tree
Showing 10 changed files with 380 additions and 52 deletions.
3 changes: 2 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,5 @@ em++ \
gen/lang-script-database.cc \
gen/grapheme-break-trie.cc \
gen/line-break-trie.cc \
gen/emoji-trie.cc
gen/emoji-trie.cc \
gen/script-trie.cc
37 changes: 36 additions & 1 deletion gen.js
Original file line number Diff line number Diff line change
Expand Up @@ -374,19 +374,54 @@ async function generateEmojiTrie() {
writeTrie(path.join(__dirname, 'gen/emoji-trie.cc'), 'emoji_trie', trie);
}

async function generateScriptTrie() {
const res = await fetch('https://www.unicode.org/Public/15.0.0/ucd/Scripts.txt');
const re = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm;
const trie = new UnicodeTrieBuilder();
/** @type {Map<string, number>} */
const names = new Map();
let js = '// generated from gen.js\nexport default [\n undefined,\n';
let nextId = 1;

if (res.status !== 200) throw new Error(res.status);

const text = await res.text();

let match;

while ((match = re.exec(text))) {
const start = match[1];
const end = match[2] != null ? match[2] : start;
const name = match[3];
if (!names.has(name)) {
const id = nextId++;
js += ` '${name}',\n`
names.set(name, id);
}
const id = names.get(name);
trie.setRange(parseInt(start, 16), parseInt(end, 16), id);
}

writeTrie(path.join(__dirname, 'gen/script-trie.cc'), 'script_trie', trie);
js += '];';
fs.writeFileSync(path.join(__dirname, 'gen/script-names.ts'), js);
}

const fns = process.argv.slice(2).map(command => {
if (command === 'line-break-trie') return generateLineBreakTrie;
if (command === 'grapheme-break-trie') return generateGraphemeBreakTrie;
if (command === 'lang-script-database') return generateLangScriptDatabase;
if (command === 'entity-trie') return generateEntityTrie;
if (command === 'emoji-trie') return generateEmojiTrie;
if (command === 'script-trie') return generateScriptTrie;
console.error(`Usage: node gen.js (cmd )+
Available commands:
line-break-trie
grapheme-break-trie
lang-script-database
entity-trie
emoji-trie`);
emoji-trie
script-trie`);
process.exit(1);
});

Expand Down
167 changes: 167 additions & 0 deletions gen/script-names.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
// generated from gen.js
export default [
undefined,
'Common',
'Latin',
'Greek',
'Cyrillic',
'Armenian',
'Hebrew',
'Arabic',
'Syriac',
'Thaana',
'Devanagari',
'Bengali',
'Gurmukhi',
'Gujarati',
'Oriya',
'Tamil',
'Telugu',
'Kannada',
'Malayalam',
'Sinhala',
'Thai',
'Lao',
'Tibetan',
'Myanmar',
'Georgian',
'Hangul',
'Ethiopic',
'Cherokee',
'Canadian_Aboriginal',
'Ogham',
'Runic',
'Khmer',
'Mongolian',
'Hiragana',
'Katakana',
'Bopomofo',
'Han',
'Yi',
'Old_Italic',
'Gothic',
'Deseret',
'Inherited',
'Tagalog',
'Hanunoo',
'Buhid',
'Tagbanwa',
'Limbu',
'Tai_Le',
'Linear_B',
'Ugaritic',
'Shavian',
'Osmanya',
'Cypriot',
'Braille',
'Buginese',
'Coptic',
'New_Tai_Lue',
'Glagolitic',
'Tifinagh',
'Syloti_Nagri',
'Old_Persian',
'Kharoshthi',
'Balinese',
'Cuneiform',
'Phoenician',
'Phags_Pa',
'Nko',
'Sundanese',
'Lepcha',
'Ol_Chiki',
'Vai',
'Saurashtra',
'Kayah_Li',
'Rejang',
'Lycian',
'Carian',
'Lydian',
'Cham',
'Tai_Tham',
'Tai_Viet',
'Avestan',
'Egyptian_Hieroglyphs',
'Samaritan',
'Lisu',
'Bamum',
'Javanese',
'Meetei_Mayek',
'Imperial_Aramaic',
'Old_South_Arabian',
'Inscriptional_Parthian',
'Inscriptional_Pahlavi',
'Old_Turkic',
'Kaithi',
'Batak',
'Brahmi',
'Mandaic',
'Chakma',
'Meroitic_Cursive',
'Meroitic_Hieroglyphs',
'Miao',
'Sharada',
'Sora_Sompeng',
'Takri',
'Caucasian_Albanian',
'Bassa_Vah',
'Duployan',
'Elbasan',
'Grantha',
'Pahawh_Hmong',
'Khojki',
'Linear_A',
'Mahajani',
'Manichaean',
'Mende_Kikakui',
'Modi',
'Mro',
'Old_North_Arabian',
'Nabataean',
'Palmyrene',
'Pau_Cin_Hau',
'Old_Permic',
'Psalter_Pahlavi',
'Siddham',
'Khudawadi',
'Tirhuta',
'Warang_Citi',
'Ahom',
'Anatolian_Hieroglyphs',
'Hatran',
'Multani',
'Old_Hungarian',
'SignWriting',
'Adlam',
'Bhaiksuki',
'Marchen',
'Newa',
'Osage',
'Tangut',
'Masaram_Gondi',
'Nushu',
'Soyombo',
'Zanabazar_Square',
'Dogra',
'Gunjala_Gondi',
'Makasar',
'Medefaidrin',
'Hanifi_Rohingya',
'Sogdian',
'Old_Sogdian',
'Elymaic',
'Nandinagari',
'Nyiakeng_Puachue_Hmong',
'Wancho',
'Chorasmian',
'Dives_Akuru',
'Khitan_Small_Script',
'Yezidi',
'Cypro_Minoan',
'Old_Uyghur',
'Tangsa',
'Toto',
'Vithkuqi',
'Kawi',
'Nag_Mundari',
];
4 changes: 4 additions & 0 deletions gen/script-trie.cc

Large diffs are not rendered by default.

Binary file modified overflow.wasm
Binary file not shown.
5 changes: 1 addition & 4 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,5 @@
"ua"
],
"author": "Caleb Hearon <[email protected]>",
"license": "MIT",
"dependencies": {
"itemizer": "^1.0.6"
}
"license": "MIT"
}
3 changes: 0 additions & 3 deletions src/deps.ts

This file was deleted.

Loading

0 comments on commit cd0149b

Please sign in to comment.