Skip to content

Commit

Permalink
turn script-names.ts into an iso15924 db
Browse files Browse the repository at this point in the history
https://www.unicode.org/iso15924/iso15924.txt

will be useful to have the code conversions (eg "Arab") when
poking in fonts as part of the upcoming glyph cache

script trie now uses the code numbers that unicode uses rather
than an arbitrary id
  • Loading branch information
chearon committed Oct 1, 2023
1 parent 8199085 commit 6d6381e
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 183 deletions.
Binary file modified dist/overflow.wasm
Binary file not shown.
53 changes: 39 additions & 14 deletions gen.js
Original file line number Diff line number Diff line change
Expand Up @@ -374,14 +374,43 @@ async function generateEmojiTrie() {
writeTrie(path.join(__dirname, 'gen/emoji-trie.cc'), 'emoji_trie', trie);
}

async function getScriptNames() {
const res = await fetch('https://www.unicode.org/iso15924/iso15924.txt');
if (res.status !== 200) throw new Error(res.statusText);
const text = await res.text();
/** @type {Map<string, number>} */
const pvaToNo = new Map();
/** @type {Map<string, string>} */
const pvaToCode = new Map();
/** @type {Map<number, string>} */
const noToPva = new Map();

for (const line of text.split('\n')) {
if (line.startsWith('#') || !line.trim().length) continue;
const [code, no, /*en*/, /*fr*/, pva, /*ver*/, /*date*/] = line.split(';');
pvaToNo.set(pva, +no);
pvaToCode.set(pva, code);
noToPva.set(+no, pva);
}

return {pvaToNo, pvaToCode, noToPva};
}

async function generateScriptNames() {
const {pvaToNo, pvaToCode, noToPva} = await getScriptNames();
fs.writeFileSync(path.join(__dirname, 'gen/script-names.ts'), `// generated from gen.js
export const pvaToNo = new Map(${JSON.stringify([...pvaToNo.entries()])});
export const pvaToCode = new Map(${JSON.stringify([...pvaToCode.entries()])});
export const noToPva = new Map(${JSON.stringify([...noToPva.entries()])});
`);
}

async function generateScriptTrie() {
const res = await fetch('https://www.unicode.org/Public/15.0.0/ucd/Scripts.txt');
const re = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm;
const trie = new UnicodeTrieBuilder();
/** @type {Map<string, number>} */
const names = new Map();
let js = '// generated from gen.js\nexport default [\n undefined,\n';
let nextId = 1;
const {pvaToNo} = await getScriptNames();

if (res.status !== 200) throw new Error(res.status);

Expand All @@ -392,19 +421,13 @@ async function generateScriptTrie() {
while ((match = re.exec(text))) {
const start = match[1];
const end = match[2] != null ? match[2] : start;
const name = match[3];
if (!names.has(name)) {
const id = nextId++;
js += ` '${name}',\n`
names.set(name, id);
}
const id = names.get(name);
trie.setRange(parseInt(start, 16), parseInt(end, 16), id);
const pva = match[3];
const no = pvaToNo.get(pva);
if (no === undefined) throw new Error(`PVA ${pva} not found in iso15924.txt`);
trie.setRange(parseInt(start, 16), parseInt(end, 16), no);
}

writeTrie(path.join(__dirname, 'gen/script-trie.cc'), 'script_trie', trie);
js += '];';
fs.writeFileSync(path.join(__dirname, 'gen/script-names.ts'), js);
}

const fns = process.argv.slice(2).map(command => {
Expand All @@ -414,14 +437,16 @@ const fns = process.argv.slice(2).map(command => {
if (command === 'entity-trie') return generateEntityTrie;
if (command === 'emoji-trie') return generateEmojiTrie;
if (command === 'script-trie') return generateScriptTrie;
if (command === 'script-names') return generateScriptNames;
console.error(`Usage: node gen.js (cmd )+
Available commands:
line-break-trie
grapheme-break-trie
lang-script-database
entity-trie
emoji-trie
script-trie`);
script-trie
script-names`);
process.exit(1);
});

Expand Down
169 changes: 3 additions & 166 deletions gen/script-names.ts
Original file line number Diff line number Diff line change
@@ -1,167 +1,4 @@
// generated from gen.js
export default [
undefined,
'Common',
'Latin',
'Greek',
'Cyrillic',
'Armenian',
'Hebrew',
'Arabic',
'Syriac',
'Thaana',
'Devanagari',
'Bengali',
'Gurmukhi',
'Gujarati',
'Oriya',
'Tamil',
'Telugu',
'Kannada',
'Malayalam',
'Sinhala',
'Thai',
'Lao',
'Tibetan',
'Myanmar',
'Georgian',
'Hangul',
'Ethiopic',
'Cherokee',
'Canadian_Aboriginal',
'Ogham',
'Runic',
'Khmer',
'Mongolian',
'Hiragana',
'Katakana',
'Bopomofo',
'Han',
'Yi',
'Old_Italic',
'Gothic',
'Deseret',
'Inherited',
'Tagalog',
'Hanunoo',
'Buhid',
'Tagbanwa',
'Limbu',
'Tai_Le',
'Linear_B',
'Ugaritic',
'Shavian',
'Osmanya',
'Cypriot',
'Braille',
'Buginese',
'Coptic',
'New_Tai_Lue',
'Glagolitic',
'Tifinagh',
'Syloti_Nagri',
'Old_Persian',
'Kharoshthi',
'Balinese',
'Cuneiform',
'Phoenician',
'Phags_Pa',
'Nko',
'Sundanese',
'Lepcha',
'Ol_Chiki',
'Vai',
'Saurashtra',
'Kayah_Li',
'Rejang',
'Lycian',
'Carian',
'Lydian',
'Cham',
'Tai_Tham',
'Tai_Viet',
'Avestan',
'Egyptian_Hieroglyphs',
'Samaritan',
'Lisu',
'Bamum',
'Javanese',
'Meetei_Mayek',
'Imperial_Aramaic',
'Old_South_Arabian',
'Inscriptional_Parthian',
'Inscriptional_Pahlavi',
'Old_Turkic',
'Kaithi',
'Batak',
'Brahmi',
'Mandaic',
'Chakma',
'Meroitic_Cursive',
'Meroitic_Hieroglyphs',
'Miao',
'Sharada',
'Sora_Sompeng',
'Takri',
'Caucasian_Albanian',
'Bassa_Vah',
'Duployan',
'Elbasan',
'Grantha',
'Pahawh_Hmong',
'Khojki',
'Linear_A',
'Mahajani',
'Manichaean',
'Mende_Kikakui',
'Modi',
'Mro',
'Old_North_Arabian',
'Nabataean',
'Palmyrene',
'Pau_Cin_Hau',
'Old_Permic',
'Psalter_Pahlavi',
'Siddham',
'Khudawadi',
'Tirhuta',
'Warang_Citi',
'Ahom',
'Anatolian_Hieroglyphs',
'Hatran',
'Multani',
'Old_Hungarian',
'SignWriting',
'Adlam',
'Bhaiksuki',
'Marchen',
'Newa',
'Osage',
'Tangut',
'Masaram_Gondi',
'Nushu',
'Soyombo',
'Zanabazar_Square',
'Dogra',
'Gunjala_Gondi',
'Makasar',
'Medefaidrin',
'Hanifi_Rohingya',
'Sogdian',
'Old_Sogdian',
'Elymaic',
'Nandinagari',
'Nyiakeng_Puachue_Hmong',
'Wancho',
'Chorasmian',
'Dives_Akuru',
'Khitan_Small_Script',
'Yezidi',
'Cypro_Minoan',
'Old_Uyghur',
'Tangsa',
'Toto',
'Vithkuqi',
'Kawi',
'Nag_Mundari',
];
export const pvaToNo = new Map([["Adlam",166],["",997],["Caucasian_Albanian",239],["Ahom",338],["Arabic",160],["Imperial_Aramaic",124],["Armenian",230],["Avestan",134],["Balinese",360],["Bamum",435],["Bassa_Vah",259],["Batak",365],["Bengali",325],["Bhaiksuki",334],["Bopomofo",285],["Brahmi",300],["Braille",570],["Buginese",367],["Buhid",372],["Chakma",349],["Canadian_Aboriginal",440],["Carian",201],["Cham",358],["Cherokee",445],["Chorasmian",109],["Coptic",204],["Cypro_Minoan",402],["Cypriot",403],["Cyrillic",220],["Devanagari",315],["Dives_Akuru",342],["Dogra",328],["Deseret",250],["Duployan",755],["Egyptian_Hieroglyphs",50],["Elbasan",226],["Elymaic",128],["Ethiopic",430],["Georgian",240],["Glagolitic",225],["Gunjala_Gondi",312],["Masaram_Gondi",313],["Gothic",206],["Grantha",343],["Greek",200],["Gujarati",320],["Gurmukhi",310],["Hangul",286],["Han",500],["Hanunoo",371],["Hatran",127],["Hebrew",125],["Hiragana",410],["Anatolian_Hieroglyphs",80],["Pahawh_Hmong",450],["Nyiakeng_Puachue_Hmong",451],["Katakana_Or_Hiragana",412],["Old_Hungarian",176],["Old_Italic",210],["Javanese",361],["Kayah_Li",357],["Katakana",411],["Kawi",368],["Kharoshthi",305],["Khmer",355],["Khojki",322],["Khitan_Small_Script",288],["Kannada",345],["Kaithi",317],["Tai_Tham",351],["Lao",356],["Latin",215],["Lepcha",335],["Limbu",336],["Linear_A",400],["Linear_B",401],["Lisu",399],["Lycian",202],["Lydian",116],["Mahajani",314],["Makasar",366],["Mandaic",140],["Manichaean",139],["Marchen",332],["Medefaidrin",265],["Mende_Kikakui",438],["Meroitic_Cursive",101],["Meroitic_Hieroglyphs",100],["Malayalam",347],["Modi",324],["Mongolian",145],["Mro",264],["Meetei_Mayek",337],["Multani",323],["Myanmar",350],["Nag_Mundari",295],["Nandinagari",311],["Old_North_Arabian",106],["Nabataean",159],["Newa",333],["Nko",165],["Nushu",499],["Ogham",212],["Ol_Chiki",261],["Old_Turkic",175],["Oriya",327],["Osage",219],["Osmanya",260],["Old_Uyghur",143],["Palmyrene",126],["Pau_Cin_Hau",263],["Old_Permic",227],["Phags_Pa",331],["Inscriptional_Pahlavi",131],["Psalter_Pahlavi",132],["Phoenician",115],["Miao",282],["Inscriptional_Parthian",130],["Rejang",363],["Hanifi_Rohingya",167],["Runic",211],["Samaritan",123],["Old_South_Arabian",105],["Saurashtra",344],["SignWriting",95],["Shavian",281],["Sharada",319],["Siddham",302],["Khudawadi",318],["Sinhala",348],["Sogdian",141],["Old_Sogdian",142],["Sora_Sompeng",398],["Soyombo",329],["Sundanese",362],["Syloti_Nagri",316],["Syriac",135],["Tagbanwa",373],["Takri",321],["Tai_Le",353],["New_Tai_Lue",354],["Tamil",346],["Tangut",520],["Tai_Viet",359],["Telugu",340],["Tifinagh",120],["Tagalog",370],["Thaana",170],["Thai",352],["Tibetan",330],["Tirhuta",326],["Tangsa",275],["Toto",294],["Ugaritic",40],["Vai",470],["Vithkuqi",228],["Warang_Citi",262],["Wancho",283],["Old_Persian",30],["Cuneiform",20],["Yezidi",192],["Yi",460],["Zanabazar_Square",339],["Inherited",994],["Common",998],["Unknown",999]]);
export const pvaToCode = new Map([["Adlam","Adlm"],["","Zxxx"],["Caucasian_Albanian","Aghb"],["Ahom","Ahom"],["Arabic","Arab"],["Imperial_Aramaic","Armi"],["Armenian","Armn"],["Avestan","Avst"],["Balinese","Bali"],["Bamum","Bamu"],["Bassa_Vah","Bass"],["Batak","Batk"],["Bengali","Beng"],["Bhaiksuki","Bhks"],["Bopomofo","Bopo"],["Brahmi","Brah"],["Braille","Brai"],["Buginese","Bugi"],["Buhid","Buhd"],["Chakma","Cakm"],["Canadian_Aboriginal","Cans"],["Carian","Cari"],["Cham","Cham"],["Cherokee","Cher"],["Chorasmian","Chrs"],["Coptic","Copt"],["Cypro_Minoan","Cpmn"],["Cypriot","Cprt"],["Cyrillic","Cyrl"],["Devanagari","Deva"],["Dives_Akuru","Diak"],["Dogra","Dogr"],["Deseret","Dsrt"],["Duployan","Dupl"],["Egyptian_Hieroglyphs","Egyp"],["Elbasan","Elba"],["Elymaic","Elym"],["Ethiopic","Ethi"],["Georgian","Geor"],["Glagolitic","Glag"],["Gunjala_Gondi","Gong"],["Masaram_Gondi","Gonm"],["Gothic","Goth"],["Grantha","Gran"],["Greek","Grek"],["Gujarati","Gujr"],["Gurmukhi","Guru"],["Hangul","Hang"],["Han","Hani"],["Hanunoo","Hano"],["Hatran","Hatr"],["Hebrew","Hebr"],["Hiragana","Hira"],["Anatolian_Hieroglyphs","Hluw"],["Pahawh_Hmong","Hmng"],["Nyiakeng_Puachue_Hmong","Hmnp"],["Katakana_Or_Hiragana","Hrkt"],["Old_Hungarian","Hung"],["Old_Italic","Ital"],["Javanese","Java"],["Kayah_Li","Kali"],["Katakana","Kana"],["Kawi","Kawi"],["Kharoshthi","Khar"],["Khmer","Khmr"],["Khojki","Khoj"],["Khitan_Small_Script","Kits"],["Kannada","Knda"],["Kaithi","Kthi"],["Tai_Tham","Lana"],["Lao","Laoo"],["Latin","Latn"],["Lepcha","Lepc"],["Limbu","Limb"],["Linear_A","Lina"],["Linear_B","Linb"],["Lisu","Lisu"],["Lycian","Lyci"],["Lydian","Lydi"],["Mahajani","Mahj"],["Makasar","Maka"],["Mandaic","Mand"],["Manichaean","Mani"],["Marchen","Marc"],["Medefaidrin","Medf"],["Mende_Kikakui","Mend"],["Meroitic_Cursive","Merc"],["Meroitic_Hieroglyphs","Mero"],["Malayalam","Mlym"],["Modi","Modi"],["Mongolian","Mong"],["Mro","Mroo"],["Meetei_Mayek","Mtei"],["Multani","Mult"],["Myanmar","Mymr"],["Nag_Mundari","Nagm"],["Nandinagari","Nand"],["Old_North_Arabian","Narb"],["Nabataean","Nbat"],["Newa","Newa"],["Nko","Nkoo"],["Nushu","Nshu"],["Ogham","Ogam"],["Ol_Chiki","Olck"],["Old_Turkic","Orkh"],["Oriya","Orya"],["Osage","Osge"],["Osmanya","Osma"],["Old_Uyghur","Ougr"],["Palmyrene","Palm"],["Pau_Cin_Hau","Pauc"],["Old_Permic","Perm"],["Phags_Pa","Phag"],["Inscriptional_Pahlavi","Phli"],["Psalter_Pahlavi","Phlp"],["Phoenician","Phnx"],["Miao","Plrd"],["Inscriptional_Parthian","Prti"],["Rejang","Rjng"],["Hanifi_Rohingya","Rohg"],["Runic","Runr"],["Samaritan","Samr"],["Old_South_Arabian","Sarb"],["Saurashtra","Saur"],["SignWriting","Sgnw"],["Shavian","Shaw"],["Sharada","Shrd"],["Siddham","Sidd"],["Khudawadi","Sind"],["Sinhala","Sinh"],["Sogdian","Sogd"],["Old_Sogdian","Sogo"],["Sora_Sompeng","Sora"],["Soyombo","Soyo"],["Sundanese","Sund"],["Syloti_Nagri","Sylo"],["Syriac","Syrc"],["Tagbanwa","Tagb"],["Takri","Takr"],["Tai_Le","Tale"],["New_Tai_Lue","Talu"],["Tamil","Taml"],["Tangut","Tang"],["Tai_Viet","Tavt"],["Telugu","Telu"],["Tifinagh","Tfng"],["Tagalog","Tglg"],["Thaana","Thaa"],["Thai","Thai"],["Tibetan","Tibt"],["Tirhuta","Tirh"],["Tangsa","Tnsa"],["Toto","Toto"],["Ugaritic","Ugar"],["Vai","Vaii"],["Vithkuqi","Vith"],["Warang_Citi","Wara"],["Wancho","Wcho"],["Old_Persian","Xpeo"],["Cuneiform","Xsux"],["Yezidi","Yezi"],["Yi","Yiii"],["Zanabazar_Square","Zanb"],["Inherited","Zinh"],["Common","Zyyy"],["Unknown","Zzzz"]]);
export const noToPva = new Map([[166,"Adlam"],[439,""],[239,"Caucasian_Albanian"],[338,"Ahom"],[160,"Arabic"],[161,""],[124,"Imperial_Aramaic"],[230,"Armenian"],[134,"Avestan"],[360,"Balinese"],[435,"Bamum"],[259,"Bassa_Vah"],[365,"Batak"],[325,"Bengali"],[334,"Bhaiksuki"],[550,""],[285,"Bopomofo"],[300,"Brahmi"],[570,"Braille"],[367,"Buginese"],[372,"Buhid"],[349,"Chakma"],[440,"Canadian_Aboriginal"],[201,"Carian"],[358,"Cham"],[445,"Cherokee"],[298,""],[109,"Chorasmian"],[291,""],[204,"Coptic"],[402,"Cypro_Minoan"],[403,"Cypriot"],[220,"Cyrillic"],[221,""],[315,"Devanagari"],[342,"Dives_Akuru"],[328,"Dogra"],[250,"Deseret"],[755,"Duployan"],[70,""],[60,""],[50,"Egyptian_Hieroglyphs"],[226,"Elbasan"],[128,"Elymaic"],[430,"Ethiopic"],[164,""],[241,"Georgian"],[240,"Georgian"],[225,"Glagolitic"],[312,"Gunjala_Gondi"],[313,"Masaram_Gondi"],[206,"Gothic"],[343,"Grantha"],[200,"Greek"],[320,"Gujarati"],[397,""],[310,"Gurmukhi"],[503,""],[286,"Hangul"],[500,"Han"],[371,"Hanunoo"],[501,""],[502,""],[127,"Hatran"],[125,"Hebrew"],[410,"Hiragana"],[80,"Anatolian_Hieroglyphs"],[450,"Pahawh_Hmong"],[451,"Nyiakeng_Puachue_Hmong"],[412,"Katakana_Or_Hiragana"],[176,"Old_Hungarian"],[610,""],[210,"Old_Italic"],[284,""],[361,"Javanese"],[413,""],[510,""],[357,"Kayah_Li"],[411,"Katakana"],[368,"Kawi"],[305,"Kharoshthi"],[355,"Khmer"],[322,"Khojki"],[505,""],[288,"Khitan_Small_Script"],[345,"Kannada"],[287,""],[436,""],[396,""],[317,"Kaithi"],[351,"Tai_Tham"],[356,"Lao"],[217,""],[216,""],[215,"Latin"],[364,""],[335,"Lepcha"],[336,"Limbu"],[400,"Linear_A"],[401,"Linear_B"],[399,"Lisu"],[437,""],[202,"Lycian"],[116,"Lydian"],[314,"Mahajani"],[366,"Makasar"],[140,"Mandaic"],[139,"Manichaean"],[332,"Marchen"],[90,""],[265,"Medefaidrin"],[438,"Mende_Kikakui"],[101,"Meroitic_Cursive"],[100,"Meroitic_Hieroglyphs"],[347,"Malayalam"],[324,"Modi"],[145,"Mongolian"],[218,""],[264,"Mro"],[337,"Meetei_Mayek"],[323,"Multani"],[350,"Myanmar"],[295,"Nag_Mundari"],[311,"Nandinagari"],[106,"Old_North_Arabian"],[159,"Nabataean"],[333,"Newa"],[85,""],[420,""],[165,"Nko"],[499,"Nushu"],[212,"Ogham"],[261,"Ol_Chiki"],[296,""],[175,"Old_Turkic"],[327,"Oriya"],[219,"Osage"],[260,"Osmanya"],[143,"Old_Uyghur"],[126,"Palmyrene"],[263,"Pau_Cin_Hau"],[15,""],[16,""],[227,"Old_Permic"],[331,"Phags_Pa"],[131,"Inscriptional_Pahlavi"],[132,"Psalter_Pahlavi"],[133,""],[115,"Phoenician"],[282,"Miao"],[293,""],[130,"Inscriptional_Parthian"],[103,""],[900,""],[949,""],[303,""],[363,"Rejang"],[167,"Hanifi_Rohingya"],[620,""],[211,"Runic"],[123,"Samaritan"],[292,""],[105,"Old_South_Arabian"],[344,"Saurashtra"],[95,"SignWriting"],[281,"Shavian"],[319,"Sharada"],[530,""],[302,"Siddham"],[180,""],[318,"Khudawadi"],[348,"Sinhala"],[141,"Sogdian"],[142,"Old_Sogdian"],[398,"Sora_Sompeng"],[329,"Soyombo"],[362,"Sundanese"],[274,""],[316,"Syloti_Nagri"],[135,"Syriac"],[138,""],[137,""],[136,""],[373,"Tagbanwa"],[321,"Takri"],[353,"Tai_Le"],[354,"New_Tai_Lue"],[346,"Tamil"],[520,"Tangut"],[359,"Tai_Viet"],[380,""],[340,"Telugu"],[290,""],[120,"Tifinagh"],[370,"Tagalog"],[170,"Thaana"],[352,"Thai"],[330,"Tibetan"],[326,"Tirhuta"],[275,"Tangsa"],[229,""],[299,""],[294,"Toto"],[341,""],[40,"Ugaritic"],[470,"Vai"],[280,""],[228,"Vithkuqi"],[262,"Warang_Citi"],[283,"Wancho"],[480,""],[30,"Old_Persian"],[20,"Cuneiform"],[192,"Yezidi"],[460,"Yi"],[339,"Zanabazar_Square"],[994,"Inherited"],[995,""],[993,""],[996,""],[997,""],[998,"Common"],[999,"Unknown"]]);
2 changes: 1 addition & 1 deletion gen/script-trie.cc

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/itemize.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import {createTrie} from './unicode-trie.js';
import wasm from './wasm.js';
import SCRIPT_NAMES from '../gen/script-names.js';
import {noToPva} from '../gen/script-names.js';

// I don't know why the pointer value is stored directly in the .value here.
// It must be an emscripten weirdness, so watch out in the future
Expand Down Expand Up @@ -283,7 +283,7 @@ export function* scriptIterator(text: string) {
code = ((code - 0xd800) * 0xd400) + (next - 0xdc00) + 0x10000;
}

let script = SCRIPT_NAMES[scriptTrie.get(code)] || 'Common';
let script = noToPva.get(scriptTrie.get(code)) || 'Common';
const pairIndex = script !== 'Common' ? -1 : getPairIndex(code);

// Paired character handling:
Expand Down

0 comments on commit 6d6381e

Please sign in to comment.