Skip to content

Commit

Permalink
move tries to their own file
Browse files Browse the repository at this point in the history
trie values should be stored with the trie to avoid possible naming
conflicts, and this makes them easier to import - soon the grapheme
breaker will have to import 3 of them
  • Loading branch information
chearon committed May 25, 2024
1 parent 75b1b57 commit a23d55f
Show file tree
Hide file tree
Showing 8 changed files with 147 additions and 104 deletions.
24 changes: 12 additions & 12 deletions gen.js
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import fs from 'fs';
import path from 'path';
import * as lbClasses from './dist/src/text-line-break.js';
import * as gbClasses from './dist/src/text-grapheme-break.js';
import * as mjClasses from './dist/src/text-itemize.js';
import * as LineBreakTrie from './dist/src/trie-line-break.js';
import * as GraphemeBreakTrie from './dist/src/trie-grapheme-break.js';
import * as EmojiTrie from './dist/src/trie-emoji.js';
import UnicodeTrieBuilder from './dist/src/text-unicode-trie-builder.js';
import {getTrie, encodeTrie} from './dist/src/string-trie-encode.js';
import {hb_tag} from './dist/src/text-harfbuzz.js';
Expand Down Expand Up @@ -30,7 +30,7 @@ async function generateLineBreakTrie() {
let start = null;
let end = null;
let type = null;
const trie = new UnicodeTrieBuilder(lbClasses.XX, 0);
const trie = new UnicodeTrieBuilder(LineBreakTrie.XX, 0);

// collect entries in the linebreaking table into ranges
// to keep things smaller.
Expand All @@ -48,10 +48,10 @@ async function generateLineBreakTrie() {
}

if ((type != null) && (rangeType !== type)) {
if (typeof lbClasses[type] !== 'number') {
if (typeof LineBreakTrie[type] !== 'number') {
throw new Error(`Class ${type} not found; update text-line-break.ts?`);
}
trie.setRange(parseInt(start, 16), parseInt(end, 16), lbClasses[type], true);
trie.setRange(parseInt(start, 16), parseInt(end, 16), LineBreakTrie[type], true);
type = null;
}

Expand All @@ -63,7 +63,7 @@ async function generateLineBreakTrie() {
end = rangeEnd;
}

trie.setRange(parseInt(start, 16), parseInt(end, 16), lbClasses[type], true);
trie.setRange(parseInt(start, 16), parseInt(end, 16), LineBreakTrie[type], true);

writeTrie(path.join(__dirname, 'gen/line-break-trie.cc'), 'line_break_trie', trie);
}
Expand All @@ -75,19 +75,19 @@ async function generateGraphemeBreakTrie() {
let match;
const re = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*([A-Za-z_]+)/gm;

const trie = new UnicodeTrieBuilder(gbClasses.Other, 0);
const trie = new UnicodeTrieBuilder(GraphemeBreakTrie.Other, 0);

// collect entries in the table into ranges
// to keep things smaller.
while ((match = re.exec(data))) {
const start = match[1];
const end = match[2] != null ? match[2] : start;
const type = match[3];
if (typeof gbClasses[type] !== 'number') {
if (typeof GraphemeBreakTrie[type] !== 'number') {
throw new Error(`Class ${type} not found; update text-grapheme-break.ts?`);
}

trie.setRange(parseInt(start, 16), parseInt(end, 16), gbClasses[type]);
trie.setRange(parseInt(start, 16), parseInt(end, 16), GraphemeBreakTrie[type]);
}

writeTrie(path.join(__dirname, 'gen/grapheme-break-trie.cc'), 'grapheme_break_trie', trie);
Expand Down Expand Up @@ -368,8 +368,8 @@ async function generateEmojiTrie() {
const start = match[1];
const end = match[2] != null ? match[2] : start;
const type = match[3];
if (typeof mjClasses[type] !== 'number') continue;
trie.setRange(parseInt(start, 16), parseInt(end, 16), mjClasses[type]);
if (typeof EmojiTrie[type] !== 'number') continue;
trie.setRange(parseInt(start, 16), parseInt(end, 16), EmojiTrie[type]);
}

writeTrie(path.join(__dirname, 'gen/emoji-trie.cc'), 'emoji_trie', trie);
Expand Down
34 changes: 14 additions & 20 deletions src/text-grapheme-break.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
// All code based on foliojs/grapheme-breaker at time of writing
import UnicodeTrie from './text-unicode-trie.js';
import wasm from './wasm.js';

// I don't know why the pointer value is stored directly in the .value here.
// It must be an emscripten weirdness, so watch out in the future
const trie = new UnicodeTrie(wasm.instance.exports.grapheme_break_trie.value);
import {
trie,
CR,
LF,
Control,
Extend,
Regional_Indicator,
SpacingMark,
L,
V,
T,
LV,
LVT
} from './trie-grapheme-break.js';

// Gets a code point from a UTF-16 string
// handling surrogate pairs appropriately
Expand Down Expand Up @@ -38,19 +45,6 @@ function codePointAt(str: string, idx: number) {
return code;
};

export const Other = 0;
export const CR = 1;
export const LF = 2;
export const Control = 3;
export const Extend = 4;
export const Regional_Indicator = 5;
export const SpacingMark = 6;
export const L = 7;
export const V = 8;
export const T = 9;
export const LV = 10;
export const LVT = 11;

const GB4 = new Set([Control, CR, LF]);

const GB5 = new Set([Control, CR, LF]);
Expand Down
30 changes: 11 additions & 19 deletions src/text-itemize.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,11 @@
import UnicodeTrie from './text-unicode-trie.js';
import wasm from './wasm.js';
import {onWasmMemoryResized} from './wasm-env.js';
import {codeToName} from '../gen/script-names.js';
import {IfcInline, InlineLevel, Inline} from './layout-flow.js';
import {Style} from './style.js';
import * as hb from './text-harfbuzz.js';

// I don't know why the pointer value is stored directly in the .value here.
// It must be an emscripten weirdness, so watch out in the future
const emojiTrie = new UnicodeTrie(wasm.instance.exports.emoji_trie.value);

const scriptTrie = new UnicodeTrie(wasm.instance.exports.script_trie.value);
import * as EmojiTrie from './trie-emoji.js';
import * as ScriptTrie from './trie-script.js';

const {
// SheenBidi
Expand Down Expand Up @@ -137,12 +132,6 @@ export function bidiIteratorNext(state: BidiIteratorState) {
}
}

// Used for the trie
export const Emoji = 1;
export const Emoji_Presentation = 2;
export const Emoji_Modifier = 3;
export const Emoji_Modifier_Base = 4;

// Some unicode char constants from Pango
const kCombiningEnclosingCircleBackslashCharacter = 0x20E0;
const kCombiningEnclosingKeycapCharacter = 0x20E3;
Expand Down Expand Up @@ -216,19 +205,22 @@ export function createEmojiIteratorState(
types.push(TAG_SEQUENCE);
} else if (code === 0xE007F) {
types.push(TAG_TERM);
} else if (emojiTrie.get(code) === Emoji_Modifier_Base) {
} else if (EmojiTrie.trie.get(code) === EmojiTrie.Emoji_Modifier_Base) {
types.push(EMOJI_MODIFIER_BASE);
} else if (emojiTrie.get(code) === Emoji_Modifier) {
} else if (EmojiTrie.trie.get(code) === EmojiTrie.Emoji_Modifier) {
types.push(EMOJI_MODIFIER);
} else if (code >= 0x1f1e6 && code <= 0x1f1ff) {
types.push(REGIONAL_INDICATOR);
} else if ((code >= 48 && code <= 57) || code === 35 || code === 42) {
types.push(KEYCAP_BASE);
} else if (emojiTrie.get(code) === Emoji_Presentation) {
} else if (EmojiTrie.trie.get(code) === EmojiTrie.Emoji_Presentation) {
types.push(EMOJI_EMOJI_PRESENTATION);
} else if (emojiTrie.get(code) === Emoji && emojiTrie.get(code) !== Emoji_Presentation) {
} else if (
EmojiTrie.trie.get(code) === EmojiTrie.Emoji &&
EmojiTrie.trie.get(code) !== EmojiTrie.Emoji_Presentation
) {
types.push(EMOJI_TEXT_PRESENTATION);
} else if (emojiTrie.get(code) === Emoji) {
} else if (EmojiTrie.trie.get(code) === EmojiTrie.Emoji) {
types.push(EMOJI);
} else {
types.push(kMaxEmojiScannerCategory);
Expand Down Expand Up @@ -403,7 +395,7 @@ export function scriptIteratorNext(state: ScriptIteratorState) {
code = ((code - 0xd800) * 0xd400) + (next - 0xdc00) + 0x10000;
}

let script = codeToName.get(scriptTrie.get(code)) || 'Common';
let script = codeToName.get(ScriptTrie.trie.get(code)) || 'Common';
const pairIndex = script !== 'Common' ? -1 : getPairIndex(code);

// Paired character handling:
Expand Down
74 changes: 21 additions & 53 deletions src/text-line-break.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
// All code based on foliojs/linebreak at time of writing
import UnicodeTrie from './text-unicode-trie.js';
import wasm from './wasm.js';
import {
trie,
NS,
AL,
HL,
HY,
BA,
WJ,
RI,
ZWJ,
AI,
BK,
CJ,
CR,
LF,
NL,
SA,
SG,
SP,
XX
} from './trie-line-break.js';

class Break {
public position: number;
Expand Down Expand Up @@ -44,10 +63,6 @@ export class HardBreaker {
}
}

// I don't know why the pointer value is stored directly in the .value here.
// It must be an emscripten weirdness, so watch out in the future
const trie = new UnicodeTrie(wasm.instance.exports.line_break_trie.value);

const DI_BRK = 0; // Direct break opportunity
const IN_BRK = 1; // Indirect break opportunity
const CI_BRK = 2; // Indirect break opportunity for combining marks
Expand Down Expand Up @@ -95,53 +110,6 @@ export const pairTable = [
[DI_BRK, PR_BRK, PR_BRK, IN_BRK, IN_BRK, DI_BRK, PR_BRK, PR_BRK, PR_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, PR_BRK, CI_BRK, PR_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, DI_BRK, IN_BRK, DI_BRK] // CB
];

// The following break classes are handled by the pair table
export const OP = 0; // Opening punctuation
export const CL = 1; // Closing punctuation
export const CP = 2; // Closing parenthesis
export const QU = 3; // Ambiguous quotation
export const GL = 4; // Glue
export const NS = 5; // Non-starters
export const EX = 6; // Exclamation/Interrogation
export const SY = 7; // Symbols allowing break after
export const IS = 8; // Infix separator
export const PR = 9; // Prefix
export const PO = 10; // Postfix
export const NU = 11; // Numeric
export const AL = 12; // Alphabetic
export const HL = 13; // Hebrew Letter
export const ID = 14; // Ideographic
export const IN = 15; // Inseparable characters
export const HY = 16; // Hyphen
export const BA = 17; // Break after
export const BB = 18; // Break before
export const B2 = 19; // Break on either side (but not pair)
export const ZW = 20; // Zero-width space
export const CM = 21; // Combining marks
export const WJ = 22; // Word joiner
export const H2 = 23; // Hangul LV
export const H3 = 24; // Hangul LVT
export const JL = 25; // Hangul L Jamo
export const JV = 26; // Hangul V Jamo
export const JT = 27; // Hangul T Jamo
export const RI = 28; // Regional Indicator
export const EB = 29; // Emoji Base
export const EM = 30; // Emoji Modifier
export const ZWJ = 31; // Zero Width Joiner
export const CB = 32; // Contingent break

// The following break classes are not handled by the pair table
export const AI = 33; // Ambiguous (Alphabetic or Ideograph)
export const BK = 34; // Break (mandatory)
export const CJ = 35; // Conditional Japanese Starter
export const CR = 36; // Carriage return
export const LF = 37; // Line feed
export const NL = 38; // Next line
export const SA = 39; // South-East Asian
export const SG = 40; // Surrogates
export const SP = 41; // Space
export const XX = 42; // Unknown

function mapClass(c: number) {
switch (c) {
case AI:
Expand Down
11 changes: 11 additions & 0 deletions src/trie-emoji.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import wasm from './wasm.js';
import UnicodeTrie from './text-unicode-trie.js';

export const Emoji = 1;
export const Emoji_Presentation = 2;
export const Emoji_Modifier = 3;
export const Emoji_Modifier_Base = 4;

// I don't know why the pointer value is stored directly in the .value here.
// It must be an emscripten weirdness, so watch out in the future
export const trie = new UnicodeTrie(wasm.instance.exports.emoji_trie.value);
20 changes: 20 additions & 0 deletions src/trie-grapheme-break.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// All code based on foliojs/grapheme-breaker at time of writing
import UnicodeTrie from './text-unicode-trie.js';
import wasm from './wasm.js';

// I don't know why the pointer value is stored directly in the .value here.
// It must be an emscripten weirdness, so watch out in the future
export const trie = new UnicodeTrie(wasm.instance.exports.grapheme_break_trie.value);

export const Other = 0;
export const CR = 1;
export const LF = 2;
export const Control = 3;
export const Extend = 4;
export const Regional_Indicator = 5;
export const SpacingMark = 6;
export const L = 7;
export const V = 8;
export const T = 9;
export const LV = 10;
export const LVT = 11;
54 changes: 54 additions & 0 deletions src/trie-line-break.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import UnicodeTrie from './text-unicode-trie.js';
import wasm from './wasm.js';

// The following break classes are handled by the pair table
// (do not delete them, they are checked during trie building)
export const OP = 0; // Opening punctuation
export const CL = 1; // Closing punctuation
export const CP = 2; // Closing parenthesis
export const QU = 3; // Ambiguous quotation
export const GL = 4; // Glue
export const NS = 5; // Non-starters
export const EX = 6; // Exclamation/Interrogation
export const SY = 7; // Symbols allowing break after
export const IS = 8; // Infix separator
export const PR = 9; // Prefix
export const PO = 10; // Postfix
export const NU = 11; // Numeric
export const AL = 12; // Alphabetic
export const HL = 13; // Hebrew Letter
export const ID = 14; // Ideographic
export const IN = 15; // Inseparable characters
export const HY = 16; // Hyphen
export const BA = 17; // Break after
export const BB = 18; // Break before
export const B2 = 19; // Break on either side (but not pair)
export const ZW = 20; // Zero-width space
export const CM = 21; // Combining marks
export const WJ = 22; // Word joiner
export const H2 = 23; // Hangul LV
export const H3 = 24; // Hangul LVT
export const JL = 25; // Hangul L Jamo
export const JV = 26; // Hangul V Jamo
export const JT = 27; // Hangul T Jamo
export const RI = 28; // Regional Indicator
export const EB = 29; // Emoji Base
export const EM = 30; // Emoji Modifier
export const ZWJ = 31; // Zero Width Joiner
export const CB = 32; // Contingent break

// The following break classes are not handled by the pair table
export const AI = 33; // Ambiguous (Alphabetic or Ideograph)
export const BK = 34; // Break (mandatory)
export const CJ = 35; // Conditional Japanese Starter
export const CR = 36; // Carriage return
export const LF = 37; // Line feed
export const NL = 38; // Next line
export const SA = 39; // South-East Asian
export const SG = 40; // Surrogates
export const SP = 41; // Space
export const XX = 42; // Unknown

// I don't know why the pointer value is stored directly in the .value here.
// It must be an emscripten weirdness, so watch out in the future
export const trie = new UnicodeTrie(wasm.instance.exports.line_break_trie.value);
4 changes: 4 additions & 0 deletions src/trie-script.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import wasm from './wasm.js';
import UnicodeTrie from './text-unicode-trie.js';

export const trie = new UnicodeTrie(wasm.instance.exports.script_trie.value);

0 comments on commit a23d55f

Please sign in to comment.