|
| 1 | +import _initAsync, { |
| 2 | + type Tokenizer as _Tokenizer, |
| 3 | + TokenizerBuilder as _TokenizerBuilder, |
| 4 | + type InitInput, |
| 5 | +} from "lindera-wasm-ipadic"; |
| 6 | + |
| 7 | +export type LinderaToken = Map<string, unknown>; |
| 8 | + |
| 9 | +export const IPADIC_DETAILS_KEYS = [ |
| 10 | + "partOfSpeech", |
| 11 | + "partOfSpeechSubcategory1", |
| 12 | + "partOfSpeechSubcategory2", |
| 13 | + "partOfSpeechSubcategory3", |
| 14 | + "conjugationForm", |
| 15 | + "conjugationType", |
| 16 | + "baseForm", |
| 17 | + "reading", |
| 18 | + "pronunciation", |
| 19 | +] as const; |
| 20 | + |
| 21 | +export type IpadicDetailsKeys = (typeof IPADIC_DETAILS_KEYS)[number]; |
| 22 | + |
| 23 | +export type IpadicDetailsObject = { |
| 24 | + [K in (typeof IPADIC_DETAILS_KEYS)[number]]: string; |
| 25 | +}; |
| 26 | + |
| 27 | +export type FormattedToken = { |
| 28 | + byteEnd: number; |
| 29 | + byteStart: number; |
| 30 | + text: string; |
| 31 | + wordId: { |
| 32 | + id: number; |
| 33 | + isSystem: boolean; |
| 34 | + }; |
| 35 | + details?: IpadicDetailsObject; |
| 36 | +}; |
| 37 | + |
| 38 | +const typeSafeObjectFromEntries = <const T extends ReadonlyArray<readonly [PropertyKey, unknown]>>( |
| 39 | + entries: T, |
| 40 | +): { [K in T[number] as K[0]]: K[1] } => { |
| 41 | + return Object.fromEntries(entries) as { [K in T[number] as K[0]]: K[1] }; |
| 42 | +}; |
| 43 | + |
| 44 | +function detailsArrayToObject(details: string[]): IpadicDetailsObject { |
| 45 | + return typeSafeObjectFromEntries(IPADIC_DETAILS_KEYS.map((key, i) => [key, details[i]])); |
| 46 | +} |
| 47 | + |
| 48 | +export class Tokenizer { |
| 49 | + #superTokenizer: _Tokenizer; |
| 50 | + #tokensFormatter(tokens: LinderaToken[]): FormattedToken[] { |
| 51 | + return tokens.map((token) => { |
| 52 | + const originalDetails = token.get("details") as string[] | undefined; |
| 53 | + const formattedDetails = originalDetails && detailsArrayToObject(originalDetails); |
| 54 | + return { |
| 55 | + byteEnd: token.get("byte_end") as number, |
| 56 | + byteStart: token.get("byte_start") as number, |
| 57 | + text: token.get("text") as string, |
| 58 | + wordId: { |
| 59 | + id: token.get("word_id") as number, |
| 60 | + isSystem: token.get("is_system") as boolean, |
| 61 | + }, |
| 62 | + details: formattedDetails, |
| 63 | + }; |
| 64 | + }); |
| 65 | + } |
| 66 | + constructor(tokenizer: _Tokenizer) { |
| 67 | + this.#superTokenizer = tokenizer; |
| 68 | + } |
| 69 | + tokenize(inputText: string): FormattedToken[] { |
| 70 | + const originalTokens = this.#superTokenizer.tokenize(inputText); |
| 71 | + return this.#tokensFormatter(originalTokens); |
| 72 | + } |
| 73 | +} |
| 74 | +export class TokenizerBuilder { |
| 75 | + #superTokenizerBuilder: _TokenizerBuilder; |
| 76 | + constructor() { |
| 77 | + this.#superTokenizerBuilder = new _TokenizerBuilder(); |
| 78 | + } |
| 79 | + build(): Tokenizer { |
| 80 | + const superTokenizer = this.#superTokenizerBuilder.build(); |
| 81 | + this.#superTokenizerBuilder.setDictionary("embedded://ipadic"); |
| 82 | + this.#superTokenizerBuilder.setMode("normal"); |
| 83 | + return new Tokenizer(superTokenizer); |
| 84 | + } |
| 85 | +} |
| 86 | + |
| 87 | +export async function initAsync(options?: { moduleOrPath: InitInput }): Promise<void> { |
| 88 | + await _initAsync(options); |
| 89 | +} |
0 commit comments