Skip to content

Commit daff86e

Browse files
committed
feat: migrate to lindera-wasm
1 parent bb7c09a commit daff86e

13 files changed

Lines changed: 991 additions & 92 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ stats-*.json
1111
web-ext.config.ts
1212
*.tsbuildinfo
1313
.wrangler
14+
dist
1415

1516
# testing
1617
/coverage

apps/extension/package.json

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,13 @@
1717
"@ctrl/tinycolor": "^4.1.0",
1818
"@floating-ui/react": "^0.27.2",
1919
"@headlessui/react": "catalog:",
20-
"@sglkc/kuromoji": "^1.1.0",
20+
"@lindera/ipadic": "workspace:^",
2121
"@webext-core/messaging": "^2.1.0",
2222
"clsx": "catalog:",
2323
"es-toolkit": "catalog:",
2424
"file-saver": "^2.0.5",
2525
"i18next": "^25.0.0",
2626
"idb": "^8.0.3",
27-
"lindera-wasm-ipadic": "^1.0.0",
2827
"next-themes": "^0.4.4",
2928
"picomatch": "^4.0.3",
3029
"react": "catalog:",

apps/extension/src/commons/toKanjiToken.ts

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
import type { FormattedToken } from "@lindera/ipadic";
12
import { head, last } from "es-toolkit";
23
import { isKanji, toKatakana } from "wanakana";
34

45
// It's not just kanji, such as "市ヶ谷" (イチガヤ), "我々" (ワレワレ).
56
export interface KanjiToken {
67
original: string;
78
reading: string;
8-
start: number; // Indexes start from 0
9+
start: number;
910
end: number;
1011
}
1112
/**
@@ -20,17 +21,21 @@ export interface KanjiToken {
2021
* ]
2122
* ```
2223
*/
23-
export const toKanjiToken = (linderaTokens: any[], text: string): KanjiToken[] => {
24+
export const toKanjiToken = (linderaTokens: FormattedToken[], text: string): KanjiToken[] => {
2425
const filteredTokens = linderaTokens
25-
.map((token) => toSimplifiedToken(token, text))
2626
.filter(isPhonetic)
27+
.map((token) => toSimplifiedToken(token, text))
2728
.flatMap(toRubyText);
2829
return filteredTokens;
2930
};
3031

31-
const isPhonetic = (linderaToken: any) => {
32-
const hasKanji = /\p{sc=Han}/v.test(linderaToken.original);
33-
const hasReading = Boolean(linderaToken.reading && linderaToken.reading !== "*");
32+
type LinderaTokenWithDetails = Omit<FormattedToken, "details"> & {
33+
details: NonNullable<FormattedToken["details"]>;
34+
};
35+
36+
const isPhonetic = (linderaToken: FormattedToken): linderaToken is LinderaTokenWithDetails => {
37+
const hasKanji = /\p{sc=Han}/v.test(linderaToken.text);
38+
const hasReading = Boolean(linderaToken.details?.reading && linderaToken.details.reading !== "*");
3439
return hasReading && hasKanji;
3540
};
3641

@@ -41,12 +46,15 @@ interface SimplifiedToken {
4146
end: number;
4247
}
4348

44-
const toSimplifiedToken = (linderaToken: any, text: string): SimplifiedToken => {
49+
const toSimplifiedToken = (
50+
linderaToken: LinderaTokenWithDetails,
51+
text: string,
52+
): SimplifiedToken => {
4553
return {
46-
start: byteIndexToCharIndex(linderaToken.get("byte_start"), text),
47-
end: byteIndexToCharIndex(linderaToken.get("byte_end"), text),
48-
original: linderaToken.get("text"),
49-
reading: linderaToken.get("details")[7],
54+
start: byteIndexToCharIndex(linderaToken.byteStart, text),
55+
end: byteIndexToCharIndex(linderaToken.byteEnd, text),
56+
original: linderaToken.text,
57+
reading: linderaToken.details.reading,
5058
};
5159
};
5260

apps/extension/src/entrypoints/background/listeners/onGetKanjiMarksMessage.ts

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import wasmInit, { type Tokenizer, TokenizerBuilder } from "lindera-wasm-ipadic";
1+
import { initAsync, type Tokenizer, TokenizerBuilder } from "@lindera/ipadic";
22
import { ExtEvent } from "@/commons/constants";
33
import { onMessage } from "@/commons/message";
44
import { type KanjiToken, toKanjiToken } from "@/commons/toKanjiToken";
@@ -24,18 +24,10 @@ const getTokenizer = async () => {
2424
return await deferredTokenizer.promise;
2525
}
2626
try {
27-
await wasmInit({
28-
module_or_path: "/lindera_wasm_bg.wasm",
27+
await initAsync({
28+
moduleOrPath: "lindera_wasm_bg.wasm",
2929
});
3030
const builder = new TokenizerBuilder();
31-
builder.setDictionary("embedded://ipadic");
32-
builder.setMode("normal");
33-
builder.appendTokenFilter("japanese_compound_word", {
34-
kind: "ipadic",
35-
tags: ["名詞,数"],
36-
new_tag: "名詞,数",
37-
});
38-
builder.appendTokenFilter("japanese_number", { tags: ["名詞,数"] });
3931
const tokenizer = builder.build();
4032
deferredTokenizer.resolve(tokenizer);
4133
} catch (error) {

apps/extension/wxt.config.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ export default defineConfig({
5757
"build:publicAssets": async ({ config }, publicFiles) => {
5858
const srcPath = path.resolve(
5959
import.meta.dirname,
60-
"./node_modules/lindera-wasm-ipadic/lindera_wasm_bg.wasm",
60+
"./node_modules/@lindera/ipadic/node_modules/lindera-wasm-ipadic/lindera_wasm_bg.wasm",
6161
);
6262
await fs.mkdir(config.outDir, { recursive: true });
6363
publicFiles.push({

package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,5 +42,8 @@
4242
"overrides": {
4343
"vite": "catalog:"
4444
}
45+
},
46+
"dependencies": {
47+
"@lindera/ipadic": "link:../lindera.js/packages/ipadic"
4548
}
4649
}

packages/lindera-ipadic/README.md

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Lindera with IPADIC
2+
3+
## Introduction
4+
5+
This package provides a wrapper around `lindera-wasm-ipadic`, a WebAssembly version of Lindera that uses the IPADIC dictionary for Japanese morphological analysis. It simplifies the initialization and configuration process, making it easier to use Lindera in my project.
6+
7+
> [!IMPORTANT]This package will not be made public because I don't have the energy to maintain it.
8+
9+
### Key Points
10+
11+
- Convert the Token structure to an `Object`, eliminating `Map` and `Array`.
12+
- Add type annotations, eliminating `any`.
13+
14+
## How to Use
15+
16+
This library cannot be bundled. You must put the WASM file into the `public` folder to ensure that the path is accessible in the browser.
17+
18+
> This file is usually located at `./node_modules/@lindera/ipadic/node_modules/lindera-wasm-ipadic/lindera_wasm_bg.wasm`.
19+
20+
Then you need to call `initAsync`:
21+
22+
```ts
23+
import { initAsync, type Tokenizer, TokenizerBuilder } from "@lindera/ipadic";
24+
25+
await initAsync({ moduleOrPath: "lindera_wasm_bg.wasm" });
26+
const builder = new TokenizerBuilder();
27+
const tokenizer = builder.build();
28+
const tokens = tokenizer.tokenize("すもももももももものうち");
29+
```
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
{
2+
"name": "@lindera/ipadic",
3+
"version": "1.0.0",
4+
"private": false,
5+
"type": "module",
6+
"license": "MIT",
7+
"files": [
8+
"dist"
9+
],
10+
"main": "./dist/index.js",
11+
"module": "./dist/index.js",
12+
"types": "./dist/index.d.ts",
13+
"exports": {
14+
".": "./dist/index.js",
15+
"./package.json": "./package.json"
16+
},
17+
"publishConfig": {
18+
"access": "public"
19+
},
20+
"scripts": {
21+
"build": "tsdown",
22+
"dev": "tsdown --watch",
23+
"test": "vitest",
24+
"typecheck": "tsc --noEmit",
25+
"postinstall": "pnpm build"
26+
},
27+
"dependencies": {
28+
"lindera-wasm-ipadic": "^1.0.0"
29+
},
30+
"devDependencies": {
31+
"@biomejs/biome": "2.1.3",
32+
"@commitlint/cli": "^19.6.1",
33+
"@commitlint/config-conventional": "^19.6.0",
34+
"@commitlint/types": "^19.5.0",
35+
"bumpp": "^10.2.3",
36+
"husky": "^9.1.7",
37+
"tsdown": "^0.14.2",
38+
"vitest": "^3.2.4"
39+
}
40+
}
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import _initAsync, {
2+
type Tokenizer as _Tokenizer,
3+
TokenizerBuilder as _TokenizerBuilder,
4+
type InitInput,
5+
} from "lindera-wasm-ipadic";
6+
7+
export type LinderaToken = Map<string, unknown>;
8+
9+
export const IPADIC_DETAILS_KEYS = [
10+
"partOfSpeech",
11+
"partOfSpeechSubcategory1",
12+
"partOfSpeechSubcategory2",
13+
"partOfSpeechSubcategory3",
14+
"conjugationForm",
15+
"conjugationType",
16+
"baseForm",
17+
"reading",
18+
"pronunciation",
19+
] as const;
20+
21+
export type IpadicDetailsKeys = (typeof IPADIC_DETAILS_KEYS)[number];
22+
23+
export type IpadicDetailsObject = {
24+
[K in (typeof IPADIC_DETAILS_KEYS)[number]]: string;
25+
};
26+
27+
export type FormattedToken = {
28+
byteEnd: number;
29+
byteStart: number;
30+
text: string;
31+
wordId: {
32+
id: number;
33+
isSystem: boolean;
34+
};
35+
details?: IpadicDetailsObject;
36+
};
37+
38+
const typeSafeObjectFromEntries = <const T extends ReadonlyArray<readonly [PropertyKey, unknown]>>(
39+
entries: T,
40+
): { [K in T[number] as K[0]]: K[1] } => {
41+
return Object.fromEntries(entries) as { [K in T[number] as K[0]]: K[1] };
42+
};
43+
44+
function detailsArrayToObject(details: string[]): IpadicDetailsObject {
45+
return typeSafeObjectFromEntries(IPADIC_DETAILS_KEYS.map((key, i) => [key, details[i]]));
46+
}
47+
48+
export class Tokenizer {
49+
#superTokenizer: _Tokenizer;
50+
#tokensFormatter(tokens: LinderaToken[]): FormattedToken[] {
51+
return tokens.map((token) => {
52+
const originalDetails = token.get("details") as string[] | undefined;
53+
const formattedDetails = originalDetails && detailsArrayToObject(originalDetails);
54+
return {
55+
byteEnd: token.get("byte_end") as number,
56+
byteStart: token.get("byte_start") as number,
57+
text: token.get("text") as string,
58+
wordId: {
59+
id: token.get("word_id") as number,
60+
isSystem: token.get("is_system") as boolean,
61+
},
62+
details: formattedDetails,
63+
};
64+
});
65+
}
66+
constructor(tokenizer: _Tokenizer) {
67+
this.#superTokenizer = tokenizer;
68+
}
69+
tokenize(inputText: string): FormattedToken[] {
70+
const originalTokens = this.#superTokenizer.tokenize(inputText);
71+
return this.#tokensFormatter(originalTokens);
72+
}
73+
}
74+
export class TokenizerBuilder {
75+
#superTokenizerBuilder: _TokenizerBuilder;
76+
constructor() {
77+
this.#superTokenizerBuilder = new _TokenizerBuilder();
78+
}
79+
build(): Tokenizer {
80+
const superTokenizer = this.#superTokenizerBuilder.build();
81+
this.#superTokenizerBuilder.setDictionary("embedded://ipadic");
82+
this.#superTokenizerBuilder.setMode("normal");
83+
return new Tokenizer(superTokenizer);
84+
}
85+
}
86+
87+
export async function initAsync(options?: { moduleOrPath: InitInput }): Promise<void> {
88+
await _initAsync(options);
89+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
{
2+
"compilerOptions": {
3+
"target": "esnext",
4+
"lib": ["ESNext", "DOM"],
5+
"moduleDetection": "force",
6+
"module": "preserve",
7+
"moduleResolution": "bundler",
8+
"resolveJsonModule": true,
9+
"types": ["node"],
10+
"strict": true,
11+
"noUnusedLocals": true,
12+
"declaration": true,
13+
"emitDeclarationOnly": true,
14+
"esModuleInterop": true,
15+
"isolatedModules": true,
16+
"verbatimModuleSyntax": true,
17+
"skipLibCheck": true,
18+
"declarationMap": true,
19+
"isolatedDeclarations": true
20+
},
21+
"include": ["src"]
22+
}

0 commit comments

Comments
 (0)