Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 118 additions & 0 deletions translation_tool/checkers/color_char_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""translation_tool/checkers/color_char_checker.py 模組。

用途:檢查翻譯檔案中的非法顏色字元(& 後接非法的 Minecraft 顏色代碼字元)。
維護注意:本檔案的函式 docstring 用於維護說明,不代表行為變更。
"""

import json
import os
import re
from dataclasses import dataclass
from typing import Any, Generator

# 核心檢查:& 後只能接 a-v(不含 w)、0-9、空格、\、#
# 合法字元:a-v, 0-9, whitespace, backslash, hash
# 違法:& 後面接了 a-v 與 0-9、空格、\、# 以外的任何字元
COLOR_PATTERN = re.compile(r"&([^a-v0-9\s\\#])")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 COLOR_PATTERN whitelist is too broad — misses many invalid codes

The character class [^a-v0-9\s\\#] treats every letter av as a legal format code. The actual valid Minecraft format characters are:

Category Characters
Color 09, af
Formatting k, l, m, n, o, r
Hex prefix (Paper/Spigot) x

Characters such as g, h, i, j, p, q, s, t, u, v are all inside the a-v range and will not be flagged as illegal — even though &g, &p, &v, etc. have no defined meaning in vanilla Minecraft and will render incorrectly.

If the intent is strict validation:

# Legal: 0-9, a-f (colour), k-o + r (format), optional x (hex), whitespace, \, #
COLOR_PATTERN = re.compile(r"&([^0-9a-fk-orx\s\\#])", re.IGNORECASE)

If some non-standard mod codes (e.g. gj) must be kept, the allowed set should be explicitly documented.



@dataclass
class ColorCharError:
"""單一顏色字元錯誤。"""

file_path: str
key: str
value: str
illegal_char: str
position: int # 在 value 中的位置
message: str


def check_color_chars(value: str) -> list[ColorCharError] | None:
"""檢查單一字串中的非法顏色字元。

Args:
value: 要檢查的字串。

Returns:
錯誤列表(若無錯誤則回傳 None)。
"""
errors: list[ColorCharError] = []
for match in COLOR_PATTERN.finditer(value):
illegal_char = match.group(1)
pos = match.start()
errors.append(
ColorCharError(
file_path="",
key="",
value=value,
illegal_char=illegal_char,
position=pos,
message=f"在位置 {pos} 發現非法顏色字元 '&{illegal_char}',"
f"& 後只能接 a-v(不含 w)、0-9、空格、\\、#。",
)
)
return errors if errors else None
Comment on lines +31 to +55
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Inconsistent return type (list | None) instead of list

check_color_chars returns None on success and a non-empty list on failure. Returning an empty list on success is more Pythonic and avoids callers needing a None guard — it also makes the declared return type truthful (right now the annotation says list[ColorCharError] | None but an empty list is never actually returned).

Suggested change
def check_color_chars(value: str) -> list[ColorCharError] | None:
"""檢查單一字串中的非法顏色字元
Args:
value: 要檢查的字串
Returns:
錯誤列表若無錯誤則回傳 None)。
"""
errors: list[ColorCharError] = []
for match in COLOR_PATTERN.finditer(value):
illegal_char = match.group(1)
pos = match.start()
errors.append(
ColorCharError(
file_path="",
key="",
value=value,
illegal_char=illegal_char,
position=pos,
message=f"在位置 {pos} 發現非法顏色字元 '&{illegal_char}',"
f"& 後只能接 a-v(不含 w)、0-9、空格、\\、#。",
)
)
return errors if errors else None
def check_color_chars(value: str) -> list[ColorCharError]:
"""檢查單一字串中的非法顏色字元
Args:
value: 要檢查的字串
Returns:
錯誤列表若無錯誤則回傳空列表)。
"""
errors: list[ColorCharError] = []
for match in COLOR_PATTERN.finditer(value):
illegal_char = match.group(1)
pos = match.start()
errors.append(
ColorCharError(
file_path="",
key="",
value=value,
illegal_char=illegal_char,
position=pos,
message=f"在位置 {pos} 發現非法顏色字元 '&{illegal_char}',"
f"& 後只能接 a-v(不含 w)、0-9、空格、\\、#。",
)
)
return errors

The caller _check_value can then simply do if errors := check_color_chars(value):.



def _check_value(
file_path: str,
key: str,
value: Any,
) -> Generator[ColorCharError, None, None]:
"""遞迴檢查單一值,若為字串則檢查顏色字元。"""
if isinstance(value, str):
errors = check_color_chars(value)
if errors:
for err in errors:
# 補足 file_path 與 key(從上層傳入)
yield ColorCharError(
file_path=err.file_path or file_path,
key=err.key or key,
value=err.value,
illegal_char=err.illegal_char,
position=err.position,
message=err.message,
)
elif isinstance(value, dict):
for k, v in value.items():
yield from _check_value(file_path, k, v)
Comment on lines +77 to +79
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Nested-dict path context is lost

When value is a dict, the function recurses with only the child key k, discarding the parent entirely. For a nested structure like {"parent": {"child": "bad&Wvalue"}}, the resulting ColorCharError will report the leaf key alone (child) instead of the full dotted path (parent.child). By contrast, the list branch correctly preserves context with f"{key}[{i}]".

Consider building a dotted path for dict keys to keep errors self-describing:

Suggested change
elif isinstance(value, dict):
for k, v in value.items():
yield from _check_value(file_path, k, v)
elif isinstance(value, dict):
for k, v in value.items():
nested_key = f"{key}.{k}" if key else k
yield from _check_value(file_path, nested_key, v)

elif isinstance(value, list):
for i, item in enumerate(value):
yield from _check_value(file_path, f"{key}[{i}]", item)


def check_json_file(file_path: str) -> Generator[ColorCharError, None, None]:
"""讀取 JSON 檔並遞迴檢查所有 string value。

Args:
file_path: JSON 檔案路徑。

Yields:
找到的 ColorCharError。
"""
try:
with open(file_path, encoding="utf-8") as f:
data = json.load(f)
except Exception:
# 不阻断,继续检查其他文件
return

if isinstance(data, dict):
for key, value in data.items():
yield from _check_value(file_path, key, value)


def check_directory(dir_path: str) -> Generator[ColorCharError, None, None]:
"""遞迴檢查目錄下所有 .json 檔。

Args:
dir_path: 目錄路徑。

Yields:
找到的 ColorCharError。
"""
for root, _, files in os.walk(dir_path):
for file in files:
if file.endswith(".json"):
yield from check_json_file(os.path.join(root, file))
31 changes: 31 additions & 0 deletions translation_tool/core/kubejs_translator_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,37 @@ def clean_kubejs_from_raw_impl(
else:
pending_en = en

# ── 雙軌去重(reverse_index dedup)───────────────────────────────
# 目的:若某英文文字(value)已出現在 final/zh_tw.json(不同 key),
# 表示該英文原文已有翻譯,不需要再送 pending。
# 建立 reverse_index:{英文文字: [key1, key2, ...]}
if pending_en and final_root_p.exists():
# 從 final/zh_tw.json 建立 final_tw_lookup(key → 原文)
final_tw_lookup: dict[str, str] = {}
for tw_file in final_root_p.rglob("zh_tw.json"):
tw_data = read_json_dict_fn(tw_file)
if tw_data:
final_tw_lookup.update(tw_data)

if final_tw_lookup:
# 建立 reverse_index(英文文字 → 對應 key 列表)
reverse_index: dict[str, list[str]] = {}
for k, v in final_tw_lookup.items():
if is_filled_text_impl(v):
reverse_index.setdefault(v, []).append(k)

# 過濾 pending_en:跳過那些「英文文字已存在於 final」的 key
pending_en = {
k: v
for k, v in pending_en.items()
if not (
is_filled_text_impl(v)
and v in reverse_index
and k != reverse_index[v][0]
)
}
Comment on lines +210 to +234
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 final_tw_lookup rebuilt on every group iteration

The entire final_root_p.rglob("zh_tw.json") scan and reverse_index construction happen inside the for group_dir, files_map in groups.items() loop (line 185). For N groups this becomes O(N × number-of-zh_tw-files) — all zh_tw.json files are read and merged from scratch on every group, even though the final directory does not change between iterations.

Move both final_tw_lookup and reverse_index outside the loop so they are built once:

# Build once before the loop
final_tw_lookup: dict[str, str] = {}
if final_root_p.exists():
    for tw_file in final_root_p.rglob("zh_tw.json"):
        tw_data = read_json_dict_fn(tw_file)
        if tw_data:
            final_tw_lookup.update(tw_data)

reverse_index: dict[str, list[str]] = {}
for k, v in final_tw_lookup.items():
    if is_filled_text_impl(v):
        reverse_index.setdefault(v, []).append(k)

for group_dir, files_map in groups.items():
    ...
    if pending_en and reverse_index:
        pending_en = {
            k: v
            for k, v in pending_en.items()
            if not (
                is_filled_text_impl(v)
                and v in reverse_index
                and k != reverse_index[v][0]
            )
        }

Comment on lines +225 to +234
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Cross-namespace key comparison may silently drop entries

reverse_index is keyed on values from final_tw_lookup (zh_tw translations), and its lists contain keys from the final directory (e.g. "item.mymod.foo"). The dedup filter checks k != reverse_index[v][0], where k is a key from the raw/pending source file — a completely different namespace.

If pending_en has {"item.othermod.bar": "Hello World"} and reverse_index["Hello World"] == ["item.mymod.foo"], the condition becomes "item.othermod.bar" != "item.mymod.foo"True, so the entry is filtered out. The intended behaviour — keeping the entry when the same key already exists in the final output — is therefore never triggered for keys that don't coincidentally match a final-directory key name.

In practice this means any pending entry whose English value also appears anywhere in final_tw_lookup (regardless of key) will be dropped, which is likely more aggressive deduplication than intended. You may want to check for key equality within the same key namespace, or reconsider the dedup condition entirely.

# ── 雙軌去重 end ───────────────────────────────────────────────

if pending_en:
dst_en = pending_root_p / rel_group / "en_us.json"
write_json_fn(dst_en, pending_en)
Expand Down
Loading