forked from kha-white/manga-ocr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
54 lines (38 loc) · 1.36 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import unicodedata
from manga_ocr_dev.env import ASSETS_PATH, FONTS_ROOT
def get_background_df(background_dir):
background_df = []
for path in background_dir.iterdir():
ymin, ymax, xmin, xmax = [int(v) for v in path.stem.split('_')[-4:]]
h = ymax - ymin
w = xmax - xmin
ratio = w / h
background_df.append({
'path': str(path),
'h': h,
'w': w,
'ratio': ratio,
})
background_df = pd.DataFrame(background_df)
return background_df
def is_kanji(ch):
return 'CJK UNIFIED IDEOGRAPH' in unicodedata.name(ch)
def is_hiragana(ch):
return 'HIRAGANA' in unicodedata.name(ch)
def is_katakana(ch):
return 'KATAKANA' in unicodedata.name(ch)
def is_ascii(ch):
return ord(ch) < 128
def get_charsets(vocab_path=None):
if vocab_path is None:
vocab_path = ASSETS_PATH / 'vocab.csv'
vocab = pd.read_csv(vocab_path).char.values
hiragana = vocab[[is_hiragana(c) for c in vocab]][:-6]
katakana = vocab[[is_katakana(c) for c in vocab]][3:]
return vocab, hiragana, katakana
def get_font_meta():
df = pd.read_csv(ASSETS_PATH / 'fonts.csv')
df.font_path = df.font_path.apply(lambda x: str(FONTS_ROOT / x))
font_map = {row.font_path: set(row.supported_chars) for row in df.dropna().itertuples()}
return df, font_map