-
Notifications
You must be signed in to change notification settings - Fork 40
/
trainer.py
142 lines (120 loc) · 4.7 KB
/
trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from PIL import Image
import subprocess
import os
#Steps to take before running:
#Set TESSDATA_PREFIX to correct directory
#Put image and box files together in the same directory
#Label each corresponding file with the same filenames
CLEANED_DIR = 'cleaned'
BOX_DIR = 'box'
EXP_DIR = 'exp'
class TesseractTrainer():
def __init__(self, languageName, fontName, directory='data'):
self.languageName = languageName
self.fontName = fontName
self.directory = directory
def runAll(self):
os.chdir(self.directory)
self.createDirectories()
self.createFontProperties()
prefixes = self.renameFiles()
self.createTrainingFiles(prefixes)
self.extractUnicode()
self.runShapeClustering()
self.runMfTraining()
self.runCnTraining()
self.createTessData()
def createDirectories(self):
if not os.path.exists(CLEANED_DIR):
os.mkdir(CLEANED_DIR)
if not os.path.exists(EXP_DIR):
os.mkdir(EXP_DIR)
def createFontProperties(self):
with open(f'{EXP_DIR}/font_properties', 'w') as f:
f.write('f{self.fontName} 0 0 0 0 0')
def cleanImages(self):
images_dir = 'images'
print("CLEANING IMAGES...")
for fileName in os.listdir(images_dir):
root, ext = os.path.splitext(fileName)
if ext in ['.jpg', '.jpeg', '.png']:
image = Image.open(f'{images_dir}/{fileName}')
#Set a threshold value for the image, and save
image = image.point(lambda x: 0 if x<250 else 255)
image.save(f'{CLEANED_DIR}/{root}.tiff')
#Looks for box files, uses the box filename to find the corresponding
#.tiff file. Renames all files with the appropriate "<language>.<font>.exp<N>" filename
def renameFiles(self):
file_prefixes = []
for i, boxFile in enumerate([f for f in os.listdir(BOX_DIR) if f.endswith('.box')]):
root, _ = os.path.splitext(boxFile)
os.system(f'cp {CLEANED_DIR}/{root}.tiff {EXP_DIR}/{self.languageName}.{self.fontName}.exp{i}.tiff')
os.system(f'cp {BOX_DIR}/{root}.box {EXP_DIR}/{self.languageName}.{self.fontName}.exp{i}.box')
file_prefixes.append(f'{self.languageName}.{self.fontName}.exp{i}')
return file_prefixes
#Creates a training file for a single tiff/box pair
def createTrainingFiles(self, prefixes):
print("CREATING TRAINING DATA...")
os.chdir(EXP_DIR)
for prefix in prefixes:
p = subprocess.Popen(["tesseract", prefix+".tiff", prefix, "nobatch", "box.train"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)
returnValue = stdout_value = p.communicate()[1]
returnValue = returnValue.decode("utf-8")
print(prefix)
print(returnValue)
if "Empty page!!" in returnValue:
print(returnValue)
subprocess.call(["tesseract", "-psm", "7", prefix+".tiff", prefix, "nobatch", "box.train"])
os.chdir('..')
def extractUnicode(self):
print("EXTRACTING UNICODE...")
extractCommand = ['unicharset_extractor'] + [f for f in os.listdir(EXP_DIR) if f.endswith('.box')]
os.chdir(EXP_DIR)
p = subprocess.Popen(extractCommand)
p.wait()
os.chdir('..')
def runShapeClustering(self):
print("RUNNING SHAPE CLUSTERING...")
#shapeclustering -F font_properties -U unicharset eng.captchaFont.exp0.tr...
shapeCommand = ['shapeclustering', '-F', 'font_properties', '-U', 'unicharset']
shapeCommand = shapeCommand + self.getTrainingFileList()
os.chdir(EXP_DIR)
p = subprocess.Popen(shapeCommand)
p.wait()
os.chdir('..')
def runMfTraining(self):
#mftraining -F font_properties -U unicharset eng.captchaFont.exp0.tr...
print("RUNNING MF CLUSTERING...")
mfCommand = ['mftraining', '-F', 'font_properties', '-U', 'unicharset']
mfCommand = mfCommand + self.getTrainingFileList()
os.chdir(EXP_DIR)
p = subprocess.Popen(mfCommand)
p.wait()
os.chdir('..')
def runCnTraining(self):
#cntraining -F font_properties -U unicharset eng.captchaFont.exp0.tr...
print('RUNNING MF CLUSTERING...')
cnCommand = ['cntraining', '-F', 'font_properties', '-U', 'unicharset']
cnCommand = cnCommand + self.getTrainingFileList()
os.chdir(EXP_DIR)
p = subprocess.Popen(cnCommand)
p.wait()
os.chdir('..')
def createTessData(self):
print("CREATING TESS DATA...")
os.chdir(EXP_DIR)
#Rename all files and run combine_tessdata <language>.
os.rename('unicharset', self.languageName+'.unicharset')
os.rename('shapetable', self.languageName+'.shapetable')
os.rename('inttemp', self.languageName+'.inttemp')
os.rename('normproto', self.languageName+'.normproto')
os.rename('pffmtable', self.languageName+'.pffmtable')
p = subprocess.Popen(['combine_tessdata', self.languageName+'.'])
#mv captcha.traineddata $TESSDATA_PREFIX/captcha.traineddata
p.wait()
os.chdir('..')
#Retrieve a list of created training files
def getTrainingFileList(self):
return [f for f in os.listdir(EXP_DIR) if f.endswith('.tr')]
trainer = TesseractTrainer('captcha', 'captchaFont')
trainer.runAll()