-
Notifications
You must be signed in to change notification settings - Fork 1
/
captcha.py
170 lines (149 loc) · 6.03 KB
/
captcha.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import numpy as np
from itertools import chain
""" Converts an 8-bit grayscale image into a numpy array of booleans (False = white, True = black).
All non-white pixels are interpreted as black. """
def to_bitmap(image):
return (image / 255).astype(np.bool)
def transform_bitmap(bitmap, transform):
a = np.fromiter(chain.from_iterable(transform(bitmap)), np.bool)
a.shape = bitmap.shape
return a
""" Repaints all sequences of black bits shorter than or equal to 'level' to white. """
def flatten_line(line, level):
black = 0
for i in line:
if i:
c = 0 if black > level else 1
for _ in xrange(black):
yield c
black = 0
yield 1
else:
black += 1
c = 0 if black > level else 1
for _ in xrange(black):
yield c
""" Applies 'flatten_line' to all lines in the bitmap. """
def flatten_lines(bitmap, level):
for line in bitmap:
yield flatten_line(line, level)
""" Transforms 'bitmap' by applying 'flatten_lines' to rows. """
def flatten_h(bitmap, level):
return transform_bitmap(bitmap, lambda b: flatten_lines(b, level))
""" Transforms 'bitmap' by applying 'flatten_lines' to columns. """
def flatten_v(bitmap, level):
return transform_bitmap(bitmap.T, lambda b: flatten_lines(b, level)).T
""" Extracts all separate contigous black areas into separate cropped bitmaps. """
def flood_split(bitmap):
bitmap = bitmap.copy()
sr = bitmap.shape[0]
sc = bitmap.shape[1]
def flood_extract(r0, c0):
result = np.ones_like(bitmap)
stack = [(r0, c0)]
bitmap[r0, c0] = 1
min_r = r0
max_r = r0
min_c = c0
max_c = c0
while len(stack) > 0:
r, c = stack.pop()
result[r, c] = 0
if r - 1 >= 0 and not bitmap[r - 1, c]:
if r is min_r:
min_r = r - 1
bitmap[r - 1, c] = 1
stack.append((r - 1, c))
if r + 1 < sr and not bitmap[r + 1, c]:
if r is max_r:
max_r = r + 1
bitmap[r + 1, c] = 1
stack.append((r + 1, c))
if c - 1 >= 0 and not bitmap[r, c - 1]:
if c is min_c:
min_c = c - 1
bitmap[r, c - 1] = 1
stack.append((r, c - 1))
if c + 1 < sc and not bitmap[r, c + 1]:
if c is max_c:
max_c = c + 1
bitmap[r, c + 1] = 1
stack.append((r, c + 1))
return result[min_r : max_r + 1, min_c : max_c + 1]
for r in xrange(sr):
for c in xrange(sc):
if not bitmap[r, c]:
yield flood_extract(r, c)
""" Removes noise smaller than or equal to 'level' from the bitmap. """
def flatten(bitmap, level):
while True:
new_bitmap = flatten_h(bitmap, level) | flatten_v(bitmap, level)
if (new_bitmap == bitmap).all():
break
bitmap = new_bitmap
return bitmap
""" Computes absolute difference of two bitmaps. """
def diff(img1, img2):
return np.absolute(img1.astype(np.float64) - img2.astype(np.float64))
""" Computes average absolute difference of two bitmaps (their "non-similarity"). """
def diff_value(img1, img2):
return diff(img1, img2).mean()
def match(i1, i2, threshold):
rs = min(i1.shape[0], i2.shape[0])
cs = min(i1.shape[1], i2.shape[1])
for r1 in xrange(i1.shape[0] - rs + 1):
for r2 in xrange(i2.shape[0] - rs + 1):
for c1 in xrange(i1.shape[1] - cs + 1):
for c2 in xrange(i2.shape[1] - cs + 1):
s1 = i1[r1:r1+rs, c1:c1+cs]
s2 = i2[r2:r2+rs, c2:c2+cs]
if diff_value(s1, s2) < threshold:
yield (r1, c1), (r2, c2), (rs, cs)
class CaptchaBreaker(object):
class Category(object):
def __init__(self, pattern_accum, sample_count=1):
self.name = None
self.pattern_accum = pattern_accum.astype(np.int32)
self.sample_count = sample_count
if sample_count is 1:
self.pattern = pattern_accum.astype(np.float64)
else:
self.pattern = np.divide(pattern_accum, float(sample_count))
def __repr__(self):
return "Category '{0}' [{1} samples]".format(self.name, self.sample_count)
def __init__(self):
self.categories = []
def categorize(self, captcha, level, threshold):
for letter in flood_split(flatten(to_bitmap(captcha), level)):
new_cat = CaptchaBreaker.Category(letter)
while True:
m = None
for i in xrange(len(self.categories)):
cat = self.categories[i]
m = next(match(new_cat.pattern, cat.pattern, threshold), None)
if m is not None:
is1, is2, s = m
del self.categories[i]
new_cat = CaptchaBreaker.Category(
new_cat.pattern_accum[is1[0]:is1[0]+s[0], is1[1]:is1[1]+s[1]] +
cat.pattern_accum[is2[0]:is2[0]+s[0], is2[1]:is2[1]+s[1]],
new_cat.sample_count + cat.sample_count)
break
if m is None:
self.categories.append(new_cat)
break
def match(self, captcha, level, threshold):
sample = flatten(to_bitmap(captcha), level).astype(np.float64)
res = {}
for cat in self.categories:
for m in match(sample, cat.pattern, threshold):
res[m[0][1]] = cat
prev_column = -level - 1
for column in sorted(res.iterkeys()):
if column - prev_column <= level:
continue
yield res[column]
prev_column = column
def __repr__(self):
return repr(self.categories)
__all__ = ('to_bitmap', 'flood_split', 'flatten', 'diff', 'diff_value', 'CaptchaBreaker')