-
Notifications
You must be signed in to change notification settings - Fork 80
/
Copy path11.2jTessBoxEditor-tesseract.py
170 lines (154 loc) · 5.21 KB
/
11.2jTessBoxEditor-tesseract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""
验证码训练脚本
Author:caixiaoxin
date:2019/7/23
"""
from PIL import ImageEnhance
from PIL import Image
import pytesseract
from bs4 import BeautifulSoup
import os
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/75.0.3770.142 Safari/537.36',
}
# 根据训练字库识别验证码
def get_varifyCode()->str:
img = Image.open('exe_file/11/code.png')
# print(img)
img = img.convert('RGB')
# 颜色调到最暗
enhancer = ImageEnhance.Color(img)
enhancer = enhancer.enhance(0)
# 增加亮度
enhancer = ImageEnhance.Brightness(enhancer)
enhancer = enhancer.enhance(2)
# 增加对比度
enhancer = ImageEnhance.Contrast(enhancer)
enhancer = enhancer.enhance(8)
# 增加图片锐度
enhancer = ImageEnhance.Sharpness(enhancer)
img = enhancer.enhance(20)
# img.show()
# 转成灰度图片
img = img.convert('L')
# img.show()
# 二值化处理
threshold = 140
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
out = img.point(table, '1')
# out.show()
# img = img.convert('RGB')
out.save('exe_file/11/code.png','png')
code = pytesseract.image_to_string(out,lang='gu',config='--psm 7')
code = code.replace(' ','') # 除去空格
return code
# 下载验证码
def download_code(session):
url = 'https://so.gushiwen.org/user/login.aspx?' \
'from=http://so.gushiwen.org/user/collect.aspx'
request = session.get(url=url, headers=headers)
soup = BeautifulSoup(request.text,'lxml')
'''
问题:url相同,为什么每次获取的验证码不同
同个url下,通过cookie随机生成验证码
所以需要在获取验证码,登陆这个过程需要建立会话
'''
img_src = 'https://so.gushiwen.org' + \
soup.find('img',id='imgCode')['src']
# print(img_src)
img = session.get(url=img_src,headers=headers)
with open('exe_file/11/code.png','wb') as fp:
fp.write(img.content)
# 查找表单需要的两个参数
__VIEWSTATE = soup.find('input', id='__VIEWSTATE')['value']
__VIEWSTATEGENERATOR = soup.find('input', id='__VIEWSTATEGENERATOR')['value']
# 识别验证码
code = get_varifyCode()
return __VIEWSTATE, __VIEWSTATEGENERATOR, code
# post登陆
def login(__VIEWSTATE, __VIEWSTATEGENERATOR, code, session)->bool:
post_url = 'https://so.gushiwen.org/user/login.aspx?' \
'from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
data = {
'__VIEWSTATE' : __VIEWSTATE,
'__VIEWSTATEGENERATOR' : __VIEWSTATEGENERATOR,
'from' : 'http://so.gushiwen.org/user/collect.aspx',
'email' : '15625266605',
'pwd' : '123456',
'code' : code,
'denglu': '登录',
}
# 登陆
request = session.post(url=post_url,headers=headers,data=data)
# print(len(request.text))
if len(request.text)==35822:
return False
else:
return True
# 实现模拟登陆,如果验证码识别错误,将有误验证码存入
def test_login()->bool:
# 创建会话
session = requests.Session()
# 下载验证码到本地
__VIEWSTATE, __VIEWSTATEGENERATOR, code = download_code(session)
status = login(__VIEWSTATE, __VIEWSTATEGENERATOR, code ,session)
if status is not True:
try:
img = Image.open('exe_file/11/code.png')
img.save('exe_file/11/verify_code/{}.png'.format(code), 'png')
except OSError:
pass
return False
else: return True
# 批量处理验证码图片
def deal_img():
root = 'exe_file/11/gushiwen_code/'
ind = 0
# 从100张图片中提取出字符样本
for image in os.listdir(root):
img = Image.open(root + image)
img = img.convert('RGB')
# 颜色调到最暗
enhancer = ImageEnhance.Color(img)
enhancer = enhancer.enhance(0)
# 增加亮度
enhancer = ImageEnhance.Brightness(enhancer)
enhancer = enhancer.enhance(2)
# 增加对比度
enhancer = ImageEnhance.Contrast(enhancer)
enhancer = enhancer.enhance(8)
# 增加图片锐度
enhancer = ImageEnhance.Sharpness(enhancer)
img = enhancer.enhance(20)
# img.show()
# 转成灰度图片
img = img.convert('L')
# img.show()
# 二值化处理
threshold = 140
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
out = img.point(table, '1')
out.save(root+'{}.png'.format(ind),'png')
ind = ind + 1
if __name__ == '__main__':
# 测试识别准确率
test_num = 200
correct_num = 0
for i in range(test_num):
if test_login() is True:
correct_num += 1
print("准确率{}%".format(correct_num*100/test_num))
# deal_img()