-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathsubdomainClean.py
142 lines (129 loc) · 6.71 KB
/
subdomainClean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding:utf-8 -*-
__author__ = 'leezp'
__date__ = 20191231
import asyncio
import aiohttp
import aiomultiprocess
import aiofiles
import queue
import datetime
import random
from lxml import etree
import urllib3
import UA_Pool as UApool
import re
import argparse
urllib3.disable_warnings()
class Gun():
def __init__(self):
self.dic = {
"None": [
],
"taobao": [
('//*[@id="error-notice"]/div[2]/div[1] :) 没有找到相应的店铺信息',
'//*[@id="error-notice"]/div[2]/div[3]/ol/li[4] :) 五秒后跳转至'),
'/html/body/div/div/p[2] :) 很抱歉', # !暂时无法处理您的请求,
'/html/body/div/div/p[2] :) 亲,慢慢来,请先坐下来喝口水!',
'//*[@id="err"]/div[1]/p[1] :) 亲,小二正忙,滑动一下马上回来',
'//*[@id="dead-page"]/header/div/div/div[2]/p :) 亲,这是个机器人猖狂的时代,请进行验证证明咱是正常人~',
'//*[@id="dead-page"]/header/div/div/div[2]/div[2]/p[1] :) 抱歉', # 很抱歉,现在暂时无法处理您的请求
'//*[@id="J_4938560545"]/div/div[1]/h2 :) 很抱歉,您查看的页面找不到了!',
'//*[@id="err"]/div[1]/p :) 抱歉!', # 页面无法访问……
'//*[@id="content"]/div[1]/div/div/div[2]/div[1]/h3/em[2] :) 抱歉,您要访问的页面不存在',
'//*[@id="App"]/div/div/div/div/div[2]/div[2] :) 内网访问受限,请先登录阿里郎连接内网后再试',
'/html/body/div[5]/h4 :) 亲,店铺不存在哟!', #亲,店铺不存在哟!输入的店铺地址不正确或店铺已经关闭。
'/html/body/div[2]/div/dl/dt/p :) 当前页面访问人数过多'
]
}
self.asyncio_Semaphore = 500 # 设置最大并发数为500 , linux可设置1000,效率翻倍
self.title_regex = re.compile(r'<title>([\s\S]*?)</title>')
self.zh_regex = re.compile(r'[\u4e00-\u9fa5]+')
def parse_args():
parse = argparse.ArgumentParser(usage='python36 %(prog)s -f targetUrl_full.txt')
parse.add_argument('-f', dest='input_file', type=str, default='url_full.txt', help='default is url_full.txt')
arg = parse.parse_args()
return arg.input_file
# input_file = 'alisports.com_1_full.txt'
input_file = parse_args()
name = input_file.split('_')[0].strip()
switch = False
for key, value in Gun().dic.items():
if key == name:
dic_key = name
switch = True
break
if switch == False:
dic_key = "None"
output_file = name + '_out.txt'
List = Gun().dic["None"]
output_file = name + '_out.txt'
q = queue.Queue()
file = open(input_file, encoding='utf-8')
for x in file.readlines():
#url = 'http://' + x.split(' ')[0].strip()
url = x.split(' ')[0].strip()
q.put(url)
# 淘宝 约124 次出现验证码
async def fetch(url):
sem = asyncio.Semaphore(Gun().asyncio_Semaphore)
async with sem:
# 最大访问数
async with aiohttp.ClientSession() as session:
try:
# proxy="http://ip:port"
async with session.get(url, headers={'User-Agent': random.choice(UApool.data)},
verify_ssl=False,
timeout=3) as resp:
# If encoding is None content encoding is autocalculated using Content-Type HTTP header and chardet tool if the header is not provided by server.
# text=await resp.text(encoding=None, errors='ignore') 等价于 content = await resp.read() code=chardet.detect(content)['encoding'] text=await resp.text(encoding=code, errors='ignore')
status = resp.status
text = await resp.text(encoding=None, errors='ignore')
if Gun().title_regex.search(text) and Gun().title_regex.search(text).group(1):
s = Gun().title_regex.search(text).group(1).strip()
if (
u'旗舰店' in s and u'天猫' in s) or u'理想生活上天猫' in s or u'现在暂时无法处理您的请求' in s or \
u'大麦' in s or u'全球演出赛事官方购票平台' or u'亲,访问受限了' in s or \
'security-X5' in s or 'dopa.com' in s or '米聊' in s or u'官方旗舰店' in s:
return
html = etree.HTML(text)
xp = {}
for i in range(len(List)):
if type(List[i]).__name__ == "tuple":
xp[str(i)] = html.xpath(List[i][0].split(':)')[0].strip())
xp["100"] = html.xpath(List[i][1].split(':)')[0].strip())
else:
xp[str(i)] = html.xpath(List[i].split(':)')[0].strip())
for i in range(len(List)):
if type(List[i]).__name__ == "tuple":
if len(xp[str(i)]) > 0 and len(xp[str("100")]) > 0:
if xp[str(i)][0].text.strip() == List[i][0].split(':)')[-1].strip() and xp[
"100"][0].text.strip() == List[i][1].split(':)')[-1].strip():
return
elif len(xp[str(i)]) > 0 and xp[str(i)][0].text:
if List[i].split(':)')[-1].strip() in xp[str(i)][0].text.strip():
return
async with aiofiles.open(output_file, 'a', encoding='utf-8') as f:
# await f.write("{} {} {}".format(url, status, text) + '\n')
await f.write("{} {}".format(url, status) + '\n')
await f.close()
except Exception as e:
print(e)
pass
finally:
print('test speed')
async def main():
tasks = []
while not q.empty():
url = q.get()
tasks.append(url)
async with aiomultiprocess.Pool() as pool:
result = await pool.map(fetch, tasks)
# print(result)
if __name__ == '__main__':
start_time = datetime.datetime.now()
print(start_time)
# event_loop事件循环:程序开启一个无限的循环,当把一些函数注册到事件循环上时,满足事件发生条件即调用相应的函数。
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
end_time = datetime.datetime.now()
print('消耗时间:{}'.format(end_time - start_time))