-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathFofatqv2.0.py
135 lines (124 loc) · 6.67 KB
/
Fofatqv2.0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# __author__:leezp
# __date__:2019-07-09
# Local: Win7 (python3)
# Fofatq v2.0
import requests
from urllib import parse
import urllib
import base64
from bs4 import BeautifulSoup
import time
import random
import datetime
User_Agents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
'Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)'
]
PROXIES = ['http://117.85.48.16:9999',
'http://182.34.34.213:9999',
'http://175.42.122.3:9999']
class FofaSpider:
def __init__(self, query_str, Cookie, Referer, X_CSRF_Token, If_None_Match):
self.query_str_urlencode = urllib.parse.quote(query_str)
query_str_qbase64 = str(base64.b64encode(query_str.encode('utf-8')), 'utf-8')
self.query_str_qbase64_urlencode = urllib.parse.quote(query_str_qbase64)
self.headers = {
'Accept': 'application/ecmascript, application/x-ecmascript, */*,q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'keep-alive',
'Host': 'fofa.so',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': '%s' % (random.choice(User_Agents)),
'Cookie': '%s' % (Cookie)
}
def file_put(self, str):
with open("ip.txt", "a", encoding='utf-8') as f:
f.write(str)
def spider_ip(self, url, i):
try:
# if (i % 15 == 0): # 每15页休眠20秒
# time.sleep(20)
response = requests.get(url=url, proxies=random.choice(PROXIES), headers=self.headers, timeout=5)
soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml')
all_t = soup.find_all("div", class_="list_mod_t")
all_c = soup.find_all("div", class_="list_mod_c")
if len(all_t) != 0: # 判断session 是否返回正确页面/Status Code: 429
count = 0
for k in all_t:
num = k.find_all('a')
if ("http" in num[0].get('href')) & ("https" not in num[0].get('href')):
ip = num[0]['href']
self.file_put(ip) # 写ip
text = all_c[count].find_all("ul", class_="list_sx1")
text = text[0].find_all("li")[0]
title = text.text.strip() # 写title 并去除空格
self.file_put(", " + title + "\n")
count += 1
else:
print(str(i) + " none") # 爬16页 左右页面显示 Retry later;状态码 Status Code: 429 Too Many Requests
time.sleep(20) # 等待20秒
# 重新爬取丢失这一页
try:
response = requests.get(url=url, proxies=random.choice(PROXIES), headers=self.headers, timeout=5)
soup = BeautifulSoup(response.content.decode('utf-8'), 'lxml')
all_t = soup.find_all("div", class_="list_mod_t")
all_c = soup.find_all("div", class_="list_mod_c")
if len(all_t) != 0: # 判断session 是否失效
count = 0
for k in all_t:
num = k.find_all('a')
if ("http" in num[0].get('href')) & ("https" not in num[0].get('href')):
ip = num[0]['href']
self.file_put(ip) # 写 ip
text = all_c[count].find_all("ul", class_="list_sx1")
text = text[0].find_all("li")[0]
title = text.text.strip() # 打印title 并去除空格
self.file_put(", " + title + "\n")
count += 1
print('第' + str(i) + '页第二次爬取成功')
except: # 这个except没有走,走的外层except,暂未解决
# print('第二次重连失败,第' + i + '页')
# time.sleep(20)
pass
except requests.exceptions.ReadTimeout as E:
print(str(i), end=' ') # end=' '不换行输出
print(E)
# 将异常的页码保存到ipexcept.txt,便于单独跑,不遗漏数据
with open("ipexcept.txt", "a", encoding='utf-8') as f:
f.write(str(i) + "\n")
pass
if __name__ == "__main__":
# query_str 查询字符串
query_str = 'app="用友-致远OA"'
Cookie = '_fofapro_ars_session=XXX'
X_CSRF_Token = ''
Referer = ''
If_None_Match = ''
fofaSpider = FofaSpider(query_str, Cookie, Referer, X_CSRF_Token, If_None_Match)
# 要爬取得页数,page=n+1,page = 2 则只爬取第一页
page = 15
starttime = datetime.datetime.now()
for i in range(1, page):
query_url = "https://fofa.so/result?page=" + str(
i) + "&qbase64=" + fofaSpider.query_str_qbase64_urlencode # "&q=" + fofaSpider.query_str_urlencode +
print("第" + str(i) + "页 " + query_url)
fofaSpider.spider_ip(query_url, i)
endtime = datetime.datetime.now()
print('程序结束', end=' ')
print(datetime.datetime.now())
print('耗时', end=' ')
print(endtime - starttime, end=' ')
print('秒')