diff --git a/fetcher/proxyFetcher.py b/fetcher/proxyFetcher.py index cfc37f928..5eb59bc43 100644 --- a/fetcher/proxyFetcher.py +++ b/fetcher/proxyFetcher.py @@ -13,7 +13,9 @@ __author__ = 'JHao' import re -import json +import urllib +import urllib.parse +from datetime import datetime from time import sleep from util.webRequest import WebRequest @@ -26,49 +28,53 @@ class ProxyFetcher(object): @staticmethod def freeProxy01(): - """ - 站大爷 https://www.zdaye.com/dayProxy.html - """ - start_url = "https://www.zdaye.com/dayProxy.html" - html_tree = WebRequest().get(start_url, verify=False).tree - latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() - from datetime import datetime - interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") - if interval.seconds < 300: # 只采集5分钟内的更新 - target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() - while target_url: - _tree = WebRequest().get(target_url, verify=False).tree - for tr in _tree.xpath("//table//tr"): - ip = "".join(tr.xpath("./td[1]/text()")).strip() - port = "".join(tr.xpath("./td[2]/text()")).strip() - yield "%s:%s" % (ip, port) - next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href") - target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False - sleep(5) + ''' + 站大爷 + ''' + url = 'https://www.zdaye.com/free/' + + # 第一页 + r = WebRequest().get(url, timeout=10) + proxies = re.findall(r'