-
Notifications
You must be signed in to change notification settings - Fork 492
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
因为代理实效导致的重试还是使用实效的那个IP #226
Comments
上一个简单的代码看看 |
class S1688(feapder.AirSpider):
__custom_setting__ = {
# "USE_SESSION": True,
"SPIDER_THREAD_COUNT": 12,
"PROXY_ENABLE": True,
# "SPIDER_SLEEP_TIME": [2, 5],
# "LOG_LEVEL": "INFO",
"PROXY_EXTRACT_API": "http://v2.api.juliangip.com/dynamic/getips",
}
def download_midware(self, request: Request):
cna = ''.join(random.choices(list('DwftHIHbiXICAQHA8429Gdvc'), k=24))
request.headers = {
"Cookie": f"cna={cna};",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42"
}
return request
def exception_request(self, request: Request, response, e):
request.proxies_pool.tag_proxy(request.requests_kwargs.get("proxies"), -1)
def start_requests(self):
dire_list = load_dire_list()
for dire in dire_list:
url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={parse.quote('%s %s 中药材' % (dire['dire_name'], dire['dire_location']), encoding='gbk')}&spm="
yield feapder.Request(url=url, dire=dire, verify=False)
def parse(self, request: Request, response: Response):
dire = request.dire
try:
data_str_result = re.search("window.data.offerresultData = successDataCheck\((.*)\)", response.text)
if not data_str_result: return
data_str = data_str_result.group(1)
data = json.loads(data_str)['data']
offer_list = data.get("offerList", [])
for drug in offer_list:
drug_id = drug['id']
title = drug['information']['subject']
if dire['dire_name'] not in title: continue
drug.update(dire)
yield feapder.Request(url=f"https://detail.1688.com/offer/{drug_id}.html",
callback=self.parse_detail, dire=dire, dire_item=drug)
except Exception as e:
request.proxies_pool.tag_proxy(request.requests_kwargs.get("proxies"), -1)
raise Exception(f"链接:{request.url}被触发风控,无法正常获取数据,尝试重试!")
def parse_detail(self, request: Request, response: Response):
dire_item = request.dire
drug_info = request.dire_item
data_match_str_result = re.search("window.__INIT_DATA=(.*)", response.text)
if not data_match_str_result: return
data_match_str = data_match_str_result.group(1)
data = json.loads(data_match_str)
item = Item()
item.item_name = "s1688"
item.table_name = item.item_name
item.update({"tempModel": data['globalData']['tempModel']})
item.update({"skuInfoMap": data['globalData']['skuModel']['skuInfoMap']})
item.update({"skuModel": data['globalData']['skuModel']})
item.update({"orderParam": data['globalData']['orderParamModel']['orderParam']})
attr_param_arr = []
module = next((x for x in data['modules'] if x['name'] == '@ali/tdmod-od-pc-layout-detail-tab-container'), None)
if module is not None:
children = module.get('children', [])
if len(children) > 0:
attr_param_arr = [i for i in children if i['name'] == '@ali/tdmod-od-pc-attribute-new']
if not attr_param_arr: return
attr_param = attr_param_arr[0]
item.update({"attrList": data['data'][attr_param['uuid']]['data']})
item['search_key'] = dire_item['dire_name']
item['dire_spec'] = dire_item['dire_spec']
item['ID'] = dire_item['ID']
item.update(drug_info)
log.info(f"s1688 {item['ID']}-{item['tempModel']['offerTitle']}-{item['search_key']}-{item['dire_spec']}")
yield item |
能留个QQ?或者其他联系方式? |
1577134779 |
代理模块 是打算废掉重写的,现在用起来比较麻烦。你可以先自己写个代理池,等我这边封装好了你再用我这个 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
需知
升级feapder,保证feapder是最新版,若BUG仍然存在,则详细描述问题
问题
对网站进行爬虫的时候,代理池一次获取20个代理,网站针对IP有访问频率的限制,所以我们的策略是在request_exeption里面去掉正在使用的代理,我们以为重试的时候会使用新的代理IP,但在实际使用的时候发现重试的代理IP还是失败的时候提高的代理。这样的话失败重试就没有意义了。所以想能够设置个自定以配置,能够决定在请求失败的时候重新从代理池里面拿IP。
截图
代码
The text was updated successfully, but these errors were encountered: