-
Notifications
You must be signed in to change notification settings - Fork 7
/
test.py
159 lines (130 loc) · 4.46 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding: utf-8 -*-
# !/usr/bin/env python
'''
-------------------------------------------------
Description : 此文件仅用来测试一些小功能,暂未使用
Author : lichunlin
date: 2018/12/31
-------------------------------------------------
'''
import json
import time
from urllib import parse
import requests
from crawler import MyOpener as Opner
def craw_movie_id(tag, movie_queue, short_queue, comment_queue, db_queue):
start = 0
opener = Opner("[%s标签包含电影抓取]" % tag)
while True:
#"https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags="
"https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=20&genres=%E5%96%9C%E5%89%A7"
url = "http://movie.douban.com/j/search_subjects?type=movie&tag=" + tag + "&page_limit=20&page_start=" + str(start)
url = parse.quote(url, safe='/:?=&')
res = opener.open(url)
print(res["data"].text)
if not res["result"]:
print("标签 <%s> 爬取失败\n" % tag)
break
movies = json.loads(res["data"].text)['subjects']
if len(movies) == 0:
break
for item in movies:
#score = float(item['rate'])
title = item['title']
cover = item['cover']
id = int(item["id"])
print(title)
start = start + 20
time.sleep(1)
print("<%s> 类电影共有 %d 部\n" % (tag, start+20))
class ProxyProvider():
def __init__(self):
self.ipcache = set()
def get_addr(self):
if len(self.ipcache) == 0:
req = requests.get("https://proxy.horocn.com/api/proxies?order_id=NWRK1621240083969632&num=10&format=json&line_separator=win").text
req_json = json.loads(req)
print(req_json)
for item in req_json:
ip = item['host'] + ":" + str(item['port'])
print(ip)
self.ipcache.add(ip)
print("重新拉取20条代理")
return self.ipcache.pop()
def delete_addr(self, ip):
req = requests.get(("http://39.108.123.85:8000/delete?ip=%s" % ip)).content
print(req)
pass
class MyOpener():
def __init__(self, request_session):
self.session = request_session
self.proxy_provider = ProxyProvider()
self.proxyaddr = self.proxy_provider.get_addr()
self.proxy = {
'http': 'http://' + self.proxyaddr,
'https': 'https://' + self.proxyaddr,
}
def open(self, req):
print("当前使用的代理: %s" % self.proxyaddr)
retrynum = 0
response = None
while True:
try:
response = self.session.get(req, timeout=5) #proxies = self.proxy,
except requests.exceptions.RequestException as e:
retrynum = retrynum + 1
print("出错啦 <%s>" % str(e))
self.update_proxy()
if retrynum > 5:
response = None
break
if response == None:
return False
else:
if response.status_code != 200:
print("返回吗不是200")
self.update_proxy()
return response
def update_proxy(self):
self.proxy_provider.delete_addr(self.proxyaddr.split(":")[0])
print("更新代理中...%s" % self.proxyaddr.split(":")[0])
self.proxyaddr = self.proxy_provider.get_addr()
self.proxy = {
'http': 'http://' + self.proxyaddr,
'https': 'https://' + self.proxyaddr,
}
# opener = MyOpener()
#
# crawer = ShortComment_Crawer("大象席地而坐", 27172891, opener, 17892, None)
# crawer.craw()
# s = requests.Session()
# s.headers.update(headers)
# openner = MyOpener(s)
#
#
#
#opener = MyOpener(s)
# crawer = ShortComment_Crawer("大象席地而坐", 27172891, 17892, None, request_session=s)
# crawer.craw()
#craw_comment_list(27605698,"西红柿首富", 3300, None)
#craw_movie_id("犯罪", None, None, None, None)
#response = opener.open("https://www.baidu.com")
# print(response.status_code)
# print(response.text)
#
# import queue
#
# q = queue.Queue(20)
# for i in range(10):
# q.put(20, block=False)
# print(q.qsize())
# q.get(block=False)
# print(q.qsize())
#
# import sys
# from importlib import reload
#
# print(sys.getdefaultencoding())
# reload(sys)
# sys('utf8')
from crawler.movie import Moview_Crawer