forked from phantom-sea-limited/Crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrxs.py
104 lines (90 loc) · 3.43 KB
/
trxs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import requests
from bs4 import BeautifulSoup
from lxml import etree
import re
def get_item(ID):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36",
"Host": "www.trxs123.com",
}
url = "https://www.trxs123.com/txt/2-" + str(ID) + "-0.html"
log("=" * 36 + "\n" + url + "\n")
r = requests.get(url, headers=headers, timeout=10)
print("{} 响应状态码:{}".format(pre, r.status_code), end="", flush=True)
if 200 != r.status_code:
log(r.status_code + "\n")
return None
check = re.search("alert", r.text)
if check != None:
log("\t\t\t404 not found\n")
return 1
return xpath_parse(r.text)
def xpath_parse(html):
et_html = etree.HTML(html)
# 查找所有class属性为hd的div标签下的a标签的第一个span标签
urls = et_html.xpath("/html/body/div[2]/div[2]/div[2]/h2/a[2]")
# movie_list = []
# 获取每个span的文本
for each in urls:
movie = each.attrib
filename = (
movie["download"].replace("/", " ").replace("|", " ").replace("?", " ").replace("?", " ")
) # 修复文件名存在"/"时候产生的问题
href = movie["href"].strip("aa..")
href = str("https://www.trxs123.com/e/DownSys") + str(href)
# movie_list.append(movie)
download_file(filename, href)
return filename
def download_file(filename, href):
session = requests.Session()
session.trust_env = False
r = session.get(href)
with open(filename, "wb") as fn:
fn.write(r.content)
log(filename + "\tOK\n")
return 0
def log(item):
with open("log.log", "a") as log:
# log.write('='*16+ i + '='*16 +'\n')
log.write(item)
def check_out(thing):
if thing == 1:
print(pre + " 文件异常 ", end="", flush=True)
else:
print("{} 文件:{} OK\t\t\t".format(pre, thing), end="", flush=True)
if __name__ == "__main__":
try:
start = int(input("请输入起始点:"))
end = int(input("请输入终点:"))
except ValueError:
print("请输入数字!")
else:
for i in range(start, end + 1):
cent = int(
(float(i) - float(start) + 1) / (float(end) - float(start)) * 100
)
# if i < 10:
# i = "0000" + str(i)
# elif 10 <= i and i < 100:
# i = "000" + str(i)
# elif 100 <= i and i < 1000:
# i = "00" + str(i)
# else:
# i = "0" + str(i)
global pre
num = int(float(cent) / 2)
pre = "\r{}%:{}".format(cent, "#" * num)
print("{}: 正在准备中\t\t\t\t\t".format(pre), end="", flush=True)
try:
thing = get_item(i)
except UnicodeEncodeError as error:
log(str(error) + "\nID:" + str(i) + "\n")
print("\n文件名编码异常,请手动前往检查:ID:{}\n".format(i))
except OSError as error:
log(str(error) + "\nID:" + str(i) + "\n")
print("\n疑似文件名编码异常,请手动前往检查:ID:{}\n".format(i))
except Exception as error:
log(str(error) + "\nID:" + str(i) + "\n")
print("\n未知异常:ID:{}\n".format(i))
else:
check_out(thing)