forked from liushaoweihua/SpringerDownloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspringer_downloader.py
71 lines (62 loc) · 3.01 KB
/
springer_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import time
import codecs
import requests
import pandas as pd
class SpringerDownloader:
def __init__(self):
self.df = [[i[0], i[1], i[2].split(";")[0], i[3]] for i in pd.read_excel("free_english_textbook.xlsx")[["Book Title", "Author", "Subject Classification", "OpenURL"]].values.tolist()]
self.current_path = os.getcwd()
def wget(self, url, save_path=None, rename=None):
file_name = url[url.rfind("/")+1:]
if not save_path:
save_path = self.current_path
if not rename:
rename = file_name
save_path = os.path.abspath(os.path.join(save_path, rename))
print("[wget] downloading from {}".format(url))
start = time.time()
size = 0
response = requests.get(url, stream=True)
if response.headers.get("content-length") is not None:
chunk_size = 128
content_size = int(response.headers["content-length"])
if not self.check_if_exists(save_path, content_size):
if response.status_code == 200:
print("[wget] file size: %.2f MB" %(content_size / 1024 / 1024))
with codecs.open(save_path, "wb") as f:
for data in response.iter_content(chunk_size=chunk_size):
f.write(data)
size += len(data)
print("\r"+"[wget] %s%.2f%%"
%(">"*int(size*50/content_size), float(size/content_size*100)), end="")
end = time.time()
print("\n"+"[wget] complete! cost: %.2fs."%(end-start))
print("[wget] save at: %s\n" %save_path)
else:
print("[wget] failed to download from {}, this book is not free.".format(url))
def mkdir(self, file_dir):
file_dir = os.path.abspath(os.path.join(self.current_path, file_dir))
os.makedirs(file_dir)
print("[mkdir] create directory {}".format(file_dir))
def check_if_exists(self, file_path, content_size):
if not os.path.exists(file_path):
return False
else:
if os.path.getsize(file_path) < content_size:
return False
else:
return True
def download(self):
for _, i in enumerate(self.df):
print("Downloading 【{}】. \nCurrent #books: {}, total #books: {}".format(i[0], _ + 1, len(self.df)))
file_name = i[0] + "_" + i[1] + ".pdf"
file_name = file_name.replace("/", "_")
classification_dir = os.path.abspath(os.path.join(self.current_path, i[2]))
if not os.path.exists(classification_dir):
self.mkdir(classification_dir)
file_url = "https://link.springer.com/content/pdf/" + requests.get(i[3]).url.split("/")[-1] + ".pdf"
self.wget(file_url, save_path=classification_dir, rename=file_name)
if __name__ == "__main__":
sd = SpringerDownloader()
sd.download()