-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbook-downloader.py
87 lines (76 loc) · 2.55 KB
/
book-downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Import libraries
import requests
import wget
import urllib
import os
import traceback
from bs4 import BeautifulSoup
# Set the URL you want to webscrape from
url = 'https://doc.lagout.org/'
def isDownloadable (response_content_type) :
return 'text/html' not in response_content_type
@DeprecationWarning
def getExtension(url):
return url.split('.')[-1]
def generateFolder(dirName):
try:
if not os.path.exists(dirName):
os.makedirs(dirName)
except Exception:
print('\nError creating folder...')
traceback.print_exc()
def getDirectory(url):
try:
splitedURL = url.split('/')
dirPathArr = splitedURL[3:-1]
dirPath = 'Books' #from current path
for i in range(0,len(dirPathArr)) :
dirPath += '/'+ dirPathArr[i]
except Exception:
dirPath = 'Books'
print("Directory name cereation error...")
traceback.print_exc()
generateFolder(dirPath)
return dirPath
def isFileExists(fileName,path):
try:
path += '/'+ fileName
return os.path.exists(path)
except Exception :
traceback.print_exc()
return False
def getFileName(url):
splitedURL = url.split('/')
return splitedURL[-1]
def download(url,path):
fileName = getFileName(url)
try :
if not isFileExists(fileName,path) :
wget.download(url, path)
print('\n>> ' + path+'/' + fileName + ' 100% complete')
except Exception:
print("\nError downloading file "+ url)
traceback.print_exc()
def downloader(url):
response = requests.head(url)
if response.status_code != 200 :
pass
elif isDownloadable(response.headers['Content-Type']) :
decodedURL = urllib.parse.unquote(url)
dirPath = getDirectory(decodedURL)
download(decodedURL,dirPath)
else :
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for i in range(1,len(soup.findAll('a'))):
current_a = soup.findAll('a')[i]['href']
if 'http://' not in current_a or 'https://' not in current_a :
#print("BAD PARSING :" + current_a)
current_url = url + current_a
#print("Recursive call :" + current_url)
downloader(current_url)
print('\n@Author : Kawser Habib\n@Email : [email protected]\n@Book Source : doc.lagout.org\n')
print('Total size : around 25 GB')
print('Note : Any exceptional issues, just start the application again...\n')
print('Downloading...')
downloader(url)