1
+ from multiprocessing .dummy import Value
1
2
from datetime import datetime
2
3
from pathlib import Path
3
4
from time import sleep
4
5
from typing import Mapping , Union
5
6
from urllib .parse import urlparse , urljoin
7
+ import os
6
8
7
9
import requests
8
10
from dagster import (
@@ -35,6 +37,7 @@ def download_file(
35
37
chunk_size : int = 1024 ,
36
38
parameters : Mapping = None ,
37
39
verify : bool = True ,
40
+ attempt_resume : bool = False ,
38
41
) -> Union [Path , None ]:
39
42
"""Download a large file and save it to disk.
40
43
@@ -59,6 +62,42 @@ def download_file(
59
62
session = requests .Session () # https://stackoverflow.com/a/63417213
60
63
61
64
try :
65
+ head = session .head (url , allow_redirects = True )
66
+ head .raise_for_status ()
67
+ remote_size = int (head .headers .get ("content-length" , 0 ))
68
+ headers = {}
69
+ if attempt_resume and os .path .exists (fpath ):
70
+ local_size = os .path .getsize (fpath )
71
+ headers = {"Range" : f"bytes={ local_size } -" }
72
+ logger .info (f"Resuming download at { local_size } /{ remote_size } bytes" )
73
+ else :
74
+ local_size = 0
75
+ logger .info (f"Starting download of { remote_size } bytes" )
76
+
77
+ with fpath .open ("ab" ) as fd :
78
+ with session .get (
79
+ url , headers = headers , stream = True , verify = verify , params = parameters
80
+ ) as r :
81
+ if local_size and r .status_code != 206 :
82
+ raise ValueError ("Server does not support range requests" )
83
+ elif r .status_code not in (200 , 206 ):
84
+ r .raise_for_status ()
85
+
86
+ bytes_written = local_size
87
+ last_logged_percent = (
88
+ int ((bytes_written / remote_size ) * 100 ) if remote_size else 0
89
+ )
90
+ for data in r .iter_content (chunk_size = chunk_size ):
91
+ fd .write (data )
92
+ bytes_written += len (data )
93
+ if remote_size :
94
+ percent = int ((bytes_written / remote_size ) * 100 )
95
+ if percent > last_logged_percent and percent % 10 == 0 :
96
+ logger .info (
97
+ f"Download progress: { percent } % ({ bytes_written } /{ remote_size } bytes)"
98
+ )
99
+ last_logged_percent = percent
100
+
62
101
r = session .get (url , params = parameters , stream = True , verify = verify )
63
102
if r .ok :
64
103
with fpath .open ("wb" ) as fd :
@@ -69,9 +108,11 @@ def download_file(
69
108
r .raise_for_status ()
70
109
r .close ()
71
110
except (
111
+ requests .RequestException ,
72
112
requests .exceptions .BaseHTTPError ,
73
113
requests .exceptions .HTTPError ,
74
114
requests .exceptions .ChunkedEncodingError ,
115
+ ValueError ,
75
116
) as e : # pragma: no cover
76
117
logger .exception (e )
77
118
return None
0 commit comments