Skip to content

Commit 96de477

Browse files
committed
implement resuming ahn downloads on retry
1 parent d695506 commit 96de477

File tree

2 files changed

+44
-3
lines changed

2 files changed

+44
-3
lines changed

packages/common/src/bag3d/common/utils/requests.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from time import sleep
44
from typing import Mapping, Union
55
from urllib.parse import urlparse, urljoin
6+
import os
67

78
import requests
89
from dagster import (
@@ -35,6 +36,7 @@ def download_file(
3536
chunk_size: int = 1024,
3637
parameters: Mapping = None,
3738
verify: bool = True,
39+
attempt_resume: bool = False,
3840
) -> Union[Path, None]:
3941
"""Download a large file and save it to disk.
4042
@@ -59,6 +61,42 @@ def download_file(
5961
session = requests.Session() # https://stackoverflow.com/a/63417213
6062

6163
try:
64+
head = session.head(url, allow_redirects=True)
65+
head.raise_for_status()
66+
remote_size = int(head.headers.get("content-length", 0))
67+
headers = {}
68+
if attempt_resume and os.path.exists(fpath):
69+
local_size = os.path.getsize(fpath)
70+
headers = {"Range": f"bytes={local_size}-"}
71+
logger.info(f"Resuming download at {local_size}/{remote_size} bytes")
72+
else:
73+
local_size = 0
74+
logger.info(f"Starting download of {remote_size} bytes")
75+
76+
with fpath.open("ab") as fd:
77+
with session.get(
78+
url, headers=headers, stream=True, verify=verify, params=parameters
79+
) as r:
80+
if local_size and r.status_code != 206:
81+
raise ValueError("Server does not support range requests")
82+
elif r.status_code not in (200, 206):
83+
r.raise_for_status()
84+
85+
bytes_written = local_size
86+
last_logged_percent = (
87+
int((bytes_written / remote_size) * 100) if remote_size else 0
88+
)
89+
for data in r.iter_content(chunk_size=chunk_size):
90+
fd.write(data)
91+
bytes_written += len(data)
92+
if remote_size:
93+
percent = int((bytes_written / remote_size) * 100)
94+
if percent > last_logged_percent and percent % 10 == 0:
95+
logger.info(
96+
f"Download progress: {percent}% ({bytes_written}/{remote_size} bytes)"
97+
)
98+
last_logged_percent = percent
99+
62100
r = session.get(url, params=parameters, stream=True, verify=verify)
63101
if r.ok:
64102
with fpath.open("wb") as fd:
@@ -69,9 +107,11 @@ def download_file(
69107
r.raise_for_status()
70108
r.close()
71109
except (
110+
requests.RequestException,
72111
requests.exceptions.BaseHTTPError,
73112
requests.exceptions.HTTPError,
74113
requests.exceptions.ChunkedEncodingError,
114+
ValueError,
75115
) as e: # pragma: no cover
76116
logger.exception(e)
77117
return None

packages/core/src/bag3d/core/assets/ahn/download.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ class LazFilesConfig(Config):
158158
required_resource_keys={"file_store"},
159159
partitions_def=partition_definition_ahn,
160160
tags={"dagster/concurrency_key": "laz_download"},
161-
pool="laz_download"
161+
pool="laz_download",
162162
)
163163
def laz_files_ahn3(context, config: LazFilesConfig, md5_ahn3, tile_index_ahn):
164164
"""AHN3 LAZ files as they are downloaded from PDOK.
@@ -213,7 +213,7 @@ def laz_files_ahn3(context, config: LazFilesConfig, md5_ahn3, tile_index_ahn):
213213
required_resource_keys={"file_store"},
214214
partitions_def=partition_definition_ahn,
215215
tags={"dagster/concurrency_key": "laz_download"},
216-
pool="laz_download"
216+
pool="laz_download",
217217
)
218218
def laz_files_ahn4(context, config: LazFilesConfig, md5_ahn4, tile_index_ahn):
219219
"""AHN4 LAZ files as they are downloaded from PDOK.
@@ -271,7 +271,7 @@ def laz_files_ahn4(context, config: LazFilesConfig, md5_ahn4, tile_index_ahn):
271271
required_resource_keys={"file_store"},
272272
partitions_def=partition_definition_ahn,
273273
tags={"dagster/concurrency_key": "laz_download"},
274-
pool="laz_download"
274+
pool="laz_download",
275275
)
276276
def laz_files_ahn5(context, config: LazFilesConfig, sha256_ahn5, tile_index_ahn):
277277
"""AHN5 LAZ files as they are downloaded from PDOK.
@@ -421,6 +421,7 @@ def download_laz(
421421
target_path=fpath,
422422
chunk_size=1024 * 1024,
423423
verify=verify_ssl,
424+
attempt_resume=True,
424425
)
425426
if fpath_download is None:
426427
# Download failed

0 commit comments

Comments
 (0)