|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
1 | 3 | # Copyright (c) 2022 Saul Pwanson
|
2 | 4 | #
|
3 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
18 | 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
19 | 21 | # SOFTWARE.
|
20 | 22 |
|
| 23 | +""" |
| 24 | +usage: unzip_http [-h] [-l] [-f] [-o] url [files ...] |
| 25 | +
|
| 26 | +Extract individual files from .zip files over http without downloading the |
| 27 | +entire archive. HTTP server must send `Accept-Ranges: bytes` and |
| 28 | +`Content-Length` in headers. |
| 29 | +
|
| 30 | +positional arguments: |
| 31 | + url URL of the remote zip file |
| 32 | + files Files to extract. If no filenames given, displays .zip |
| 33 | + contents (filenames and sizes). Each filename can be a |
| 34 | + wildcard glob. |
| 35 | +
|
| 36 | +options: |
| 37 | + -h, --help show this help message and exit |
| 38 | + -l, --list List files in the remote zip file |
| 39 | + -f, --full-filepaths Recreate folder structure from zip file when extracting |
| 40 | + (instead of extracting the files to the current |
| 41 | + directory) |
| 42 | + -o, --stdout Write files to stdout (if multiple files: concatenate |
| 43 | + them to stdout, in zipfile order) |
| 44 | +""" |
| 45 | + |
21 | 46 | import sys
|
22 | 47 | import os
|
23 | 48 | import io
|
| 49 | +import math |
| 50 | +import time |
24 | 51 | import zlib
|
25 | 52 | import struct
|
26 | 53 | import fnmatch
|
| 54 | +import argparse |
27 | 55 | import pathlib
|
28 | 56 | import urllib.parse
|
29 | 57 |
|
30 | 58 |
|
31 |
| -__version__ = '0.5.1' |
| 59 | +__version__ = '0.6' |
32 | 60 |
|
33 | 61 |
|
34 | 62 | def error(s):
|
@@ -263,3 +291,95 @@ def read(self, n):
|
263 | 291 | self._buffer = self._buffer[n:]
|
264 | 292 |
|
265 | 293 | return ret
|
| 294 | + |
| 295 | + |
| 296 | + ### script start |
| 297 | + |
| 298 | +class StreamProgress: |
| 299 | + def __init__(self, fp, name='', total=0): |
| 300 | + self.name = name |
| 301 | + self.fp = fp |
| 302 | + self.total = total |
| 303 | + self.start_time = time.time() |
| 304 | + self.last_update = 0 |
| 305 | + self.amtread = 0 |
| 306 | + |
| 307 | + def read(self, n): |
| 308 | + r = self.fp.read(n) |
| 309 | + self.amtread += len(r) |
| 310 | + now = time.time() |
| 311 | + if now - self.last_update > 0.1: |
| 312 | + self.last_update = now |
| 313 | + |
| 314 | + elapsed_s = now - self.start_time |
| 315 | + sys.stderr.write(f'\r{elapsed_s:.0f}s {self.amtread/10**6:.02f}/{self.total/10**6:.02f}MB ({self.amtread/10**6/elapsed_s:.02f} MB/s) {self.name}') |
| 316 | + |
| 317 | + if not r: |
| 318 | + sys.stderr.write('\n') |
| 319 | + |
| 320 | + return r |
| 321 | + |
| 322 | + |
| 323 | +def list_files(rzf): |
| 324 | + def safelog(x): |
| 325 | + return 1 if x == 0 else math.ceil(math.log10(x)) |
| 326 | + |
| 327 | + digits_compr = max(safelog(f.compress_size) for f in rzf.infolist()) |
| 328 | + digits_plain = max(safelog(f.file_size ) for f in rzf.infolist()) |
| 329 | + fmtstr = f'%{digits_compr}d -> %{digits_plain}d\t%s' |
| 330 | + for f in rzf.infolist(): |
| 331 | + print(fmtstr % (f.compress_size, f.file_size, f.filename), file=sys.stderr) |
| 332 | + |
| 333 | + |
| 334 | +def extract_one(outfile, rzf, f, ofname): |
| 335 | + print(f'Extracting {f.filename} to {ofname}...', file=sys.stderr) |
| 336 | + |
| 337 | + fp = StreamProgress(rzf.open(f), name=f.filename, total=f.compress_size) |
| 338 | + while r := fp.read(2**18): |
| 339 | + outfile.write(r) |
| 340 | + |
| 341 | + |
| 342 | +def download_file(f, rzf, args): |
| 343 | + if not any(fnmatch.fnmatch(f.filename, g) for g in args.files): |
| 344 | + return |
| 345 | + |
| 346 | + if args.stdout: |
| 347 | + extract_one(sys.stdout.buffer, rzf, f, "stdout") |
| 348 | + else: |
| 349 | + path = pathlib.Path(f.filename) |
| 350 | + if args.full_filepaths: |
| 351 | + path.parent.mkdir(parents=True, exist_ok=True) |
| 352 | + else: |
| 353 | + path = path.name |
| 354 | + |
| 355 | + with open(str(path), 'wb') as of: |
| 356 | + extract_one(of, rzf, f, str(path)) |
| 357 | + |
| 358 | + |
| 359 | +def main(): |
| 360 | + parser = argparse.ArgumentParser(prog='unzip-http', \ |
| 361 | + description="Extract individual files from .zip files over http without downloading the entire archive. HTTP server must send `Accept-Ranges: bytes` and `Content-Length` in headers.") |
| 362 | + |
| 363 | + parser.add_argument('-l', '--list', action='store_true', default=False, |
| 364 | + help="List files in the remote zip file") |
| 365 | + parser.add_argument('-f', '--full-filepaths', action='store_true', default=False, |
| 366 | + help="Recreate folder structure from zip file when extracting (instead of extracting the files to the current directory)") |
| 367 | + parser.add_argument('-o', '--stdout', action='store_true', default=False, |
| 368 | + help="Write files to stdout (if multiple files: concatenate them to stdout, in zipfile order)") |
| 369 | + |
| 370 | + parser.add_argument("url", nargs=1, help="URL of the remote zip file") |
| 371 | + parser.add_argument("files", nargs='*', help="Files to extract. If no filenames given, displays .zip contents (filenames and sizes). Each filename can be a wildcard glob.") |
| 372 | + |
| 373 | + args = parser.parse_args() |
| 374 | + |
| 375 | + rzf = RemoteZipFile(args.url[0]) |
| 376 | + if args.list or len(args.files) == 0: |
| 377 | + list_files(rzf) |
| 378 | + else: |
| 379 | + for f in rzf.infolist(): |
| 380 | + download_file(f, rzf, args) |
| 381 | + |
| 382 | + |
| 383 | + |
| 384 | +if __name__ == '__main__': |
| 385 | + main() |
0 commit comments