@@ -422,7 +422,7 @@ def test_download_wacz_crawls(
422
422
with ZipFile (fh , "r" ) as zip_file :
423
423
contents = zip_file .namelist ()
424
424
425
- assert len (contents ) > = 2
425
+ assert len (contents ) = = 2
426
426
for filename in contents :
427
427
assert filename .endswith (".wacz" ) or filename == "datapackage.json"
428
428
assert zip_file .getinfo (filename ).compress_type == ZIP_STORED
@@ -437,6 +437,68 @@ def test_download_wacz_crawls(
437
437
assert resource ["bytes" ]
438
438
439
439
440
+ @pytest .mark .parametrize (
441
+ "type_path" ,
442
+ [
443
+ # crawls endpoint
444
+ ("crawls" ),
445
+ # all-crawls endpoint
446
+ ("all-crawls" ),
447
+ ],
448
+ )
449
+ def test_download_wacz_crawls_as_single_wacz (
450
+ admin_auth_headers , default_org_id , admin_crawl_id , type_path
451
+ ):
452
+ with TemporaryFile () as fh :
453
+ with requests .get (
454
+ f"{ API_PREFIX } /orgs/{ default_org_id } /{ type_path } /{ admin_crawl_id } /download?preferSingleWACZ=true" ,
455
+ headers = admin_auth_headers ,
456
+ stream = True ,
457
+ ) as r :
458
+ assert r .status_code == 200
459
+ for chunk in r .iter_content ():
460
+ fh .write (chunk )
461
+
462
+ fh .seek (0 )
463
+ with ZipFile (fh , "r" ) as zip_file :
464
+ contents = zip_file .namelist ()
465
+
466
+ assert len (contents ) >= 6
467
+
468
+ assert "datapackage.json" in contents
469
+ assert "datapackage-digest.json" in contents
470
+
471
+ archives_found = False
472
+ indexes_found = False
473
+ pages_found = False
474
+ logs_found = False
475
+
476
+ for filename in contents :
477
+ print (filename )
478
+ if filename .startswith ("archive/" ) and filename .endswith (".warc.gz" ):
479
+ archives_found = True
480
+ if filename .startswith ("indexes/" ):
481
+ indexes_found = True
482
+ if filename .startswith ("pages/" ) and filename .endwith (".jsonl" ):
483
+ pages_found = True
484
+ if filename .startswith ("logs/" ) and filename .endswith (".log" ):
485
+ logs_found = True
486
+
487
+ if filename == "datapackage.json" :
488
+ data = zip_file .read (filename ).decode ("utf-8" )
489
+ datapackage = json .loads (data )
490
+ assert len (datapackage ["resources" ]) == 1
491
+ for resource in datapackage ["resources" ]:
492
+ assert resource ["name" ] == resource ["path" ]
493
+ assert resource ["hash" ]
494
+ assert resource ["bytes" ]
495
+
496
+ assert archives_found
497
+ assert indexes_found
498
+ assert pages_found
499
+ assert logs_found
500
+
501
+
440
502
def test_update_crawl (
441
503
admin_auth_headers ,
442
504
default_org_id ,
0 commit comments