Skip to content

Commit cdbddd4

Browse files
committed
Add test for preferSingleWACZ arg
1 parent 9970624 commit cdbddd4

File tree

1 file changed

+63
-1
lines changed

1 file changed

+63
-1
lines changed

backend/test/test_run_crawl.py

Lines changed: 63 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ def test_download_wacz_crawls(
422422
with ZipFile(fh, "r") as zip_file:
423423
contents = zip_file.namelist()
424424

425-
assert len(contents) >= 2
425+
assert len(contents) == 2
426426
for filename in contents:
427427
assert filename.endswith(".wacz") or filename == "datapackage.json"
428428
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
@@ -437,6 +437,68 @@ def test_download_wacz_crawls(
437437
assert resource["bytes"]
438438

439439

440+
@pytest.mark.parametrize(
441+
"type_path",
442+
[
443+
# crawls endpoint
444+
("crawls"),
445+
# all-crawls endpoint
446+
("all-crawls"),
447+
],
448+
)
449+
def test_download_wacz_crawls_as_single_wacz(
450+
admin_auth_headers, default_org_id, admin_crawl_id, type_path
451+
):
452+
with TemporaryFile() as fh:
453+
with requests.get(
454+
f"{API_PREFIX}/orgs/{default_org_id}/{type_path}/{admin_crawl_id}/download?preferSingleWACZ=true",
455+
headers=admin_auth_headers,
456+
stream=True,
457+
) as r:
458+
assert r.status_code == 200
459+
for chunk in r.iter_content():
460+
fh.write(chunk)
461+
462+
fh.seek(0)
463+
with ZipFile(fh, "r") as zip_file:
464+
contents = zip_file.namelist()
465+
466+
assert len(contents) >= 6
467+
468+
assert "datapackage.json" in contents
469+
assert "datapackage-digest.json" in contents
470+
471+
archives_found = False
472+
indexes_found = False
473+
pages_found = False
474+
logs_found = False
475+
476+
for filename in contents:
477+
print(filename)
478+
if filename.startswith("archive/") and filename.endswith(".warc.gz"):
479+
archives_found = True
480+
if filename.startswith("indexes/"):
481+
indexes_found = True
482+
if filename.startswith("pages/") and filename.endwith(".jsonl"):
483+
pages_found = True
484+
if filename.startswith("logs/") and filename.endswith(".log"):
485+
logs_found = True
486+
487+
if filename == "datapackage.json":
488+
data = zip_file.read(filename).decode("utf-8")
489+
datapackage = json.loads(data)
490+
assert len(datapackage["resources"]) == 1
491+
for resource in datapackage["resources"]:
492+
assert resource["name"] == resource["path"]
493+
assert resource["hash"]
494+
assert resource["bytes"]
495+
496+
assert archives_found
497+
assert indexes_found
498+
assert pages_found
499+
assert logs_found
500+
501+
440502
def test_update_crawl(
441503
admin_auth_headers,
442504
default_org_id,

0 commit comments

Comments
 (0)