-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path_download_datasets.py
114 lines (90 loc) · 3.4 KB
/
_download_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# coding: utf-8
"""
This script downloads all raw/processed data necessary to re-run most analysis.
Please note that it may be impractical to download everything serially as here,
so this script is more of inspiration on which files should be placed where.
You will need to create a new access token from Zenodo
(https://zenodo.org/account/settings/applications/tokens/new/), and add this to a
file "~/.zenodo.auth.json" as a simple key: value pair e.g.:
{'access_token': '123asd123asd123asd123asd123asd'}
Feel free to download datasets manually from Zenodo directly/manually as well:
- IMC dataset, raw MCD files: https://zenodo.org/record/4110560
- IMC dataset, cell masks: https://zenodo.org/record/4139443
- IMC activation panel dataset, MCD and masks: https://zenodo.org/record/4637034
- IHC dataset: https://zenodo.org/record/4633906
- GeoMx dataset: https://zenodo.org/record/4635286
"""
import io, sys, json, tempfile, argparse
from typing import Tuple, Dict, List, Optional, Callable, Any
import hashlib
from tqdm import tqdm
import requests
import numpy as np
import pandas as pd
from imc.types import Path
paths = [
Path("metadata"),
Path("data"),
Path("data") / "ihc",
Path("data") / "geomx",
Path("processed"),
Path("results"),
]
for path in paths:
path.mkdir(exist_ok=True)
api_root = "https://zenodo.org/api/"
headers = {"Content-Type": "application/json"}
secrets_file = Path("~/.zenodo.auth.json").expanduser()
secrets = json.load(open(secrets_file))
kws = dict(params=secrets)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--overwrite", action="store_true")
args = parser.parse_args()
deps = {
"imc_raw": 4110560,
"imc_proc": 4110560,
"imc_activation": 4637033,
"ihc": 4633906,
"geomx": 4635286,
}
for name, dep_id in deps.items():
print(f"Downloading '{name}' dataset.")
dep = get(dep_id)
for file in dep["files"]:
output_file = Path(file["filename"])
output_file.parent.mkdir(parents=True)
if output_file.exists() and not args.overwrite:
print(f"File '{output_file}' already exists, skipping.")
continue
print(f"Downloading '{output_file}'")
while True:
download_file(file["links"]["download"], output_file)
if get_checksum(output_file) == file["checksum"]:
break
print("Checksum failed to match. Re-trying download.")
return 0
def get(deposit_id: int) -> Dict[str, Any]:
return requests.get(
f"{api_root}deposit/depositions/{deposit_id}", **kws
).json()
def download_file(url: str, output_file: Path) -> None:
with requests.get(url, stream=True, **kws) as r:
r.raise_for_status()
with open(output_file, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
def get_checksum(filename: Path, algo: str = "md5") -> str:
"""Return checksum of file contents."""
import hashlib
with open(filename, "rb") as f:
file_hash = getattr(hashlib, algo)()
while chunk := f.read(8192):
file_hash.update(chunk)
return file_hash.hexdigest()
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
print("\t - Exiting due to user interruption.")
sys.exit(1)