-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path_upload_update.py
288 lines (235 loc) · 10.1 KB
/
_upload_update.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python
"""
This script was used to upload data to Zenodo following the revision of the manuscript.
It updates the metadata of the previous Zenodo submission(s),
uploads a IHC dataset, a GeoMx dataset, and the IMC activation panel.
"""
import sys, json, requests, hashlib, time
from typing import Dict, Any
import pandas as pd
from imc.types import Path
from src.config import roi_exclude_strings
secrets_file = Path("~/.zenodo.auth.json").expanduser()
secrets = json.load(open(secrets_file))
zenodo_ihc_json = Path("zenodo.deposition.ihc.json")
zenodo_geomx_json = Path("zenodo.deposition.geomx.json")
zenodo_imc_activation_json = Path("zenodo.deposition.imc_activation.json")
api_root = "https://zenodo.org/api/"
headers = {"Content-Type": "application/json"}
kws = dict(params=secrets)
title = "The spatial landscape of lung pathology during COVID-19 progression"
abstract = "Recent studies have provided insights into the pathology and immune response to coronavirus disease 2019 (COVID-19). However thorough interrogation of the interplay between infected cells and the immune system at sites of infection is lacking. We use high parameter imaging mass cytometry9 targeting the expression of 36 proteins, to investigate at single cell resolution, the cellular composition and spatial architecture of human acute lung injury including SARS-CoV-2. This spatially resolved, single-cell data unravels the disordered structure of the infected and injured lung alongside the distribution of extensive immune infiltration. Neutrophil and macrophage infiltration are hallmarks of bacterial pneumonia and COVID-19, respectively. We provide evidence that SARS-CoV-2 infects predominantly alveolar epithelial cells and induces a localized hyper-inflammatory cell state associated with lung damage. By leveraging the temporal range of COVID-19 severe fatal disease in relation to the time of symptom onset, we observe increased macrophage extravasation, mesenchymal cells, and fibroblasts abundance concomitant with increased proximity between these cell types as the disease progresses, possibly as an attempt to repair the damaged lung tissue. This spatially resolved single-cell data allowed us to develop a biologically interpretable landscape of lung pathology from a structural, immunological and clinical standpoint. This spatial single-cell landscape enabled the pathophysiological characterization of the human lung from its macroscopic presentation to the single-cell, providing an important basis for the understanding of COVID-19, and lung pathology in general."
def main() -> int:
# Test connection
req = requests.get(api_root + "deposit/depositions", **kws)
assert req.ok
update_metadata()
upload_ihc()
upload_imc_activation()
return 0
def update_metadata() -> None:
# Update bucket metadata
deps = [("raw data", {"id": 4110560}), ("processed data", {"id": 4139443})]
for name, dep in deps:
# renew the metadata:
dep = get()
# Update metadata
# # Title
if dep["metadata"]["title"] != f"{title} - {name}":
dep["metadata"]["title"] = f"{title} - {name}"
put(dep)
# # Abstract
if dep["metadata"]["description"] != abstract:
dep["metadata"]["description"] = abstract
put(dep)
# # Authors
authors_meta = pd.read_csv("metadata/authors.csv")
if len(dep["metadata"]["creators"]) != authors_meta.shape[0]:
authors = authors_meta[["name", "affiliation", "orcid"]].T.to_dict()
authors = [
{k2: v2 for k2, v2 in v.items() if not pd.isnull(v2)}
for k, v in authors.items()
]
dep["metadata"]["creators"] = authors
put(dep)
def upload_ihc():
# Get a new bucket or load existing
if not zenodo_ihc_json.exists():
req = requests.post(
api_root + "deposit/depositions",
json={},
**kws,
)
json.dump(req.json(), open(zenodo_ihc_json, "w"))
dep = json.load(open(zenodo_ihc_json, "r"))
# renew the metadata:
dep = get() # {"id": 4633905}
# Add metadata
authors_meta = pd.read_csv("metadata/authors.csv")
dep["metadata"] = json.load(open("metadata/zenodo_metadata.ihc.json"))[
"metadata"
]
authors = authors_meta[["name", "affiliation", "orcid"]].T.to_dict()
authors = [
{k2: v2 for k2, v2 in v.items() if not pd.isnull(v2)}
for k, v in authors.items()
]
dep["metadata"]["creators"] = authors
put(dep)
# Upload files
bucket_url = dep["links"]["bucket"] + "/"
# 'https://zenodo.org/api/files/6a4ac068-d7e5-419c-9e05-77bf99ad780a/'
# # Upload OME-TIFF files and masks
data_dir = Path("data")
ihc_files = sorted(map(str, (data_dir / "ihc").glob("*/*.tif*")))
ihc_files = [f for f in ihc_files if "MPO" in f or "cd163" in f]
finished = False
while not finished:
dep = get()
uploaded_files = [x["filename"] for x in dep["files"]]
ihc_files = [x for x in ihc_files if x not in uploaded_files]
if not ihc_files:
break
file = ihc_files[0]
for file in tqdm(ihc_files[ihc_files.index(file) :]):
try:
upload(file)
if file == ihc_files[-1]:
finished = True
except requests.exceptions.ConnectionError:
pass
time.sleep(5)
# # Upload metadata
upload("metadata/ihc_metadata.csv")
# # Upload quantification
upload("data/ihc/quantification_hdab.gated_by_image.csv")
def upload_geomx() -> None:
# Get a new bucket or load existing
if not zenodo_geomx_json.exists():
req = requests.post(
api_root + "deposit/depositions",
json={},
**kws,
)
json.dump(req.json(), open(zenodo_geomx_json, "w"))
dep = json.load(open(zenodo_geomx_json, "r"))
# renew the metadata:
dep = get() # {"id": 4635286}
# Add metadata
authors_meta = pd.read_csv("metadata/authors.csv")
dep["metadata"] = json.load(open("metadata/zenodo_metadata.geomx.json"))[
"metadata"
]
authors = authors_meta[["name", "affiliation", "orcid"]].T.to_dict()
authors = [
{k2: v2 for k2, v2 in v.items() if not pd.isnull(v2)}
for k, v in authors.items()
]
dep["metadata"]["creators"] = authors
put(dep)
# Upload files
bucket_url = dep["links"]["bucket"] + "/"
# 'https://zenodo.org/api/files/a41d8925-e4ec-454c-93c2-7cb97f7ee854/'
# # Upload metadata
upload("data/geomx/metadata_matrix.pq")
# # Upload data
upload("data/geomx/expression_matrix.pq")
def upload_imc_activation() -> None:
# Get a new bucket or load existing
if not zenodo_imc_activation_json.exists():
req = requests.post(
api_root + "deposit/depositions",
json={},
**kws,
)
json.dump(req.json(), open(zenodo_imc_activation_json, "w"))
dep = json.load(open(zenodo_imc_activation_json, "r"))
# renew the metadata:
dep = get() # {"id": 4637033}
# Add metadata
authors_meta = pd.read_csv("metadata/authors.csv")
dep["metadata"] = json.load(
open("metadata/zenodo_metadata.imc_activation.json")
)["metadata"]
authors = authors_meta[["name", "affiliation", "orcid"]].T.to_dict()
authors = [
{k2: v2 for k2, v2 in v.items() if not pd.isnull(v2)}
for k, v in authors.items()
]
dep["metadata"]["creators"] = authors
put(dep)
# Upload files
bucket_url = dep["links"]["bucket"] + "/"
# 'https://zenodo.org/api/files/74567992-623e-4c71-906b-44965daf25da/'
# # Upload MCD files
data_dir = Path("data")
mcds = list(data_dir.glob("*_ActivationPanel/*.mcd"))
for mcd in mcds:
upload(mcd.as_posix())
# # Upload masks
processed_dir = Path("processed")
masks = processed_dir.glob("*_ActivationPanel/tiffs/*_full_mask.tiff")
for mask in masks:
upload(mask.as_posix())
def get() -> Dict[str, Any]:
return requests.get(
api_root + f"deposit/depositions/{dep['id']}", **kws
).json()
def put(payload: Dict, check: bool = True) -> requests.models.Response:
"""
Raises:
`AssertionError` if `check` is True and response not ok.
"""
r = requests.put(
api_root + f"deposit/depositions/{payload['id']}",
data=json.dumps(payload),
headers=headers,
**kws,
)
if check:
assert r.ok
return r
def get_file_md5sum(filename: str, chunk_size: int = 8192) -> str:
with open(filename, "rb") as f:
file_hash = hashlib.md5()
while chunk := f.read(chunk_size):
file_hash.update(chunk)
return file_hash.hexdigest()
def upload(file: str, refresh: bool = False) -> None:
if refresh:
exists = [x["filename"] for x in get()["files"]]
else:
try:
exists = dep["existing_files"]
except KeyError:
exists = []
if file in exists:
print(f"File '{file}' already uploaded.")
return
print(f"Uploading '{file}'.")
with open(file, "rb") as handle:
r = requests.put(bucket_url + file, data=handle, **kws)
assert r.ok, f"Error uploading file '{file}': {r.json()['message']}."
print(f"Successfuly uploaded '{file}'.")
f = r.json()["checksum"].replace("md5:", "")
g = get_file_md5sum(file)
assert f == g, f"MD5 checksum does not match for file '{file}'."
print(f"Checksum match for '{file}'.")
def delete(file: str, refresh: bool = False) -> None:
print(f"Deleting '{file}'.")
if refresh:
files = get()["files"]
else:
files = dep["files"]
file_ids = [f["id"] for f in files if f["filename"] == file]
# ^^ this should always be one but just in case
for file_id in file_ids:
r = requests.delete(
api_root + f"deposit/depositions/{dep['id']}/files/{file_id}", **kws
)
assert r.ok, f"Error deleting file '{file}', with id '{file_id}'."
if __name__ == "__main__":
try:
sys.exit(main())
except KeyboardInterrupt:
sys.exit(1)