Skip to content

Commit 8403488

Browse files
committed
First draft of omat24 download scripts
1 parent bf56f70 commit 8403488

4 files changed

Lines changed: 329 additions & 0 deletions

File tree

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
usage() {
5+
cat <<'USAGE'
6+
Usage:
7+
00_download_omat24.sh --split <train|val|salex> [--all | <subset>...]
8+
00_download_omat24.sh --split <salex> --all
9+
10+
Notes:
11+
- train/val subsets: rattled-1000, rattled-1000-subsampled, rattled-500,
12+
rattled-500-subsampled, rattled-300, rattled-300-subsampled,
13+
aimd-from-PBE-1000-npt, aimd-from-PBE-1000-nvt,
14+
aimd-from-PBE-3000-npt, aimd-from-PBE-3000-nvt, rattled-relax
15+
- salex split: train or val (use --split salex and subset train/val)
16+
17+
Examples:
18+
00_download_omat24.sh --split train rattled-1000 rattled-relax
19+
00_download_omat24.sh --split val --all
20+
00_download_omat24.sh --split salex train
21+
USAGE
22+
}
23+
24+
if ! command -v curl >/dev/null 2>&1; then
25+
echo "error: curl is required" >&2
26+
exit 1
27+
fi
28+
29+
SPLIT=""
30+
SUBSETS=()
31+
ALL=false
32+
33+
while [[ $# -gt 0 ]]; do
34+
case "$1" in
35+
--split)
36+
SPLIT="$2"; shift 2 ;;
37+
--all)
38+
ALL=true; shift ;;
39+
-h|--help)
40+
usage; exit 0 ;;
41+
*)
42+
SUBSETS+=("$1"); shift ;;
43+
esac
44+
done
45+
46+
if [[ -z "$SPLIT" ]]; then
47+
echo "error: --split is required" >&2
48+
usage
49+
exit 1
50+
fi
51+
52+
TRAIN_BASE="https://dl.fbaipublicfiles.com/opencatalystproject/data/omat/241018/omat/train"
53+
VAL_BASE="https://dl.fbaipublicfiles.com/opencatalystproject/data/omat/241220/omat/val"
54+
SALEX_BASE="https://dl.fbaipublicfiles.com/opencatalystproject/data/omat/241018/sAlex"
55+
56+
TRAIN_SUBSETS=(
57+
rattled-1000
58+
rattled-1000-subsampled
59+
rattled-500
60+
rattled-500-subsampled
61+
rattled-300
62+
rattled-300-subsampled
63+
aimd-from-PBE-1000-npt
64+
aimd-from-PBE-1000-nvt
65+
aimd-from-PBE-3000-npt
66+
aimd-from-PBE-3000-nvt
67+
rattled-relax
68+
)
69+
70+
VAL_SUBSETS=(
71+
rattled-1000
72+
rattled-1000-subsampled
73+
rattled-500
74+
rattled-500-subsampled
75+
rattled-300
76+
rattled-300-subsampled
77+
aimd-from-PBE-1000-npt
78+
aimd-from-PBE-1000-nvt
79+
aimd-from-PBE-3000-npt
80+
aimd-from-PBE-3000-nvt
81+
rattled-relax
82+
)
83+
84+
mkdir -p train val salex
85+
86+
fetch() {
87+
local base="$1"
88+
local subset="$2"
89+
local target_dir="$3"
90+
local url="${base}/${subset}.tar.gz"
91+
local out="${target_dir}/${subset}.tar.gz"
92+
if [[ -f "$out" ]]; then
93+
echo "skip: $out exists"
94+
return
95+
fi
96+
echo "downloading: $url"
97+
curl -L --fail --retry 3 --retry-delay 5 -o "$out" "$url"
98+
}
99+
100+
case "$SPLIT" in
101+
train)
102+
if $ALL; then
103+
SUBSETS=("${TRAIN_SUBSETS[@]}")
104+
fi
105+
if [[ ${#SUBSETS[@]} -eq 0 ]]; then
106+
echo "error: no subsets provided" >&2
107+
usage
108+
exit 1
109+
fi
110+
for subset in "${SUBSETS[@]}"; do
111+
fetch "$TRAIN_BASE" "$subset" train
112+
done
113+
;;
114+
val)
115+
if $ALL; then
116+
SUBSETS=("${VAL_SUBSETS[@]}")
117+
fi
118+
if [[ ${#SUBSETS[@]} -eq 0 ]]; then
119+
echo "error: no subsets provided" >&2
120+
usage
121+
exit 1
122+
fi
123+
for subset in "${SUBSETS[@]}"; do
124+
fetch "$VAL_BASE" "$subset" val
125+
done
126+
;;
127+
salex)
128+
if $ALL; then
129+
SUBSETS=("train" "val")
130+
fi
131+
if [[ ${#SUBSETS[@]} -eq 0 ]]; then
132+
echo "error: no subsets provided (train|val)" >&2
133+
usage
134+
exit 1
135+
fi
136+
for subset in "${SUBSETS[@]}"; do
137+
if [[ "$subset" != "train" && "$subset" != "val" ]]; then
138+
echo "error: salex subset must be train or val" >&2
139+
exit 1
140+
fi
141+
fetch "$SALEX_BASE" "$subset" salex
142+
done
143+
;;
144+
*)
145+
echo "error: unknown split '$SPLIT'" >&2
146+
usage
147+
exit 1
148+
;;
149+
esac
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
from __future__ import annotations
2+
3+
import argparse
4+
from pathlib import Path
5+
6+
from equitrain.data.format_hdf5 import HDF5Dataset
7+
from equitrain.data.format_lmdb import iter_lmdb_atoms
8+
9+
10+
def _find_shards(src_dir: Path) -> list[Path]:
11+
return sorted(
12+
path
13+
for path in src_dir.rglob("*.aselmdb")
14+
if path.is_file() and not path.name.endswith("-lock")
15+
)
16+
17+
18+
def _convert_split(src_dir: Path, dst_file: Path) -> int:
19+
shards = _find_shards(src_dir)
20+
if not shards:
21+
raise FileNotFoundError(f"No .aselmdb shards found under {src_dir}")
22+
23+
dst_file.parent.mkdir(parents=True, exist_ok=True)
24+
25+
count = 0
26+
with HDF5Dataset(dst_file, mode="w") as dataset:
27+
for shard in shards:
28+
print(f"converting {shard}")
29+
for atoms in iter_lmdb_atoms(shard):
30+
dataset[count] = atoms
31+
count += 1
32+
return count
33+
34+
35+
def parse_args() -> argparse.Namespace:
36+
parser = argparse.ArgumentParser(
37+
description="Convert extracted OMAT24 ASE-LMDB shards into EquiTrain HDF5."
38+
)
39+
parser.add_argument(
40+
"--train-dir",
41+
default="train",
42+
help="Directory containing extracted OMAT24 training shards.",
43+
)
44+
parser.add_argument(
45+
"--valid-dir",
46+
default="val",
47+
help="Directory containing extracted OMAT24 validation shards.",
48+
)
49+
parser.add_argument(
50+
"--output-dir",
51+
default="omat24",
52+
help="Output directory for train.h5 / valid.h5.",
53+
)
54+
parser.add_argument(
55+
"--skip-train",
56+
action="store_true",
57+
help="Skip conversion of the training split.",
58+
)
59+
parser.add_argument(
60+
"--skip-valid",
61+
action="store_true",
62+
help="Skip conversion of the validation split.",
63+
)
64+
return parser.parse_args()
65+
66+
67+
def main() -> None:
68+
args = parse_args()
69+
70+
output_dir = Path(args.output_dir)
71+
if not args.skip_train:
72+
train_count = _convert_split(
73+
Path(args.train_dir),
74+
output_dir / "train.h5",
75+
)
76+
print(f"wrote {train_count} training structures to {output_dir / 'train.h5'}")
77+
78+
if not args.skip_valid:
79+
valid_count = _convert_split(
80+
Path(args.valid_dir),
81+
output_dir / "valid.h5",
82+
)
83+
print(f"wrote {valid_count} validation structures to {output_dir / 'valid.h5'}")
84+
85+
86+
if __name__ == "__main__":
87+
main()
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from __future__ import annotations
2+
3+
from pathlib import Path
4+
5+
from equitrain import get_args_parser_preprocess, preprocess
6+
7+
8+
def main():
9+
args = get_args_parser_preprocess().parse_args()
10+
11+
output_dir = Path("omat24")
12+
args.train_file = str(output_dir / "train.h5")
13+
args.valid_file = str(output_dir / "valid.h5")
14+
args.output_dir = str(output_dir)
15+
args.compute_statistics = True
16+
args.atomic_energies = "average"
17+
args.r_max = 6.0
18+
19+
preprocess(args)
20+
21+
22+
if __name__ == "__main__":
23+
main()

resources/data/omat24/makefile

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
TRAIN_SUBSETS ?= --all
2+
VAL_SUBSETS ?= --all
3+
SALEX_SUBSETS ?= train val
4+
OUTPUT_DIR ?= omat24
5+
6+
.PHONY: all train val salex extract extract-train extract-val extract-salex clean
7+
8+
# ------------------------------------------------------------------------------
9+
10+
all: $(OUTPUT_DIR)/.stats-complete
11+
12+
# ------------------------------------------------------------------------------
13+
14+
train:
15+
bash 00_download_omat24.sh --split train $(TRAIN_SUBSETS)
16+
17+
val:
18+
bash 00_download_omat24.sh --split val $(VAL_SUBSETS)
19+
20+
salex:
21+
bash 00_download_omat24.sh --split salex $(SALEX_SUBSETS)
22+
23+
# ------------------------------------------------------------------------------
24+
25+
extract: extract-train extract-val
26+
27+
extract-train: train
28+
@mkdir -p train
29+
@for archive in train/*.tar.gz; do \
30+
[ -e "$$archive" ] || continue; \
31+
echo "extracting $$archive"; \
32+
tar -xzf "$$archive" -C train; \
33+
done
34+
35+
extract-val: val
36+
@mkdir -p val
37+
@for archive in val/*.tar.gz; do \
38+
[ -e "$$archive" ] || continue; \
39+
echo "extracting $$archive"; \
40+
tar -xzf "$$archive" -C val; \
41+
done
42+
43+
extract-salex: salex
44+
@mkdir -p salex
45+
@for archive in salex/*.tar.gz; do \
46+
[ -e "$$archive" ] || continue; \
47+
echo "extracting $$archive"; \
48+
tar -xzf "$$archive" -C salex; \
49+
done
50+
51+
# ------------------------------------------------------------------------------
52+
53+
$(OUTPUT_DIR)/train.h5: extract-train 01_aselmdb_to_hdf5.py
54+
python 01_aselmdb_to_hdf5.py --train-dir train --output-dir $(OUTPUT_DIR) --skip-valid
55+
56+
$(OUTPUT_DIR)/valid.h5: extract-val 01_aselmdb_to_hdf5.py
57+
python 01_aselmdb_to_hdf5.py --valid-dir val --output-dir $(OUTPUT_DIR) --skip-train
58+
59+
$(OUTPUT_DIR)/.stats-complete: $(OUTPUT_DIR)/train.h5 $(OUTPUT_DIR)/valid.h5 02_compute_statistics.py
60+
python 02_compute_statistics.py
61+
@touch $@
62+
63+
# ------------------------------------------------------------------------------
64+
65+
clean:
66+
rm -rf train val salex $(OUTPUT_DIR)
67+
68+
# ------------------------------------------------------------------------------
69+
70+
.DELETE_ON_ERROR:

0 commit comments

Comments
 (0)