First draft of omat24 download scripts

pbenner · pbenner · commit 84034887d302 · 2026-04-17T09:25:07.000+02:00
diff --git a/resources/data/omat24/00_download_omat24.sh b/resources/data/omat24/00_download_omat24.sh
@@ -0,0 +1,149 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+usage() {
+  cat <<'USAGE'
+Usage:
+  00_download_omat24.sh --split <train|val|salex> [--all | <subset>...]
+  00_download_omat24.sh --split <salex> --all
+
+Notes:
+  - train/val subsets: rattled-1000, rattled-1000-subsampled, rattled-500,
+    rattled-500-subsampled, rattled-300, rattled-300-subsampled,
+    aimd-from-PBE-1000-npt, aimd-from-PBE-1000-nvt,
+    aimd-from-PBE-3000-npt, aimd-from-PBE-3000-nvt, rattled-relax
+  - salex split: train or val (use --split salex and subset train/val)
+
+Examples:
+  00_download_omat24.sh --split train rattled-1000 rattled-relax
+  00_download_omat24.sh --split val --all
+  00_download_omat24.sh --split salex train
+USAGE
+}
+
+if ! command -v curl >/dev/null 2>&1; then
+  echo "error: curl is required" >&2
+  exit 1
+fi
+
+SPLIT=""
+SUBSETS=()
+ALL=false
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --split)
+      SPLIT="$2"; shift 2 ;;
+    --all)
+      ALL=true; shift ;;
+    -h|--help)
+      usage; exit 0 ;;
+    *)
+      SUBSETS+=("$1"); shift ;;
+  esac
+done
+
+if [[ -z "$SPLIT" ]]; then
+  echo "error: --split is required" >&2
+  usage
+  exit 1
+fi
+
+TRAIN_BASE="https://dl.fbaipublicfiles.com/opencatalystproject/data/omat/241018/omat/train"
+VAL_BASE="https://dl.fbaipublicfiles.com/opencatalystproject/data/omat/241220/omat/val"
+SALEX_BASE="https://dl.fbaipublicfiles.com/opencatalystproject/data/omat/241018/sAlex"
+
+TRAIN_SUBSETS=(
+  rattled-1000
+  rattled-1000-subsampled
+  rattled-500
+  rattled-500-subsampled
+  rattled-300
+  rattled-300-subsampled
+  aimd-from-PBE-1000-npt
+  aimd-from-PBE-1000-nvt
+  aimd-from-PBE-3000-npt
+  aimd-from-PBE-3000-nvt
+  rattled-relax
+)
+
+VAL_SUBSETS=(
+  rattled-1000
+  rattled-1000-subsampled
+  rattled-500
+  rattled-500-subsampled
+  rattled-300
+  rattled-300-subsampled
+  aimd-from-PBE-1000-npt
+  aimd-from-PBE-1000-nvt
+  aimd-from-PBE-3000-npt
+  aimd-from-PBE-3000-nvt
+  rattled-relax
+)
+
+mkdir -p train val salex
+
+fetch() {
+  local base="$1"
+  local subset="$2"
+  local target_dir="$3"
+  local url="${base}/${subset}.tar.gz"
+  local out="${target_dir}/${subset}.tar.gz"
+  if [[ -f "$out" ]]; then
+    echo "skip: $out exists"
+    return
+  fi
+  echo "downloading: $url"
+  curl -L --fail --retry 3 --retry-delay 5 -o "$out" "$url"
+}
+
+case "$SPLIT" in
+  train)
+    if $ALL; then
+      SUBSETS=("${TRAIN_SUBSETS[@]}")
+    fi
+    if [[ ${#SUBSETS[@]} -eq 0 ]]; then
+      echo "error: no subsets provided" >&2
+      usage
+      exit 1
+    fi
+    for subset in "${SUBSETS[@]}"; do
+      fetch "$TRAIN_BASE" "$subset" train
+    done
+    ;;
+  val)
+    if $ALL; then
+      SUBSETS=("${VAL_SUBSETS[@]}")
+    fi
+    if [[ ${#SUBSETS[@]} -eq 0 ]]; then
+      echo "error: no subsets provided" >&2
+      usage
+      exit 1
+    fi
+    for subset in "${SUBSETS[@]}"; do
+      fetch "$VAL_BASE" "$subset" val
+    done
+    ;;
+  salex)
+    if $ALL; then
+      SUBSETS=("train" "val")
+    fi
+    if [[ ${#SUBSETS[@]} -eq 0 ]]; then
+      echo "error: no subsets provided (train|val)" >&2
+      usage
+      exit 1
+    fi
+    for subset in "${SUBSETS[@]}"; do
+      if [[ "$subset" != "train" && "$subset" != "val" ]]; then
+        echo "error: salex subset must be train or val" >&2
+        exit 1
+      fi
+      fetch "$SALEX_BASE" "$subset" salex
+    done
+    ;;
+  *)
+    echo "error: unknown split '$SPLIT'" >&2
+    usage
+    exit 1
+    ;;
+esac
diff --git a/resources/data/omat24/01_aselmdb_to_hdf5.py b/resources/data/omat24/01_aselmdb_to_hdf5.py
@@ -0,0 +1,87 @@
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+from equitrain.data.format_hdf5 import HDF5Dataset
+from equitrain.data.format_lmdb import iter_lmdb_atoms
+
+
+def _find_shards(src_dir: Path) -> list[Path]:
+    return sorted(
+        path
+        for path in src_dir.rglob("*.aselmdb")
+        if path.is_file() and not path.name.endswith("-lock")
+    )
+
+
+def _convert_split(src_dir: Path, dst_file: Path) -> int:
+    shards = _find_shards(src_dir)
+    if not shards:
+        raise FileNotFoundError(f"No .aselmdb shards found under {src_dir}")
+
+    dst_file.parent.mkdir(parents=True, exist_ok=True)
+
+    count = 0
+    with HDF5Dataset(dst_file, mode="w") as dataset:
+        for shard in shards:
+            print(f"converting {shard}")
+            for atoms in iter_lmdb_atoms(shard):
+                dataset[count] = atoms
+                count += 1
+    return count
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Convert extracted OMAT24 ASE-LMDB shards into EquiTrain HDF5."
+    )
+    parser.add_argument(
+        "--train-dir",
+        default="train",
+        help="Directory containing extracted OMAT24 training shards.",
+    )
+    parser.add_argument(
+        "--valid-dir",
+        default="val",
+        help="Directory containing extracted OMAT24 validation shards.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default="omat24",
+        help="Output directory for train.h5 / valid.h5.",
+    )
+    parser.add_argument(
+        "--skip-train",
+        action="store_true",
+        help="Skip conversion of the training split.",
+    )
+    parser.add_argument(
+        "--skip-valid",
+        action="store_true",
+        help="Skip conversion of the validation split.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+
+    output_dir = Path(args.output_dir)
+    if not args.skip_train:
+        train_count = _convert_split(
+            Path(args.train_dir),
+            output_dir / "train.h5",
+        )
+        print(f"wrote {train_count} training structures to {output_dir / 'train.h5'}")
+
+    if not args.skip_valid:
+        valid_count = _convert_split(
+            Path(args.valid_dir),
+            output_dir / "valid.h5",
+        )
+        print(f"wrote {valid_count} validation structures to {output_dir / 'valid.h5'}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/resources/data/omat24/02_compute_statistics.py b/resources/data/omat24/02_compute_statistics.py
@@ -0,0 +1,23 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from equitrain import get_args_parser_preprocess, preprocess
+
+
+def main():
+    args = get_args_parser_preprocess().parse_args()
+
+    output_dir = Path("omat24")
+    args.train_file = str(output_dir / "train.h5")
+    args.valid_file = str(output_dir / "valid.h5")
+    args.output_dir = str(output_dir)
+    args.compute_statistics = True
+    args.atomic_energies = "average"
+    args.r_max = 6.0
+
+    preprocess(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/resources/data/omat24/makefile b/resources/data/omat24/makefile
@@ -0,0 +1,70 @@
+TRAIN_SUBSETS ?= --all
+VAL_SUBSETS ?= --all
+SALEX_SUBSETS ?= train val
+OUTPUT_DIR ?= omat24
+
+.PHONY: all train val salex extract extract-train extract-val extract-salex clean
+
+# ------------------------------------------------------------------------------
+
+all: $(OUTPUT_DIR)/.stats-complete
+
+# ------------------------------------------------------------------------------
+
+train:
+	bash 00_download_omat24.sh --split train $(TRAIN_SUBSETS)
+
+val:
+	bash 00_download_omat24.sh --split val $(VAL_SUBSETS)
+
+salex:
+	bash 00_download_omat24.sh --split salex $(SALEX_SUBSETS)
+
+# ------------------------------------------------------------------------------
+
+extract: extract-train extract-val
+
+extract-train: train
+	@mkdir -p train
+	@for archive in train/*.tar.gz; do \
+		[ -e "$$archive" ] || continue; \
+		echo "extracting $$archive"; \
+		tar -xzf "$$archive" -C train; \
+	done
+
+extract-val: val
+	@mkdir -p val
+	@for archive in val/*.tar.gz; do \
+		[ -e "$$archive" ] || continue; \
+		echo "extracting $$archive"; \
+		tar -xzf "$$archive" -C val; \
+	done
+
+extract-salex: salex
+	@mkdir -p salex
+	@for archive in salex/*.tar.gz; do \
+		[ -e "$$archive" ] || continue; \
+		echo "extracting $$archive"; \
+		tar -xzf "$$archive" -C salex; \
+	done
+
+# ------------------------------------------------------------------------------
+
+$(OUTPUT_DIR)/train.h5: extract-train 01_aselmdb_to_hdf5.py
+	python 01_aselmdb_to_hdf5.py --train-dir train --output-dir $(OUTPUT_DIR) --skip-valid
+
+$(OUTPUT_DIR)/valid.h5: extract-val 01_aselmdb_to_hdf5.py
+	python 01_aselmdb_to_hdf5.py --valid-dir val --output-dir $(OUTPUT_DIR) --skip-train
+
+$(OUTPUT_DIR)/.stats-complete: $(OUTPUT_DIR)/train.h5 $(OUTPUT_DIR)/valid.h5 02_compute_statistics.py
+	python 02_compute_statistics.py
+	@touch $@
+
+# ------------------------------------------------------------------------------
+
+clean:
+	rm -rf train val salex $(OUTPUT_DIR)
+
+# ------------------------------------------------------------------------------
+
+.DELETE_ON_ERROR: