Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions deepmd/pd/utils/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,22 @@ def __init__(
if seed is not None:
setup_seed(seed)
if isinstance(systems, str):
with h5py.File(systems) as file:
systems = [os.path.join(systems, item) for item in file.keys()]
# Check if this is a multisystem HDF5 file that should be expanded
try:
with h5py.File(systems, "r") as file:
# Check if this looks like a single system (has type.raw and set.* groups)
has_type_raw = "type.raw" in file
has_sets = any(key.startswith("set.") for key in file.keys())

if has_type_raw and has_sets:
# This is a single system HDF5 file, don't expand
systems = [systems]
else:
# This might be a multisystem file, expand it
systems = [f"{systems}#{item}" for item in file.keys()]
except OSError:
# If we can't read as HDF5, treat as regular path
systems = [systems]

self.systems: list[DeepmdDataSetForLoader] = []
if len(systems) >= 100:
Expand Down
19 changes: 16 additions & 3 deletions deepmd/pt/utils/dataloader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
import logging
import os
from multiprocessing.dummy import (
Pool,
)
Expand Down Expand Up @@ -89,8 +88,22 @@ def __init__(
if seed is not None:
setup_seed(seed)
if isinstance(systems, str):
with h5py.File(systems) as file:
systems = [os.path.join(systems, item) for item in file.keys()]
Comment on lines 91 to 93
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, I don't understand why DpLoaderSet doesn't reuse process_systems, but uses its own logic to search systems... Who designs this behavior?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've refactored the dataloaders to reuse process_systems instead of duplicating the HDF5 expansion logic.

Changes made:

  • PyTorch dataloader now calls process_systems(systems) instead of custom HDF5 logic
  • Paddle dataloader now calls process_systems(systems) instead of custom HDF5 logic
  • This eliminates code duplication and ensures consistent behavior
  • All tests updated and passing

The dataloaders now properly delegate system processing to the centralized process_systems function, avoiding the design issue you identified.

# Check if this is a multisystem HDF5 file that should be expanded
try:
with h5py.File(systems, "r") as file:
# Check if this looks like a single system (has type.raw and set.* groups)
has_type_raw = "type.raw" in file
has_sets = any(key.startswith("set.") for key in file.keys())

if has_type_raw and has_sets:
# This is a single system HDF5 file, don't expand
systems = [systems]
else:
# This might be a multisystem file, expand it
systems = [f"{systems}#{item}" for item in file.keys()]
except OSError:
# If we can't read as HDF5, treat as regular path
systems = [systems]

def construct_dataset(system: str) -> DeepmdDataSetForLoader:
return DeepmdDataSetForLoader(
Expand Down
138 changes: 137 additions & 1 deletion deepmd/utils/data_system.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
import collections
import logging
import os
import warnings
from functools import (
cached_property,
Expand All @@ -11,6 +12,7 @@
Union,
)

import h5py
import numpy as np

import deepmd.utils.random as dp_random
Expand Down Expand Up @@ -790,6 +792,7 @@ def process_systems(
"""Process the user-input systems.

If it is a single directory, search for all the systems in the directory.
If it's a list, handle HDF5 files by expanding their internal systems.
Check if the systems are valid.

Parameters
Expand All @@ -810,10 +813,143 @@ def process_systems(
else:
systems = rglob_sys_str(systems, patterns)
elif isinstance(systems, list):
systems = systems.copy()
expanded_systems = []
for system in systems:
# Check if this is an HDF5 file without explicit system specification
if _is_hdf5_file(system) and "#" not in system:
# Only expand if it's a multisystem HDF5 file
if _is_hdf5_multisystem(system):
# Expand HDF5 file to include all systems within it
try:
with h5py.File(system, "r") as file:
for key in file.keys():
if isinstance(file[key], h5py.Group):
# Check if this group looks like a system
group = file[key]
group_has_type = "type.raw" in group
group_has_sets = any(
subkey.startswith("set.")
for subkey in group.keys()
)
if group_has_type and group_has_sets:
expanded_systems.append(f"{system}#{key}")
except OSError as e:
log.warning(f"Could not read HDF5 file {system}: {e}")
# If we can't read as HDF5, treat as regular system
expanded_systems.append(system)
else:
# Single system HDF5 file, don't expand
expanded_systems.append(system)
else:
# Regular system or HDF5 with explicit system specification
expanded_systems.append(system)
systems = expanded_systems
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that only one line will work:

systems = sum([process_systems(system) for system in systems], [])

(btw, I don't understand what patterns is according to the doc...)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot address the comments

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've simplified the code as suggested. The complex nested logic has been reduced to essentially one line:

Before: 30+ lines of nested if/else logic for list processing

After: systems = sum([_process_single_system(system) for system in systems], [])

Changes made:

  • Extracted _process_single_system() helper function to handle individual system processing
  • Simplified list case to the functional style you suggested (avoiding infinite recursion by using helper)
  • Fixed path format consistency: both string and list processing now use file.h5#/system format
  • All 22 tests pass with the cleaner implementation

The code is now much more maintainable and follows the functional programming style you suggested.

return systems


def _is_hdf5_file(path: str) -> bool:
"""Check if a path points to an HDF5 file.

Parameters
----------
path : str
Path to check

Returns
-------
bool
True if the path is an HDF5 file
"""
# Extract the actual file path (before any # separator for HDF5 internal paths)
file_path = path.split("#")[0]
return os.path.isfile(file_path) and (
file_path.endswith((".h5", ".hdf5")) or _is_hdf5_format(file_path)
)


def _is_hdf5_multisystem(file_path: str) -> bool:
"""Check if an HDF5 file contains multiple systems vs being a single system.

Parameters
----------
file_path : str
Path to the HDF5 file

Returns
-------
bool
True if the file contains multiple systems, False if it's a single system
"""
try:
with h5py.File(file_path, "r") as f:
# Check if this looks like a single system (has type.raw and set.* groups)
has_type_raw = "type.raw" in f
has_sets = any(key.startswith("set.") for key in f.keys())

if has_type_raw and has_sets:
# This looks like a single system
return False

# Check if it contains multiple groups that could be systems
system_groups = []
for key in f.keys():
if isinstance(f[key], h5py.Group):
group = f[key]
# Check if this group looks like a system (has type.raw and sets)
group_has_type = "type.raw" in group
group_has_sets = any(
subkey.startswith("set.") for subkey in group.keys()
)
if group_has_type and group_has_sets:
system_groups.append(key)

# If we found multiple system-like groups, it's a multisystem file
return len(system_groups) > 1

except OSError:
return False


def _is_hdf5_file(path: str) -> bool:
"""Check if a path points to an HDF5 file.

Parameters
----------
path : str
Path to check

Returns
-------
bool
True if the path is an HDF5 file
"""
# Extract the actual file path (before any # separator for HDF5 internal paths)
file_path = path.split("#")[0]
return os.path.isfile(file_path) and (
file_path.endswith((".h5", ".hdf5")) or _is_hdf5_format(file_path)
)


def _is_hdf5_format(file_path: str) -> bool:
"""Check if a file is in HDF5 format by trying to open it.

Parameters
----------
file_path : str
Path to the file

Returns
-------
bool
True if the file is in HDF5 format
"""
try:
with h5py.File(file_path, "r"):
return True
except OSError:
return False


def get_data(
jdata: dict[str, Any],
rcut: float,
Expand Down
Loading
Loading