PaddlePaddle · JunnHuo · Jul 10, 2025 · Jul 10, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/ppmat/calculator/ase.py b/ppmat/calculator/ase.py
diff --git a/ppmat/datasets/matbench_dataset.py b/ppmat/datasets/matbench_dataset.py
@@ -43,19 +43,21 @@ class MatbenchDataset(Dataset):
     """Matbench Dataset Handler
 
     This class provides utilities for loading and processing the Matbench materials
-    science benchmark datasets. The implementation supports loading multiple properties
-    from different matbench JSON files and processing them for materials property prediction.
+        science benchmark datasets.
+    The implementation supports loading multiple properties from different
+        matbench JSON files and processing them for materials property prediction.
 
     **Dataset Overview**
     Matbench is a benchmark suite for materials property prediction containing multiple
-    datasets with different properties:
+        datasets with different properties:
     - Formation Energy (mp_e_form): ~132k samples
     - Band Gap (mp_gap): ~106k samples
     - Shear Modulus G (elasticity_log10(G_VRH)): ~11k samples
     - Bulk Modulus K (elasticity_log10(K_VRH)): ~11k samples
 
     **Automatic Download**
-    If the data directory doesn't exist, the dataset will be automatically downloaded from:
+    If the data directory doesn't exist, the dataset will be
+        automatically downloaded from:
     https://paddle-org.bj.bcebos.com/paddlematerial/datasets/matbench/matbench.zip
 
     **Data Format**
@@ -79,18 +81,21 @@ class MatbenchDataset(Dataset):
     - "log10(K_VRH)": Log10 of bulk modulus (GPa) from elasticity_log10(K_VRH).json
 
     Args:
-        data_dir (str): Directory containing matbench JSON files.
+        path (str): Directory containing matbench JSON files.
             Defaults to "./data/matbench".
         property_names (Optional[List[str]]): Property names to load.
-            Should be selected from ["e_form", "gap pbe", "log10(G_VRH)", "log10(K_VRH)"].
+            Should be selected from
+                ["e_form", "gap pbe", "log10(G_VRH)", "log10(K_VRH)"].
             Defaults to None (loads all available properties).
         build_structure_cfg (Dict, optional): Configs for building pymatgen structures.
             Defaults to None.
         build_graph_cfg (Dict, optional): Configs for building graphs from structures.
             Defaults to None.
-        transforms (Optional[Callable], optional): Preprocessing transforms for each sample.
+        transforms (Optional[Callable], optional):
+            Preprocessing transforms for each sample.
             Defaults to None.
-        cache_path (Optional[str], optional): Path for caching processed structures and graphs.
+        cache_path (Optional[str], optional):
+            Path for caching processed structures and graphs.
             Defaults to None.
         overwrite (bool, optional): Whether to overwrite existing cache files.
             Defaults to False.
@@ -105,7 +110,8 @@ class MatbenchDataset(Dataset):
     url = (
         "https://paddle-org.bj.bcebos.com/paddlematerial/datasets/matbench/matbench.zip"
     )
-    md5 = "71e85300825604c2e228cbbf75574906"  # TODO: Replace with actual MD5 hash when available
+    # TODO: Replace with actual MD5 hash when available
+    md5 = "71e85300825604c2e228cbbf75574906"
 
     # Property file mapping
     PROPERTY_FILES = {
@@ -117,7 +123,7 @@ class MatbenchDataset(Dataset):
 
     def __init__(
         self,
-        data_dir: str = "./data/matbench",
+        path: str = "./data/matbench",
         property_names: Optional[List[str]] = None,
         build_structure_cfg: Dict = None,
         build_graph_cfg: Dict = None,
@@ -132,23 +138,24 @@ def __init__(
 
         # Check if data directory and required files exist, if not download the dataset
         # This follows the same pattern as MP2018Dataset
-        if not osp.exists(data_dir):
+        if not osp.exists(path):
             logger.message("The matbench dataset is not found. Will download it now.")
             root_path = download.get_datasets_path_from_url(self.url, self.md5)
-            data_dir = osp.join(root_path, self.name)
+            path = osp.join(root_path, self.name)
         else:
             # Check if required files exist in the directory
             required_files = list(self.PROPERTY_FILES.values())
-            files_exist = all(osp.exists(osp.join(data_dir, f)) for f in required_files)
+            files_exist = all(osp.exists(osp.join(path, f)) for f in required_files)
 
             if not files_exist:
                 logger.message(
-                    "Some matbench data files are missing. Will download the dataset now."
+                    "Some matbench data files are missing. "
+                    "Will download the dataset now."
                 )
                 root_path = download.get_datasets_path_from_url(self.url, self.md5)
-                data_dir = osp.join(root_path, self.name)
+                path = osp.join(root_path, self.name)
 
-        self.data_dir = data_dir
+        self.path = path
         if isinstance(property_names, str):
             property_names = [property_names]
 
@@ -160,7 +167,8 @@ def __init__(
         for prop in property_names:
             if prop not in self.PROPERTY_FILES:
                 raise ValueError(
-                    f"Unknown property '{prop}'. Available properties: {list(self.PROPERTY_FILES.keys())}"
+                    f"Unknown property '{prop}'. "
+                    f"Available properties: {list(self.PROPERTY_FILES.keys())}"
                 )
 
         self.property_names = property_names
@@ -187,7 +195,7 @@ def __init__(
         else:
             # Generate cache path based on data directory and properties
             prop_str = "_".join(sorted(property_names))
-            self.cache_path = osp.join(data_dir + "_cache", f"matbench_{prop_str}")
+            self.cache_path = osp.join(path + "_cache", f"matbench_{prop_str}")
         logger.info(f"Cache path: {self.cache_path}")
 
         self.overwrite = overwrite
@@ -215,14 +223,14 @@ def load_matbench_data(self):
         else:
             # Multiple properties case - need to handle differently
             raise NotImplementedError(
-                "Loading multiple properties from different files is not yet implemented. "
+                "Loading multiple properties from different files is not implemented. "
                 "Please specify only one property at a time."
             )
 
     def _load_single_property_data(self):
         """Load data for a single property from its matbench JSON file."""
         prop_name = self.property_names[0]
-        file_path = osp.join(self.data_dir, self.PROPERTY_FILES[prop_name])
+        file_path = osp.join(self.path, self.PROPERTY_FILES[prop_name])
 
         if not osp.exists(file_path):
             raise FileNotFoundError(f"Matbench file not found: {file_path}")
@@ -232,6 +240,15 @@ def _load_single_property_data(self):
         with open(file_path, "r") as f:
             data = json.load(f)
 
+        # If max_samples is not None, keep only the specified number of samples
+        if self.max_samples is not None:
+            data_samples = {
+                "index": data["index"][: self.max_samples],
+                "columns": data["columns"],
+                "data": data["data"][: self.max_samples],
+            }
+            data = data_samples
+
         # Validate data format
         if not all(key in data for key in ["index", "columns", "data"]):
             raise ValueError(f"Invalid matbench file format: {file_path}")
@@ -250,17 +267,14 @@ def _load_single_property_data(self):
         structures = []
         properties = []
 
-        for i, (structure_dict, prop_value) in enumerate(data["data"]):
-            if self.max_samples is not None and i >= self.max_samples:
-                break
-
+        for structure_dict, prop_value in data["data"]:
             # Convert structure dict to pymatgen Structure
             try:
                 structure = Structure.from_dict(structure_dict)
                 structures.append(structure)
                 properties.append(prop_value)
             except Exception as e:
-                logger.warning(f"Failed to parse structure {i} in {file_path}: {e}")
+                logger.warning(f"Failed to parse structure in {file_path}: {e}")
                 if not self.filter_unvalid:
                     structures.append(None)
                     properties.append(None)

diff --git a/ppmat/datasets/mp2018_dataset.py b/ppmat/datasets/mp2018_dataset.py
@@ -337,6 +337,11 @@ def read_data(self, path: str):
             for idx in idxs:
                 data[key].append(json_data[key][idx])
 
+        # Example: Load a small subset of the dataset for quick testing
+        # num_samples = 1000
+        # for key in data.keys():
+        #     data[key] = data[key][:num_samples]
+
         return data, num_samples
 
     def filter_unvalid_by_property(self):

diff --git a/ppmat/datasets/mp20_dataset.py b/ppmat/datasets/mp20_dataset.py
@@ -307,6 +307,12 @@ def read_data(self, path: str):
         num_samples = 0
         for key in data:
             num_samples = max(num_samples, len(data[key]))
+
+        # Example: Load a small subset of the dataset for quick testing
+        # num_samples = 100
+        # for key in data.keys():
+        #     data[key] = data[key][:num_samples]
+
         return data, num_samples
 
     def filter_unvalid_by_property(self):
@@ -440,6 +446,6 @@ class AlexMP20MatterGenDataset(MP20Dataset):
     the mp20 dataset used for mattergen.
     """
 
-    name = "alex_mp_20_mattergen"
+    name = "alex_mp_20"
     url = "https://paddle-org.bj.bcebos.com/paddlematerial/datasets/alex_mp_20/alex_mp_20.zip"
     md5 = "624361c17259cc3af63a00b29fffe9cd"
diff --git a/ppmat/datasets/mptrj_dataset.py b/ppmat/datasets/mptrj_dataset.py
@@ -339,6 +339,18 @@ def read_data(self, path: str):
             path (str): Path to the data.
         """
         json_data = read_json(path)
+
+        # # Example: Load a small subset of the dataset for quick testing
+        # num_samples = 1000
+        # json_data_subset = {}
+        # count = 0
+        # for k, v in json_data.items():
+        #     json_data_subset[k] = v
+        #     count += 1
+        #     if count >= num_samples:
+        #         break
+        # json_data = json_data_subset
+
         return json_data
 
     def filter_unvalid_by_property(self):

diff --git a/ppmat/models/__init__.py b/ppmat/models/__init__.py
@@ -195,6 +195,7 @@ def build_model(
 def build_model_from_name(model_name: str, weights_name: Optional[str] = None):
     path = download.get_weights_path_from_url(MODEL_REGISTRY[model_name])
     path = osp.join(path, model_name)
+    logger.info(f"Save model and configuration files in path: {path}")
     config_path = osp.join(path, f"{model_name}.yaml")
     if not osp.exists(config_path):
         logger.warning(

diff --git a/ppmat/models/dimenetpp/dimenetpp.py b/ppmat/models/dimenetpp/dimenetpp.py
@@ -310,9 +310,9 @@ class DimeNetPlusPlus(paddle.nn.Layer):
             a graph-level feature (“mean” or “sum”). Defaults to "mean".
         property_names (Optional[str], optional): A comma-separated list of
             target property names to predict. Defaults to "formation_energy_per_atom".
-        data_norm_mean (float, optional): The mean used for normalizing target values.
+        data_mean (float, optional): The mean used for normalizing target values.
             Defaults to 0.0.
-        data_norm_std (float, optional): The standard deviation used for
+        data_std (float, optional): The standard deviation used for
             normalizing target values. Defaults to 1.0.
         loss_type (str, optional): Loss type, can be 'mse_loss' or 'l1_loss'.
             Defaults to "l1_loss".
@@ -339,8 +339,8 @@ def __init__(
         num_output_layers: int = 3,
         readout: str = "mean",
         property_names: Optional[str] = "formation_energy_per_atom",
-        data_norm_mean: float = 0.0,
-        data_norm_std: float = 1.0,
+        data_mean: float = 0.0,
+        data_std: float = 1.0,
         loss_type: str = "l1_loss",
         act: str = "swish",
     ):
@@ -357,10 +357,10 @@ def __init__(
             assert isinstance(property_names, str)
             self.property_names = property_names
         self.register_buffer(
-            tensor=paddle.to_tensor(data_norm_mean), name="data_norm_mean"
+            tensor=paddle.to_tensor(data_mean), name="data_mean"
         )
         self.register_buffer(
-            tensor=paddle.to_tensor(data_norm_std), name="data_norm_std"
+            tensor=paddle.to_tensor(data_std), name="data_std"
         )
 
         # basis layers
@@ -445,10 +445,10 @@ def triplets(self, edge_index, num_nodes):
         )
 
     def normalize(self, tensor):
-        return (tensor - self.data_norm_mean) / self.data_norm_std
+        return (tensor - self.data_mean) / self.data_std
 
     def unnormalize(self, tensor):
-        return tensor * self.data_norm_std + self.data_norm_mean
+        return tensor * self.data_std + self.data_mean
 
     def _forward(self, data):
         #  The data in data['graph'] is numpy.ndarray, convert it to paddle.Tensor

diff --git a/ppmat/predictor/__init__.py b/ppmat/predictor/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ppmat.predictor.base import BasePredictor
+from ppmat.predictor.sample import StructureSampler
+
+__all__ = [
+    "BasePredictor",
+    "StructureSampler",
+]