Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
69aa46b
Add ase_interface with support for optimization and md
JunnHuo Jul 10, 2025
deea169
Keep the from_structures func in base_calculator
JunnHuo Jul 10, 2025
37ed942
Add predict and tasks modules; update ASE interface and property pred…
JunnHuo Jul 15, 2025
1c75f1d
Add example usage
JunnHuo Jul 15, 2025
06c3c3e
WIP: save local changes
JunnHuo Aug 29, 2025
ee55f16
modify err
JunnHuo Aug 29, 2025
0226096
Remove tests from tracking
JunnHuo Aug 29, 2025
c32fabd
Save local changes before rebase
JunnHuo Sep 1, 2025
6023fe5
Sync missing files from upstream/develop
JunnHuo Sep 1, 2025
0d3354f
modify train config
JunnHuo Sep 4, 2025
9a2dc78
revise train config (2nd version)
JunnHuo Sep 5, 2025
f8b5710
revise train config (3rd version)
JunnHuo Sep 7, 2025
24d6b70
Update training config (4th version)
JunnHuo Sep 8, 2025
a34fef1
Update training config (5th version)
JunnHuo Sep 8, 2025
59614f5
Update training config
JunnHuo Sep 8, 2025
6eeafb6
small fix
JunnHuo Sep 18, 2025
ab05190
small fix: correct structure generation
JunnHuo Sep 18, 2025
52f050a
save local changes before merge upstream
JunnHuo Sep 28, 2025
d841f0d
merge upstream develop
JunnHuo Sep 28, 2025
77d164f
change configs and README
JunnHuo Oct 15, 2025
e8ce0be
resolve config bug in predict.py
JunnHuo Oct 16, 2025
3ddeb0c
keep previous content
JunnHuo Oct 17, 2025
a83c8c4
keep previous content
JunnHuo Oct 17, 2025
e2ea426
keep previous content
JunnHuo Oct 17, 2025
578522a
keep previous content
JunnHuo Oct 17, 2025
b556ff1
fix: correct commit message for ppmatSim README
JunnHuo Oct 20, 2025
623ffc0
revert
JunnHuo Oct 24, 2025
70bca7f
remove experiments directory for PR review
JunnHuo Oct 24, 2025
d713a86
correct
JunnHuo Oct 24, 2025
743d006
correct
JunnHuo Oct 24, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
452 changes: 452 additions & 0 deletions ppmat/calculator/ase.py

Large diffs are not rendered by default.

64 changes: 39 additions & 25 deletions ppmat/datasets/matbench_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,19 +43,21 @@ class MatbenchDataset(Dataset):
"""Matbench Dataset Handler

This class provides utilities for loading and processing the Matbench materials
science benchmark datasets. The implementation supports loading multiple properties
from different matbench JSON files and processing them for materials property prediction.
science benchmark datasets.
The implementation supports loading multiple properties from different
matbench JSON files and processing them for materials property prediction.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里?


**Dataset Overview**
Matbench is a benchmark suite for materials property prediction containing multiple
datasets with different properties:
datasets with different properties:
- Formation Energy (mp_e_form): ~132k samples
- Band Gap (mp_gap): ~106k samples
- Shear Modulus G (elasticity_log10(G_VRH)): ~11k samples
- Bulk Modulus K (elasticity_log10(K_VRH)): ~11k samples

**Automatic Download**
If the data directory doesn't exist, the dataset will be automatically downloaded from:
If the data directory doesn't exist, the dataset will be
automatically downloaded from:
https://paddle-org.bj.bcebos.com/paddlematerial/datasets/matbench/matbench.zip

**Data Format**
Expand All @@ -79,18 +81,21 @@ class MatbenchDataset(Dataset):
- "log10(K_VRH)": Log10 of bulk modulus (GPa) from elasticity_log10(K_VRH).json

Args:
data_dir (str): Directory containing matbench JSON files.
path (str): Directory containing matbench JSON files.
Defaults to "./data/matbench".
property_names (Optional[List[str]]): Property names to load.
Should be selected from ["e_form", "gap pbe", "log10(G_VRH)", "log10(K_VRH)"].
Should be selected from
["e_form", "gap pbe", "log10(G_VRH)", "log10(K_VRH)"].
Defaults to None (loads all available properties).
build_structure_cfg (Dict, optional): Configs for building pymatgen structures.
Defaults to None.
build_graph_cfg (Dict, optional): Configs for building graphs from structures.
Defaults to None.
transforms (Optional[Callable], optional): Preprocessing transforms for each sample.
transforms (Optional[Callable], optional):
Preprocessing transforms for each sample.
Defaults to None.
cache_path (Optional[str], optional): Path for caching processed structures and graphs.
cache_path (Optional[str], optional):
Path for caching processed structures and graphs.
Defaults to None.
overwrite (bool, optional): Whether to overwrite existing cache files.
Defaults to False.
Expand All @@ -105,7 +110,8 @@ class MatbenchDataset(Dataset):
url = (
"https://paddle-org.bj.bcebos.com/paddlematerial/datasets/matbench/matbench.zip"
)
md5 = "71e85300825604c2e228cbbf75574906" # TODO: Replace with actual MD5 hash when available
# TODO: Replace with actual MD5 hash when available
md5 = "71e85300825604c2e228cbbf75574906"

# Property file mapping
PROPERTY_FILES = {
Expand All @@ -117,7 +123,7 @@ class MatbenchDataset(Dataset):

def __init__(
self,
data_dir: str = "./data/matbench",
path: str = "./data/matbench",
property_names: Optional[List[str]] = None,
build_structure_cfg: Dict = None,
build_graph_cfg: Dict = None,
Expand All @@ -132,23 +138,24 @@ def __init__(

# Check if data directory and required files exist, if not download the dataset
# This follows the same pattern as MP2018Dataset
if not osp.exists(data_dir):
if not osp.exists(path):
logger.message("The matbench dataset is not found. Will download it now.")
root_path = download.get_datasets_path_from_url(self.url, self.md5)
data_dir = osp.join(root_path, self.name)
path = osp.join(root_path, self.name)
else:
# Check if required files exist in the directory
required_files = list(self.PROPERTY_FILES.values())
files_exist = all(osp.exists(osp.join(data_dir, f)) for f in required_files)
files_exist = all(osp.exists(osp.join(path, f)) for f in required_files)

if not files_exist:
logger.message(
"Some matbench data files are missing. Will download the dataset now."
"Some matbench data files are missing. "
"Will download the dataset now."
)
root_path = download.get_datasets_path_from_url(self.url, self.md5)
data_dir = osp.join(root_path, self.name)
path = osp.join(root_path, self.name)

self.data_dir = data_dir
self.path = path
if isinstance(property_names, str):
property_names = [property_names]

Expand All @@ -160,7 +167,8 @@ def __init__(
for prop in property_names:
if prop not in self.PROPERTY_FILES:
raise ValueError(
f"Unknown property '{prop}'. Available properties: {list(self.PROPERTY_FILES.keys())}"
f"Unknown property '{prop}'. "
f"Available properties: {list(self.PROPERTY_FILES.keys())}"
)

self.property_names = property_names
Expand All @@ -187,7 +195,7 @@ def __init__(
else:
# Generate cache path based on data directory and properties
prop_str = "_".join(sorted(property_names))
self.cache_path = osp.join(data_dir + "_cache", f"matbench_{prop_str}")
self.cache_path = osp.join(path + "_cache", f"matbench_{prop_str}")
logger.info(f"Cache path: {self.cache_path}")

self.overwrite = overwrite
Expand Down Expand Up @@ -215,14 +223,14 @@ def load_matbench_data(self):
else:
# Multiple properties case - need to handle differently
raise NotImplementedError(
"Loading multiple properties from different files is not yet implemented. "
"Loading multiple properties from different files is not implemented. "
"Please specify only one property at a time."
)

def _load_single_property_data(self):
"""Load data for a single property from its matbench JSON file."""
prop_name = self.property_names[0]
file_path = osp.join(self.data_dir, self.PROPERTY_FILES[prop_name])
file_path = osp.join(self.path, self.PROPERTY_FILES[prop_name])

if not osp.exists(file_path):
raise FileNotFoundError(f"Matbench file not found: {file_path}")
Expand All @@ -232,6 +240,15 @@ def _load_single_property_data(self):
with open(file_path, "r") as f:
data = json.load(f)

# If max_samples is not None, keep only the specified number of samples
if self.max_samples is not None:
data_samples = {
"index": data["index"][: self.max_samples],
"columns": data["columns"],
"data": data["data"][: self.max_samples],
}
data = data_samples

# Validate data format
if not all(key in data for key in ["index", "columns", "data"]):
raise ValueError(f"Invalid matbench file format: {file_path}")
Expand All @@ -250,17 +267,14 @@ def _load_single_property_data(self):
structures = []
properties = []

for i, (structure_dict, prop_value) in enumerate(data["data"]):
if self.max_samples is not None and i >= self.max_samples:
break

for structure_dict, prop_value in data["data"]:
# Convert structure dict to pymatgen Structure
try:
structure = Structure.from_dict(structure_dict)
structures.append(structure)
properties.append(prop_value)
except Exception as e:
logger.warning(f"Failed to parse structure {i} in {file_path}: {e}")
logger.warning(f"Failed to parse structure in {file_path}: {e}")
if not self.filter_unvalid:
structures.append(None)
properties.append(None)
Expand Down
5 changes: 5 additions & 0 deletions ppmat/datasets/mp2018_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,11 @@ def read_data(self, path: str):
for idx in idxs:
data[key].append(json_data[key][idx])

# Example: Load a small subset of the dataset for quick testing
# num_samples = 1000
# for key in data.keys():
# data[key] = data[key][:num_samples]

return data, num_samples

def filter_unvalid_by_property(self):
Expand Down
8 changes: 7 additions & 1 deletion ppmat/datasets/mp20_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,12 @@ def read_data(self, path: str):
num_samples = 0
for key in data:
num_samples = max(num_samples, len(data[key]))

# Example: Load a small subset of the dataset for quick testing
# num_samples = 100
# for key in data.keys():
# data[key] = data[key][:num_samples]

return data, num_samples

def filter_unvalid_by_property(self):
Expand Down Expand Up @@ -440,6 +446,6 @@ class AlexMP20MatterGenDataset(MP20Dataset):
the mp20 dataset used for mattergen.
"""

name = "alex_mp_20_mattergen"
name = "alex_mp_20"
url = "https://paddle-org.bj.bcebos.com/paddlematerial/datasets/alex_mp_20/alex_mp_20.zip"
md5 = "624361c17259cc3af63a00b29fffe9cd"
12 changes: 12 additions & 0 deletions ppmat/datasets/mptrj_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,18 @@ def read_data(self, path: str):
path (str): Path to the data.
"""
json_data = read_json(path)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个地方的改动是为什么?

# # Example: Load a small subset of the dataset for quick testing
# num_samples = 1000
# json_data_subset = {}
# count = 0
# for k, v in json_data.items():
# json_data_subset[k] = v
# count += 1
# if count >= num_samples:
# break
# json_data = json_data_subset

return json_data

def filter_unvalid_by_property(self):
Expand Down
1 change: 1 addition & 0 deletions ppmat/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def build_model(
def build_model_from_name(model_name: str, weights_name: Optional[str] = None):
path = download.get_weights_path_from_url(MODEL_REGISTRY[model_name])
path = osp.join(path, model_name)
logger.info(f"Save model and configuration files in path: {path}")
config_path = osp.join(path, f"{model_name}.yaml")
if not osp.exists(config_path):
logger.warning(
Expand Down
16 changes: 8 additions & 8 deletions ppmat/models/dimenetpp/dimenetpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,9 +310,9 @@ class DimeNetPlusPlus(paddle.nn.Layer):
a graph-level feature (“mean” or “sum”). Defaults to "mean".
property_names (Optional[str], optional): A comma-separated list of
target property names to predict. Defaults to "formation_energy_per_atom".
data_norm_mean (float, optional): The mean used for normalizing target values.
data_mean (float, optional): The mean used for normalizing target values.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里为什么这么改

Defaults to 0.0.
data_norm_std (float, optional): The standard deviation used for
data_std (float, optional): The standard deviation used for
normalizing target values. Defaults to 1.0.
loss_type (str, optional): Loss type, can be 'mse_loss' or 'l1_loss'.
Defaults to "l1_loss".
Expand All @@ -339,8 +339,8 @@ def __init__(
num_output_layers: int = 3,
readout: str = "mean",
property_names: Optional[str] = "formation_energy_per_atom",
data_norm_mean: float = 0.0,
data_norm_std: float = 1.0,
data_mean: float = 0.0,
data_std: float = 1.0,
loss_type: str = "l1_loss",
act: str = "swish",
):
Expand All @@ -357,10 +357,10 @@ def __init__(
assert isinstance(property_names, str)
self.property_names = property_names
self.register_buffer(
tensor=paddle.to_tensor(data_norm_mean), name="data_norm_mean"
tensor=paddle.to_tensor(data_mean), name="data_mean"
)
self.register_buffer(
tensor=paddle.to_tensor(data_norm_std), name="data_norm_std"
tensor=paddle.to_tensor(data_std), name="data_std"
)

# basis layers
Expand Down Expand Up @@ -445,10 +445,10 @@ def triplets(self, edge_index, num_nodes):
)

def normalize(self, tensor):
return (tensor - self.data_norm_mean) / self.data_norm_std
return (tensor - self.data_mean) / self.data_std

def unnormalize(self, tensor):
return tensor * self.data_norm_std + self.data_norm_mean
return tensor * self.data_std + self.data_mean

def _forward(self, data):
# The data in data['graph'] is numpy.ndarray, convert it to paddle.Tensor
Expand Down
21 changes: 21 additions & 0 deletions ppmat/predictor/__init__.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

predictor里面为什么要暴露StructureSampler

Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from ppmat.predictor.base import BasePredictor
from ppmat.predictor.sample import StructureSampler

__all__ = [
"BasePredictor",
"StructureSampler",
]
Loading