diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..1bccc1fa --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.h5 filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index d6dcaec1..17af60f8 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -18,10 +18,13 @@ jobs: # windows-latest is not supported because pyscf is not supported on windows # https://pyscf.org/user/install.html os: ["ubuntu-latest", "macos-latest"] - py: ["3.9", "3.10", "3.11", "3.12"] + py: ["3.10", "3.11", "3.12"] steps: - uses: "actions/checkout@v4" + # Whether to download Git-LFS files + with: + lfs: true - name: Setup python for test ${{ matrix.py }} uses: actions/setup-python@v5 @@ -30,13 +33,14 @@ jobs: - name: Install development version run: | - pip install -v . + pip install -e . - name: Install extra test dependencies run: | pip install --upgrade pip pip install .[test_extra] + - name: Run pytest default tests uses: pavelzw/pytest-action@v2 with: @@ -59,3 +63,4 @@ jobs: click-to-expand: true report-title: 'Dev Test Report' pytest-args: '-m dev' + diff --git a/atomdb/data/database_beta_1.3.0.h5 b/atomdb/data/database_beta_1.3.0.h5 index 744f7497..b4904224 100644 Binary files a/atomdb/data/database_beta_1.3.0.h5 and b/atomdb/data/database_beta_1.3.0.h5 differ diff --git a/atomdb/data/elements_data.h5 b/atomdb/data/elements_data.h5 new file mode 100644 index 00000000..f8cb3e66 --- /dev/null +++ b/atomdb/data/elements_data.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde2a5f5db8c0adb8418016ea8b85d5f12af4e2e40a7f07bef7bcfe474ae3e81 +size 105117616 diff --git a/atomdb/datasets/datasets_data.h5 b/atomdb/datasets/datasets_data.h5 new file mode 100644 index 00000000..b4c77869 --- /dev/null +++ b/atomdb/datasets/datasets_data.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afcf1e437f143d5861f8c30cd890cc7996c40ddc12bb112d949f4db64537ed74 +size 922707071 diff --git a/atomdb/datasets/slater/h5file_creator.py b/atomdb/datasets/slater/h5file_creator.py new file mode 100644 index 00000000..c5c8033b --- /dev/null +++ b/atomdb/datasets/slater/h5file_creator.py @@ -0,0 +1,420 @@ +import warnings +import numpy as np +from importlib_resources import files +import tables as pt +from dataclasses import asdict +from atomdb.datasets.slater.run import NPOINTS +from atomdb.periodic_test import element_symbol_map, get_scalar_data, ElementAttr + + +# Suppresses NaturalNameWarning warnings from PyTables. +warnings.filterwarnings("ignore", category=pt.NaturalNameWarning) + +max_norba = 56 + +SLATER_PROPERTY_CONFIGS = [ + { + "SpeciesInfo": "elem", + "type": "string", + }, + { + "SpeciesInfo": "nexc", + "type": "int", + }, + { + "SpeciesInfo": "charge", + "type": "int", + }, + { + "SpeciesInfo": "mult", + "type": "int", + }, + { + "SpeciesInfo": "nelec", + "type": "int", + }, + { + "SpeciesInfo": "nspin", + "type": "int", + }, + { + "SpeciesInfo": "energy", + "type": "float", + }, + { + "SpeciesInfo": "ip", + "type": "float", + }, + { + "SpeciesInfo": "mu", + "type": "float", + }, + { + "SpeciesInfo": "eta", + "type": "float", + }, + { + "SpeciesInfo": "nbasis", + "type": "int", + }, + { + "property": "obasis_name", + "table_name": "obasis_name", + "description": "Orbital basis name", + "type": "string", + }, + { + "array_property": "mo_energy_a", + "table_name": "mo_energy_a", + "description": "Alpha MO Energies", + }, + { + "array_property": "mo_energy_b", + "table_name": "mo_energy_b", + "description": "Beta MO Energies", + }, + { + "array_property": "mo_occs_a", + "table_name": "mo_occs_a", + "description": "Alpha MO Occupations", + }, + { + "array_property": "mo_occs_b", + "table_name": "mo_occs_b", + "description": "Alpha MO Energies", + }, + {"Carray_property": "rs", "table_name": "rs", "folder": "RadialGrid", "spins": "no"}, + { + "Carray_property": "mo_dens_a", + "table_name": "mo_dens_a", + "folder": "Density", + "spins": "yes", + }, + { + "Carray_property": "mo_dens_b", + "table_name": "mo_dens_b", + "folder": "Density", + "spins": "yes", + }, + {"Carray_property": "dens_tot", "table_name": "dens_tot", "folder": "Density", "spins": "no"}, + { + "Carray_property": "mo_d_dens_a", + "table_name": "mo_d_dens_a", + "folder": "DensityGradient", + "spins": "yes", + }, + { + "Carray_property": "mo_d_dens_b", + "table_name": "mo_d_dens_b", + "folder": "DensityGradient", + "spins": "yes", + }, + { + "Carray_property": "d_dens_tot", + "table_name": "d_dens_tot", + "folder": "DensityGradient", + "spins": "no", + }, + { + "Carray_property": "mo_dd_dens_a", + "table_name": "mo_dd_dens_a", + "folder": "DensityLaplacian", + "spins": "yes", + }, + { + "Carray_property": "mo_dd_dens_b", + "table_name": "mo_dd_dens_b", + "folder": "DensityLaplacian", + "spins": "yes", + }, + { + "Carray_property": "dd_dens_tot", + "table_name": "dd_dens_tot", + "folder": "DensityLaplacian", + "spins": "no", + }, + { + "Carray_property": "mo_ked_a", + "table_name": "mo_ked_a", + "folder": "KineticEnergyDensity", + "spins": "yes", + }, + { + "Carray_property": "mo_ked_b", + "table_name": "mo_ked_b", + "folder": "KineticEnergyDensity", + "spins": "yes", + }, + { + "Carray_property": "ked_tot", + "table_name": "ked_tot", + "folder": "KineticEnergyDensity", + "spins": "no", + }, +] + + +class IntPropertyDescription(pt.IsDescription): + value = pt.Int32Col() + + +class StringPropertyDescription(pt.IsDescription): + value = pt.StringCol(25) + + +class FloatPropertyDescription(pt.IsDescription): + value = pt.Float64Col() + + +# static definition +class ArrayPropertyDescription(pt.IsDescription): + value = pt.Float64Col(shape=(max_norba,)) + + +class SpeciesInfo(pt.IsDescription): + """Schema for SpeciesInfo table.""" + + elem = pt.StringCol(25) + charge = pt.Int32Col() + mult = pt.Int32Col() + nexc = pt.Int32Col() + nelec = pt.Int32Col() + nspin = pt.Int32Col() + nbasis = pt.Int32Col() + energy = pt.Float64Col() + ip = pt.Float64Col() + mu = pt.Float64Col() + eta = pt.Float64Col() + + +def create_species_info_table(species_info_table_row, prop_name, prop_type, value): + """Adds a property column to speciesInfo table. + + Args: + table_row (dict): single row in the table that holds all the columns. + prop_name (str): Name of the property column to add to the table. + prop_type (str): Data type of the property ('int', 'string', or 'float'). + value: The value to store in the column. + + """ + if prop_type == "int": + value = int(value) if value is not None else 0 + + elif prop_type == "string": + value = str(value) if value is not None else "" + + elif prop_type == "float": + value = float(value) if value is not None else np.nan + + species_info_table_row[prop_name] = value + + +def create_properties_tables(hdf5_file, parent_folder, config, value): + """Creates a table for storing properties in the HDF5 file. + + Args: + hdf5_file (tables.File): The open HDF5 file where the table will be created. + parent_folder (tables.Group): The parent folder in the HDF5 file where the table will be stored. + config (dict): Configuration dictionary containing table metadata, including: + - 'table_name': Name of the table. + - 'description': Description of the table. + - 'type': Data type of the property ('int', 'string', or 'float'). + value: The value to store in the table. + """ + + # Extract table metadata from config. + table_name = config["table_name"] + table_description = config["description"] + type = config["type"] + + if type == "int": + row_description = IntPropertyDescription + value = int(value) if value is not None else 0 + + elif type == "string": + row_description = StringPropertyDescription + value = str(value) if value is not None else "" + + elif type == "float": + row_description = FloatPropertyDescription + value = float(value) if value is not None else np.nan + + # Create the table and populate the data + table = hdf5_file.create_table(parent_folder, table_name, row_description, table_description) + row = table.row + row["value"] = value + row.append() + table.flush() + + +def create_properties_arrays(hdf5_file, parent_folder, table_name, description, data): + """Creates a table for storing an array property in the HDF5 file. + + Args: + hdf5_file (tables.File): The open HDF5 file where the array will be created. + parent_folder (tables.Group): The parent folder in the HDF5 file where the table will be stored. + table_name (str): Name of the table to create. + description (str): Description of the table. + data (numpy.ndarray): The array data to store in the table. + """ + filters = pt.Filters(complevel=5, complib="blosc2") + + # Create the table and populate the data + table = hdf5_file.create_table( + parent_folder, table_name, ArrayPropertyDescription, description, filters=filters + ) + row = table.row + padded_data = np.pad(data, (0, max_norba - len(data)), "constant", constant_values=0) + row["value"] = padded_data + row.append() + table.flush() + + +def create_spins_array(h5file, parent_folder, key, array_data, shape): + """Creates a CArray for storing spin-dependent array data in the HDF5 file. + + Args: + hdf5_file (tables.File): The open HDF5 file where the CArray will be created. + parent_folder (tables.Group): The parent folder in the HDF5 file where the CArray will be stored. + key (str): Name of the CArray. + array_data (numpy.ndarray): The array data to store in the CArray. + shape (int): The total size of the CArray. + """ + data_length = len(array_data) + filters = pt.Filters(complevel=5, complib="blosc2") + + # Create the CArray and populate the data + array = h5file.create_carray( + parent_folder, key, pt.Float64Atom(), shape=(shape,), filters=filters + ) + array[:data_length] = array_data + array[data_length:] = 0 + + +def create_tot_array(h5file, parent_folder, key, array_data): + """Creates a CArray for storing total (non-spin-dependent) array data in the HDF5 file. + + Args: + h5file (tables.File): The open HDF5 file where the CArray will be created. + parent_folder (tables.Group): The parent folder in the HDF5 file where the CArray will be stored. + key (str): Name of the CArray. + array_data (numpy.ndarray): The array data to store in the CArray. + """ + data_length = len(array_data) + filters = pt.Filters(complevel=5, complib="blosc2") + + # Create the CArray and populate the data + tot_gradient_array = h5file.create_carray( + parent_folder, key, pt.Float64Atom(), shape=(NPOINTS,), filters=filters + ) + if data_length < NPOINTS: + tot_gradient_array[:data_length] = array_data + tot_gradient_array[data_length:] = 0 + + else: + tot_gradient_array[:] = array_data + + +def create_hdf5_file(DATASETS_H5FILE, fields, dataset, mult): + """Creates an HDF5 folder with structured data for a specific dataset and element. + + Args: + DATASETS_H5FILE (tables.File): An open PyTables HDF5 file object to store the data. + fields (dataclass): A dataclass containing the fields to store in the HDF5 file. + dataset (str): Name of the dataset. + mult (int): Multiplicity. + """ + fields = asdict(fields) + dataset = dataset.lower() + shape = NPOINTS * max_norba + + elem = fields["elem"] + nexc = fields["nexc"] + atnum = element_symbol_map[elem][ElementAttr.atnum] + charge = atnum - fields["nelec"] + + # charge and mult can be calculated (instead of passing them)? + dataset_folder = f"/Datasets/{dataset}" + elem_folder = f"{dataset_folder}/{elem}" + specific_elem_folder = f"{elem_folder}/{elem}_{charge:03d}_{mult:03d}_{nexc:03d}" + + # Create dataset folder if it doesn't exist + if dataset_folder not in DATASETS_H5FILE: + DATASETS_H5FILE.create_group("/Datasets", dataset, f"{dataset} Data") + + # Create element folder if it doesn't exist + if elem_folder not in DATASETS_H5FILE: + DATASETS_H5FILE.create_group(dataset_folder, elem, f"{elem} Data") + + # Create specific element folder (charge/mult/nexc) if it doesn't exist + if specific_elem_folder not in DATASETS_H5FILE: + DATASETS_H5FILE.create_group( + elem_folder, + f"{elem}_{charge:03d}_{mult:03d}_{nexc:03d}", + f"{elem} {charge} {mult} {nexc} Data", + ) + + folders = { + "Properties": DATASETS_H5FILE.create_group( + specific_elem_folder, "Properties", "Properties Data" + ), + "RadialGrid": DATASETS_H5FILE.create_group( + specific_elem_folder, "RadialGrid", "Radial Grid Data" + ), + "Density": DATASETS_H5FILE.create_group(specific_elem_folder, "Density", "Density Data"), + "DensityGradient": DATASETS_H5FILE.create_group( + specific_elem_folder, "DensityGradient", "Density Gradient Data" + ), + "DensityLaplacian": DATASETS_H5FILE.create_group( + specific_elem_folder, "DensityLaplacian", "Density Laplacian Data" + ), + "KineticEnergyDensity": DATASETS_H5FILE.create_group( + specific_elem_folder, "KineticEnergyDensity", "Kinetic Energy Density Data" + ), + } + + # Create basic species table and its row + species_info_table = DATASETS_H5FILE.create_table( + folders["Properties"], "species_info", SpeciesInfo, "Species Information" + ) + species_info_table_row = species_info_table.row + + # Create basic property tables + for config in SLATER_PROPERTY_CONFIGS: + if "SpeciesInfo" in config: + prop_name = config["SpeciesInfo"] + create_species_info_table( + species_info_table_row, prop_name, config["type"], fields[prop_name] + ) + + elif "property" in config: + prop_name = config["property"] + create_properties_tables( + DATASETS_H5FILE, folders["Properties"], config, fields[prop_name] + ) + + # Create array property tables + elif "array_property" in config: + prop_name = config["array_property"] + create_properties_arrays( + DATASETS_H5FILE, + folders["Properties"], + config["table_name"], + config["description"], + fields[prop_name], + ) + + elif "Carray_property" in config: + prop_name = config["Carray_property"] + parent_folder = folders[config["folder"]] + if config["spins"] == "yes": + create_spins_array( + DATASETS_H5FILE, parent_folder, config["table_name"], fields[prop_name], shape + ) + elif config["spins"] == "no": + create_tot_array( + DATASETS_H5FILE, parent_folder, config["table_name"], fields[prop_name] + ) + + species_info_table_row.append() + species_info_table.flush() diff --git a/atomdb/datasets/slater/run.py b/atomdb/datasets/slater/run.py index 391e0c91..a74f10b7 100644 --- a/atomdb/datasets/slater/run.py +++ b/atomdb/datasets/slater/run.py @@ -19,14 +19,15 @@ import re import atomdb -from atomdb.periodic import Element from grid.onedgrid import UniformInteger from grid.rtransform import ExpRTransform # from importlib_resources import files from atomdb.utils import DEFAULT_DATAPATH from scipy.special import factorial - +from dataclasses import dataclass +from typing import Optional, Dict +from atomdb.periodic_test import element_symbol_map, get_scalar_data __all__ = ["AtomicDensity", "load_slater_wfn", "run"] @@ -39,6 +40,65 @@ # DATAPATH = os.path.abspath(DATAPATH._paths[0]) +@dataclass +class DefinitionClass: + """Data structure for the Slater dataset.""" + + # species info + elem: str + atnum: int + nelec: int + nspin: int + nexc: int + nbasis: int + charge: int + mult: int + obasis_name: str + + # properties (all from multiple sources Dict[str, float] ) + atmass: Optional[Dict[str, float]] + cov_radius: Optional[Dict[str, float]] + vdw_radius: Optional[Dict[str, float]] + at_radius: Optional[Dict[str, float]] + polarizability: Optional[Dict[str, float]] + dispersion: Optional[Dict[str, float]] + + # [float] + energy: Optional[float] + ip: Optional[float] + mu: Optional[float] + eta: Optional[float] + + # [np.ndarray] + mo_energy_a: Optional[np.ndarray] + mo_energy_b: Optional[np.ndarray] + mo_occs_a: Optional[np.ndarray] + mo_occs_b: Optional[np.ndarray] + + # Radial grid + rs: np.ndarray = Optional[np.ndarray] + + # Density + mo_dens_a: np.ndarray = Optional[np.ndarray] + mo_dens_b: np.ndarray = Optional[np.ndarray] + dens_tot: np.ndarray = Optional[np.ndarray] + + # Density gradient + mo_d_dens_a: np.ndarray = Optional[np.ndarray] + mo_d_dens_b: np.ndarray = Optional[np.ndarray] + d_dens_tot: np.ndarray = Optional[np.ndarray] + + # Density laplacian + mo_dd_dens_a: np.ndarray = Optional[np.ndarray] + mo_dd_dens_b: np.ndarray = Optional[np.ndarray] + dd_dens_tot: np.ndarray = Optional[np.ndarray] + + # KED + mo_ked_a: np.ndarray = Optional[np.ndarray] + mo_ked_b: np.ndarray = Optional[np.ndarray] + ked_tot: np.ndarray = Optional[np.ndarray] + + class AtomicDensity: r""" Atomic Density Class. @@ -1067,7 +1127,7 @@ def run(elem, charge, mult, nexc, dataset, datapath): # Set up internal variables elem = atomdb.element_symbol(elem) - atnum = atomdb.element_number(elem) + atnum = element_symbol_map[elem][0] nelec = atnum - charge nspin = mult - 1 @@ -1088,6 +1148,8 @@ def run(elem, charge, mult, nexc, dataset, datapath): # Get electronic structure data energy = species.energy[0] # get energy from list norba = len(mo_occ) // 2 + nbasis = norba + # Get MO energies and occupations mo_e_up = species.orbitals_energy.ravel()[:norba] mo_e_dn = species.orbitals_energy.ravel()[norba:] @@ -1121,17 +1183,13 @@ def run(elem, charge, mult, nexc, dataset, datapath): mo_ked_a = species.eval_orbs_ked_positive_definite(rs)[:norba, :] mo_ked_b = species.eval_orbs_ked_positive_definite(rs)[:norba, :] - # Get information about the element - atom = Element(elem) - atmass = atom.mass - cov_radius, vdw_radius, at_radius, polarizability, dispersion = [ - None, - ] * 5 - # overwrite values for neutral atomic species - if charge == 0: - cov_radius, vdw_radius, at_radius = (atom.cov_radius, atom.vdw_radius, atom.at_radius) - polarizability = atom.pold - dispersion = {"C6": atom.c6} + # Get periodic data + cov_radius = get_scalar_data("cov_radius", atnum, nelec) + vdw_radius = get_scalar_data("vdw_radius", atnum, nelec) + at_radius = get_scalar_data("at_radius", atnum, nelec) + polarizability = get_scalar_data("polarizability", atnum, nelec) + dispersion = get_scalar_data("dispersion", atnum, nelec) + atmass = get_scalar_data("atmass", atnum, nelec) # Conceptual-DFT properties (WIP) ip = -mo_e_up[np.sum(occs_up) - 1] # - energy of HOMO @@ -1139,10 +1197,13 @@ def run(elem, charge, mult, nexc, dataset, datapath): mu = None eta = None - # Return Species instance - fields = dict( + # Return fields + fields = DefinitionClass( elem=elem, + charge=charge, + mult=mult, atnum=atnum, + nbasis=norba, obasis_name="Slater", nelec=nelec, nspin=nspin, @@ -1179,4 +1240,4 @@ def run(elem, charge, mult, nexc, dataset, datapath): mo_ked_b=mo_ked_b.flatten(), ked_tot=ked_tot, ) - return atomdb.Species(dataset, fields) + return fields diff --git a/atomdb/migration/datasets/datasets_data.py b/atomdb/migration/datasets/datasets_data.py new file mode 100644 index 00000000..d9557425 --- /dev/null +++ b/atomdb/migration/datasets/datasets_data.py @@ -0,0 +1,20 @@ +""" +running this file will recreate datasets_data.h5, that will lead to create an empty datasets folder once again +""" + +from importlib_resources import files +import tables as pt + +hdf5_file = files("atomdb.datasets").joinpath("datasets_data.h5") + +with pt.open_file(hdf5_file, mode="w", title="Datasets Data Files") as h5file: + # create the root folder 'datasets' + datasets_folder = h5file.create_group("/", "Datasets", "Datasets Data") + + # create a folder for each dataset to hold its data files + h5file.create_group(datasets_folder, "slater", "Slater dataset") + h5file.create_group(datasets_folder, "gaussian", "Gaussian dataset") + h5file.create_group(datasets_folder, "hci", "HCI dataset") + h5file.create_group(datasets_folder, "nist", "NIST dataset") + h5file.create_group(datasets_folder, "numeric", "Numeric dataset") + h5file.create_group(datasets_folder, "uhf_augccpvdz", "UHF aug-cc-pVDZ dataset") diff --git a/atomdb/migration/periodic/elements_data.py b/atomdb/migration/periodic/elements_data.py new file mode 100644 index 00000000..33a6c636 --- /dev/null +++ b/atomdb/migration/periodic/elements_data.py @@ -0,0 +1,374 @@ +import csv +import tables as pt +import numpy as np +from importlib_resources import files +import warnings +from atomdb.utils import CONVERTOR_TYPES + +# Suppresses NaturalNameWarning warnings from PyTables. +warnings.filterwarnings("ignore", category=pt.NaturalNameWarning) + +# Set-up variables +elements_data_csv = files("atomdb.data").joinpath("elements_data.csv") +data_info_csv = files("atomdb.data").joinpath("data_info.csv") +hdf5_file = files("atomdb.data").joinpath("elements_data.h5") + + +# Properties of each element in the HDF5 file. +PROPERTY_CONFIGS = [ + { + "basic_property": "atnum", + "table_name": "atnum", + "description": "Atom Number", + "type": "int", + }, + { + "basic_property": "symbol", + "table_name": "symbol", + "description": "Atom Symbol", + "type": "string", + }, + { + "basic_property": "name", + "table_name": "name", + "description": "Atom Name", + "type": "string", + }, + { + "basic_property": "group", + "table_name": "group", + "description": "Atom Group", + "type": "int", + }, + { + "basic_property": "period", + "table_name": "period", + "description": "Atom Period", + "type": "int", + }, + { + "basic_property": "mult", + "table_name": "mult", + "description": "Atom multiplicity", + "type": "int", + }, + {"property": "cov_radius", "table_name": "cov_radius", "description": "Covalent Radius"}, + {"property": "vdw_radius", "table_name": "vdw_radius", "description": "Van der Waals Radius"}, + { + "property": "at_radius", + "group": "Radius", + "table_name": "at_radius", + "description": "Atomic Radius", + }, + {"property": "mass", "table_name": "atmass", "description": "Atomic Mass"}, + {"property": "pold", "table_name": "polarizability", "description": "Polarizability"}, + {"property": "c6", "table_name": "dispersion_c6", "description": "C6 Dispersion Coefficient"}, + {"property": "eneg", "table_name": "eneg", "description": "Electronegativity"}, +] + + +class NumberElementDescription(pt.IsDescription): + value = pt.Int32Col() + + +class StringElementDescription(pt.IsDescription): + value = pt.StringCol(25) + + +class PropertyValues(pt.IsDescription): + """Schema for property value tables.""" + + source = pt.StringCol(30, pos=0) + unit = pt.StringCol(20, pos=1) + value = pt.Float64Col(pos=2) + + +class ElementsDataInfo(pt.IsDescription): + """Schema for the property_info table.""" + + property_key = pt.StringCol(20, pos=0) + property_name = pt.StringCol(50, pos=1) + source_key = pt.StringCol(30, pos=2) + property_description = pt.StringCol(250, pos=3) + reference = pt.StringCol(250, pos=4) + doi = pt.StringCol(150, pos=5) + notes = pt.StringCol(500, pos=6) + + +def create_properties_tables( + hdf5_file, + parent_folder, + table_name, + table_description, + row_description, + columns, + row_data, + sources_data, + units_data, +): + """ + Create a table in the HDF5 file for a specific properties. + + Args: + hdf5_file: PyTables file object. + parent_folder: Group where the table will be created. + table_name (str): Name of the table. + table_description (str): Description of the table. + row_description: PyTables IsDescription class for the table schema. + columns (list): List of column names from the CSV to include. + row_data (dict): Data for the current element. + sources_data (dict): sources of each property. + units_data (dict): units of each property. + """ + + # Creates a new table in the HDF5 file. + table = hdf5_file.create_table(parent_folder, table_name, row_description, table_description) + + # Iterates over the list of columns relevant to the current table. + for col in columns: + source = sources_data.get(col, "unknown") # defaulting to 'unknown' if not found. + unit = units_data.get(col, "unknown") # defaulting to 'unknown' if not found. + value = np.nan + + if col in row_data and row_data[col].strip(): + try: + value = float(row_data[col]) + value = CONVERTOR_TYPES[unit](value) + except (ValueError, TypeError): + value = np.nan + + # Creates a new row in the table. + row = table.row + row["source"] = source.encode("utf-8") if source else "" + row["unit"] = unit.encode("utf-8") if unit else "" + row["value"] = value + row.append() + + # Flushes the table to ensure all data is written to the HDF5 file. + table.flush() + + +def create_basic_properties_tables( + hdf5_file, parent_folder, table_name, row_description, table_description, value, prop_type +): + """ + Create a table for a single basic property. + + Args: + hdf5_file: PyTables file object. + parent_folder: Group where the table will be created. + table_name (str): Name of the table. + row_description: PyTables IsDescription class for the table schema. + table_description (str): Description of the table. + value (integer or string): The value to store in the table. + """ + table = hdf5_file.create_table(parent_folder, table_name, row_description, table_description) + row = table.row + if prop_type == "int": + row["value"] = value + if prop_type == "string": + row["value"] = value.encode("utf-8") if value else "" + + row.append() + table.flush() + + +def read_elements_data_csv(elements_data_csv): + """ + Read the elements_data.csv file. + + Args: + elements_data_csv: Path to the elements_data.csv file. + + Returns: + - data (List): List of dictionaries containing element data. + - unique_headers (List): List of unique column headers. + - sources_data (dict): sources of each property. + - units_data (dict): units of each property. + """ + + # Opens the csv file, filters out comment lines (starting with #) and empty lines. + with open(elements_data_csv, "r") as f: + reader = csv.reader(f) + lines = [line for line in reader if not line[0].startswith("#") and any(line)] + + headers = [header.strip() for header in lines[0]] # first row as column headers + sources = [source.strip() for source in lines[1]] # second row as sources + units = [unit.strip() for unit in lines[2]] # third row as units + data_rows = lines[3:] # remaining rows as data + + # Process headers to make them unique + unique_headers = [] + header_counts = {} + for header in headers: + if header in header_counts: + header_counts[header] += 1 + unique_headers.append( + f"{header}.{header_counts[header]}" + ) # creates suffix (header.1, header.2) for duplicate headers + else: + header_counts[header] = 0 + unique_headers.append(header) + + # Create data as list of dictionaries + data = [] + for row in data_rows: + data.append(dict(zip(unique_headers, row))) + + sources_data = dict(zip(unique_headers, sources)) + units_data = dict(zip(unique_headers, units)) + + return data, unique_headers, sources_data, units_data + + +def read_data_info_csv(data_info_csv): + """ + Read and parse the data_info.csv file containing metadata. + + Args: + data_info_csv: Path to the data_info.csv file. + + Returns: + data_info (List): List of dictionaries containing metadata for each property. + """ + # Opens the csv file, filters out comment lines (starting with #) and empty lines. + with open(data_info_csv, "r") as f: + lines = [] + for line in f: + stripped = line.strip() + if stripped and not stripped.startswith("#"): + lines.append(stripped) + + # hardcode the headers + data_info_headers = [ + "Property key", + "Property name", + "Source key", + "Property description", + "Reference", + "doi", + "Notes", + ] + + reader = csv.reader(lines) + data_rows = list(reader) + + data_info = [] + for row in data_rows: + data_info.append(dict(zip(data_info_headers, row))) + + return data_info + + +def write_elements_data_to_hdf5(data, unique_headers, sources_data, units_data): + """Write element data to an HDF5 file using PyTables. + + Args: + data (list of dict): List of dictionaries containing element data. + unique_headers (list of str): List of unique column headers from the data, used to identify properties. + sources_data (dict): sources of each property. + units_data (dict): units of each property. + """ + h5file = pt.open_file(hdf5_file, mode="w", title="Periodic Data") + elements_group = h5file.create_group("/", "Elements", "Elements Data") + + for row in data: + atnum = int(row["atnum"]) if "atnum" in row and row["atnum"].strip() else 0 + name = row["name"] if "name" in row and row["name"].strip() else "" + element_group_name = f"{atnum:03d}" + element_group = h5file.create_group(elements_group, element_group_name, f"Data for {name}") + + # Handle basic properties + for config in PROPERTY_CONFIGS: + if "basic_property" in config: + property_name = config["basic_property"] + table_name = config["table_name"] + description = config["description"] + prop_type = config["type"] + + # checking the property type to use the relevant ElementDescription class + if prop_type == "int": + row_description = NumberElementDescription + value = ( + int(row[property_name]) + if property_name in row and row[property_name].strip() + else 0 + ) + elif prop_type == "string": + row_description = StringElementDescription + value = ( + row[property_name] + if property_name in row and row[property_name].strip() + else "" + ) + + create_basic_properties_tables( + h5file, + element_group, + table_name, + row_description, + description, + value, + prop_type, + ) + + # handle rest of the properties + else: + columns = [col for col in unique_headers if col.startswith(config["property"])] + if columns: + create_properties_tables( + h5file, + element_group, + config["table_name"], + config["description"], + PropertyValues, + columns, + row, + sources_data, + units_data, + ) + + h5file.close() + + +def write_data_info_to_hdf5(data_info_list): + """ + Write dara from data_info.csv to the HDF5 file. + + Args: + data_info_list: List of dictionaries containing metadata. + """ + + # Opens the HDF5 file in append mode ("a") --> add metadata without overwriting existing data. + with pt.open_file(hdf5_file, mode="a", title="Periodic Data") as h5file: + data_info_group = h5file.create_group("/", "data_info", "Data Info") + + property_info_table = h5file.create_table( + data_info_group, "property_info", ElementsDataInfo, "Property Information" + ) + + for row in data_info_list: + table_row = property_info_table.row + table_row["property_key"] = row.get("Property key", "").encode("utf-8") + table_row["property_name"] = row.get("Property name", "").encode("utf-8") + table_row["source_key"] = row.get("Source key", "").encode("utf-8") + table_row["property_description"] = row.get("Property description", "").encode("utf-8") + table_row["reference"] = row.get("Reference", "").encode("utf-8") + table_row["doi"] = row.get("doi", "").encode("utf-8") + table_row["notes"] = row.get("Notes", "").encode("utf-8") + table_row.append() + property_info_table.flush() + + +if __name__ == "__main__": + # Read the elements data from the CSV file + data, unique_headers, sources_data, units_data = read_elements_data_csv(elements_data_csv) + + # Read the provenance data from the CSV file + data_info_df = read_data_info_csv(data_info_csv) + + # Write the periodic table data to an HDF5 file + write_elements_data_to_hdf5(data, unique_headers, sources_data, units_data) + + # Write the provenance data to the HDF5 file + write_data_info_to_hdf5(data_info_df) diff --git a/atomdb/periodic_test.py b/atomdb/periodic_test.py new file mode 100644 index 00000000..2998f3d1 --- /dev/null +++ b/atomdb/periodic_test.py @@ -0,0 +1,108 @@ +from enum import IntEnum +from numbers import Integral +import tables as pt +import numpy as np +from importlib_resources import files + + +__all__ = [ + "PROPERTY_NAME_MAP", + "get_scalar_data", + "element_symbol_map", + "ElementAttr", +] + + +class ElementAttr(IntEnum): + atnum = 0 + name = 1 + + +elements_hdf5_file = files("atomdb.data").joinpath("elements_data.h5") +ELEMENTS_H5FILE = pt.open_file(elements_hdf5_file, mode="r") + +PROPERTY_NAME_MAP = { + "atmass": "atmass", + "cov_radius": "cov_radius", + "vdw_radius": "vdw_radius", + "at_radius": "at_radius", + "polarizability": "polarizability", + "dispersion_c6": "dispersion_c6", + "dispersion": "dispersion_c6", # fields in run + "elem": "symbol", + "atnum": "atnum", + "name": "name", + "mult": "mult", +} + + +def get_scalar_data(prop_name, atnum, nelec): + """ + Get a scalar property value for a given element. + + Args: + prop_name (str): Property name to retrieve. + atnum (int): Atomic number of the element. + nelec (int): Number of electrons in the element. + + Returns: + int | float | str | dict[str, float] | None: + - int, float, or str for single-valued properties. + - dict for properties with multiple sources. + - None + """ + + charge = atnum - nelec + + if charge != 0 and prop_name not in ["atmass", "elem", "atnum", "name"]: + return None + + # get the element group + element_group = f"/Elements/{atnum:03d}" + + table_name = PROPERTY_NAME_MAP[prop_name] + table_path = f"{element_group}/{table_name}" + + # get the table node from the HDF5 file + table = ELEMENTS_H5FILE.get_node(table_path) + + # Handle basic properties (single column --> no sources) + if len(table.colnames) == 1 and table.colnames[0] == "value": + value = table[0]["value"] + # if the value is an int, return it as an int + if isinstance(value, Integral): + return int(value) + # if the value is a string, decode from bytes + elif isinstance(value, bytes): + return value.decode("utf-8") + else: + # handle properties with multiple sources + result = {} + for row in table: + source = row["source"].decode("utf-8") + value = row["value"] + # exclude none values + if not np.isnan(value): + result[source] = float(value) + return result if result else None + + +def map_element_symbol(): + """ + Build a mapping of element symbols to their atomic number and name. + + Returns: + dict[str, tuple[int, str]]: + Dictionary mapping element symbol → (atomic_number, name). + """ + element_symbol_map = {} + for element_group in ELEMENTS_H5FILE.root.Elements: + symbol = element_group.symbol[0]["value"].decode("utf-8").strip() + atnum = element_group.atnum[0]["value"] + name = element_group.name[0]["value"].decode("utf-8").strip() + element_symbol_map[symbol] = (atnum, name) + + return element_symbol_map + + +element_symbol_map = map_element_symbol() diff --git a/atomdb/species.py b/atomdb/species.py index 2408eb05..ffe596a2 100644 --- a/atomdb/species.py +++ b/atomdb/species.py @@ -19,19 +19,23 @@ import re from dataclasses import asdict, dataclass, field from importlib import import_module -from numbers import Integral from os import makedirs, path import numpy as np import pooch import requests -from msgpack import packb, unpackb -from msgpack_numpy import decode, encode from numpy import ndarray from scipy.interpolate import CubicSpline -from atomdb.periodic import Element, element_symbol +from atomdb.periodic_test import element_symbol_map, PROPERTY_NAME_MAP, get_scalar_data, ElementAttr from atomdb.utils import DEFAULT_DATAPATH, DEFAULT_DATASET, DEFAULT_REMOTE +from importlib_resources import files +import tables as pt +from numbers import Integral + +datasets_hdf5_file = files("atomdb.datasets").joinpath("datasets_data.h5") +DATASETS_H5FILE = pt.open_file(datasets_hdf5_file, mode="a") + __all__ = [ "Species", @@ -68,28 +72,12 @@ def scalar(method): @property def wrapper(self): + # Checking if the property is not in PROPERTY_NAME_MAP, if not then fetch it from SpeciesData + if name not in PROPERTY_NAME_MAP: + return getattr(self._data, name) - # Map the name of the method in the SpeciesData class to the name in the Elements class - # This dict can be removed if the Elements csv file uses the same names as the SpeciesData class. - namemap = { - "cov_radius": "cov_radius", - "vdw_radius": "vdw_radius", - "at_radius": "at_radius", - "polarizability": "pold", - "dispersion_c6": "c6", - "atmass": "mass", - } - - if name == "atmass": - return getattr(Element(self._data.elem), namemap[name]) - if name in namemap: - # Only return Element property if neutral, otherwise None - charge = self._data.atnum - self._data.nelec - return getattr(Element(self._data.elem), namemap[name]) if charge == 0 else None - - return getattr(self._data, name) + get_scalar_data(name, self._data.atnum, self._data.nelec) - # conserve the docstring of the method wrapper.__doc__ = method.__doc__ return wrapper @@ -201,7 +189,7 @@ def __call__(self, x, deriv=0): else: y = self._obj(x, nu=deriv) # Handle errors from the y = exp(log y) operation -- set NaN to zero - np.nan_to_num(y, nan=0., copy=False) + np.nan_to_num(y, nan=0.0, copy=False) # Cutoff value: assume y(x) is zero where x > final given point x_n y[x > self._obj.x[-1]] = 0 return y @@ -231,66 +219,6 @@ def __init__(self, data) -> None: self.nbasis = self.norba # number of spatial basis functions -@dataclass(eq=False, order=False) -class SpeciesData: - r"""Database entry fields for atomic and ionic species.""" - - # Species info - elem: str = field(default_factory=default_required("elem", "str")) - atnum: int = field(default_factory=default_required("atnum", "int")) - nelec: int = field(default_factory=default_required("nelec", "int")) - nspin: int = field(default_factory=default_required("nspin", "int")) - nexc: int = field(default_factory=default_required("nexc", "int")) - - # Scalar properties - atmass: float = field(default=None) - cov_radius: float = field(default=None) - vdw_radius: float = field(default=None) - at_radius: float = field(default=None) - polarizability: float = field(default=None) - dispersion: float = field(default=None) - - # Scalar energy and CDFT-related properties - energy: float = field(default=None) - ip: float = field(default=None) - mu: float = field(default=None) - eta: float = field(default=None) - - # Basis set name - obasis_name: str = field(default=None) - - # Radial grid - rs: ndarray = field(default_factory=default_vector) - - # Orbital energies - mo_energy_a: ndarray = field(default_factory=default_vector) - mo_energy_b: ndarray = field(default_factory=default_vector) - - # Orbital occupations - mo_occs_a: ndarray = field(default_factory=default_vector) - mo_occs_b: ndarray = field(default_factory=default_vector) - - # Orbital densities - mo_dens_a: ndarray = field(default_factory=default_matrix) - mo_dens_b: ndarray = field(default_factory=default_matrix) - dens_tot: ndarray = field(default_factory=default_matrix) - - # Orbital density gradients - mo_d_dens_a: ndarray = field(default_factory=default_matrix) - mo_d_dens_b: ndarray = field(default_factory=default_matrix) - d_dens_tot: ndarray = field(default_factory=default_matrix) - - # Orbital density Laplacian - mo_dd_dens_a: ndarray = field(default_factory=default_matrix) - mo_dd_dens_b: ndarray = field(default_factory=default_matrix) - dd_dens_tot: ndarray = field(default_factory=default_matrix) - - # Orbital kinetic energy densities - mo_ked_a: ndarray = field(default_factory=default_matrix) - mo_ked_b: ndarray = field(default_factory=default_matrix) - ked_tot: ndarray = field(default_factory=default_matrix) - - class Species: r"""Properties of atomic and ionic species.""" @@ -308,7 +236,11 @@ def __init__(self, dataset, fields, spinpol=1): """ self._dataset = dataset.lower() - self._data = SpeciesData(**fields) + # converting fields from dict to DefinitionClass + submodule = import_module(f"atomdb.datasets.{dataset}.run") + fields = submodule.DefinitionClass(**fields) + + self._data = fields self.spinpol = spinpol self.ao = _AtomicOrbitals(self._data) @@ -699,7 +631,7 @@ def dd_dens_lapl_func(self, spin="t", index=None, log=False): Return the function for the electronic density Laplacian. .. math:: - + \nabla^2 \rho(\mathbf{r}) = \frac{d^2 \rho(r)}{dr^2} + \frac{2}{r} \frac{d \rho(r)}{dr} Parameters @@ -714,13 +646,13 @@ def dd_dens_lapl_func(self, spin="t", index=None, log=False): By default, all orbitals of the given spin(s) are included. log : bool, default=False Whether the logarithm of the density is used for interpolation. - + Returns ------- Callable[np.ndarray(N,) -> np.ndarray(N,)] a callable function evaluating the Laplacian of the density given a set of radial points (1-D array). - + Notes ----- When this function is evaluated at a point close to zero, the Laplacian becomes undefined. @@ -734,11 +666,11 @@ def dd_dens_lapl_func(self, spin="t", index=None, log=False): # Define the Laplacian function def densityspline_like_func(rs): # Avoid division by zero and handle small values of r - with np.errstate(divide='ignore'): + with np.errstate(divide="ignore"): laplacian = dd_dens_spline(rs) + 2 * d_dens_sp_spline(rs) / rs laplacian = np.where(rs < 1e-10, 0.0, laplacian) return laplacian - + return densityspline_like_func @spline @@ -796,33 +728,27 @@ def compile_species( Path to the local AtomDB cache, by default DEFAULT_DATAPATH variable value. """ - # Ensure directories exist - makedirs(path.join(datapath, dataset.lower(), "db"), exist_ok=True) - makedirs(path.join(datapath, dataset.lower(), "raw"), exist_ok=True) - # Import the compile script for the appropriate dataset + # import the selected dataset compile script and get fields submodule = import_module(f"atomdb.datasets.{dataset}.run") - # Compile the Species instance and dump the database entry - species = submodule.run(elem, charge, mult, nexc, dataset, datapath) - dump(species, datapath=datapath) + fields = submodule.run(elem, charge, mult, nexc, dataset, datapath) + # dump the data to the HDF5 file + dump(fields, dataset, mult) -def dump(*species, datapath=DEFAULT_DATAPATH): - r"""Dump the Species instance(s) to a MessagePack file in the database. + +def dump(fields, dataset, mult): + r"""Dump the compiled species data to an HDF5 file in the AtomDB database. Parameters ---------- - species: Iterable - Iterables of objects of class `Species` - datapath : str, optional - Path to the local AtomDB cache, by default DEFAULT_DATAPATH variable value. - + fields (dataclass): A dataclass containing the fields to store in the HDF5 file. + dataset (str): Name of the dataset. + mult (int): Multiplicity. """ - for s in species: - fn = datafile( - s._data.elem, s.charge, s.mult, nexc=s.nexc, dataset=s.dataset, datapath=datapath - ) - with open(fn, "wb") as f: - f.write(packb(asdict(s._data), default=encode)) + + # Save data to the HDF5 file + element_folder_creator = import_module(f"atomdb.datasets.{dataset}.h5file_creator") + element_folder_creator.create_hdf5_file(DATASETS_H5FILE, fields, dataset, mult) def load( @@ -858,24 +784,31 @@ def load( Object of class Species """ - fn = datafile( - elem, - charge, - mult, - nexc=nexc, - dataset=dataset, - datapath=datapath, - remotepath=remotepath, - ) + + # Construct the dataset path + dataset_path = f"/Datasets/{dataset}" + + # import the selected dataset HDF5 file creator to access property configurations + dataset_submodule = import_module(f"atomdb.datasets.{dataset}.h5file_creator") + DATASET_PROPERTY_CONFIGS = getattr(dataset_submodule, f"{dataset.upper()}_PROPERTY_CONFIGS") + + # Handle wildcard case for loading multiple species if Ellipsis in (elem, charge, mult, nexc): + data_paths = datafile(elem, charge, mult, nexc=nexc, dataset=dataset) + # create a list to hold all species objects obj = [] - - for file in fn: - with open(file, "rb") as f: - obj.append(Species(dataset, unpackb(f.read(), object_hook=decode))) + for data_path in data_paths: + elem = data_path.split("/")[-2] + # Construct the specific data path for each species + fields = get_species_data(data_path, elem, DATASET_PROPERTY_CONFIGS) + obj.append(Species(dataset, fields)) else: - with open(fn, "rb") as f: - obj = Species(dataset, unpackb(f.read(), object_hook=decode)) + # Construct the specific data path for a single species + data_path = f"{dataset_path}/{elem}/{elem}_{charge:03d}_{mult:03d}_{nexc:03d}" + # get the species data and then create a species object + fields = get_species_data(data_path, elem, DATASET_PROPERTY_CONFIGS) + obj = Species(dataset, fields) + return obj @@ -888,19 +821,16 @@ def datafile( datapath=DEFAULT_DATAPATH, remotepath=DEFAULT_REMOTE, ): - r"""Return the name of the database file for a species. - - This function returns the local path to the database file of a species in the AtomDB cache. If - the file is not found, it is downloaded from the remote URL. + r"""Return the paths to the database files for a species in AtomDB. Parameters ---------- - elem : str | Ellipsis - Element symbol or Ellipsis for wildcard. - charge : int | Ellipsis - Charge or Ellipsis for wildcard. - mult : int | Ellipsis - Multiplicity or Ellipsis for wildcard. + elem : str + Element symbol. + charge : int + Charge. + mult : int + Multiplicity. nexc : int, optional Excitation level, by default 0. dataset : str, optional @@ -912,59 +842,107 @@ def datafile( Returns ------- - str - Local path to the database file of a species in the AtomDB cache + list + paths to the database file of a species in AtomDB. """ - elem = "[^_]" if elem is Ellipsis else element_symbol(elem) - charge = "[^_]" if charge is Ellipsis else f"{charge:03d}" - mult = "[^_]" if mult is Ellipsis else f"{mult:03d}" - nexc = "[^_]" if nexc is Ellipsis else f"{nexc:03d}" - - # Wildcard search for multiple species, use repodata.txt for matching - if "[^_]" in (elem, charge, mult, nexc): - # try to retrieve the repodata file from the remote URL - try: - repodata = pooch.retrieve( - url=f"{remotepath}{dataset.lower()}/db/repodata.txt", - known_hash=None, - path=path.join(datapath, dataset.lower(), "db"), - fname="repo_data.txt", - ) - # if the file is not found or remote was not valid, use the local repodata file - except (requests.exceptions.HTTPError, ValueError): - repodata = path.join(datapath, dataset.lower(), "db", "repo_data.txt") - - with open(repodata) as f: - data = f.read() - files = re.findall(rf"\b{elem}+_{charge}+_{mult}+_{nexc}\.msg\b", data) - species_list = [] - for file in files: - # try to retrieve the file from the remote URL - try: - element = pooch.retrieve( - url=f"{remotepath}{dataset.lower()}/db/{file}", - known_hash=None, - path=path.join(datapath, dataset.lower(), "db"), - fname=f"{file}", - ) - # if the file is not found, use the local file - except (requests.exceptions.HTTPError, ValueError): - element = path.join(datapath, dataset.lower(), "db", file) - species_list.append(element) - return species_list - # try to retrieve the file from the remote URL - try: - species = pooch.retrieve( - url=f"{remotepath}{dataset.lower()}/db/{elem}_{charge}_{mult}_{nexc}.msg", - known_hash=None, - path=path.join(datapath, dataset.lower(), "db"), - fname=f"{elem}_{charge}_{mult}_{nexc}.msg", - ) - # if the file is not found, use the local file - except (requests.exceptions.HTTPError, ValueError): - species = path.join(datapath, dataset.lower(), "db", f"{elem}_{charge}_{mult}_{nexc}.msg") - return species + group_paths = [] + conditions = [] + + # Access the dataset folder in the HDF5 file + dataset_path = f"/Datasets/{dataset}" + dataset_folder = DATASETS_H5FILE.get_node(dataset_path) + + if elem is not Ellipsis: + conditions.append(f'(elem == b"{elem}")') # b for bytes comparison + if charge is not Ellipsis: + conditions.append(f"(charge == {charge})") + if mult is not Ellipsis: + conditions.append(f"(mult == {mult})") + if nexc is not Ellipsis: + conditions.append(f"(nexc == {nexc})") + + if conditions: + query_result = " & ".join(conditions) if conditions else None + + for elem, elem_folder in dataset_folder._v_groups.items(): + for species_folder in elem_folder._v_groups: + properties_folder = DATASETS_H5FILE.get_node( + f"/Datasets/{dataset}/{elem}/{species_folder}/Properties" + ) + species_info_table = properties_folder._f_get_child("species_info") + + matched_species = list(species_info_table.where(query_result)) + if matched_species: + group_paths.append(f"{dataset_path}/{elem}/{species_folder}") + + # if there are no conditions, return all species + else: + for elem, elem_folder in dataset_folder._v_groups.items(): + for species_folder in elem_folder._v_groups: + group_paths.append(f"{dataset_path}/{elem}/{species_folder}") + + return group_paths + + +def get_species_data(folder_path, elem, DATASET_PROPERTY_CONFIGS): + r"""Retrieve species data from the specified HDF5 folder path. + + Parameters + ---------- + folder_path : str + Path to the HDF5 folder containing the species data. + elem : str + Element symbol. + DATASET_PROPERTY_CONFIGS : list + list of configuration dictionaries. + + Returns + ------- + dict + the extracted species data fields. + """ + fields = {} + dataset_folder = DATASETS_H5FILE.get_node(folder_path) + + species_info_table = dataset_folder.Properties._f_get_child("species_info") + species_info_row = species_info_table[0] + + # Iterate through property configurations to extract data from datasets_data.h5 + for config in DATASET_PROPERTY_CONFIGS: + if "SpeciesInfo" in config: + # Extract species info data + prop_name = config["SpeciesInfo"] + value = species_info_row[prop_name] + if config["type"] == "string": + value = value.decode("utf-8") + fields[config["SpeciesInfo"]] = value + + elif "property" in config: + # Extract single value properties + table = dataset_folder.Properties._f_get_child(config["table_name"]) + value = table[0]["value"] + if config["type"] == "string": + value = value.decode("utf-8") + fields[config["property"]] = value + + elif "array_property" in config: + # Extract array properties + table = dataset_folder.Properties._f_get_child(config["table_name"]) + fields[config["array_property"]] = table[0]["value"] + + elif "Carray_property" in config: + # Extract Carray properties + table = dataset_folder._f_get_child(config["folder"])._f_get_child(config["table_name"]) + fields[config["Carray_property"]] = table[:] + + fields["atnum"] = element_symbol_map[elem][ElementAttr.atnum] + + # Add scalar properties + for prop in ("atmass", "cov_radius", "vdw_radius", "at_radius", "polarizability", "dispersion"): + fields[prop] = get_scalar_data(prop, fields["atnum"], fields["nelec"]) + + return fields def raw_datafile( @@ -1006,7 +984,8 @@ def raw_datafile( str Path to the raw data file. """ - elem = "*" if elem is Ellipsis else element_symbol(elem) + # elem = "*" if elem is Ellipsis else element_symbol(elem) --> why using element_symbol here + elem = "*" if elem is Ellipsis else elem charge = "*" if charge is Ellipsis else f"{charge:03d}" mult = "*" if mult is Ellipsis else f"{mult:03d}" nexc = "*" if nexc is Ellipsis else f"{nexc:03d}" diff --git a/atomdb/utils.py b/atomdb/utils.py index d3a38985..20675bf8 100644 --- a/atomdb/utils.py +++ b/atomdb/utils.py @@ -37,6 +37,7 @@ ] + DEFAULT_DATASET = "slater" r"""Default dataset to query.""" diff --git a/pyproject.toml b/pyproject.toml index 8b9f82a9..dc336b38 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ authors = [ description = "AtomDB is a database of atomic and ionic properties." readme = "README.md" license = {text = "GPL-3.0-or-later"} -requires-python = ">=3.9" +requires-python = ">=3.10" classifiers = [ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', @@ -36,19 +36,19 @@ classifiers = [ 'Intended Audience :: Science/Research', "Intended Audience :: Education", "Natural Language :: English", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] dependencies = [ - "numpy>=1.16", + "numpy>=1.26.4", "scipy>=1.4", "msgpack>=1.0.0", "msgpack-numpy>=0.4.8", "h5py>=3.6.0", "importlib_resources>=3.0.0", "pooch>=1.8.1", + "tables>=3.9.2", ] dynamic = ["version"] [tool.setuptools_scm] @@ -67,7 +67,7 @@ dev = [ "qc-gbasis", # "qc-grid@git+https://github.com/theochem/grid.git@master", # TODO: uncomment when grid is available on PyPI - # "qc-grid", + "qc-grid", # "qc-iodata@git+https://github.com/theochem/iodata.git@main", "qc-iodata", ] @@ -92,6 +92,9 @@ doc = [ [tool.setuptools] packages = ["atomdb"] +# Adding the package data +package-data = { "atomdb" = ["data/*.h5", "data/*.msg"] } +include-package-data = true [tool.black] line-length = 100