From ff21bbcf45f01e60ec3c2d9ea01eb26fa1bc75f8 Mon Sep 17 00:00:00 2001 From: Axel Huebl Date: Mon, 5 Aug 2024 12:42:44 -0500 Subject: [PATCH] Python: Series to DataFrame (#1506) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Python: Series to DataFrame Add new helpers to create Pandas and cuDF dataframes for a single particle species, over all iterations at once. * Remove hard-coded species name Co-authored-by: Franz Pöschel --------- Co-authored-by: Franz Pöschel --- docs/source/analysis/pandas.rst | 11 +- docs/source/analysis/rapids.rst | 9 +- examples/11_particle_dataframe.py | 21 ++++ src/binding/python/openpmd_api/DataFrame.py | 108 ++++++++++++++++++++ src/binding/python/openpmd_api/__init__.py | 5 +- 5 files changed, 135 insertions(+), 19 deletions(-) diff --git a/docs/source/analysis/pandas.rst b/docs/source/analysis/pandas.rst index dcfe97aae2..a5fee0be07 100644 --- a/docs/source/analysis/pandas.rst +++ b/docs/source/analysis/pandas.rst @@ -45,16 +45,7 @@ One can also combine all iterations in a single dataframe like this: .. code-block:: python - import pandas as pd - - df = pd.concat( - ( - s.iterations[i].particles["electrons"].to_df().assign(iteration=i) - for i in s.iterations - ), - axis=0, - ignore_index=True, - ) + df = s.to_df("electrons") # like before but with a new column "iteration" and all particles print(df) diff --git a/docs/source/analysis/rapids.rst b/docs/source/analysis/rapids.rst index 41acc55308..e3bb011d8d 100644 --- a/docs/source/analysis/rapids.rst +++ b/docs/source/analysis/rapids.rst @@ -51,14 +51,7 @@ One can also combine all iterations in a single dataframe like this: .. code-block:: python - cdf = cudf.concat( - ( - cudf.from_pandas(s.iterations[i].particles["electrons"].to_df().assign(iteration=i)) - for i in s.iterations - ), - axis=0, - ignore_index=True, - ) + cdf = s.to_cudf("electrons") # like before but with a new column "iteration" and all particles print(cdf) diff --git a/examples/11_particle_dataframe.py b/examples/11_particle_dataframe.py index 7e0cad065c..defc93dd96 100755 --- a/examples/11_particle_dataframe.py +++ b/examples/11_particle_dataframe.py @@ -16,6 +16,14 @@ except ImportError: print("pandas NOT found. Install pandas to run this example.") sys.exit() + +found_cudf = False +try: + import cudf + found_cudf = True +except ImportError: + print("cudf NOT found. Install RAPIDS for CUDA DataFrame example.") + found_dask = False try: import dask @@ -39,6 +47,19 @@ df = electrons.to_df(np.s_[:100]) print(df) + # all particles over all steps + df = s.to_df("electrons") + print(df) + + if found_cudf: + # all particles - to GPU + cdf = cudf.from_pandas(electrons.to_df()) + print(cdf) + + # all particles over all steps - to GPU + cdf = s.to_cudf("electrons") + print(cdf) + # Particles if found_dask: # the default schedulers are local/threaded, not requiring much. diff --git a/src/binding/python/openpmd_api/DataFrame.py b/src/binding/python/openpmd_api/DataFrame.py index 1248136a5a..55db5d1769 100644 --- a/src/binding/python/openpmd_api/DataFrame.py +++ b/src/binding/python/openpmd_api/DataFrame.py @@ -74,3 +74,111 @@ def particles_to_dataframe(particle_species, slice=None): df.index.name = "row" return df + + +def iterations_to_dataframe(series, species_name): + """ + Load all iterations of a particle species into a Pandas DataFrame. + + Parameters + ---------- + series : openpmd_api.Series + A Series class in openPMD-api. + species_name : string + The name of a particle species. + + Returns + ------- + pandas.DataFrame + A pandas dataframe with particles as index and openPMD record + components of the particle_species as columns. Particles might be + repeated over multiple iterations and an "iteration" column is + added. + + Raises + ------ + ImportError + Raises an exception if pandas is not installed + + See Also + -------- + pandas.DataFrame : the central dataframe object created here + """ + # import pandas here for a lazy import + try: + import pandas as pd + except ImportError: + raise ImportError("pandas NOT found. Install pandas for DataFrame " + "support.") + + df = pd.concat( + ( + series.iterations[i] + .particles[species_name] + .to_df() + .assign(iteration=i) + for i in series.iterations + ), + axis=0, + ignore_index=True, + ) + + return df + + +def iterations_to_cudf(series, species_name): + """ + Load all iterations of a particle species into a cuDF DataFrame. + + Parameters + ---------- + series : openpmd_api.Series + A Series class in openPMD-api. + species_name : string + The name of a particle species. + + Returns + ------- + cudf.DataFrame + A cuDF (RAPIDS) dataframe with particles as index and openPMD record + components of the particle_species as columns. Particles might be + repeated over multiple iterations and an "iteration" column is + added. + + Raises + ------ + ImportError + Raises an exception if cuDF (RAPIDS) is not installed + + See Also + -------- + cudf.DataFrame : the central dataframe object created here + """ + # import pandas here for a lazy import + try: + import pandas # noqa + except ImportError: + raise ImportError("pandas NOT found. Install pandas for DataFrame " + "support.") + # import cudf here for a lazy import + try: + import cudf + except ImportError: + raise ImportError("cudf NOT found. Install RAPIDS for CUDA DataFrame " + "support.") + + cdf = cudf.concat( + ( + cudf.from_pandas( + series.iterations[i] + .particles[species_name] + .to_df() + .assign(iteration=i) + ) + for i in series.iterations + ), + axis=0, + ignore_index=True, + ) + + return cdf diff --git a/src/binding/python/openpmd_api/__init__.py b/src/binding/python/openpmd_api/__init__.py index e1bb49ef7e..09f21026f9 100644 --- a/src/binding/python/openpmd_api/__init__.py +++ b/src/binding/python/openpmd_api/__init__.py @@ -1,7 +1,8 @@ from . import openpmd_api_cxx as cxx from .DaskArray import record_component_to_daskarray from .DaskDataFrame import particles_to_daskdataframe -from .DataFrame import particles_to_dataframe +from .DataFrame import (iterations_to_cudf, iterations_to_dataframe, + particles_to_dataframe) from .openpmd_api_cxx import * # noqa __version__ = cxx.__version__ @@ -13,6 +14,8 @@ ParticleSpecies.to_df = particles_to_dataframe # noqa ParticleSpecies.to_dask = particles_to_daskdataframe # noqa Record_Component.to_dask_array = record_component_to_daskarray # noqa +Series.to_df = iterations_to_dataframe # noqa +Series.to_cudf = iterations_to_cudf # noqa # TODO remove in future versions (deprecated) Access_Type = Access # noqa