Python: Series to DataFrame (#1506)

* Python: Series to DataFrame Add new helpers to create Pandas and cuDF dataframes for a single particle species, over all iterations at once. * Remove hard-coded species name Co-authored-by: Franz Pöschel <[email protected]> --------- Co-authored-by: Franz Pöschel <[email protected]>
openPMD · Aug 5, 2024 · ff21bbc · ff21bbc
1 parent fafdac2
commit ff21bbc
Show file tree

Hide file tree

Showing 5 changed files with 135 additions and 19 deletions.
diff --git a/docs/source/analysis/pandas.rst b/docs/source/analysis/pandas.rst
@@ -45,16 +45,7 @@ One can also combine all iterations in a single dataframe like this:
 
 .. code-block:: python
 
-   import pandas as pd
-
-   df = pd.concat(
-        (
-            s.iterations[i].particles["electrons"].to_df().assign(iteration=i)
-            for i in s.iterations
-        ),
-        axis=0,
-        ignore_index=True,
-   )
+   df = s.to_df("electrons")
 
    # like before but with a new column "iteration" and all particles
    print(df)

diff --git a/docs/source/analysis/rapids.rst b/docs/source/analysis/rapids.rst
@@ -51,14 +51,7 @@ One can also combine all iterations in a single dataframe like this:
 
 .. code-block:: python
 
-   cdf = cudf.concat(
-        (
-            cudf.from_pandas(s.iterations[i].particles["electrons"].to_df().assign(iteration=i))
-            for i in s.iterations
-        ),
-        axis=0,
-        ignore_index=True,
-   )
+   cdf = s.to_cudf("electrons")
 
    # like before but with a new column "iteration" and all particles
    print(cdf)

diff --git a/examples/11_particle_dataframe.py b/examples/11_particle_dataframe.py
@@ -16,6 +16,14 @@
 except ImportError:
     print("pandas NOT found. Install pandas to run this example.")
     sys.exit()
+
+found_cudf = False
+try:
+    import cudf
+    found_cudf = True
+except ImportError:
+    print("cudf NOT found. Install RAPIDS for CUDA DataFrame example.")
+
 found_dask = False
 try:
     import dask
@@ -39,6 +47,19 @@
     df = electrons.to_df(np.s_[:100])
     print(df)
 
+    # all particles over all steps
+    df = s.to_df("electrons")
+    print(df)
+
+    if found_cudf:
+        # all particles - to GPU
+        cdf = cudf.from_pandas(electrons.to_df())
+        print(cdf)
+
+        # all particles over all steps - to GPU
+        cdf = s.to_cudf("electrons")
+        print(cdf)
+
     # Particles
     if found_dask:
         # the default schedulers are local/threaded, not requiring much.

diff --git a/src/binding/python/openpmd_api/DataFrame.py b/src/binding/python/openpmd_api/DataFrame.py
@@ -74,3 +74,111 @@ def particles_to_dataframe(particle_species, slice=None):
     df.index.name = "row"
 
     return df
+
+
+def iterations_to_dataframe(series, species_name):
+    """
+    Load all iterations of a particle species into a Pandas DataFrame.
+
+    Parameters
+    ----------
+    series : openpmd_api.Series
+        A Series class in openPMD-api.
+    species_name : string
+        The name of a particle species.
+
+    Returns
+    -------
+    pandas.DataFrame
+        A pandas dataframe with particles as index and openPMD record
+        components of the particle_species as columns. Particles might be
+        repeated over multiple iterations and an "iteration" column is
+        added.
+
+    Raises
+    ------
+    ImportError
+        Raises an exception if pandas is not installed
+
+    See Also
+    --------
+    pandas.DataFrame : the central dataframe object created here
+    """
+    # import pandas here for a lazy import
+    try:
+        import pandas as pd
+    except ImportError:
+        raise ImportError("pandas NOT found. Install pandas for DataFrame "
+                          "support.")
+
+    df = pd.concat(
+        (
+            series.iterations[i]
+            .particles[species_name]
+            .to_df()
+            .assign(iteration=i)
+            for i in series.iterations
+        ),
+        axis=0,
+        ignore_index=True,
+    )
+
+    return df
+
+
+def iterations_to_cudf(series, species_name):
+    """
+    Load all iterations of a particle species into a cuDF DataFrame.
+
+    Parameters
+    ----------
+    series : openpmd_api.Series
+        A Series class in openPMD-api.
+    species_name : string
+        The name of a particle species.
+
+    Returns
+    -------
+    cudf.DataFrame
+        A cuDF (RAPIDS) dataframe with particles as index and openPMD record
+        components of the particle_species as columns. Particles might be
+        repeated over multiple iterations and an "iteration" column is
+        added.
+
+    Raises
+    ------
+    ImportError
+        Raises an exception if cuDF (RAPIDS) is not installed
+
+    See Also
+    --------
+    cudf.DataFrame : the central dataframe object created here
+    """
+    # import pandas here for a lazy import
+    try:
+        import pandas  # noqa
+    except ImportError:
+        raise ImportError("pandas NOT found. Install pandas for DataFrame "
+                          "support.")
+    # import cudf here for a lazy import
+    try:
+        import cudf
+    except ImportError:
+        raise ImportError("cudf NOT found. Install RAPIDS for CUDA DataFrame "
+                          "support.")
+
+    cdf = cudf.concat(
+        (
+            cudf.from_pandas(
+                series.iterations[i]
+                      .particles[species_name]
+                      .to_df()
+                      .assign(iteration=i)
+            )
+            for i in series.iterations
+        ),
+        axis=0,
+        ignore_index=True,
+    )
+
+    return cdf
diff --git a/src/binding/python/openpmd_api/__init__.py b/src/binding/python/openpmd_api/__init__.py
@@ -1,7 +1,8 @@
 from . import openpmd_api_cxx as cxx
 from .DaskArray import record_component_to_daskarray
 from .DaskDataFrame import particles_to_daskdataframe
-from .DataFrame import particles_to_dataframe
+from .DataFrame import (iterations_to_cudf, iterations_to_dataframe,
+                        particles_to_dataframe)
 from .openpmd_api_cxx import *  # noqa
 
 __version__ = cxx.__version__
@@ -13,6 +14,8 @@
 ParticleSpecies.to_df = particles_to_dataframe  # noqa
 ParticleSpecies.to_dask = particles_to_daskdataframe  # noqa
 Record_Component.to_dask_array = record_component_to_daskarray  # noqa
+Series.to_df = iterations_to_dataframe  # noqa
+Series.to_cudf = iterations_to_cudf  # noqa
 
 # TODO remove in future versions (deprecated)
 Access_Type = Access  # noqa