Skip to content

Commit

Permalink
Python: Series to DataFrame (#1506)
Browse files Browse the repository at this point in the history
* Python: Series to DataFrame

Add new helpers to create Pandas and cuDF dataframes for a single
particle species, over all iterations at once.

* Remove hard-coded species name

Co-authored-by: Franz Pöschel <[email protected]>

---------

Co-authored-by: Franz Pöschel <[email protected]>
  • Loading branch information
ax3l and franzpoeschel committed Aug 5, 2024
1 parent fafdac2 commit ff21bbc
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 19 deletions.
11 changes: 1 addition & 10 deletions docs/source/analysis/pandas.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,7 @@ One can also combine all iterations in a single dataframe like this:

.. code-block:: python
import pandas as pd
df = pd.concat(
(
s.iterations[i].particles["electrons"].to_df().assign(iteration=i)
for i in s.iterations
),
axis=0,
ignore_index=True,
)
df = s.to_df("electrons")
# like before but with a new column "iteration" and all particles
print(df)
Expand Down
9 changes: 1 addition & 8 deletions docs/source/analysis/rapids.rst
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,7 @@ One can also combine all iterations in a single dataframe like this:

.. code-block:: python
cdf = cudf.concat(
(
cudf.from_pandas(s.iterations[i].particles["electrons"].to_df().assign(iteration=i))
for i in s.iterations
),
axis=0,
ignore_index=True,
)
cdf = s.to_cudf("electrons")
# like before but with a new column "iteration" and all particles
print(cdf)
Expand Down
21 changes: 21 additions & 0 deletions examples/11_particle_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@
except ImportError:
print("pandas NOT found. Install pandas to run this example.")
sys.exit()

found_cudf = False
try:
import cudf
found_cudf = True
except ImportError:
print("cudf NOT found. Install RAPIDS for CUDA DataFrame example.")

found_dask = False
try:
import dask
Expand All @@ -39,6 +47,19 @@
df = electrons.to_df(np.s_[:100])
print(df)

# all particles over all steps
df = s.to_df("electrons")
print(df)

if found_cudf:
# all particles - to GPU
cdf = cudf.from_pandas(electrons.to_df())
print(cdf)

# all particles over all steps - to GPU
cdf = s.to_cudf("electrons")
print(cdf)

# Particles
if found_dask:
# the default schedulers are local/threaded, not requiring much.
Expand Down
108 changes: 108 additions & 0 deletions src/binding/python/openpmd_api/DataFrame.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,111 @@ def particles_to_dataframe(particle_species, slice=None):
df.index.name = "row"

return df


def iterations_to_dataframe(series, species_name):
"""
Load all iterations of a particle species into a Pandas DataFrame.
Parameters
----------
series : openpmd_api.Series
A Series class in openPMD-api.
species_name : string
The name of a particle species.
Returns
-------
pandas.DataFrame
A pandas dataframe with particles as index and openPMD record
components of the particle_species as columns. Particles might be
repeated over multiple iterations and an "iteration" column is
added.
Raises
------
ImportError
Raises an exception if pandas is not installed
See Also
--------
pandas.DataFrame : the central dataframe object created here
"""
# import pandas here for a lazy import
try:
import pandas as pd
except ImportError:
raise ImportError("pandas NOT found. Install pandas for DataFrame "
"support.")

df = pd.concat(
(
series.iterations[i]
.particles[species_name]
.to_df()
.assign(iteration=i)
for i in series.iterations
),
axis=0,
ignore_index=True,
)

return df


def iterations_to_cudf(series, species_name):
"""
Load all iterations of a particle species into a cuDF DataFrame.
Parameters
----------
series : openpmd_api.Series
A Series class in openPMD-api.
species_name : string
The name of a particle species.
Returns
-------
cudf.DataFrame
A cuDF (RAPIDS) dataframe with particles as index and openPMD record
components of the particle_species as columns. Particles might be
repeated over multiple iterations and an "iteration" column is
added.
Raises
------
ImportError
Raises an exception if cuDF (RAPIDS) is not installed
See Also
--------
cudf.DataFrame : the central dataframe object created here
"""
# import pandas here for a lazy import
try:
import pandas # noqa
except ImportError:
raise ImportError("pandas NOT found. Install pandas for DataFrame "
"support.")
# import cudf here for a lazy import
try:
import cudf
except ImportError:
raise ImportError("cudf NOT found. Install RAPIDS for CUDA DataFrame "
"support.")

cdf = cudf.concat(
(
cudf.from_pandas(
series.iterations[i]
.particles[species_name]
.to_df()
.assign(iteration=i)
)
for i in series.iterations
),
axis=0,
ignore_index=True,
)

return cdf
5 changes: 4 additions & 1 deletion src/binding/python/openpmd_api/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from . import openpmd_api_cxx as cxx
from .DaskArray import record_component_to_daskarray
from .DaskDataFrame import particles_to_daskdataframe
from .DataFrame import particles_to_dataframe
from .DataFrame import (iterations_to_cudf, iterations_to_dataframe,
particles_to_dataframe)
from .openpmd_api_cxx import * # noqa

__version__ = cxx.__version__
Expand All @@ -13,6 +14,8 @@
ParticleSpecies.to_df = particles_to_dataframe # noqa
ParticleSpecies.to_dask = particles_to_daskdataframe # noqa
Record_Component.to_dask_array = record_component_to_daskarray # noqa
Series.to_df = iterations_to_dataframe # noqa
Series.to_cudf = iterations_to_cudf # noqa

# TODO remove in future versions (deprecated)
Access_Type = Access # noqa

0 comments on commit ff21bbc

Please sign in to comment.