From 32aa2cb49734a92275ac41b897ed2223ade005d9 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Sun, 25 Jun 2023 16:12:15 -0700
Subject: [PATCH] Docs: Analysis (#1444)

* Docs: Analysis

Add a data analysis & visualization section.
This is meant to show entry points and workflows to work with
openPMD data in larger frameworks and compatible ecosystems.

* [Draft] DASK, Pandas, ...

* Doc: DASK

* Pandas

* RAPIDS

* Typos
---
 docs/source/analysis/contrib.rst  |  34 ++++++++++
 docs/source/analysis/dask.rst     |  50 +++++++++++++++
 docs/source/analysis/pandas.rst   | 101 ++++++++++++++++++++++++++++++
 docs/source/analysis/paraview.rst |  55 ++++++++++++++++
 docs/source/analysis/rapids.rst   |  99 +++++++++++++++++++++++++++++
 docs/source/analysis/viewer.rst   |  68 ++++++++++++++++++++
 docs/source/index.rst             |  15 +++++
 7 files changed, 422 insertions(+)
 create mode 100644 docs/source/analysis/contrib.rst
 create mode 100644 docs/source/analysis/dask.rst
 create mode 100644 docs/source/analysis/pandas.rst
 create mode 100644 docs/source/analysis/paraview.rst
 create mode 100644 docs/source/analysis/rapids.rst
 create mode 100644 docs/source/analysis/viewer.rst
diff --git a/docs/source/analysis/contrib.rst b/docs/source/analysis/contrib.rst
new file mode 100644
index 0000000000..f1ccf9df67
--- /dev/null
+++ b/docs/source/analysis/contrib.rst
@@ -0,0 +1,34 @@
+.. _analysis-contrib:
+
+Contributed
+===========
+
+This page contains contributed projects and third party integrations to analyze openPMD data.
+See the `openPMD-projects <https://github.com/openPMD/openPMD-projects#data-processing-and-visualization>`__ catalog for more community integrations.
+
+
+.. _analysis-contrib-visualpic:
+
+3D Visualization: VisualPIC
+---------------------------
+
+openPMD data can be visualized with the domain-specific VisualPIC renderer.
+Please see `the WarpX page for details <https://warpx.readthedocs.io/en/latest/dataanalysis/visualpic.html>`__.
+
+
+.. _analysis-contrib-visit:
+
+3D Visualization: VisIt
+-----------------------
+
+openPMD **HDF5** data can be visualized with VisIt 3.1.0+.
+VisIt supports openPMD HDF5 files and requires to rename the files from ``.h5`` to ``.opmd`` to be automatically detected.
+
+
+.. _analysis-contrib-yt:
+
+yt-project
+----------
+
+openPMD **HDF5** data can be visualized with `yt-project <https://yt-project.org>`__.
+Please see the `yt documentation <https://yt-project.org/doc/examining/loading_data.html?highlight=openpmd#openpmd-data>`__ for details.
diff --git a/docs/source/analysis/dask.rst b/docs/source/analysis/dask.rst
new file mode 100644
index 0000000000..e00dce0e98
--- /dev/null
+++ b/docs/source/analysis/dask.rst
@@ -0,0 +1,50 @@
+.. _analysis-dask:
+
+DASK
+====
+
+The Python bindings of openPMD-api provide direct methods to load data into the parallel, `DASK data analysis ecosystem <https://www.dask.org>`__.
+
+
+How to Install
+--------------
+
+Among many package managers, `PyPI <https://pypi.org/project/dask/>`__ ships the latest packages of DASK:
+
+.. code-block:: python
+
+    python3 -m pip install -U dask
+    python3 -m pip install -U pyarrow
+
+
+How to Use
+----------
+
+The central Python API calls to convert to DASK datatypes are the ``ParticleSpecies.to_dask`` and ``Record_Component.to_dask_array`` methods.
+
+.. code-block:: python
+
+   s = io.Series("samples/git-sample/data%T.h5", io.Access.read_only)
+   electrons = s.iterations[400].particles["electrons"]
+
+   # the default schedulers are local/threaded. We can also use local
+   # "processes" or for multi-node "distributed", among others.
+   dask.config.set(scheduler='processes')
+
+   df = electrons.to_dask()
+   type(df)  # ...
+
+   E = s.iterations[400].meshes["E"]
+   E_x = E["x"]
+   darr_x = E_x.to_dask_array()
+   type(darr_x)  # ...
+
+   # note: no series.flush() needed
+
+
+Example
+-------
+
+A detailed example script for particle and field analysis is documented under as ``11_particle_dataframe.py`` in our :ref:`examples <usage-examples>`.
+
+See a video of openPMD on DASK in action in `pull request #963 <https://github.com/openPMD/openPMD-api/pull/963#issuecomment-873350174>`__ (part of openPMD-api v0.14.0 and later).
diff --git a/docs/source/analysis/pandas.rst b/docs/source/analysis/pandas.rst
new file mode 100644
index 0000000000..dcfe97aae2
--- /dev/null
+++ b/docs/source/analysis/pandas.rst
@@ -0,0 +1,101 @@
+.. _analysis-pandas:
+
+Pandas
+======
+
+The Python bindings of openPMD-api provide direct methods to load data into the `Pandas data analysis ecosystem <https://pandas.pydata.org>`__.
+
+Pandas computes on the CPU, for GPU-accelerated data analysis see :ref:`RAPIDS <analysis-rapids>`.
+
+
+.. _analysis-pandas-install:
+
+How to Install
+--------------
+
+Among many package managers, `PyPI <https://pypi.org/project/pandas/>`__ ships the latest packages of pandas:
+
+.. code-block:: python
+
+    python3 -m pip install -U pandas
+
+
+.. _analysis-pandas-df:
+
+Dataframes
+----------
+
+The central Python API call to convert to openPMD particles to a Pandas dataframe is the ``ParticleSpecies.to_df`` method.
+
+.. code-block:: python
+
+   import openpmd_api as io
+
+   s = io.Series("samples/git-sample/data%T.h5", io.Access.read_only)
+   electrons = s.iterations[400].particles["electrons"]
+
+   df = electrons.to_df()
+
+   type(df)  # pd.DataFrame
+   print(df)
+
+   # note: no series.flush() needed
+
+One can also combine all iterations in a single dataframe like this:
+
+.. code-block:: python
+
+   import pandas as pd
+
+   df = pd.concat(
+        (
+            s.iterations[i].particles["electrons"].to_df().assign(iteration=i)
+            for i in s.iterations
+        ),
+        axis=0,
+        ignore_index=True,
+   )
+
+   # like before but with a new column "iteration" and all particles
+   print(df)
+
+
+.. _analysis-pandas-ascii:
+
+openPMD to ASCII
+----------------
+
+Once converted to a Pandas dataframe, export of openPMD data to text is very simple.
+We generally do not recommend this because ASCII processing is slower, uses significantly more space on disk and has less precision than the binary data usually stored in openPMD data series.
+Nonetheless, in some cases and especially for small, human-readable data sets this can be helpful.
+
+The central Pandas call for this is `DataFrame.to_csv <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html>`__.
+
+.. code-block:: python
+
+   # creates a electrons.csv file
+   df.to_csv("electrons.csv", sep=",", header=True)
+
+
+.. _analysis-pandas-sql:
+
+openPMD as SQL Database
+-----------------------
+
+Once converted to a Pandas dataframe, one can query and process openPMD data also with `SQL syntax <https://en.wikipedia.org/wiki/SQL>`__ as provided by many databases.
+
+A project that provides such syntax is for instance `pandasql <https://github.com/yhat/pandasql/>`__.
+
+.. code-block:: python
+
+    python3 -m pip install -U pandasql
+
+or one can `export into an SQL database <https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html>`__.
+
+
+.. _analysis-pandas-example:
+
+Example
+-------
+
+A detailed example script for particle and field analysis is documented under as ``11_particle_dataframe.py`` in our :ref:`examples <usage-examples>`.
diff --git a/docs/source/analysis/paraview.rst b/docs/source/analysis/paraview.rst
new file mode 100644
index 0000000000..696f08bbb2
--- /dev/null
+++ b/docs/source/analysis/paraview.rst
@@ -0,0 +1,55 @@
+.. _analysis-paraview:
+
+3D Visualization: ParaView
+==========================
+
+openPMD data can be visualized by ParaView, an open source visualization and analysis software.
+ParaView can be downloaded and installed from httpshttps://www.paraview.org.
+Use the latest version for best results.
+
+Tutorials
+---------
+
+ParaView is a powerful, general parallel rendering program.
+If this is your first time using ParaView, consider starting with a tutorial.
+
+* https://www.paraview.org/Wiki/The_ParaView_Tutorial
+* https://www.youtube.com/results?search_query=paraview+introduction
+* https://www.youtube.com/results?search_query=paraview+tutorial
+
+
+openPMD
+-------
+
+openPMD files can be visualized with ParaView 5.9+, using 5.11+ is recommended.
+ParaView supports ADIOS1, ADIOS2 and HDF5 files, as it implements against the Python bindings of openPMD-api.
+
+For openPMD output to be recognized, create a small textfile with ``.pmd`` ending per data series, which can be opened with ParaView:
+
+.. code-block:: console
+
+   $ cat paraview.pmd
+   openpmd_%06T.bp
+
+The file contains the same string as one would put in an openPMD ``Series("....")`` object.
+
+.. tip::
+
+   When you first open ParaView, adjust its global ``Settings`` (Linux: under menu item ``Edit``).
+   ``General`` -> ``Advanced`` -> Search for ``data`` -> ``Data Processing Options``.
+   Check the box ``Auto Convert Properties``.
+
+   This will simplify application of filters, e.g., contouring of components of vector fields, without first adding a calculator that extracts a single component or magnitude.
+
+.. warning::
+
+   As of ParaView 5.11 and older, the axisLabel is not yet read for fields.
+   See, e.g., `WarpX issue 21162 <https://github.com/ECP-WarpX/WarpX/issues/1803>`__.
+   Please apply rotation of, e.g., ``0 -90 0`` to mesh data where needed.
+
+.. warning::
+
+   `ParaView issue 21837 <https://gitlab.kitware.com/paraview/paraview/-/issues/21837>`__:
+   In order to visualize particle traces with the ``Temporal Particles To Pathlines``, you need to apply the ``Merge Blocks`` filter first.
+
+   If you have multiple species, you may have to extract the species you want with ``Extract Block`` before applying ``Merge Blocks``.
diff --git a/docs/source/analysis/rapids.rst b/docs/source/analysis/rapids.rst
new file mode 100644
index 0000000000..41acc55308
--- /dev/null
+++ b/docs/source/analysis/rapids.rst
@@ -0,0 +1,99 @@
+.. _analysis-rapids:
+
+RAPIDS
+======
+
+The Python bindings of openPMD-api enable easy loading into the GPU-accelerated `RAPIDS.ai datascience & AI/ML ecosystem <https://rapids.ai/>`__.
+
+
+.. _analysis-rapids-install:
+
+How to Install
+--------------
+
+Follow the `official documentation <https://docs.rapids.ai/install>`__ to install RAPIDS.
+
+.. code-block:: python
+
+   # preparation
+   conda update -n base conda
+   conda install -n base conda-libmamba-solver
+   conda config --set solver libmamba
+
+   # install
+   conda create -n rapids -c rapidsai -c conda-forge -c nvidia rapids python cudatoolkit openpmd-api pandas
+   conda activate rapids
+
+
+.. _analysis-rapids-cudf:
+
+Dataframes
+----------
+
+The central Python API call to convert to openPMD particles to a cuDF dataframe is the ``ParticleSpecies.to_df`` method.
+
+.. code-block:: python
+
+   import openpmd_api as io
+   import cudf
+
+   s = io.Series("samples/git-sample/data%T.h5", io.Access.read_only)
+   electrons = s.iterations[400].particles["electrons"]
+
+   cdf = cudf.from_pandas(electrons.to_df())
+
+   type(cdf)  # cudf.DataFrame
+   print(cdf)
+
+   # note: no series.flush() needed
+
+One can also combine all iterations in a single dataframe like this:
+
+.. code-block:: python
+
+   cdf = cudf.concat(
+        (
+            cudf.from_pandas(s.iterations[i].particles["electrons"].to_df().assign(iteration=i))
+            for i in s.iterations
+        ),
+        axis=0,
+        ignore_index=True,
+   )
+
+   # like before but with a new column "iteration" and all particles
+   print(cdf)
+
+
+.. _analysis-rapids-sql:
+
+openPMD as SQL Database
+-----------------------
+
+Once converted to a dataframe, one can query and process openPMD data also with `SQL syntax <https://en.wikipedia.org/wiki/SQL>`__ as provided by many databases.
+
+A project that provides such syntax is for instance `BlazingSQL <https://github.com/BlazingDB/blazingsql>`__ (see the `BlazingSQL install documentation <https://github.com/BlazingDB/blazingsql#prerequisites>`__).
+
+.. code-block:: python
+
+   import openpmd_api as io
+   from blazingsql import BlazingContext
+
+   s = io.Series("samples/git-sample/data%T.h5", io.Access.read_only)
+   electrons = s.iterations[400].particles["electrons"]
+
+   bc = BlazingContext(enable_progress_bar=True)
+   bc.create_table('electrons', electrons.to_df())
+
+   # all properties for electrons > 3e11 kg*m/s
+   bc.sql('SELECT * FROM electrons WHERE momentum_z > 3e11')
+
+   # selected properties
+   bc.sql('SELECT momentum_x, momentum_y, momentum_z, weighting FROM electrons WHERE momentum_z > 3e11')
+
+
+.. _analysis-rapids-example:
+
+Example
+-------
+
+A detailed example script for particle and field analysis is documented under as ``11_particle_dataframe.py`` in our :ref:`examples <usage-examples>`.
diff --git a/docs/source/analysis/viewer.rst b/docs/source/analysis/viewer.rst
new file mode 100644
index 0000000000..acfbd08a9e
--- /dev/null
+++ b/docs/source/analysis/viewer.rst
@@ -0,0 +1,68 @@
+.. _analysis-viewer:
+
+openPMD-viewer
+==============
+
+`openPMD-viewer <https://github.com/openPMD/openPMD-viewer>`__ (`documentation <https://openpmd-viewer.readthedocs.io>`__) is a Python package to access openPMD data.
+
+It allows to:
+
+* Quickly browse through the data, with a GUI-type interface in the Jupyter notebook
+* Have access to the data numpy array, for more detailed analysis
+
+Installation
+------------
+
+openPMD-viewer can be installed via ``conda`` or ``pip``:
+
+.. code-block:: bash
+
+    conda install -c conda-forge openpmd-viewer openpmd-api
+
+.. code-block:: bash
+
+    python3 -m pip install openPMD-viewer openPMD-api
+
+Usage
+-----
+
+openPMD-viewer can be used either in simple Python scripts or in `Jupyter <https://jupyter.org>`__.
+For interactive plots in Jupyter lab, add this `"cell magic" <https://ipython.readthedocs.io/en/stable/interactive/magics.html>`__ to the first line of your notebook:
+
+.. code-block:: python
+
+   %matplotlib widget
+
+and for Jupyter notebook use this instead:
+
+.. code-block:: python
+
+   %matplotlib notebook
+
+If none of those work, e.g. because `ipympl <https://github.com/matplotlib/ipympl#installation>`__ is not properly installed, you can as a last resort always try ``%matplotlib inline`` for non-interactive plots.
+
+In both interactive and scripted usage, you can import openPMD-viewer, and load the data with the following commands:
+
+.. code-block:: python
+
+    from openpmd_viewer import OpenPMDTimeSeries
+    ts = OpenPMDTimeSeries('path/to/data/series/')
+
+.. note::
+
+    If you are using the Jupyter notebook, then you can start a pre-filled
+    notebook, which already contains the above lines, by typing in a terminal:
+
+    ::
+
+        openPMD_notebook
+
+When using the Jupyter notebook, you can quickly browse through the data
+by using the command:
+
+::
+
+    ts.slider()
+
+You can also access the particle and field data as numpy arrays with the methods ``ts.get_field`` and ``ts.get_particle``.
+See the openPMD-viewer tutorials `on read-the-docs <https://openpmd-viewer.readthedocs.io>`_ for more info.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 6c59a7997d..7b7f050069 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -21,6 +21,7 @@ Writing & reading through those backends and their associated files is supported
    section#api-details,
    section#utilities,
    section#backends,
+   section#data-analysis,
    section#development,
    section#maintenance {
        display:none;
@@ -128,6 +129,20 @@ Backends
    backends/adios2
    backends/hdf5
 
+Data Analysis
+-------------
+.. toctree::
+   :caption: DATA ANALYSIS
+   :maxdepth: 1
+   :hidden:
+
+   analysis/viewer
+   analysis/paraview
+   analysis/pandas
+   analysis/dask
+   analysis/rapids
+   analysis/contrib
+
 Development
 -----------
 .. toctree::