DocS: add clumerge() examples

clugen · Jun 20, 2023 · 4cecf12 · 4cecf12
1 parent 3fc7fc4
commit 4cecf12
Show file tree

Hide file tree

Showing 3 changed files with 168 additions and 19 deletions.
diff --git a/docs/docs/examples/plot_5_mrg_examples.py b/docs/docs/examples/plot_5_mrg_examples.py
@@ -0,0 +1,128 @@
+"""# Merging and hierarchical cluster examples
+
+This section contains several examples on how to merge cluster data, either
+generated with **pyclugen** or from other sources. To run the examples we first
+need to import the [`clugen()`][pyclugen.main.clugen] and
+[`clugen()`][pyclugen.main.clumerge] functions:"""
+
+from pyclugen import clugen, clumerge
+
+#%%
+# To make the examples exactly reproducible we'll import a random number
+# generator from NumPy and pass it as a parameter to
+# [`clugen()`][pyclugen.main.clugen]. We'll also create a small helper function
+# for providing us a brand new seeded generator:
+
+import numpy as np
+from numpy.random import PCG64, Generator
+
+def rng(seed):
+    return Generator(PCG64(seed))
+
+#%%
+# Although it is possible to merge data in any dimension, these examples will
+# focus on merging 2D data. Therefore, we'll use the same
+# [`plot_examples_2d`](plot_functions.md#plot_examples_2d) function used for
+# the [2D examples](../plot_2_2d_examples):
+
+from plot_functions import plot_examples_2d
+
+#%%
+# ## Merging two data sets generated with `clugen()`
+
+seed1 = 444
+seed2 = 555
+
+#%%
+
+e088 = clugen(2, 5, 1000, [1, 1], np.pi / 12, [20, 20], 14, 1.2, 1.5, rng=rng(seed1),
+    proj_dist_fn="unif", point_dist_fn="n")
+e089 = clugen(2, 3, 1500, [1, 0], 0.05, [20, 20], 0, 0, 4, rng=rng(seed2),
+    point_dist_fn="n", cluster_offset = [20, 0])
+e090 = clumerge(e088, e089)
+
+#%%
+
+plot_examples_2d(
+    e088, "e088: data set 1",
+    e089, "e089: data set 2",
+    e090, "e090: merged data sets")
+
+#%%
+# In the previous example, clusters from individual data sets remain as separate
+# clusters in the merged data set. It's also possible to mantain the original
+# cluster labels by setting the `clusters_field` parameter to `None`:
+
+#%%
+
+e091 = clumerge(e088, e089, clusters_field=None)
+
+#%%
+
+plot_examples_2d(
+    e088, "e088: data set 1",
+    e089, "e089: data set 2",
+    e091, "e091: merged data sets")
+
+#%%
+# ## Adding noise to a `clugen()`-generated data set
+
+seed = 333
+
+#%%
+
+prng = rng(seed)
+e092 = {"points": 120 * prng.random((500, 2)) - 60, "clusters": np.ones(500, dtype=np.int32)}
+e093 = clumerge(e092, e090) # clumerge(e092, e088, e089) would also work
+
+#%%
+
+plot_examples_2d(
+    e090, "e092: original merged data sets",
+    e092, "e094: random uniform noise",
+    e093, "e095: data sets with noise",
+    pmargin=0)
+
+#%%
+# ## Merging with data not generated with `clugen()`
+#
+# Data generated with [`clugen()`][pyclugen.main.clugen] can be merged with
+# other data sets, for example data created with one of
+# [scikit-learn](https://scikit-learn.org/)'s generators:
+
+seed = 321
+
+#%%
+
+from sklearn.datasets import make_moons
+
+X, y = make_moons(100, noise=0.05, random_state=seed)
+
+e094 = {"points": X, "clusters": y}
+e095 = clugen(2, 4, 200, [1, 1], np.pi / 12, [1, 1], 0.1, 0.01, 0.25, rng=rng(seed),
+    proj_dist_fn = "unif", point_dist_fn = "n")
+e096 = clumerge(e094, e095)
+
+#%%
+
+plt = plot_examples_2d(
+    e094, "e094: generated w/ make_moons()",
+    e095, "e095: generated w/ clugen()",
+    e096, "e096: merged data")
+
+#%%
+# We can also hierarchize clusters from different sources:
+
+#%%
+
+e097 = {**e094, "hclusters": np.ones(100, dtype=np.int32)}
+e098 = {**e095._asdict(), "hclusters": 2 * np.ones(200, np.int32)}
+e099 = clumerge(e097, e098, clusters_field="hclusters")
+
+#%%
+
+plt = plot_examples_2d(
+    e097, "e097: generated w/ make_moons()",
+    e098, "e098: generated w/ clugen()",
+    e099, "e099: merged data",
+    clusters_field="hclusters")
diff --git a/docs/docs/examples/plot_functions.py b/docs/docs/examples/plot_functions.py
@@ -3,39 +3,49 @@
 Several auxiliary functions for plotting the examples in this documentation.
 """
 
-#%%
+# %%
 # ## Import the required libraries
 
 import matplotlib.pyplot as plt  # type: ignore
 import numpy as np
+import numpy.typing as npt
 import pandas as pd
 import seaborn as sns  # type: ignore
 
 from pyclugen import Clusters
 
-#%%
+# %%
 # ## clusters2df
 
 
-def clusters2df(*exs: Clusters) -> pd.DataFrame:
+def clusters2df(
+    *exs: Clusters | dict[str, npt.ArrayLike], clusters_field: str = "clusters"
+) -> pd.DataFrame:
     """Convert a sequence of clusters to a Pandas dataframe."""
 
     dfs = []
     iex = 1
 
     for ex in exs:
+        if isinstance(ex, dict):
+            points = ex["points"]
+            clusters = ex[clusters_field]
+        else:
+            points = ex.points
+            clusters = ex.clusters
+
         df = pd.DataFrame(
-            data=ex.points, columns=[f"x{i}" for i in range(np.size(ex.points, 1))]
+            data=points, columns=[f"x{i}" for i in range(np.size(points, 1))]
         )
-        df["cluster"] = ex.clusters.tolist()
-        df["example"] = [iex] * ex.clusters.size
+        df["cluster"] = clusters.tolist()
+        df["example"] = [iex] * clusters.size
         dfs.append(df)
         iex += 1
 
     return pd.concat(dfs, ignore_index=True)
 
 
-#%%
+# %%
 # ## get_plot_lims
 
 
@@ -59,19 +69,19 @@ def get_plot_lims(df: pd.DataFrame, pmargin: float = 0.1):
     return xmaxs, xmins
 
 
-#%%
+# %%
 # ## plot_examples_1d
 
 
-def plot_examples_1d(*ets, ncols: int = 3):
+def plot_examples_1d(*ets, ncols: int = 3, clusters_field: str = "clusters"):
     """Plot the 1D examples given in the ets parameter."""
 
     # Get examples
     ex = ets[0::2]
     # Get titles
     et = ets[1::2]
 
-    df = clusters2df(*ex)
+    df = clusters2df(*ex, clusters_field=clusters_field)
 
     # Set seaborn's dark grid style
     sns.set_theme(style="darkgrid")
@@ -96,19 +106,21 @@ def plot_examples_1d(*ets, ncols: int = 3):
         ax.set_title(t)
 
 
-#%%
+# %%
 # ## plot_examples_2d
 
 
-def plot_examples_2d(*ets, pmargin: float = 0.1, ncols: int = 3):
+def plot_examples_2d(
+    *ets, pmargin: float = 0.1, ncols: int = 3, clusters_field: str = "clusters"
+):
     """Plot the 2D examples given in the ets parameter."""
 
     # Get examples
     ex = ets[0::2]
     # Get titles
     et = ets[1::2]
 
-    df = clusters2df(*ex)
+    df = clusters2df(*ex, clusters_field=clusters_field)
 
     # Get limits in each dimension
     xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)
@@ -136,11 +148,17 @@ def plot_examples_2d(*ets, pmargin: float = 0.1, ncols: int = 3):
         ax.set_ylabel("y")
 
 
-#%%
+# %%
 # ## plot_examples_3d
 
 
-def plot_examples_3d(*ets, pmargin: float = 0.1, ncols: int = 3, side=350):
+def plot_examples_3d(
+    *ets,
+    pmargin: float = 0.1,
+    ncols: int = 3,
+    side=350,
+    clusters_field: str = "clusters",
+):
     """Plot the 3D examples given in the ets parameter."""
 
     # Get examples
@@ -153,7 +171,7 @@ def plot_examples_3d(*ets, pmargin: float = 0.1, ncols: int = 3, side=350):
     nrows = max(1, int(np.ceil(num_plots / ncols)))
     blank_plots = nrows * ncols - num_plots
 
-    df = clusters2df(*ex)
+    df = clusters2df(*ex, clusters_field=clusters_field)
 
     # Get limits in each dimension
     xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)
@@ -198,17 +216,19 @@ def plot_examples_3d(*ets, pmargin: float = 0.1, ncols: int = 3, side=350):
         ax.patch.set_alpha(0)
 
 
-#%%
+# %%
 # ## plot_examples_nd
 
 
-def plot_examples_nd(ex: Clusters, t: str, pmargin: float = 0.1):
+def plot_examples_nd(
+    ex: Clusters, t: str, pmargin: float = 0.1, clusters_field: str = "clusters"
+):
     """Plot the nD example given in the ex parameter."""
 
     # How many dimensions?
     nd = ex.points.shape[1]
 
-    df = clusters2df(ex)
+    df = clusters2df(ex, clusters_field=clusters_field)
 
     # Get limits in each dimension
     xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)

diff --git a/pyproject.toml b/pyproject.toml
@@ -54,6 +54,7 @@ dev = [
     "coverage",
     "pytest-cov >= 3.0.0",
     "pytest-mypy",
+    "scikit-learn",
     "seaborn" ]
 
 [project.urls]