Skip to content

Commit

Permalink
DocS: add clumerge() examples
Browse files Browse the repository at this point in the history
  • Loading branch information
nunofachada committed Jun 20, 2023
1 parent 3fc7fc4 commit 4cecf12
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 19 deletions.
128 changes: 128 additions & 0 deletions docs/docs/examples/plot_5_mrg_examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""# Merging and hierarchical cluster examples
This section contains several examples on how to merge cluster data, either
generated with **pyclugen** or from other sources. To run the examples we first
need to import the [`clugen()`][pyclugen.main.clugen] and
[`clugen()`][pyclugen.main.clumerge] functions:"""

from pyclugen import clugen, clumerge

#%%
# To make the examples exactly reproducible we'll import a random number
# generator from NumPy and pass it as a parameter to
# [`clugen()`][pyclugen.main.clugen]. We'll also create a small helper function
# for providing us a brand new seeded generator:

import numpy as np
from numpy.random import PCG64, Generator

def rng(seed):
return Generator(PCG64(seed))

#%%
# Although it is possible to merge data in any dimension, these examples will
# focus on merging 2D data. Therefore, we'll use the same
# [`plot_examples_2d`](plot_functions.md#plot_examples_2d) function used for
# the [2D examples](../plot_2_2d_examples):

from plot_functions import plot_examples_2d

#%%
# ## Merging two data sets generated with `clugen()`

seed1 = 444
seed2 = 555

#%%

e088 = clugen(2, 5, 1000, [1, 1], np.pi / 12, [20, 20], 14, 1.2, 1.5, rng=rng(seed1),
proj_dist_fn="unif", point_dist_fn="n")
e089 = clugen(2, 3, 1500, [1, 0], 0.05, [20, 20], 0, 0, 4, rng=rng(seed2),
point_dist_fn="n", cluster_offset = [20, 0])
e090 = clumerge(e088, e089)

#%%

plot_examples_2d(
e088, "e088: data set 1",
e089, "e089: data set 2",
e090, "e090: merged data sets")

#%%
# In the previous example, clusters from individual data sets remain as separate
# clusters in the merged data set. It's also possible to mantain the original
# cluster labels by setting the `clusters_field` parameter to `None`:

#%%

e091 = clumerge(e088, e089, clusters_field=None)

#%%

plot_examples_2d(
e088, "e088: data set 1",
e089, "e089: data set 2",
e091, "e091: merged data sets")

#%%
# ## Adding noise to a `clugen()`-generated data set

seed = 333

#%%

prng = rng(seed)
e092 = {"points": 120 * prng.random((500, 2)) - 60, "clusters": np.ones(500, dtype=np.int32)}
e093 = clumerge(e092, e090) # clumerge(e092, e088, e089) would also work

#%%

plot_examples_2d(
e090, "e092: original merged data sets",
e092, "e094: random uniform noise",
e093, "e095: data sets with noise",
pmargin=0)

#%%
# ## Merging with data not generated with `clugen()`
#
# Data generated with [`clugen()`][pyclugen.main.clugen] can be merged with
# other data sets, for example data created with one of
# [scikit-learn](https://scikit-learn.org/)'s generators:

seed = 321

#%%

from sklearn.datasets import make_moons

X, y = make_moons(100, noise=0.05, random_state=seed)

e094 = {"points": X, "clusters": y}
e095 = clugen(2, 4, 200, [1, 1], np.pi / 12, [1, 1], 0.1, 0.01, 0.25, rng=rng(seed),
proj_dist_fn = "unif", point_dist_fn = "n")
e096 = clumerge(e094, e095)

#%%

plt = plot_examples_2d(
e094, "e094: generated w/ make_moons()",
e095, "e095: generated w/ clugen()",
e096, "e096: merged data")

#%%
# We can also hierarchize clusters from different sources:

#%%

e097 = {**e094, "hclusters": np.ones(100, dtype=np.int32)}
e098 = {**e095._asdict(), "hclusters": 2 * np.ones(200, np.int32)}
e099 = clumerge(e097, e098, clusters_field="hclusters")

#%%

plt = plot_examples_2d(
e097, "e097: generated w/ make_moons()",
e098, "e098: generated w/ clugen()",
e099, "e099: merged data",
clusters_field="hclusters")
58 changes: 39 additions & 19 deletions docs/docs/examples/plot_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,49 @@
Several auxiliary functions for plotting the examples in this documentation.
"""

#%%
# %%
# ## Import the required libraries

import matplotlib.pyplot as plt # type: ignore
import numpy as np
import numpy.typing as npt
import pandas as pd
import seaborn as sns # type: ignore

from pyclugen import Clusters

#%%
# %%
# ## clusters2df


def clusters2df(*exs: Clusters) -> pd.DataFrame:
def clusters2df(
*exs: Clusters | dict[str, npt.ArrayLike], clusters_field: str = "clusters"
) -> pd.DataFrame:
"""Convert a sequence of clusters to a Pandas dataframe."""

dfs = []
iex = 1

for ex in exs:
if isinstance(ex, dict):
points = ex["points"]
clusters = ex[clusters_field]
else:
points = ex.points
clusters = ex.clusters

df = pd.DataFrame(
data=ex.points, columns=[f"x{i}" for i in range(np.size(ex.points, 1))]
data=points, columns=[f"x{i}" for i in range(np.size(points, 1))]
)
df["cluster"] = ex.clusters.tolist()
df["example"] = [iex] * ex.clusters.size
df["cluster"] = clusters.tolist()
df["example"] = [iex] * clusters.size
dfs.append(df)
iex += 1

return pd.concat(dfs, ignore_index=True)


#%%
# %%
# ## get_plot_lims


Expand All @@ -59,19 +69,19 @@ def get_plot_lims(df: pd.DataFrame, pmargin: float = 0.1):
return xmaxs, xmins


#%%
# %%
# ## plot_examples_1d


def plot_examples_1d(*ets, ncols: int = 3):
def plot_examples_1d(*ets, ncols: int = 3, clusters_field: str = "clusters"):
"""Plot the 1D examples given in the ets parameter."""

# Get examples
ex = ets[0::2]
# Get titles
et = ets[1::2]

df = clusters2df(*ex)
df = clusters2df(*ex, clusters_field=clusters_field)

# Set seaborn's dark grid style
sns.set_theme(style="darkgrid")
Expand All @@ -96,19 +106,21 @@ def plot_examples_1d(*ets, ncols: int = 3):
ax.set_title(t)


#%%
# %%
# ## plot_examples_2d


def plot_examples_2d(*ets, pmargin: float = 0.1, ncols: int = 3):
def plot_examples_2d(
*ets, pmargin: float = 0.1, ncols: int = 3, clusters_field: str = "clusters"
):
"""Plot the 2D examples given in the ets parameter."""

# Get examples
ex = ets[0::2]
# Get titles
et = ets[1::2]

df = clusters2df(*ex)
df = clusters2df(*ex, clusters_field=clusters_field)

# Get limits in each dimension
xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)
Expand Down Expand Up @@ -136,11 +148,17 @@ def plot_examples_2d(*ets, pmargin: float = 0.1, ncols: int = 3):
ax.set_ylabel("y")


#%%
# %%
# ## plot_examples_3d


def plot_examples_3d(*ets, pmargin: float = 0.1, ncols: int = 3, side=350):
def plot_examples_3d(
*ets,
pmargin: float = 0.1,
ncols: int = 3,
side=350,
clusters_field: str = "clusters",
):
"""Plot the 3D examples given in the ets parameter."""

# Get examples
Expand All @@ -153,7 +171,7 @@ def plot_examples_3d(*ets, pmargin: float = 0.1, ncols: int = 3, side=350):
nrows = max(1, int(np.ceil(num_plots / ncols)))
blank_plots = nrows * ncols - num_plots

df = clusters2df(*ex)
df = clusters2df(*ex, clusters_field=clusters_field)

# Get limits in each dimension
xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)
Expand Down Expand Up @@ -198,17 +216,19 @@ def plot_examples_3d(*ets, pmargin: float = 0.1, ncols: int = 3, side=350):
ax.patch.set_alpha(0)


#%%
# %%
# ## plot_examples_nd


def plot_examples_nd(ex: Clusters, t: str, pmargin: float = 0.1):
def plot_examples_nd(
ex: Clusters, t: str, pmargin: float = 0.1, clusters_field: str = "clusters"
):
"""Plot the nD example given in the ex parameter."""

# How many dimensions?
nd = ex.points.shape[1]

df = clusters2df(ex)
df = clusters2df(ex, clusters_field=clusters_field)

# Get limits in each dimension
xmaxs, xmins = get_plot_lims(df, pmargin=pmargin)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dev = [
"coverage",
"pytest-cov >= 3.0.0",
"pytest-mypy",
"scikit-learn",
"seaborn" ]

[project.urls]
Expand Down

0 comments on commit 4cecf12

Please sign in to comment.