replaced all pymc potential with pymc censored (#750)

* replaced all pymc potential with pymc censored * removed gumbel_sf function that is not being used * added rng to samplers * put back seeds for sampling data observations --------- Co-authored-by: Jonathan Dekermanjian <[email protected]>
pymc-devs · Dec 16, 2024 · 1ed0a6d · 1ed0a6d
1 parent 6fdd588
commit 1ed0a6d
Show file tree

Hide file tree

Showing 2 changed files with 377 additions and 322 deletions.
diff --git a/examples/survival_analysis/weibull_aft.ipynb b/examples/survival_analysis/weibull_aft.ipynb
diff --git a/examples/survival_analysis/weibull_aft.myst.md b/examples/survival_analysis/weibull_aft.myst.md
@@ -8,6 +8,9 @@ kernelspec:
   display_name: pymc
   language: python
   name: python3
+myst:
+  substitutions:
+    extra_dependencies: statsmodels
 ---
 
 (weibull_aft)=
@@ -25,15 +28,26 @@ import arviz as az
 import numpy as np
 import pymc as pm
 import pytensor.tensor as pt
-import statsmodels.api as sm
 
 print(f"Running on PyMC v{pm.__version__}")
 ```
 
+:::{include} ../extra_installs.md
+:::
+
+```{code-cell} ipython3
+# These dependencies need to be installed separately from PyMC
+import statsmodels.api as sm
+```
+
 ```{code-cell} ipython3
 %config InlineBackend.figure_format = 'retina'
+# These seeds are for sampling data observations
 RANDOM_SEED = 8927
 np.random.seed(RANDOM_SEED)
+# Set a seed for reproducibility of posterior results
+seed: int = sum(map(ord, "aft_weibull"))
+rng: np.random.Generator = np.random.default_rng(seed=seed)
 az.style.use("arviz-darkgrid")
 ```
 
@@ -71,7 +85,9 @@ censored[:5]
 
 We have an unique problem when modelling censored data. Strictly speaking, we don't have any _data_ for censored values: we only know the _number_ of values that were censored. How can we include this information in our model?
 
-One way do this is by making use of `pm.Potential`. The [PyMC2 docs](https://pymc-devs.github.io/pymc/modelbuilding.html#the-potential-class) explain its usage very well. Essentially, declaring `pm.Potential('x', logp)` will add `logp` to the log-likelihood of the model.
+One way do this is by making use of `pm.Potential`. The [PyMC2 docs](https://pymc-devs.github.io/pymc/modelbuilding.html#the-potential-class) explain its usage very well. Essentially, declaring `pm.Potential('x', logp)` will add `logp` to the log-likelihood of the model. 
+
+However, `pm.Potential` only effect probability based sampling this excludes using `pm.sample_prior_predictice` and `pm.sample_posterior_predictive`. We can overcome these limitations by using `pm.Censored` instead. We can model our right-censored data by defining the `upper` argument of `pm.Censored`.
 
 +++
 
@@ -80,36 +96,40 @@ One way do this is by making use of `pm.Potential`. The [PyMC2 docs](https://pym
 This parameterization is an intuitive, straightforward parameterization of the Weibull survival function. This is probably the first parameterization to come to one's mind.
 
 ```{code-cell} ipython3
-def weibull_lccdf(x, alpha, beta):
-    """Log complementary cdf of Weibull distribution."""
-    return -((x / beta) ** alpha)
+# normalize the event time between 0 and 1
+y_norm = y / np.max(y)
+```
+
+```{code-cell} ipython3
+# If censored then observed event time else maximum time
+right_censored = [x if x > 0 else np.max(y_norm) for x in y_norm * censored]
 ```
 
 ```{code-cell} ipython3
 with pm.Model() as model_1:
-    alpha_sd = 10.0
+    alpha_sd = 1.0
 
-    mu = pm.Normal("mu", mu=0, sigma=100)
+    mu = pm.Normal("mu", mu=0, sigma=1)
     alpha_raw = pm.Normal("a0", mu=0, sigma=0.1)
     alpha = pm.Deterministic("alpha", pt.exp(alpha_sd * alpha_raw))
     beta = pm.Deterministic("beta", pt.exp(mu / alpha))
+    beta_backtransformed = pm.Deterministic("beta_backtransformed", beta * np.max(y))
 
-    y_obs = pm.Weibull("y_obs", alpha=alpha, beta=beta, observed=y[~censored])
-    y_cens = pm.Potential("y_cens", weibull_lccdf(y[censored], alpha, beta))
+    latent = pm.Weibull.dist(alpha=alpha, beta=beta)
+    y_obs = pm.Censored("Censored_likelihood", latent, upper=right_censored, observed=y_norm)
 ```
 
 ```{code-cell} ipython3
 with model_1:
-    # Change init to avoid divergences
-    data_1 = pm.sample(target_accept=0.9, init="adapt_diag")
+    idata_param1 = pm.sample(nuts_sampler="numpyro", random_seed=rng)
 ```
 
 ```{code-cell} ipython3
-az.plot_trace(data_1, var_names=["alpha", "beta"])
+az.plot_trace(idata_param1, var_names=["alpha", "beta"])
 ```
 
 ```{code-cell} ipython3
-az.summary(data_1, var_names=["alpha", "beta"], round_to=2)
+az.summary(idata_param1, var_names=["alpha", "beta", "beta_backtransformed"], round_to=2)
 ```
 
 ## Parameterization 2
@@ -120,26 +140,26 @@ For more information, see [this Stan example model](https://github.com/stan-dev/
 
 ```{code-cell} ipython3
 with pm.Model() as model_2:
-    alpha = pm.Normal("alpha", mu=0, sigma=10)
-    r = pm.Gamma("r", alpha=1, beta=0.001, testval=0.25)
+    alpha = pm.Normal("alpha", mu=0, sigma=1)
+    r = pm.Gamma("r", alpha=2, beta=1)
     beta = pm.Deterministic("beta", pt.exp(-alpha / r))
+    beta_backtransformed = pm.Deterministic("beta_backtransformed", beta * np.max(y))
 
-    y_obs = pm.Weibull("y_obs", alpha=r, beta=beta, observed=y[~censored])
-    y_cens = pm.Potential("y_cens", weibull_lccdf(y[censored], r, beta))
+    latent = pm.Weibull.dist(alpha=r, beta=beta)
+    y_obs = pm.Censored("Censored_likelihood", latent, upper=right_censored, observed=y_norm)
 ```
 
 ```{code-cell} ipython3
 with model_2:
-    # Increase target_accept to avoid divergences
-    data_2 = pm.sample(target_accept=0.9)
+    idata_param2 = pm.sample(nuts_sampler="numpyro", random_seed=rng)
 ```
 
 ```{code-cell} ipython3
-az.plot_trace(data_2, var_names=["r", "beta"])
+az.plot_trace(idata_param2, var_names=["r", "beta"])
 ```
 
 ```{code-cell} ipython3
-az.summary(data_2, var_names=["r", "beta"], round_to=2)
+az.summary(idata_param2, var_names=["r", "beta", "beta_backtransformed"], round_to=2)
 ```
 
 ## Parameterization 3
@@ -148,41 +168,41 @@ In this parameterization, we model the log-linear error distribution with a Gumb
 
 ```{code-cell} ipython3
 logtime = np.log(y)
+```
 
-
-def gumbel_sf(y, mu, sigma):
-    """Gumbel survival function."""
-    return 1.0 - pt.exp(-pt.exp(-(y - mu) / sigma))
+```{code-cell} ipython3
+# If censored then observed event time else maximum time
+right_censored = [x if x > 0 else np.max(logtime) for x in logtime * censored]
 ```
 
 ```{code-cell} ipython3
 with pm.Model() as model_3:
-    s = pm.HalfNormal("s", tau=5.0)
+    s = pm.HalfNormal("s", tau=3.0)
     gamma = pm.Normal("gamma", mu=0, sigma=5)
 
-    y_obs = pm.Gumbel("y_obs", mu=gamma, beta=s, observed=logtime[~censored])
-    y_cens = pm.Potential("y_cens", gumbel_sf(y=logtime[censored], mu=gamma, sigma=s))
+    latent = pm.Gumbel.dist(mu=gamma, beta=s)
+    y_obs = pm.Censored("Censored_likelihood", latent, upper=right_censored, observed=logtime)
 ```
 
 ```{code-cell} ipython3
 with model_3:
-    # Change init to avoid divergences
-    data_3 = pm.sample(init="adapt_diag")
+    idata_param3 = pm.sample(tune=4000, draws=2000, nuts_sampler="numpyro", random_seed=rng)
 ```
 
 ```{code-cell} ipython3
-az.plot_trace(data_3)
+az.plot_trace(idata_param3)
 ```
 
 ```{code-cell} ipython3
-az.summary(data_3, round_to=2)
+az.summary(idata_param3, round_to=2)
 ```
 
 ## Authors
 
 - Originally collated by [Junpeng Lao](https://junpenglao.xyz/) on Apr 21, 2018. See original code [here](https://github.com/junpenglao/Planet_Sakaar_Data_Science/blob/65447fdb431c78b15fbeaef51b8c059f46c9e8d6/PyMC3QnA/discourse_1107.ipynb).
 - Authored and ported to Jupyter notebook by [George Ho](https://eigenfoo.xyz/) on Jul 15, 2018.
 - Updated for compatibility with PyMC v5 by Chris Fonnesbeck on Jan 16, 2023.
+- Updated to replace `pm.Potential` with `pm.Censored` by Jonathan Dekermanjian on Nov 25, 2024.
 
 ```{code-cell} ipython3
 %load_ext watermark