paper edits

sisl · Sep 27, 2024 · 2aa4c81 · 2aa4c81
1 parent f25046b
commit 2aa4c81
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 31 deletions.
diff --git a/docs/make.jl b/docs/make.jl
@@ -21,6 +21,7 @@ makedocs(
             "Appendix" => [
                 "Gamma EPCA and the Itakura-Saito Distance" => "math/appendix/gamma.md",
                 "Poisson EPCA and Generalized KL-Divergence" => "math/appendix/poisson.md",
+                "EPCA Generalizes the PCA Objective via a Bregman Divergence" => "math/appendix/gaussian.md",
                 "Inverse Link Functions" => "math/appendix/inverses.md",
                 "Link Functions and Expectations" => "math/appendix/expectation.md"
 

diff --git a/docs/src/math/intro.md b/docs/src/math/intro.md
@@ -12,6 +12,7 @@ This guide is organized into several pages, each building on the last:
 4. Appendix
    - [The Gamma EPCA Objective is the Itakura-Saito Distance](./appendix/gamma.md)
    - [The Poisson EPCA Objective is the Generalized KL-Divergence](./appendix/poisson.md)
+   - [EPCA Generalizes the PCA Objective with Bregman Divergences](./appendix/gaussian.md)
    - [The Inverse of the Link Function is the Gradient of the Convex Conjugate](./appendix/inverses.md)
    - [The Link Function and the Expectation Parameter](./appendix/expectation.md)
 

diff --git a/paper.bib b/paper.bib
@@ -16,6 +16,32 @@ @article{azoury2001relative
   publisher={Springer}
 }
 
+@article{clustering,
+  author  = {Arindam Banerjee and Srujana Merugu and Inderjit S. Dhillon and Joydeep Ghosh},
+  title   = {Clustering with Bregman Divergences},
+  journal = {Journal of Machine Learning Research},
+  year    = {2005},
+  volume  = {6},
+  number  = {58},
+  pages   = {1705--1749},
+  url     = {http://jmlr.org/papers/v6/banerjee05b.html}
+}
+
+@article{ultrasound,
+title = {Fast reduction of speckle noise in real ultrasound images},
+journal = {Signal Processing},
+volume = {93},
+number = {4},
+pages = {684-694},
+year = {2013},
+issn = {0165-1684},
+doi = {https://doi.org/10.1016/j.sigpro.2012.09.005},
+url = {https://www.sciencedirect.com/science/article/pii/S0165168412003258},
+author = {Jie Huang and Xiaoping Yang},
+keywords = {Real ultrasound image, Speckle reduction, Generalized Kullback–Leibler distance, Variable splitting, Bregman iterative},
+abstract = {In this paper, we concentrate on fast removing speckle noise in real ultrasound images. It is hard to design a fast algorithm to solve a speckle reduction model, since the data fitting term in a speckle reduction model is usually not convex. In this paper, we present a convex variational model to deal with speckle noise in real ultrasound images. The data-fitting term of the proposed model is obtained by using a generalized Kullback–Leibler distance. To fast solve the proposed model, we incorporate variable splitting method and Bregman iterative method to propose a fast ultrasound speckle reduction algorithm. The capability of the proposed method is shown both on synthetic images and real ultrasound images.}
+}
+
 @article{spectrum,
 title = {Itakura-{S}aito distance based autoencoder for dimensionality reduction of mass spectra},
 journal = {Chemometrics and Intelligent Laboratory Systems},
@@ -30,6 +56,30 @@ @article{spectrum
 abstract = {Small signals may contain important information. Mass spectra of chemical compounds are usually given in a format of sparse high-dimensional data of large dynamic range. As peaks at high m/z (mass to charge ratio) region of a mass spectrum contribute to sensory information, they should not be ignored during the dimensionality reduction process even if the peak is small. However, in most of dimensionality reduction techniques, large peaks in a dataset are typically more emphasized than tiny peaks when Euclidean space is assessed. Autoencoders are widely used nonlinear dimensionality reduction technique, which is known as one special form of artificial neural networks to gain a compressed, distributed representation after learning. In this paper, we present an autoencoder which uses IS (Itakura-Saito) distance as its cost function to achieve a high capability of approximation of small target inputs in dimensionality reduction. The result of comparative experiments showed that our new autoencoder achieved the higher performance in approximation of small targets than that of the autoencoders with conventional cost functions such as the mean squared error and the cross-entropy.}
 }
 
+@article{azoury,
+  title={Relative Loss Bounds for On-Line Density Estimation with the Exponential Family of Distributions},
+  author={Azoury, Katy S and Warmuth, Manfred K},
+  journal={Machine learning},
+  volume={43},
+  pages={211--246},
+  year={2001},
+  publisher={Springer}
+}
+
+@article{forster,
+title = {Relative Expected Instantaneous Loss Bounds},
+journal = {Journal of Computer and System Sciences},
+volume = {64},
+number = {1},
+pages = {76-102},
+year = {2002},
+issn = {0022-0000},
+doi = {10.1006/jcss.2001.1798},
+url = {https://www.sciencedirect.com/science/article/pii/S0022000001917982},
+author = {Jürgen Forster and Manfred K. Warmuth},
+abstract = {In the literature a number of relative loss bounds have been shown for on-line learning algorithms. Here the relative loss is the total loss of the on-line algorithm in all trials minus the total loss of the best comparator that is chosen off-line. However, for many applications instantaneous loss bounds are more interesting where the learner first sees a batch of examples and then uses these examples to make a prediction on a new instance. We show relative expected instantaneous loss bounds for the case when the examples are i.i.d. with an unknown distribution. We bound the expected loss of the algorithm on the last example minus the expected loss of the best comparator on a random example. In particular, we study linear regression and density estimation problems and show how the leave-one-out loss can be used to prove instantaneous loss bounds for these cases. For linear regression we use an algorithm that is similar to a new on-line learning algorithm developed by Vovk. Recently a large number of relative total loss bounds have been shown that have the form O(lnT), where T is the number of trials/examples. Standard conversions of on-line algorithms to batch algorithms result in relative expected instantaneous loss bounds of the form O(lnTT). Our methods lead to O(1T) upper bounds. In many cases we give tight lower bounds.}
+}
+
 @misc{Julia,
       title={Julia: A Fast Dynamic Language for Technical Computing}, 
       author={Jeff Bezanson and Stefan Karpinski and Viral B. Shah and Alan Edelman},
@@ -70,6 +120,22 @@ @article{Bregman
 abstract = {IN this paper we consider an iterative method of finding the common point of convex sets. This method can be regarded as a generalization of the methods discussed in [1–4]. Apart from problems which can be reduced to finding some point of the intersection of convex sets, the method considered can be applied to the approximate solution of problems in linear and convex programming.}
 }
 
+@misc{logexp,
+  title = {LogExpFunctions.jl},
+  year = {2024},
+  publisher = {GitHub},
+  url = {https://github.com/JuliaStats/LogExpFunctions.jl},
+}
+
+@unknown{stable_exp,
+author = {Mächler, Martin},
+year = {2015},
+month = {09},
+pages = {},
+title = {Accurately Computing {$\log(1 - \exp(-|a|)$} Assessed by the Rmpfr package},
+doi = {10.13140/RG.2.2.11834.70084}
+}
+
 @article{symbolics,
 author = {Gowda, Shashi and Ma, Yingbo and Cheli, Alessandro and Gw\'{o}\'{z}zd\'{z}, Maja and Shah, Viral B. and Edelman, Alan and Rackauckas, Christopher},
 title = {High-Performance Symbolic-Numerics via Multiple Dispatch},

diff --git a/paper.md b/paper.md
@@ -31,52 +31,63 @@ bibliography: paper.bib
 
 # Summary
 
-Principal component analysis (PCA) [@PCA] is a fundamental tool in data science and machine learning for dimensionality reduction and denoising. While PCA is effective for continuous, real-valued data, it may not perform well for binary, count, or discrete distribution data. Exponential family PCA (EPCA) [@EPCA] generalizes PCA to accommodate these data types, making it more suitable for tasks such as belief compression in reinforcement learning [@Roy]. `ExpFamilyPCA.jl` is the first Julia [@Julia] package for EPCA, offering fast implementations for common distributions and a flexible interface for custom distributions.
+Principal component analysis (PCA) [@PCA] is an important tool for compression, intepretability, and denoising that works best when data is continuous, real, and normally-distributed. Exponential family principal component analysis (EPCA) [@EPCA] generalizes PCA to accomodate observations from any exponential family, making it well-suited for working with binary, count, and discrete distribution data.`ExpFamilyPCA.jl` is the first Julia package [@Julia] to implement EPCA and the first package in any language to support EPCA with multiple distributions.
 
 # Statement of Need
 
-<!-- REDO -->
+`ExpFamilyPCA.jl` is a fast, numerically stable package for compressing, interpreting, and denoising high-dimensional datasets with data from exponential family distributions. It supports most common exponential family distributions (§ [Supported Distributions](#supported-distributions)) and includes multiple constructors for custom distributions (§ [Custom Distributions](#supported-distributions)).
 
-To our knowledge, there are no open-source implementations of EPCA, and the sole proprietary package [@epca-MATLAB] is limited to a single distribution. Modern applications of EPCA in reinforcement learning [@Roy] and mass spectrometry [@spectrum] require multiple distributions, numerical stability, and the ability to handle large datasets. `ExpFamilyPCA.jl` addresses this gap by providing fast implementations for several exponential family distributions and multiple constructors for custom distributions. More implementation and mathematical details are in the [documentation](https://sisl.github.io/ExpFamilyPCA.jl/dev/).
+EPCA has been used in reinforcement learning and sequential decision making to effeciently process state uncertainty [@Roy]. Similar techniques are also used in mass spectrometry [@spectrum], ultrasound denoising [@ultrasound], text analysis [@LitReview], and robust clustering [@clustering], suggesting that EPCA may have further applications in signal processing and machine learning.
 
-# Problem Formulation
-
-- PCA has a specific geometric objective in terms of projections
-- This can also be interpreted as a denoising process using Gaussian MLE
-- EPCA generalizes geometric objective using Bregman divergences which are related to exponential families
+`ExpFamilyPCA.jl` is the written in Julia. Julia uses multiple dispatch which encourages high code reuse and interoperability between packages [@dispatch]. `ExpFamilyPCA.jl` relies on this interoperability and the languages innate support for high-performance scientific computing to support fast symbolic differentiation [@sybolics], optimization [@optim], and numerically stable exponential operations [@stable_exp, @logexp].
 
-TODO: read the original GLM paper
-
-PCA has many interpretations (e.g., a variance-maximizing compression, a distance-minimizing projection). The interpretation that is most useful for understanding EPCA is the denoising interpretration. Suppose we have $n$ noisy observations $x_1, \dots, x_n \in \mathbb{R}^{n \times d$
+# Problem Formulation
 
 ## Principal Component Analysis
 
+### Geometric Interpretation
 
-
-
-
-Traditional PCA is a low-rank matrix approximation problem. For a data matrix $X \in \mathbb{R}^{n \times d}$ with $n$ observations, we want to find the low-rank matrix approximation $\Theta \in \mathbb{R}^{n \times d}$ such that $\mathrm{rank}(\Theta) = k \leq d$. Formally,
+Given a data matrix $X \in \mathbb{R}^{n \times d}$, the geometric goal of PCA is to find the closest low-rank approximation $\Theta \in \mathbb{R}^{n \times d}$. Formally,
 
 $$\begin{aligned}
 & \underset{\Theta}{\text{minimize}}
-& & \|X - \Theta\|_F^2 \\
+& & \frac{1}{2}\|X - \Theta\|_F^2 \\
 & \text{subject to}
 & & \mathrm{rank}\left(\Theta\right) = k
 \end{aligned}$$
 
-where $\| \cdot \|_F$ denotes the Frobenius norm[^1] and $\Theta = AV$ where $A = X_k \in \mathbb{R}^{n \times k}$ and $V = X_k \in \mathbb{R}^{k \times d}$.
+where $\| \cdot \|_F$ denotes the Frobenius norm.
+
+### Probabalistic Interpretation
 
-[^1]: The Frobenius norm is a generalization of the Euclidean distance and thus a special case of the Bregman divergence (induced from the log-partition of the normal distribution).
+Since the geometric PCA objective is equivalent to maximizing the Gaussian log-likelihood, PCA can be viewed as a denoising procedure that recovers the data's low-dimensional latent structure of the data from high-dimensional observations corrupted with Gaussian noise. Formally,
+
+$$
+x_i \sim \mathcal{N}(\theta_i, I)
+$$
+
+where $x_i$ is a row of $X$, $\theta_i$ is a row of $\Theta$, and $i \in \{1, \dots, n\}$.
 
-## Exponential Family PCA
 
-EPCA is a generalization of PCA that replaces PCA's geometric objective with a more general probabilistic objective that minimizes the generalized Bregman divergence—a measure closely related to the exponential family (see [documentation](https://sisl.github.io/ExpFamilyPCA.jl/dev/math/bregman/))—rather than the squared Frobenius norm. The Bregman divergence $B_F$ associated with $F$ is defined [@Bregman]:
+## Bregman Divergences
+
+Bregman divergences [@Bregman] provide a flexible measure of statistical difference. For a strictly convex and continuously differentiable function $F$, the Bregman divergence between $p, q \in \mathrm{dom}(F)$ is
 
 $$
-B_F(p, q) = F(p) - F(q) - \nabla F(q) \cdot (p - q).
+B_F(p \| q) = F(p) - F(q) - \langle \nabla F(q), p - q \rangle.
 $$
 
-The Bregman-based objective makes EPCA particularly versatile for dimensionality reduction when working with non-Gaussian data distributions:
+<!-- todo: double check if F is the cumulant or the conjugate of the cumulant or if it even matters -->
+
+When $F$ is chosen to be the convex conjugate of the log-partition of an exponential family distribution, minimizing the Bregman divergence aligns with maximizing the log-likelihood [@azoury, @forster]. This property makes Bregman divergences particularly suitable for extending PCA to the exponential family. A comprehensive discussion can be found in the [documentation](https://sisl.github.io/ExpFamilyPCA.jl/dev/math/bregman/).
+
+## Exponential Family Principal Component Analysis
+
+EPCA extends the classical PCA by replacing its geometric objective with a probabilistic one that minimizes a generalized Bregman divergence instead of the squared Frobenius norm. Formally, EPCA seeks to solve:
+
+<!-- 
+
+EPCA is a generalization of PCA that replaces PCA's geometric objective with a more general probabilistic objective that minimizes the generalized Bregman divergence rather than the squared Frobenius norm (see [appendix](https://sisl.github.io/ExpFamilyPCA.jl/dev/math/appendix/gaussian/) to see why PCA is a special case of EPCA).The Bregman-based objective makes EPCA particularly versatile for dimensionality reduction when working with non-Gaussian data distributions: -->
 
 $$\begin{aligned}
 & \underset{\Theta}{\text{minimize}}
@@ -92,7 +103,20 @@ In this formulation,
 *  $B_F(p \| q)$ is the **Bregman divergence** induced from $F$,
 *  and both $\mu_0 \in \mathrm{range}(g)$ and $\epsilon > 0$ are regularization hyperparameters.
 
-EPCA is similar to generalized linear models (GLMs) [@GLM]. Just as GLMs extend linear regression to handle a variety of response distributions, EPCA generalizes PCA to accommodate data with noise drawn from any exponential family distribution, rather than just Gaussian noise. This allows EPCA to address a broader range of real-world data scenarios where the Gaussian assumption may not hold (e.g., binary, count, discrete distribution data).
+
+This objective allows EPCA to effectively perform dimensionality reduction on data drawn from any exponential family distribution, thereby broadening the applicability of PCA beyond Gaussian data. Notably, PCA itself is a special case of EPCA when the data follows a Gaussian distribution, as detailed in the [appendix](https://sisl.github.io/ExpFamilyPCA.jl/dev/math/appendix/gaussian/).
+
+EPCA shares similarities with generalized linear models (GLMs) [@GLM]. Just as GLMs extend linear regression to accommodate various response distributions, EPCA generalizes PCA to handle data with noise from any exponential family distribution, not limited to Gaussian noise. This generalization allows EPCA to better model real-world data scenarios where the Gaussian assumption may not hold, such as binary, count, or other discrete distributions.
+
+### Example: Poisson EPCA
+
+For the Poisson distribution, the associated EPCA objective is the generalized Kullback-Leibler (KL) divergence (see [appendix](https://sisl.github.io/ExpFamilyPCA.jl/dev/math/appendix/poisson/)), making Poisson EPCA particularly suitable for compressing discrete distribution data. This capability is crucial for applications like belief compression in reinforcement learning [@Roy], where high-dimensional belief states can be effectively reduced with minimal information loss.
+
+<!-- todo: smooth out transition -->
+
+One notable example is in reinforcement learning, specifically in belief state compression for partially observable Markov decision processes (POMDPs). Using Poisson EPCA, the package effectively reduces high-dimensional belief spaces with minimal information loss, as demonstrated by recreating results from @shortRoy. In this case, Poisson EPCA achieved nearly perfect reconstruction of a $41$-dimensional belief profile using just five basis components [CITE `CompressedBeleifMDPS.jl`, PAPER IN PRE-REVIEW]. Poisson EPCA compression also produces a more interpretable compression than traditional PCA because it minimizes the generalized KL divergence rather than the Frobenius norm.
+
+![](./scripts/kl_divergence_plot.png)
 
 # API 
 
@@ -143,14 +167,6 @@ gamma_epca = EPCA(indim, outdim, G, g, Val((:G, :g)); options = NegativeDomain()
 
 A lengthier discussion of the `EPCA` constructors and math is provided in the [documentation](https://sisl.github.io/ExpFamilyPCA.jl/dev/math/objectives/).
 
-## Applications
-
-The practical applications of `ExpFamilyPCA.jl` span several domains that deal with non-Gaussian data. One notable example is in reinforcement learning, specifically in belief state compression for partially observable Markov decision processes (POMDPs). Using Poisson EPCA, the package effectively reduces high-dimensional belief spaces with minimal information loss, as demonstrated by recreating results from @shortRoy. In this case, Poisson EPCA achieved nearly perfect reconstruction of a $41$-dimensional belief profile using just five basis components [CITE `CompressedBeleifMDPS.jl`, PAPER IN PRE-REVIEW]. Poisson EPCA compression also produces a more interpretable compression than traditional PCA because it minimizes the generalized KL divergence rather than the Frobenius norm.
-
-![](./scripts/kl_divergence_plot.png)
-
-`ExpFamilyPCA.jl` can also be used in fields like mass spectrometry and survival analysis, where specific data distributions like the gamma or Weibull may be more appropriate. By minimizing divergences suited to the distribution, `ExpFamilyPCA.jl` provides more accurate and interpretable dimensionality reduction than standard PCA.
-
 # Acknowledgments
 
 We thank Arec Jamgochian, Robert Moss, and Dylan Asmar for their help and guidance.

diff --git a/proposal.md b/proposal.md
@@ -1,11 +1,17 @@
 # Summary
 
+Principal component analysis (PCA) [@PCA] is a fundamental tool in data science and machine learning for dimensionality reduction and denoising. While PCA is effective for continuous, real-valued data, it may not perform well for binary, count, or discrete distribution data. Exponential family PCA (EPCA) [@EPCA] generalizes PCA to accommodate these data types, making it more suitable for tasks such as belief compression in reinforcement learning [@Roy]. `ExpFamilyPCA.jl` is the first Julia [@Julia] package for EPCA, offering fast implementations for common distributions and a flexible interface for custom distributions.
+
 # Statment of Need
 
 # Problem Formulation
 
 ## Principal Component Analysis
 
+## The Exponential Family
+
+## Bregman Divergences
+
 ## Exponential Family Principal Component Analysis
 
 ### Poisson