Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
yuyay committed Feb 8, 2022
0 parents commit 4b5dd97
Show file tree
Hide file tree
Showing 13 changed files with 873 additions and 0 deletions.
133 changes: 133 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# vscode
*.code-workspace
.vscode/
Binary file added LICENSE.txt
Binary file not shown.
Empty file added MANIFEST.in
Empty file.
70 changes: 70 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# GPX

![GPX example on California housing dataset](https://raw.githubusercontent.com/yuyay/gpx/image/california_example.png)

GPX is a Gaussian process regression model that can output the feature contributions to the prediction for each sample, which is implemented based on the following paper:
**Yuya Yoshikawa, and Tomoharu Iwata. "[Gaussian Process Regression With Interpretable Sample-Wise Feature Weights.](https://ieeexplore.ieee.org/abstract/document/9646444)" IEEE Transactions on Neural Networks and Learning Systems (2021).**

GPX has the following characteristics:
- High accuracy: GPX can achieve comparable predictive accuracy to standard Gaussian process regression models.
- Explainability: GPX can output feature contributions with uncertainty for each sample. We showed that the feature contributions are more appropriate qualitatively and quantitatively than the existing explanation methods, such as LIME and SHAP, etc.

## Installation
The pytorch-gpx package is on PyPI. Simply run:
```bash
pip install pytorch-gpx
```
Or clone the repository and run:
```bash
pip install .
```

## Usage
The pytorch-gpx package provides scikit-learn-like API for training, prediction, and evaluation of GPX models.

```python
from sklearn.metrics import mean_squared_error
from gpx import GPXRegressor

'''Training
X_tr: input data (numpy array), with shape of (n_samples, n_X_features)
y_tr: target variables (numpy array), with shape of (n_samples,)
Z_tr: simplified input data (numpy array), with shape of (n_samples, n_Z_features). The same as X_tr is OK.
'''
model = GPXRegressor().fit(X_tr, y_tr, Z_tr)

'''Prediction
y_mean: the posterior mean of target variables
y_conv: the posterior variance of target variables
w_mean: the posterior mean of weights
w_conv: the posterior variance of weights
'''
y_mean, y_cov, w_mean, w_cov = model.predict(X_te, Z_te, return_weights=True)

'''Evaluation'''
mse = mean_squared_error(y_te, y_mean)
print("Test MSE = {}".format(mse))
```

For more usage examples, please see the below.
- [Regression on California housing price dataset (tabular data)](notebooks/california_regression.ipynb)
- [Label regression on binary-class hand-written digits dataset (image data)](notebooks/digits_visualization.ipynb)

## Citation
If you use this repo, please cite the following paper.

```bibtex
@article{yoshikawa2021gpx,
title={Gaussian Process Regression With Interpretable Sample-Wise Feature Weights},
author={Yoshikawa, Yuya and Iwata, Tomoharu},
journal={IEEE Transactions on Neural Networks and Learning Systems},
year={2021},
publisher={IEEE}
}
```

## License
Please see [LICENSE.txt](./LICENSE.txt).

## Acknowledgment
This work was supported by the Japan Society for the Promotion of Science (JSPS) KAKENHI under Grant 18K18112.
2 changes: 2 additions & 0 deletions gpx/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .gpx_regressor_module import GPXRegressorModule
from ._regressor import GPXRegressor
110 changes: 110 additions & 0 deletions gpx/_regressor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from typing import Any, Union
import numpy as np
import torch
import gpytorch as gpt
from sklearn.base import RegressorMixin, BaseEstimator
from sklearn.metrics import r2_score

from .gpx_regressor_module import GPXRegressorModule
from .tensor_utils import to_ndarray, to_tensor


class GPXRegressor(RegressorMixin, BaseEstimator):
"""GPX for regression.
"""
def __init__(
self,
kernel: Any = gpt.kernels.RBFKernel,
kernel_kwargs: dict = {},
kernel_init_params: dict = {},
max_iter: int = 150,
tol: float = 10**-4,
lr: float = 0.1,
dtype: torch.dtype = torch.double,
verbose: bool = False,
):
self.kernel = kernel
self.kernel_kwargs = kernel_kwargs
self.kernel_init_params = kernel_init_params
self.max_iter = max_iter
self.tol = tol
self.lr = lr
self.dtype = dtype
self.verbose = verbose

self.model = GPXRegressorModule(kernel, kernel_kwargs=kernel_kwargs, dtype=dtype)
for k, v in kernel_init_params.items():
self.model.kernel_obj.base_kernel.__dict__[k] = v


def fit(self, X: np.ndarray, y: np.ndarray, Z: Union[np.ndarray, None]):
"""Train model.
Parameters
----------
X : numpy.ndarray
y : numpy.ndarray
Z : numpy.ndarray or None
"""
Z = X if Z is None else Z
X, y, Z = map(lambda x: x.type(self.dtype), to_tensor(X, y, Z))

self.model.type(self.dtype)
self.model.train_initialize(X, y, Z)
self.model.train()
if self.verbose:
for param_name, param in self.model.named_parameters():
print(f'Parameter name: {param_name:42} value = {param.item()}')

# use LBFGS as optimizer since we can load the whole data to train
def closure():
optimizer.zero_grad()
loss = self.model(X, y, Z)
if self.verbose:
print('Iter {0:3d}: loss ='.format(optimizer.iter_count), loss.item())
optimizer.iter_count += 1
loss.backward()
return loss

optimizer = torch.optim.LBFGS(
self.model.parameters(), lr=self.lr, tolerance_change=self.tol, max_iter=self.max_iter)
optimizer.iter_count = 1
optimizer.step(closure)

if self.verbose:
for param_name, param in self.model.named_parameters():
print(f'Parameter name: {param_name:42} value = {param.item()}')

self.model.prepare_eval()
return self


def predict(
self, X: np.ndarray, Z: Union[np.ndarray, None] = None, return_weights: bool = False
):
"""Prediction.
Parameters
----------
X : numpy.ndarray
Z : numpy.ndarray or None
return_weights : bool
Decide whether to return sample-wise weights.
"""

Z = X if Z is None else Z
X, Z = map(lambda x: x.type(self.dtype), to_tensor(X, Z))
y_mean, y_cov = to_ndarray(*self.model.predict_targets(X, Z))
if return_weights:
w_mean, w_cov = to_ndarray(*self.model.predict_weights(X, Z))
return y_mean, y_cov, w_mean, w_cov
else:
return y_mean, y_cov


def score(
self, X: np.ndarray, y: np.ndarray, Z: Union[np.ndarray, None] = None,
sample_weight: Union[np.ndarray, None] = None
):
y_pred, _ = self.predict(X, Z)
return r2_score(y, y_pred, sample_weight=sample_weight)
Loading

0 comments on commit 4b5dd97

Please sign in to comment.