diff --git a/setup.cfg b/setup.cfg index 2c6e66bee..b58c477d4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -84,6 +84,8 @@ cobra = [options.extras_require] array = scipy +chrr = + hopsy development = black bumpversion diff --git a/src/cobra/sampling/__init__.py b/src/cobra/sampling/__init__.py index 691735164..d175c9dec 100644 --- a/src/cobra/sampling/__init__.py +++ b/src/cobra/sampling/__init__.py @@ -1,5 +1,11 @@ from .hr_sampler import HRSampler, shared_np_array from .achr import ACHRSampler from .core import step + +from .hopsy import hopsy_is_available + +if hopsy_is_available: + from .hopsy import HopsySampler + from .optgp import OptGPSampler from .sampling import sample diff --git a/src/cobra/sampling/hopsy.py b/src/cobra/sampling/hopsy.py new file mode 100644 index 000000000..6071ce2ce --- /dev/null +++ b/src/cobra/sampling/hopsy.py @@ -0,0 +1,214 @@ +"""Provide hopsy samplers.""" + +from typing import TYPE_CHECKING, Optional + + +if TYPE_CHECKING: + from cobra import Model + +from ..core.configuration import Configuration +from .hr_sampler import HRSampler + + +__all__ = ( + "HopsySampler", + "hopsy_is_available", +) + +configuration = Configuration() + +try: + import hopsy + import numpy as np + import pandas as pd + + hopsy_is_available = True + + # for the time being while the updated hopsy version isn't released + def back_transform(problem, samples): + """ + Transform samples from the sampling space to the original parameter space. + + Parameters + ---------- + problem : hopsy.Problem + A hopsy.Problem with populated `problem.transformation` and `problem.shift` of + shape `(m, d)` and `(d, )`, respectively. If not populated, the identity + operation is performed. + samples : np.array + Parameter samples of shape `(..., d)` which are to be transformed. + + Returns + ------- + np.array + Transformed samples of shape `(..., m)` + """ + S, h = problem.transformation, problem.shift + transformed = samples @ S.T if S is not None else samples + shifted = transformed + h if h is not None else transformed + return shifted + + class HopsySampler(HRSampler): + """ + Generic sampling wrapper for hopsy samplers. + + Parameters + ---------- + model : cobra.Model + The cobra model from which to generate samples. + sampler : hopsy.Proposal, optional + The hopsy sampler for sampling the polytope + (default hopsy.UniformCoordinateHitAndRunProposal). + processes: int, optional + The number of processes used during sampling + (default cobra.Configuration.processes). + thinning : int, optional + The thinning factor of the generated sampling chain. A thinning of + 10 means samples are returned every 10 steps (default 100). + nproj : int > 0, optional + How often to reproject the sampling point into the feasibility + space. Avoids numerical issues at the cost of lower sampling. If + you observe many equality constraint violations with + `sampler.validate` you should lower this number (default None). + rounding : bool, optional + Whether to precondition the sampling problem by applying an offline rounding + transformation. + seed : int > 0, optional + Sets the random number seed. Initialized to the current time stamp + if None (default None). + + Attributes + ---------- + n_samples : int + The total number of samples that have been generated by this + sampler instance. + problem : typing.NamedTuple + A NamedTuple whose attributes define the entire sampling problem in + matrix form. + fwd_idx : numpy.array + A numpy array having one entry for each reaction in the model, + containing the index of the respective forward variable. + rev_idx : numpy.array + A numpy array having one entry for each reaction in the model, + containing the index of the respective reverse variable. + sampler : + The hopsy sampler type. + mcs : list + List of length `processes` of hopsy.MarkovChain objects. + rngs : list + List of length `processes` of hopsy.RandomNumberGenerator objects. + + """ + + def __init__( + self, + model: "Model", + sampler: hopsy.Proposal = hopsy.UniformCoordinateHitAndRunProposal, + thinning: int = 100, + processes: Optional[int] = None, + nproj: Optional[int] = None, + seed: Optional[int] = None, + rounding: bool = True, + **kwargs, + ) -> None: + """Initialize a new HopsySampler.""" + super().__init__(model, thinning, nproj=nproj, seed=seed, **kwargs) + + if self.problem.inequalities.shape[0] > 0: + A = self.problem.inequalities + b = self.problem.bounds + else: + # empty dummy constraint to set problem dimension + A = np.ones((0, self.problem.equalities.shape[1])) + b = np.ones(0) + + problem = hopsy.Problem(A, b) + problem = hopsy.add_box_constraints( + problem, + self.problem.variable_bounds[0], + self.problem.variable_bounds[1], + simplify=False, + ) + problem = hopsy.add_equality_constraints( + problem, self.problem.equalities, self.problem.b + ) + problem = hopsy.round(problem, simplify=False) if rounding else problem + + self.sampler = sampler + + # batched transformation to better exploit vectorization + self.transformation = problem.transformation.copy() + self.shift = problem.shift.copy() + + problem.transformation = None + problem.shift = None + + self._problem = problem + + if processes is None: + self.processes = configuration.processes + else: + self.processes = processes + + self.mcs = None + self.rngs = None + + self.mcs = [ + hopsy.MarkovChain(self._problem, self.sampler) for i in range(processes) + ] + self.rngs = [ + hopsy.RandomNumberGenerator(self._seed, i) for i in range(processes) + ] + + def sample(self, n: int, fluxes: bool = True) -> pd.DataFrame: + """Generate a set of samples by calling hopsy. + + Parameters + ---------- + n : int + The minimum number of samples that are generated at once. + fluxes : bool, optional + Whether to return fluxes or the internal solver variables. If + set to False, will return a variable for each forward and + backward flux as well as all additional variables you might + have defined in the model (default True). + + Returns + ------- + pandas.DataFrame + Returns a pandas DataFrame with `n` rows, each containing a + flux sample. + """ + _, samples = hopsy.sample( + self.mcs, + self.rngs, + n_samples=n // self.processes, + thinning=self.thinning, + n_procs=self.processes, + ) + + # batched transformation to better exploit vectorization + self._problem.transformation = self.transformation + self._problem.shift = self.shift + + samples = back_transform(self._problem, samples) + + self._problem.transformation = None + self._problem.shift = None + + samples = samples.reshape(-1, samples.shape[-1]) # flatten chain dim + + if fluxes: + names = [r.id for r in self.model.reactions] + + return pd.DataFrame( + samples[:, self.fwd_idx] - samples[:, self.rev_idx], + columns=names, + ) + else: + names = [v.name for v in self.model.variables] + + return pd.DataFrame(samples, columns=names) + +except ModuleNotFoundError: + hopsy_is_available = False diff --git a/src/cobra/sampling/sampling.py b/src/cobra/sampling/sampling.py index 9d0ec0eeb..5c2e38003 100644 --- a/src/cobra/sampling/sampling.py +++ b/src/cobra/sampling/sampling.py @@ -5,6 +5,14 @@ import pandas as pd from .achr import ACHRSampler +from .hopsy import hopsy_is_available + + +if hopsy_is_available: + from .hopsy import HopsySampler +else: + import warnings + from .optgp import OptGPSampler @@ -15,22 +23,33 @@ def sample( model: "Model", n: int, - method: str = "optgp", + method: str = "auto", thinning: int = 100, processes: int = 1, seed: Optional[int] = None, ) -> pd.DataFrame: """Sample valid flux distributions from a cobra model. - Currently, two methods are supported: + Currently, three methods are supported: + + 1. 'chrr' which uses a the coordinate Hit-and-Run algorithm + with rounding ([1]_), which has been shown to often outperform OptGP + and ACHR, refer [2]_, [3]_. The rounding transformation is performed + in an offline manner which inflicts a certain base cost, however + sampling is performed using the C++-based library ``hopsy`` ([4]_) + and is, thus, blazingly fast. Moreover, this method supports + parallel sampling. - 1. 'optgp' (default) which uses the OptGPSampler that supports parallel + 2. 'optgp' which uses the OptGPSampler that supports parallel sampling. Requires large numbers of samples to be performant (`n` > 1000). For smaller samples, 'achr' might be better suited. - For details, refer [1]_ . + For details, refer [5]_ . - 2. 'achr' which uses artificial centering hit-and-run. This is a single - process method with good convergence. For details, refer [2]_ . + 3. 'achr' which uses artificial centering hit-and-run. This is a single + process method with good convergence. For details, refer [6]_ . + + A fourth 'auto' option is available, which tries to choose 'chrr' if ``hopsy`` + is available. This is the default option. Parameters ---------- @@ -40,16 +59,16 @@ def sample( The number of samples to obtain. When using 'optgp', this must be a multiple of `processes`, otherwise a larger number of samples will be returned. - method : {"optgp", "achr"}, optional - The sampling algorithm to use (default "optgp"). + method : {"auto", "chrr", "optgp", "achr"}, optional + The sampling algorithm to use (default "auto"). thinning : int, optional The thinning factor of the generated sampling chain. A thinning of 10 means samples are returned every 10 steps. Defaults to 100 which in benchmarks gives approximately uncorrelated samples. If set to 1 will return all iterates (default 100). processes : int, optional - Only used for 'optgp'. The number of processes used to generate - samples (default 1). + Only used for 'optgp' and 'chrr'. The number of processes + used to generate samples (default 1). seed : int > 0, optional Sets the random number seed. Initialized to the current time stamp if None (default None). @@ -69,28 +88,58 @@ def sample( References ---------- - .. [1] Megchelenbrink W, Huynen M, Marchiori E (2014) + .. [1] Hulda S Haraldsdóttir, Ben Cousins, Ines Thiele, Ronan M.T Fleming, + Santosh Vempala, + CHRR: coordinate hit-and-run with rounding for uniform sampling of + constraint-based models, + Bioinformatics, Volume 33, Issue 11, June 2017, Pages 1741–1743, + https://doi.org/10.1093/bioinformatics/btx052 + + .. [2] Herrmann, H.A., Dyson, B.C., Vass, L. et al. + Flux sampling is a powerful tool to study metabolism under changing + environmental conditions. + npj Syst Biol Appl 5, 32 (2019). + https://doi.org/10.1038/s41540-019-0109-0 + + .. [3] Fallahi S, Skaug HJ, Alendal G (2020) + A comparison of Monte Carlo sampling methods for + metabolic network models. + PLoS ONE 15(7): e0235393. + https://doi.org/10.1371/journal.pone.0235393 + + .. [4] Richard D Paul, Johann F Jadebeck, Anton Stratmann, Wolfgang Wiechert, + Katharina Nöh, + hopsy — a methods marketplace for convex polytope sampling in Python, + Bioinformatics, Volume 40, Issue 7, July 2024, btae430, + https://doi.org/10.1093/bioinformatics/btae430 + + .. [5] Megchelenbrink W, Huynen M, Marchiori E (2014) optGpSampler: An Improved Tool for Uniformly Sampling the Solution-Space of Genome-Scale Metabolic Networks. PLoS ONE 9(2): e86587. https://doi.org/10.1371/journal.pone.0086587 - .. [2] Direction Choice for Accelerated Convergence in Hit-and-Run Sampling + .. [6] Direction Choice for Accelerated Convergence in Hit-and-Run Sampling David E. Kaufman, Robert L. Smith Operations Research 199846:1 , 84-95 https://doi.org/10.1287/opre.46.1.84 """ + if method == "auto": + method = "chrr" if hopsy_is_available else "optgp" + if method == "optgp": sampler = OptGPSampler(model, processes=processes, thinning=thinning, seed=seed) elif method == "achr": sampler = ACHRSampler(model, thinning=thinning, seed=seed) + elif method == "chrr": + sampler = HopsySampler( + model, processes=processes, thinning=thinning, seed=seed, rounding=True + ) else: raise ValueError( f'Invalid value: "{method}" for method used. ' - 'The value must be "optgp" or "achr".' + 'The value must be "optgp", "achr" or "chrr".' ) - return pd.DataFrame( - columns=[rxn.id for rxn in model.reactions], data=sampler.sample(n) - ) + return sampler.sample(n) diff --git a/tests/test_sampling/test_sampling.py b/tests/test_sampling/test_sampling.py index 50ddbe87f..6c32fe454 100644 --- a/tests/test_sampling/test_sampling.py +++ b/tests/test_sampling/test_sampling.py @@ -7,7 +7,7 @@ from cobra.core import Metabolite, Model, Reaction from cobra.flux_analysis.parsimonious import pfba -from cobra.sampling import ACHRSampler, OptGPSampler, sample +from cobra.sampling import ACHRSampler, OptGPSampler, hopsy_is_available, sample def test_single_achr(model: Model) -> None: @@ -16,16 +16,29 @@ def test_single_achr(model: Model) -> None: assert s.shape == (10, len(model.reactions)) +def test_single_uchrr(model: Model) -> None: + """Test UCHRR sampling (one sample).""" + s = sample(model, 10, processes=1) + assert s.shape == (10, len(model.reactions)) + + def test_single_optgp(model: Model) -> None: """Test OptGP sampling (one sample).""" - s = sample(model, 10, processes=1) + s = sample(model, 10, processes=1, method="optgp") + assert s.shape == (10, len(model.reactions)) + + +@pytest.mark.skipif("SKIP_MP" in os.environ, reason="unsafe for parallel execution") +def test_multi_uchrr(model: Model) -> None: # pragma: no cover + """Test UCHRR sampling (multi sample).""" + s = sample(model, 10, processes=2) assert s.shape == (10, len(model.reactions)) @pytest.mark.skipif("SKIP_MP" in os.environ, reason="unsafe for parallel execution") def test_multi_optgp(model: Model) -> None: # pragma: no cover """Test OptGP sampling (multi sample).""" - s = sample(model, 10, processes=2) + s = sample(model, 10, processes=2, method="optgp") assert s.shape == (10, len(model.reactions)) @@ -41,6 +54,14 @@ def test_fixed_seed(model: Model) -> None: s2 = sample(model, 1, seed=42) assert np.isclose(s1.TPI[0], s2.TPI[0]) + s1 = sample(model, 1, seed=42, method="achr") + s2 = sample(model, 1, seed=42, method="achr") + assert np.isclose(s1.TPI[0], s2.TPI[0]) + + s1 = sample(model, 1, seed=42, method="optgp") + s2 = sample(model, 1, seed=42, method="optgp") + assert np.isclose(s1.TPI[0], s2.TPI[0]) + def test_equality_constraint(model: Model) -> None: """Test equality constraint.""" @@ -52,6 +73,9 @@ def test_equality_constraint(model: Model) -> None: s = sample(model, 10, method="achr") assert np.allclose(s.ACALD, -1.5, atol=1e-6, rtol=0) + s = sample(model, 10, method="optgp") + assert np.allclose(s.ACALD, -1.5, atol=1e-6, rtol=0) + def test_inequality_constraint(model: Model) -> None: """Test inequality constraint.""" @@ -64,6 +88,9 @@ def test_inequality_constraint(model: Model) -> None: s = sample(model, 10, method="achr") assert all(s.ACALD > -0.5 - 1e-6) + s = sample(model, 10, method="optgp") + assert all(s.ACALD > -0.5 - 1e-6) + def test_inhomogeneous_sanity(model: Model) -> None: """Test standard deviation between inhomogeneous and homogeneous sampling.""" @@ -85,6 +112,15 @@ def test_inhomogeneous_sanity(model: Model) -> None: relative_diff = (s_inhom.std() + 1e-12) / (s_hom.std() + 1e-12) assert 0.5 < relative_diff.abs().mean() < 2 + model.reactions.ACALD.bounds = (-1.5, -1.5) + s_inhom = sample(model, 64, method="optgp") + + model.reactions.ACALD.bounds = (-1.5 - 1e-3, -1.5 + 1e-3) + s_hom = sample(model, 64, method="optgp") + + relative_diff = (s_inhom.std() + 1e-12) / (s_hom.std() + 1e-12) + assert 0.5 < relative_diff.abs().mean() < 2 + def test_complicated_model() -> None: """Test a complicated model. @@ -123,6 +159,15 @@ def test_complicated_model() -> None: assert sum(optgp.validate(optgp_samples) == "v") > 95 assert sum(achr.validate(achr_samples) == "v") > 95 + if hopsy_is_available: + from cobra.sampling import HopsySampler + + uchrr = HopsySampler(model, seed=42) + uchrr_samples = uchrr.sample(100) + + assert any(uchrr_samples.corr().abs() < 1.0) + assert sum(uchrr.validate(uchrr_samples) == "v") > 95 + def test_single_point_space(model: Model) -> None: """Test the reduction of the sampling space to one point."""