From 83f2d6c67278dd617cd2498abe6cd057a373b333 Mon Sep 17 00:00:00 2001 From: daikitag <48062118+daikitag@users.noreply.github.com> Date: Thu, 17 Aug 2023 19:01:05 +0900 Subject: [PATCH] Speed up Previous code had 2 tree traversals, so reduce it to be 1. Also modify sections where the code was inefficient and rewrite docstring --- requirements/CI-complete/requirements.txt | 2 +- requirements/CI-tests-pip/requirements.txt | 2 +- setup.cfg | 4 +- tests/test_base.py | 1 + tests/test_genetic_value.py | 355 ++++++++++---- tests/test_sim_trait.py | 96 +--- tests/test_simulate_environment.py | 34 +- tests/test_simulate_phenotype.py | 22 +- tests/test_trait_model.py | 26 +- tstrait/__init__.py | 10 +- tstrait/base.py | 7 +- tstrait/genetic_value.py | 211 +++++++-- tstrait/simulate_effect_size.py | 166 +++---- tstrait/simulate_environment.py | 87 +++- tstrait/simulate_phenotype.py | 134 ++++-- tstrait/trait_model.py | 514 ++++++++++++++++----- 16 files changed, 1143 insertions(+), 528 deletions(-) diff --git a/requirements/CI-complete/requirements.txt b/requirements/CI-complete/requirements.txt index 5c1fe64..44e5dfe 100644 --- a/requirements/CI-complete/requirements.txt +++ b/requirements/CI-complete/requirements.txt @@ -5,4 +5,4 @@ pandas pytest==6.2.5 pytest-cov==3.0.0 scipy -tskit==0.5.3 \ No newline at end of file +tskit>=0.5.5 \ No newline at end of file diff --git a/requirements/CI-tests-pip/requirements.txt b/requirements/CI-tests-pip/requirements.txt index f6c1cf0..4660244 100644 --- a/requirements/CI-tests-pip/requirements.txt +++ b/requirements/CI-tests-pip/requirements.txt @@ -3,4 +3,4 @@ numba numpy pandas scipy -tskit==0.5.3 +tskit>=0.5.5 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index f22da94..4dcf5b6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,9 +47,9 @@ keywords = packages = tstrait python_requires = >=3.8 install_requires = - numpy + numpy>=1.20.3 numba>=0.57.0 - pandas + pandas>=1.0 tskit>=0.5.5 setup_requires = setuptools_scm diff --git a/tests/test_base.py b/tests/test_base.py index 8775d32..a84d537 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -108,3 +108,4 @@ def test_non_decreasing(self): _check_non_decreasing([0, 1, 0], "array") _check_non_decreasing([0, 0, 1, 1, 4], "array") + _check_non_decreasing([0], "array") diff --git a/tests/test_genetic_value.py b/tests/test_genetic_value.py index 828ba16..685adb7 100644 --- a/tests/test_genetic_value.py +++ b/tests/test_genetic_value.py @@ -39,6 +39,10 @@ def sample_trait_model(): return tstrait.trait_model(distribution="normal", mean=0, var=1) +def freqdep(alpha, freq): + return np.sqrt(pow(2 * freq * (1 - freq), alpha)) + + class TestInput: """This test will check that an informative error is raised when the input parameter does not have an appropriate type or value. @@ -48,24 +52,24 @@ def test_input_type(self, sample_ts, sample_df): with pytest.raises( TypeError, match="ts must be a instance" ): - tstrait.genetic_value(ts=1, trait_df=sample_df) + tstrait.sim_genetic(ts=1, trait_df=sample_df, random_seed=1) with pytest.raises( TypeError, match="trait_df must be a instance", ): - tstrait.genetic_value(ts=sample_ts, trait_df=1) + tstrait.sim_genetic(ts=sample_ts, trait_df=1, random_seed=1) def test_bad_input(self, sample_ts, sample_df): with pytest.raises( ValueError, match="columns must be included in trait_df dataframe" ): df = sample_df.drop(columns=["site_id"]) - tstrait.genetic_value(ts=sample_ts, trait_df=df) + tstrait.sim_genetic(ts=sample_ts, trait_df=df, random_seed=1) with pytest.raises(ValueError, match="site_id must be non-decreasing"): df = sample_df.copy() df["site_id"] = [2, 0] - tstrait.genetic_value(ts=sample_ts, trait_df=df) + tstrait.sim_genetic(ts=sample_ts, trait_df=df, random_seed=1) @pytest.mark.parametrize("trait_id", [[2, 3], [0, 2]]) def test_trait_id(self, sample_ts, sample_df, trait_id): @@ -74,27 +78,47 @@ def test_trait_id(self, sample_ts, sample_df, trait_id): ): df = sample_df.copy() df["trait_id"] = trait_id - tstrait.genetic_value(ts=sample_ts, trait_df=df) + tstrait.sim_genetic(ts=sample_ts, trait_df=df, random_seed=1) class TestOutputDim: """Check that the genetic_value function gives the output with correct dimensions""" - def check_dimensions(self, df, nrow): - assert len(df) == nrow - assert df.shape[1] == 3 - assert list(df.columns) == [ + def check_dimensions(self, result, nrow, input_df): + genetic_df = result.genetic + effect_size_df = result.effect_size + + np.testing.assert_array_equal(effect_size_df["site_id"], input_df["site_id"]) + np.testing.assert_array_equal(effect_size_df["trait_id"], input_df["trait_id"]) + + assert list(effect_size_df.columns) == [ + "site_id", + "effect_size", + "trait_id", + "causal_state", + ] + + assert len(input_df) == len(effect_size_df) + + _check_numeric_array(effect_size_df["effect_size"], "effect_size") + + assert len(genetic_df) == nrow + assert genetic_df.shape[1] == 3 + assert list(genetic_df.columns) == [ "trait_id", "individual_id", "genetic_value", ] - _check_numeric_array(df["individual_id"], "individual_id") - _check_numeric_array(df["genetic_value"], "genetic_value") - _check_numeric_array(df["trait_id"], "trait_id") + + _check_numeric_array(genetic_df["individual_id"], "individual_id") + _check_numeric_array(genetic_df["genetic_value"], "genetic_value") + _check_numeric_array(genetic_df["trait_id"], "trait_id") def test_output_simple(self, sample_ts, sample_df): - genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=sample_df) - self.check_dimensions(genetic_df, sample_ts.num_individuals) + genetic_result = tstrait.sim_genetic( + ts=sample_ts, trait_df=sample_df, random_seed=5 + ) + self.check_dimensions(genetic_result, sample_ts.num_individuals, sample_df) @pytest.mark.parametrize( "trait_model", @@ -111,13 +135,16 @@ def test_output_sim_trait(self, sample_ts, trait_model): trait_df = tstrait.sim_trait( ts=sample_ts, num_causal=num_causal, model=trait_model ) - genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=trait_df) - self.check_dimensions(genetic_df, sample_ts.num_individuals) + genetic_result = tstrait.sim_genetic( + ts=sample_ts, trait_df=trait_df, random_seed=100 + ) + self.check_dimensions(genetic_result, sample_ts.num_individuals, trait_df) np.testing.assert_equal( - genetic_df["trait_id"], np.zeros(sample_ts.num_individuals) + genetic_result.genetic["trait_id"], np.zeros(sample_ts.num_individuals) ) np.testing.assert_equal( - genetic_df["individual_id"], np.arange(sample_ts.num_individuals) + genetic_result.genetic["individual_id"], + np.arange(sample_ts.num_individuals), ) def test_output_multivariate(self, sample_ts): @@ -127,43 +154,46 @@ def test_output_multivariate(self, sample_ts): num_causal = 10 model = tstrait.trait_model(distribution="multi_normal", mean=mean, cov=cov) trait_df = tstrait.sim_trait(ts=sample_ts, num_causal=num_causal, model=model) - genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=trait_df) - self.check_dimensions(genetic_df, sample_ts.num_individuals * num_trait) - ind_id = np.arange(sample_ts.num_individuals) - trait_id = np.ones(sample_ts.num_individuals) + genetic_result = tstrait.sim_genetic( + ts=sample_ts, trait_df=trait_df, random_seed=6 + ) + self.check_dimensions( + genetic_result, sample_ts.num_individuals * num_trait, trait_df + ) np.testing.assert_equal( - genetic_df["individual_id"], np.concatenate((ind_id, ind_id, ind_id)) + genetic_result.genetic["individual_id"], + np.tile(np.arange(sample_ts.num_individuals), 3), ) np.testing.assert_equal( - genetic_df["trait_id"], - np.concatenate((trait_id * 0, trait_id * 1, trait_id * 2)), + genetic_result.genetic["trait_id"], + np.repeat(np.arange(3), sample_ts.num_individuals), ) def test_additional_row(self, sample_ts, sample_df): """Check that adding unexpected rows to trait_df won't cause any errors""" df = sample_df.copy() df["height"] = [170, 180] - tstrait.genetic_value(ts=sample_ts, trait_df=df) + tstrait.sim_genetic(ts=sample_ts, trait_df=df, random_seed=1000) class TestGenotype: - """Test the `_individual_genotype` method and check if it can accurately detect - the individual genotype. Afterwards, we will check the output of `sim_trait`. + """Test the `_individual_genotype` and `_obtain_allele_count` method and check they + it can accurately detect the individual genotype. Afterwards, we will check the + output of `sim_trait`. """ def test_binary_tree(self): trait_df = pd.DataFrame( { - "site_id": [0, 1, 2, 3], - "causal_state": ["T", "T", "C", "C"], - "effect_size": [1, 10, 100, 1000], - "trait_id": [0, 0, 0, 0], + "site_id": [0, 2], + "effect_size": [1, 10], + "trait_id": [0, 0], } ) ts = binary_tree() tree = ts.first() - genetic = tstrait.GeneticValue(ts, trait_df) + genetic = tstrait.GeneticValue(ts, trait_df, alpha=-1, random_seed=1) g0 = genetic._individual_genotype(tree, ts.site(0), "T", ts.num_nodes) g1 = genetic._individual_genotype(tree, ts.site(1), "T", ts.num_nodes) g2 = genetic._individual_genotype(tree, ts.site(2), "C", ts.num_nodes) @@ -174,31 +204,56 @@ def test_binary_tree(self): np.testing.assert_equal(g2, np.array([0, 1, 0])) np.testing.assert_equal(g3, np.array([1, 2, 0])) - genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + c0 = genetic._obtain_allele_count(tree, ts.site(0)) + c1 = genetic._obtain_allele_count(tree, ts.site(1)) + c2 = genetic._obtain_allele_count(tree, ts.site(2)) + c3 = genetic._obtain_allele_count(tree, ts.site(3)) + + assert c0 == {"T": 2} + assert c1 == {"C": 1, "T": 1} + assert c2 == {"C": 1} + assert c3 == {"C": 2, "T": 2} - result_df = pd.DataFrame( + genetic_result = tstrait.sim_genetic( + ts=ts, trait_df=trait_df, alpha=-1, random_seed=2 + ) + first = 1 * freqdep(-1, 1 / 2) + second = 10 * freqdep(-1, 1 / 4) + effect_size_df = pd.DataFrame( + { + "site_id": [0, 2], + "effect_size": [first, second], + "trait_id": [0, 0], + "causal_state": ["T", "C"], + } + ) + genetic_df = pd.DataFrame( { "trait_id": [0, 0, 0], "individual_id": [0, 1, 2], - "genetic_value": [1011, 2110, 2], + "genetic_value": [first, second, first * 2], } ) - pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + pd.testing.assert_frame_equal( + effect_size_df, genetic_result.effect_size, check_dtype=False + ) + pd.testing.assert_frame_equal( + genetic_df, genetic_result.genetic, check_dtype=False + ) def test_diff_ind_tree(self): trait_df = pd.DataFrame( { - "site_id": [0, 1, 2, 3], - "causal_state": ["T", "T", "C", "C"], - "effect_size": [1, 10, 100, 1000], - "trait_id": [0, 0, 0, 0], + "site_id": [0, 2], + "effect_size": [1, 10], + "trait_id": [0, 0], } ) ts = diff_ind_tree() tree = ts.first() - genetic = tstrait.GeneticValue(ts, trait_df) + genetic = tstrait.GeneticValue(ts, trait_df, alpha=0, random_seed=1) g0 = genetic._individual_genotype(tree, ts.site(0), "T", ts.num_nodes) g1 = genetic._individual_genotype(tree, ts.site(1), "T", ts.num_nodes) g2 = genetic._individual_genotype(tree, ts.site(2), "C", ts.num_nodes) @@ -209,79 +264,178 @@ def test_diff_ind_tree(self): np.testing.assert_equal(g2, np.array([0, 1, 0])) np.testing.assert_equal(g3, np.array([1, 1, 1])) - genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + c0 = genetic._obtain_allele_count(tree, ts.site(0)) + c1 = genetic._obtain_allele_count(tree, ts.site(1)) + c2 = genetic._obtain_allele_count(tree, ts.site(2)) + c3 = genetic._obtain_allele_count(tree, ts.site(3)) + + assert c0 == {"T": 2} + assert c1 == {"C": 1, "T": 1} + assert c2 == {"C": 1} + assert c3 == {"C": 2, "T": 2} - result_df = pd.DataFrame( + genetic_result = tstrait.sim_genetic( + ts=ts, trait_df=trait_df, alpha=-0.5, random_seed=2 + ) + first = 1 * freqdep(-0.5, 1 / 2) + second = 10 * freqdep(-0.5, 1 / 4) + effect_size_df = pd.DataFrame( + { + "site_id": [0, 2], + "effect_size": [first, second], + "trait_id": [0, 0], + "causal_state": ["T", "C"], + } + ) + genetic_df = pd.DataFrame( { "trait_id": [0, 0, 0], "individual_id": [0, 1, 2], - "genetic_value": [1011, 1101, 1011], + "genetic_value": [first, first + second, first], } ) - pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + pd.testing.assert_frame_equal( + effect_size_df, genetic_result.effect_size, check_dtype=False + ) + pd.testing.assert_frame_equal( + genetic_df, genetic_result.genetic, check_dtype=False + ) def test_non_binary_tree(self): trait_df = pd.DataFrame( { - "site_id": [0, 1], - "causal_state": ["T", "C"], - "effect_size": [1, 10], - "trait_id": [0, 0], + "site_id": [0], + "effect_size": [1], + "trait_id": [0], } ) ts = non_binary_tree() tree = ts.first() - genetic = tstrait.GeneticValue(ts, trait_df) + genetic = tstrait.GeneticValue(ts, trait_df, alpha=-1, random_seed=0) g0 = genetic._individual_genotype(tree, ts.site(0), "T", ts.num_nodes) g1 = genetic._individual_genotype(tree, ts.site(1), "C", ts.num_nodes) np.testing.assert_equal(g0, np.array([0, 1, 2])) np.testing.assert_equal(g1, np.array([0, 1, 1])) - genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + c0 = genetic._obtain_allele_count(tree, ts.site(0)) + c1 = genetic._obtain_allele_count(tree, ts.site(1)) - result_df = pd.DataFrame( + assert c0 == {"T": 3} + assert c1 == {"C": 2, "T": 1} + + genetic_result = tstrait.sim_genetic( + ts=ts, trait_df=trait_df, alpha=-1, random_seed=2 + ) + effect_size_df = pd.DataFrame( + { + "site_id": [0], + "effect_size": [freqdep(-1, 1 / 2)], + "trait_id": [0], + "causal_state": ["T"], + } + ) + genetic_df = pd.DataFrame( { "trait_id": [0, 0, 0], "individual_id": [0, 1, 2], - "genetic_value": [0, 11, 12], + "genetic_value": [0, freqdep(-1, 1 / 2), 2 * freqdep(-1, 1 / 2)], } ) - pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + pd.testing.assert_frame_equal( + effect_size_df, genetic_result.effect_size, check_dtype=False + ) + pd.testing.assert_frame_equal( + genetic_df, genetic_result.genetic, check_dtype=False + ) def test_triploid(self, sample_df): trait_df = pd.DataFrame( { - "trait_id": [0, 0], - "site_id": [0, 1], - "causal_state": ["T", "C"], - "effect_size": [1, 10], + "site_id": [0], + "effect_size": [1], + "trait_id": [0], } ) ts = triploid_tree() tree = ts.first() - genetic = tstrait.GeneticValue(ts, sample_df) + genetic = tstrait.GeneticValue(ts, sample_df, alpha=-1, random_seed=1) g0 = genetic._individual_genotype(tree, ts.site(0), "T", ts.num_nodes) g1 = genetic._individual_genotype(tree, ts.site(1), "C", ts.num_nodes) np.testing.assert_equal(g0, np.array([1, 2])) np.testing.assert_equal(g1, np.array([1, 1])) - genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + c0 = genetic._obtain_allele_count(tree, ts.site(0)) + c1 = genetic._obtain_allele_count(tree, ts.site(1)) + + assert c0 == {"T": 3} + assert c1 == {"C": 2, "T": 1} - result_df = pd.DataFrame( + genetic_result = tstrait.sim_genetic( + ts=ts, trait_df=trait_df, alpha=-1, random_seed=2 + ) + effect_size_df = pd.DataFrame( + { + "site_id": [0], + "effect_size": [freqdep(-1, 1 / 2)], + "trait_id": [0], + "causal_state": ["T"], + } + ) + genetic_df = pd.DataFrame( { "trait_id": [0, 0], "individual_id": [0, 1], - "genetic_value": [11, 12], + "genetic_value": [freqdep(-1, 1 / 2), 2 * freqdep(-1, 1 / 2)], } ) - pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + pd.testing.assert_frame_equal( + effect_size_df, genetic_result.effect_size, check_dtype=False + ) + pd.testing.assert_frame_equal( + genetic_df, genetic_result.genetic, check_dtype=False + ) + + def test_allele_freq_one(self, sample_trait_model): + ts = binary_tree() + tables = ts.dump_tables() + tables.sites.add_row(4, "A") + tables.mutations.add_row(site=4, node=0, derived_state="T") + tables.mutations.add_row(site=4, node=0, derived_state="A", parent=9) + ts = tables.tree_sequence() + trait_df = pd.DataFrame( + { + "site_id": [4], + "effect_size": [1], + "trait_id": [0], + } + ) + genetic_result = tstrait.sim_genetic( + ts=ts, trait_df=trait_df, alpha=-1, random_seed=2 + ) + effect_size_df = pd.DataFrame( + {"site_id": [4], "effect_size": [0], "trait_id": [0], "causal_state": ["A"]} + ) + genetic_df = pd.DataFrame( + { + "trait_id": np.zeros(3), + "individual_id": np.arange(3), + "genetic_value": np.zeros(3), + } + ) + + pd.testing.assert_frame_equal( + effect_size_df, genetic_result.effect_size, check_dtype=False + ) + pd.testing.assert_frame_equal( + genetic_df, genetic_result.genetic, check_dtype=False + ) class TestTreeSeq: @@ -291,44 +445,81 @@ def test_tree_seq(self): ts = binary_tree_seq() trait_df = pd.DataFrame( { - "site_id": [0, 1, 2], - "causal_state": ["T", "G", "C"], - "effect_size": [1, 10, 100], - "trait_id": [0, 0, 0], + "site_id": [0, 2], + "effect_size": [1, 10], + "trait_id": [0, 0], } ) - genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + genetic_result = tstrait.sim_genetic( + ts=ts, trait_df=trait_df, alpha=-1, random_seed=2 + ) - result_df = pd.DataFrame( + first = freqdep(-1, 3 / 4) + second = 10 * freqdep(-1, 1 / 2) + effect_size_df = pd.DataFrame( + { + "site_id": [0, 2], + "effect_size": [first, second], + "trait_id": [0, 0], + "causal_state": ["T", "C"], + } + ) + genetic_df = pd.DataFrame( { "trait_id": [0, 0], "individual_id": [0, 1], - "genetic_value": [101, 122], + "genetic_value": [first + second, 2 * first + second], } ) - pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + pd.testing.assert_frame_equal( + effect_size_df, genetic_result.effect_size, check_dtype=False + ) + pd.testing.assert_frame_equal( + genetic_df, genetic_result.genetic, check_dtype=False + ) def test_tree_seq_multiple_trait(self): ts = binary_tree_seq() trait_df = pd.DataFrame( { - "trait_id": np.tile([0, 1], 3), - "site_id": np.repeat([0, 1, 2], 2), - "causal_state": np.repeat(["T", "G", "C"], 2), - "effect_size": [1, 2, 10, 20, 100, 200], + "trait_id": np.tile([0, 1], 2), + "site_id": np.repeat([0, 2], 2), + "effect_size": [1, 2, 10, 20], } ) - genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) + genetic_result = tstrait.sim_genetic( + ts=ts, trait_df=trait_df, alpha=-0.5, random_seed=2 + ) - result_df = pd.DataFrame( + first = freqdep(-0.5, 3 / 4) + second = 10 * freqdep(-0.5, 1 / 2) + effect_size_df = pd.DataFrame( { - "trait_id": [0, 0, 1, 1], - "individual_id": [0, 1, 0, 1], - "genetic_value": [101, 122, 202, 244], + "site_id": np.repeat([0, 2], 2), + "effect_size": [first, first * 2, second, second * 2], + "trait_id": np.tile([0, 1], 2), + "causal_state": np.repeat(["T", "C"], 2), + } + ) + genetic_df = pd.DataFrame( + { + "trait_id": np.repeat([0, 1], 2), + "individual_id": np.tile([0, 1], 2), + "genetic_value": [ + first + second, + 2 * first + second, + 2 * (first + second), + 2 * (2 * first + second), + ], } ) - pd.testing.assert_frame_equal(genetic_df, result_df, check_dtype=False) + pd.testing.assert_frame_equal( + effect_size_df, genetic_result.effect_size, check_dtype=False + ) + pd.testing.assert_frame_equal( + genetic_df, genetic_result.genetic, check_dtype=False + ) diff --git a/tests/test_sim_trait.py b/tests/test_sim_trait.py index d83b5ca..3768453 100644 --- a/tests/test_sim_trait.py +++ b/tests/test_sim_trait.py @@ -2,17 +2,9 @@ import numpy as np import pandas as pd import pytest -import tskit import tstrait from scipy import stats from tstrait.base import _check_numeric_array -from .data import ( - binary_tree, - binary_tree_seq, - diff_ind_tree, - non_binary_tree, - triploid_tree, -) # noreorder @pytest.fixture(scope="class") @@ -65,10 +57,9 @@ class TestOutputDim: def check_dimensions(self, df, num_causal): assert len(df) == num_causal - assert df.shape[1] == 4 + assert df.shape[1] == 3 assert list(df.columns) == [ "site_id", - "causal_state", "effect_size", "trait_id", ] @@ -116,7 +107,6 @@ def test_output_multivariate(self, sample_ts): ts=sample_ts, num_causal=num_causal, model=model, - alpha=0, random_seed=1, ) trait_ids = df.trait_id.unique() @@ -132,86 +122,6 @@ def test_output_multivariate(self, sample_ts): assert 0 <= np.min(df["trait_id"]) and np.max(df["trait_id"]) <= 1 -class Test_allele_freq: - @pytest.mark.parametrize("tree_seq", [binary_tree(), diff_ind_tree()]) - def test_binary_tree(self, tree_seq, sample_trait_model): - simulator = tstrait.TraitSimulator( - ts=tree_seq, - num_causal=4, - model=sample_trait_model, - alpha=0.3, - random_seed=1, - ) - tree = tree_seq.first() - c0 = simulator._obtain_allele_count(tree, tree_seq.site(0)) - c1 = simulator._obtain_allele_count(tree, tree_seq.site(1)) - c2 = simulator._obtain_allele_count(tree, tree_seq.site(2)) - c3 = simulator._obtain_allele_count(tree, tree_seq.site(3)) - - assert c0 == {"T": 2} - assert c1 == {"C": 1, "T": 1} - assert c2 == {"C": 1} - assert c3 == {"C": 2, "T": 2} - - @pytest.mark.parametrize("tree_seq", [non_binary_tree(), triploid_tree()]) - def non_binary_tree(self, tree_seq, sample_trait_model): - simulator = tstrait.TraitSimulator( - ts=tree_seq, - num_causal=4, - model=sample_trait_model, - alpha=0.3, - random_seed=1, - ) - tree = tree_seq.first() - c0 = simulator._obtain_allele_count(tree, tree_seq.site(0)) - c1 = simulator._obtain_allele_count(tree, tree_seq.site(1)) - - assert c0 == {"T": 3} - assert c1 == {"C": 2, "T": 1} - - -class TestAlleleFreq: - """Test the allele frequency dependence model by using a fixed trait model. - We will be using a tree sequence to also test that the simulation model works - in a tree sequence data. - """ - - def freqdep(self, alpha, freq): - return np.sqrt(pow(2 * freq * (1 - freq), alpha)) - - @pytest.mark.parametrize("alpha", [0, -1]) - def test_fixed_freq_dependence(self, alpha): - ts = binary_tree_seq() - num_causal = ts.num_sites - value = 2 - model = tstrait.trait_model(distribution="fixed", value=value) - sim_result = tstrait.sim_trait( - ts=ts, num_causal=num_causal, model=model, alpha=alpha - ) - - np.testing.assert_equal(sim_result["site_id"], np.arange(num_causal)) - np.testing.assert_equal(sim_result["trait_id"], np.zeros(num_causal)) - assert np.all(sim_result["causal_state"] != "A") - - df0 = sim_result.loc[sim_result.site_id == 0] - df2 = sim_result.loc[sim_result.site_id == 2] - - assert df0["causal_state"].values[0] == "T" - assert df0["effect_size"].values[0] == value * self.freqdep(alpha, 0.75) - assert df2["causal_state"].values[0] == "C" - assert df2["effect_size"].values[0] == value * self.freqdep(alpha, 0.5) - - def test_allele_freq_one(self, sample_trait_model): - ts = tskit.Tree.generate_comb(6, span=2).tree_sequence - tables = ts.dump_tables() - tables.sites.add_row(0, "A") - tables.mutations.add_row(site=0, node=8, derived_state="T") - tables.mutations.add_row(site=0, node=8, derived_state="A", parent=0) - ts = tables.tree_sequence() - sim_result = tstrait.sim_trait(ts=ts, num_causal=1, model=sample_trait_model) - np.testing.assert_equal(sim_result["effect_size"], np.array([0])) - - def model_list(): num_causal = 1000 loc = 2 @@ -275,7 +185,9 @@ def test_fixed(self, sample_ts): model = tstrait.trait_model(distribution="fixed", value=value) sim_result = tstrait.sim_trait(ts=sample_ts, num_causal=1000, model=model) data = sim_result.loc[sim_result.effect_size != 0] - np.testing.assert_allclose(data["effect_size"], np.ones(len(data)) * value) + np.testing.assert_allclose( + data["effect_size"], np.ones(len(data)) * value / 1000 + ) def test_multivariate(self, sample_ts): """ diff --git a/tests/test_simulate_environment.py b/tests/test_simulate_environment.py index 2d731b7..2d4be81 100644 --- a/tests/test_simulate_environment.py +++ b/tests/test_simulate_environment.py @@ -143,8 +143,10 @@ def test_output_sim_env(self, sample_ts, trait_model): trait_df = tstrait.sim_trait( ts=sample_ts, num_causal=num_causal, model=trait_model ) - genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=trait_df) - phenotype_df = tstrait.sim_env(genetic_df=genetic_df, h2=0.3) + genetic_result = tstrait.sim_genetic( + ts=sample_ts, trait_df=trait_df, random_seed=10 + ) + phenotype_df = tstrait.sim_env(genetic_df=genetic_result.genetic, h2=0.3) self.check_dimensions(phenotype_df, sample_ts.num_individuals) @@ -162,8 +164,12 @@ def test_output_multivariate(self, sample_ts): num_causal = 10 model = tstrait.trait_model(distribution="multi_normal", mean=mean, cov=cov) trait_df = tstrait.sim_trait(ts=sample_ts, num_causal=num_causal, model=model) - genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=trait_df) - phenotype_df = tstrait.sim_env(genetic_df=genetic_df, h2=np.ones(3) * 0.3) + genetic_result = tstrait.sim_genetic( + ts=sample_ts, trait_df=trait_df, random_seed=10 + ) + phenotype_df = tstrait.sim_env( + genetic_df=genetic_result.genetic, h2=np.ones(3) * 0.3 + ) self.check_dimensions(phenotype_df, sample_ts.num_individuals * num_trait) @@ -229,9 +235,13 @@ def test_env_dist(self): trait_df = tstrait.sim_trait( ts=ts, num_causal=num_causal, model=model, random_seed=10 ) - genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) - phenotype_df = tstrait.sim_env(genetic_df=genetic_df, h2=h2, random_seed=20) - phenotype_df1 = tstrait.sim_env(genetic_df=genetic_df, h2=h2, random_seed=30) + genetic_result = tstrait.sim_genetic(ts=ts, trait_df=trait_df, random_seed=10) + phenotype_df = tstrait.sim_env( + genetic_df=genetic_result.genetic, h2=h2, random_seed=20 + ) + phenotype_df1 = tstrait.sim_env( + genetic_df=genetic_result.genetic, h2=h2, random_seed=30 + ) phenotype_df = pd.concat([phenotype_df, phenotype_df1]) phenotype_df = phenotype_df.reset_index() @@ -260,9 +270,13 @@ def test_env_multiple(self): trait_df = tstrait.sim_trait( ts=ts, num_causal=num_causal, model=model_multiple, random_seed=11 ) - genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) - phenotype_df = tstrait.sim_env(genetic_df=genetic_df, h2=h2, random_seed=10) - phenotype_df1 = tstrait.sim_env(genetic_df=genetic_df, h2=h2, random_seed=20) + genetic_result = tstrait.sim_genetic(ts=ts, trait_df=trait_df, random_seed=1) + phenotype_df = tstrait.sim_env( + genetic_df=genetic_result.genetic, h2=h2, random_seed=10 + ) + phenotype_df1 = tstrait.sim_env( + genetic_df=genetic_result.genetic, h2=h2, random_seed=20 + ) phenotype_df = pd.concat([phenotype_df, phenotype_df1]) diff --git a/tests/test_simulate_phenotype.py b/tests/test_simulate_phenotype.py index c7ab6f2..f4fcf0a 100644 --- a/tests/test_simulate_phenotype.py +++ b/tests/test_simulate_phenotype.py @@ -102,13 +102,16 @@ def test_output(self, sample_ts, trait_model): ts=sample_ts, num_causal=num_causal, model=trait_model, - alpha=alpha, random_seed=1, ) - genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=trait_df) - phenotype_df = tstrait.sim_env(genetic_df=genetic_df, h2=h2, random_seed=1) + genetic_result = tstrait.sim_genetic( + ts=sample_ts, trait_df=trait_df, alpha=alpha, random_seed=1 + ) + phenotype_df = tstrait.sim_env( + genetic_df=genetic_result.genetic, h2=h2, random_seed=1 + ) - pd.testing.assert_frame_equal(result.effect_size, trait_df) + pd.testing.assert_frame_equal(result.effect_size, genetic_result.effect_size) pd.testing.assert_frame_equal(result.phenotype, phenotype_df) def test_multivariate(self, sample_ts): @@ -130,11 +133,14 @@ def test_multivariate(self, sample_ts): ts=sample_ts, num_causal=num_causal, model=trait_model, - alpha=alpha, random_seed=1, ) - genetic_df = tstrait.genetic_value(ts=sample_ts, trait_df=trait_df) - phenotype_df = tstrait.sim_env(genetic_df=genetic_df, h2=h2, random_seed=1) + genetic_result = tstrait.sim_genetic( + ts=sample_ts, trait_df=trait_df, alpha=alpha, random_seed=1 + ) + phenotype_df = tstrait.sim_env( + genetic_df=genetic_result.genetic, h2=h2, random_seed=1 + ) - pd.testing.assert_frame_equal(result.effect_size, trait_df) + pd.testing.assert_frame_equal(result.effect_size, genetic_result.effect_size) pd.testing.assert_frame_equal(result.phenotype, phenotype_df) diff --git a/tests/test_trait_model.py b/tests/test_trait_model.py index c246e15..edb0fa3 100644 --- a/tests/test_trait_model.py +++ b/tests/test_trait_model.py @@ -7,7 +7,7 @@ def simulate(model, num_causal): return np.array( [ - model.sim_effect_size(num_causal, np.random.default_rng(i)) + model._sim_effect_size(num_causal, np.random.default_rng(i)) for i in range(num_causal * 2) ] ) @@ -20,7 +20,7 @@ class TestTraitModel: tstrait.TraitModel.__abstractmethods__ = set() model = tstrait.TraitModel("sample") - assert model.sim_effect_size() is None + assert model._sim_effect_size() is None class TestInput: @@ -58,8 +58,8 @@ def test_output_dim(self): model = tstrait.trait_model( distribution="multi_normal", mean=np.zeros(4), cov=np.eye(4) ) - beta = model.sim_effect_size(10, np.random.default_rng(1)) - assert beta.shape == (4,) + beta = model._sim_effect_size(10, np.random.default_rng(1)) + assert beta.shape == (10, 4) assert model.num_trait == 4 def test_large_sample(self): @@ -70,9 +70,9 @@ def test_large_sample(self): M = np.random.randn(n, n) cov = np.dot(M, M.T) model = tstrait.trait_model(distribution="multi_normal", mean=mean, cov=cov) - effect_size = simulate(model, num_causal) + effect_size = model._sim_effect_size(num_causal, np.random.default_rng(1)) np.testing.assert_allclose( - np.cov(effect_size.T) * (num_causal**2), cov, rtol=1e-1 + np.cov(effect_size.T) * (num_causal**2), cov, rtol=2e-1 ) np.testing.assert_allclose(effect_size.mean(0) * num_causal, mean, rtol=1e-1) @@ -93,7 +93,7 @@ def test_normal(self, num_causal): loc = 2 scale = 5 model = tstrait.trait_model(distribution="normal", mean=loc, var=scale**2) - effect_size = simulate(model, num_causal) + effect_size = model._sim_effect_size(num_causal, np.random.default_rng(1)) self.check_distribution( effect_size, "norm", (loc / num_causal, scale / num_causal) ) @@ -101,7 +101,7 @@ def test_normal(self, num_causal): def test_exponential(self, num_causal): scale = 2 model = tstrait.trait_model(distribution="exponential", scale=scale) - effect_size = simulate(model, num_causal) + effect_size = model._sim_effect_size(num_causal, np.random.default_rng(1)) self.check_distribution(effect_size, "expon", (0, scale / num_causal)) assert np.min(effect_size) > 0 @@ -110,7 +110,7 @@ def test_exponential_negative(self, num_causal): model = tstrait.trait_model( distribution="exponential", scale=scale, negative=True ) - effect_size = simulate(model, num_causal) + effect_size = model._sim_effect_size(num_causal, np.random.default_rng(1)) assert np.min(effect_size) < 0 effect_size = np.abs(effect_size) self.check_distribution(effect_size, "expon", (0, scale / num_causal)) @@ -120,7 +120,7 @@ def test_t(self, num_causal): scale = 3 df = 10 model = tstrait.trait_model(distribution="t", mean=loc, var=scale**2, df=df) - effect_size = simulate(model, num_causal) + effect_size = model._sim_effect_size(num_causal, np.random.default_rng(1)) self.check_distribution( effect_size, "t", (df, loc / num_causal, scale / num_causal) ) @@ -129,7 +129,7 @@ def test_gamma(self, num_causal): shape = 3 scale = 2 model = tstrait.trait_model(distribution="gamma", shape=shape, scale=scale) - effect_size = simulate(model, num_causal) + effect_size = model._sim_effect_size(num_causal, np.random.default_rng(1)) self.check_distribution(effect_size, "gamma", (shape, 0, scale / num_causal)) assert np.min(effect_size) > 0 @@ -139,7 +139,7 @@ def test_gamma_negative(self, num_causal): model = tstrait.trait_model( distribution="gamma", shape=shape, scale=scale, negative=True ) - effect_size = simulate(model, num_causal) + effect_size = model._sim_effect_size(num_causal, np.random.default_rng(1)) assert np.min(effect_size) < 0 effect_size = np.abs(effect_size) self.check_distribution(effect_size, "gamma", (shape, 0, scale / num_causal)) @@ -151,7 +151,7 @@ def test_multivariate(self, num_causal): M = np.random.randn(n, n) cov = np.dot(M, M.T) model = tstrait.trait_model(distribution="multi_normal", mean=mean, cov=cov) - effect_size = simulate(model, num_causal) + effect_size = model._sim_effect_size(num_causal, np.random.default_rng(1)) for i in range(n): self.check_distribution( effect_size[:, i], diff --git a/tstrait/__init__.py b/tstrait/__init__.py index 28ffa7c..fb1f3a7 100644 --- a/tstrait/__init__.py +++ b/tstrait/__init__.py @@ -12,7 +12,7 @@ TraitSimulator, ) # noreorder from .simulate_phenotype import ( - Result, + PhenotypeResult, sim_phenotype, ) # noreorder from .trait_model import ( @@ -26,7 +26,8 @@ TraitModelT, ) # noreorder from .genetic_value import ( - genetic_value, + sim_genetic, + GeneticResult, GeneticValue, ) # noreorder from .simulate_environment import ( @@ -38,7 +39,7 @@ "__version__", "sim_trait", "TraitSimulator", - "Result", + "PhenotypeResult", "sim_phenotype", "trait_model", "TraitModel", @@ -48,7 +49,8 @@ "TraitModelMultivariateNormal", "TraitModelNormal", "TraitModelT", - "genetic_value", + "sim_genetic", + "GeneticResult", "GeneticValue", "sim_env", "EnvSimulator", diff --git a/tstrait/base.py b/tstrait/base.py index 2a79510..77d77d6 100644 --- a/tstrait/base.py +++ b/tstrait/base.py @@ -57,6 +57,7 @@ def _check_dataframe(df, column, df_name): def _check_non_decreasing(array, array_name): """Check that the array is non-decreasing.""" - diff = np.diff(array) - if np.min(diff) < 0: - raise ValueError(f"{array_name} must be non-decreasing") + if len(array) > 1: + diff = np.diff(array) + if np.min(diff) < 0: + raise ValueError(f"{array_name} must be non-decreasing") diff --git a/tstrait/genetic_value.py b/tstrait/genetic_value.py index 2e7e963..1c5d021 100644 --- a/tstrait/genetic_value.py +++ b/tstrait/genetic_value.py @@ -1,3 +1,6 @@ +import collections +from dataclasses import dataclass + import numba import numpy as np import pandas as pd @@ -6,6 +9,33 @@ from .base import _check_instance, _check_dataframe, _check_non_decreasing # noreorder +@dataclass +class GeneticResult: + """ + Data class that contains effect size and genetic value dataframe. + + Parameters + ---------- + effect_size : pandas.DataFrame + Dataframe that includes simulated effect sizes. + genetic : pandas.DataFrame + Dataframe that includes simulated genetic values. + + See Also + -------- + sim_genetic : Use this dataclass as a simulation output. + + Examples + -------- + See :ref:`effect_size_genetic` for details on extracting the effect size + dataframe, and :ref:`genetic_value_output` for details on extracting the + genetic value dataframe. + """ + + effect_size: pd.DataFrame + genetic: pd.DataFrame + + @numba.njit def _traversal_genotype( nodes_individual, @@ -43,16 +73,52 @@ def _traversal_genotype( class GeneticValue: """GeneticValue class to compute genetic values of individuals. - :param ts: Tree sequence data with mutation. - :type ts: tskit.TreeSequence - :param trait_df: Pandas dataframe that includes causal site ID, causal allele, - simulated effect size, and trait ID. - :type effect_size_df: pandas.DataFrame + Parameters + ---------- + ts : tskit.TreeSequence + Tree sequence data with mutation + trait_df : pandas.DataFrame + Dataframe that includes causal site ID, causal allele, simulated effect + size, and trait ID. + alpha : float + Parameter that determines the relative weight on rarer variants. """ - def __init__(self, ts, trait_df): - self.trait_df = trait_df[["site_id", "causal_state", "effect_size", "trait_id"]] + def __init__(self, ts, trait_df, alpha, random_seed): + self.trait_df = trait_df[["site_id", "effect_size", "trait_id"]] self.ts = ts + self.alpha = alpha + self.rng = np.random.default_rng(random_seed) + + def _obtain_allele_count(self, tree, site): + """Obtain a dictionary of allele counts, and the ancestral state is not + included in the dictionary. Input is the tree sequence site (`ts.site(ID)`) + instead of site ID, as obtaining `ts.site(ID)` can be time consuming. The + ancestral state is not deleted if the ancestral state is the only allele + at that site. + """ + counts = collections.Counter({site.ancestral_state: self.ts.num_samples}) + for m in site.mutations: + current_state = site.ancestral_state + if m.parent != tskit.NULL: + current_state = self.ts.mutation(m.parent).derived_state + # Silent mutations do nothing + if current_state != m.derived_state: # pragma: no cover + num_samples = tree.num_samples(m.node) + counts[m.derived_state] += num_samples + counts[current_state] -= num_samples + del counts[site.ancestral_state] + counts = {x: y for x, y in counts.items() if y != 0} + if len(counts) == 0: + counts = {site.ancestral_state: self.ts.num_samples} + return counts + + def _frequency_dependence(self, allele_freq): + if allele_freq == 0 or allele_freq == 1: + const = 0 + else: + const = np.sqrt(pow(2 * allele_freq * (1 - allele_freq), self.alpha)) + return const def _individual_genotype(self, tree, site, causal_state, num_nodes): """ @@ -69,7 +135,7 @@ def _individual_genotype(self, tree, site, causal_state, num_nodes): if state == causal_state: stack.append(node) - if len(stack) == 0: + if len(stack) == 0: # pragma: no cover genotype = np.zeros(self.ts.num_individuals) else: genotype = _traversal_genotype( @@ -87,29 +153,38 @@ def _individual_genotype(self, tree, site, causal_state, num_nodes): def compute_genetic_value(self): """Computes genetic values of individuals. - :return: Returns a pandas dataframe with genetic value, individual ID, - and trait ID - :rtype: pandas.DataFrame + Returns + ------- + pandas.DataFrame + Dataframe with simulated genetic value, individual ID, and trait ID. """ num_ind = self.ts.num_individuals + num_samples = self.ts.num_samples num_trait = np.max(self.trait_df.trait_id) + 1 genetic_val_array = np.zeros((num_trait, num_ind)) - + causal_state_array = np.zeros(len(self.trait_df), dtype=object) + freq_dep = np.zeros(len(self.trait_df)) num_nodes = self.ts.num_nodes tree = tskit.Tree(self.ts) - for data in self.trait_df.itertuples(): + for i, data in enumerate(self.trait_df.itertuples()): site = self.ts.site(data.site_id) tree.seek(site.position) + counts = self._obtain_allele_count(tree, site) + causal_state = self.rng.choice(list(counts)) + causal_state_array[i] = causal_state + allele_freq = counts[causal_state] / num_samples + freq_dep[i] = self._frequency_dependence(allele_freq) + individual_genotype = self._individual_genotype( tree=tree, site=site, - causal_state=data.causal_state, + causal_state=causal_state, num_nodes=num_nodes, ) genetic_val_array[data.trait_id, :] += ( - individual_genotype * data.effect_size + individual_genotype * data.effect_size * freq_dep[i] ) df = pd.DataFrame( @@ -120,27 +195,91 @@ def compute_genetic_value(self): } ) - return df - - -def genetic_value(ts, trait_df): - """Calculates genetic values of individuals based on the inputted tree sequence - and the dataframe with simulated effect sizes of causal mutations. - :param ts: The tree sequence data that will be used in the quantitative trait - simulation. The tree sequence data must include a mutation. - :type ts: tskit.TreeSequence - :param trait_df: The dataframe that includes simulated effect sizes of causal - mutations. It must include site_id, causal allele, simulated effect size, - and trait_id, and the data must be aligned based on site_id. - :type trait_df: pandas.DataFrame - :return: Returns a pandas dataframe that includes individual ID and genetic - value. - :rtype: pandas.DataFrame + self.trait_df["effect_size"] = np.multiply(self.trait_df.effect_size, freq_dep) + self.trait_df["causal_state"] = causal_state_array + + genetic_result = GeneticResult(effect_size=self.trait_df, genetic=df) + + return genetic_result + + +def sim_genetic(ts, trait_df, alpha=0, random_seed=None): + """ + Simulates genetic values from a trait dataframe. + + Parameters + ---------- + ts : tskit.TreeSequence + The tree sequence data that will be used in the quantitative trait + simulation. + trait_df : pandas.DataFrame + Trait dataframe. + alpha : float, default 0 + Parameter that determines the degree of the frequency dependence model. Please + see :ref:`frequency_dependence` for details on how this parameter influences + effect size simulation. + random_seed : int, default None + Random seed of simulation. If None, simulation will be conducted randomly. + + Returns + ------- + GeneticResult + Dataclass object that includes effect size and genetic value dataframe. + + See Also + -------- + trait_model : Return a trait model, which can be used as `model` input. + sim_trait : Return a trait dataframe, whch can be used as a `trait_df` input. + GeneticResult : Dataclass object that will be used as an output. + sim_env : Genetic value dataframe output can be used as an input to simulate + environmental noise. + + Notes + ----- + The `trait_df` input has some requirements that will be noted below. + + 1. Columns + + The following columns must be included in `trait_df`: + + * **site_id**: Site IDs that have causal mutation. + * **effect_size**: Simulated effect size of causal mutation. + * **trait_id**: Trait ID. + + 2. Data requirements + + * Site IDs in **site_id** column must be sorted in an ascending order. Please refer + to :func:`pandas.DataFrame.sort_values` for details on sorting values in a + :class:`pandas.DataFrame`. + + * Trait IDs in **trait_id** column must start from zero and be consecutive. + + The simulation outputs of effect sizes and phenotypes are given as a + :py:class:`pandas.DataFrame`. + + The effect size dataframe can be extracted by using ``.effect_size`` in the + resulting object and contains the following columns: + + * **site_id**: ID of sites that have causal mutation. + * **causal_state**: Causal state. + * **effect_size**: Simulated effect size of causal mutation. + * **trait_id**: Trait ID. + + The genetic value dataframe can be extracted by using ``.genetic`` in the resulting + object and contains the following columns: + + * **trait_id**: Trait ID. + * **individual_id**: Individual ID inside the tree sequence input. + * **genetic_value**: Simulated genetic values. + + Examples + -------- + See :ref:`genetic` for worked examples. """ ts = _check_instance(ts, "ts", tskit.TreeSequence) trait_df = _check_dataframe( - trait_df, ["site_id", "causal_state", "effect_size", "trait_id"], "trait_df" + trait_df, ["site_id", "effect_size", "trait_id"], "trait_df" ) _check_non_decreasing(trait_df["site_id"], "site_id") @@ -149,8 +288,10 @@ def genetic_value(ts, trait_df): if np.min(trait_id) != 0 or np.max(trait_id) != len(trait_id) - 1: raise ValueError("trait_id must be consecutive and start from 0") - genetic = GeneticValue(ts=ts, trait_df=trait_df) + genetic = GeneticValue( + ts=ts, trait_df=trait_df, alpha=alpha, random_seed=random_seed + ) - genetic_df = genetic.compute_genetic_value() + genetic_result = genetic.compute_genetic_value() - return genetic_df + return genetic_result diff --git a/tstrait/simulate_effect_size.py b/tstrait/simulate_effect_size.py index 21c18af..fd07cb6 100644 --- a/tstrait/simulate_effect_size.py +++ b/tstrait/simulate_effect_size.py @@ -1,5 +1,3 @@ -import collections - import numpy as np import pandas as pd import tskit @@ -12,24 +10,22 @@ class TraitSimulator: """Simulator class to select causal alleles and simulate effect sizes of causal mutations. - :param ts: Tree sequence data with mutation. - :type ts: tskit.TreeSequence - :param num_causal: Number of causal sites. - :type num_causal: int - :param model: Trait model that will be used to simulate effect sizes. - :type model: TraitModel - :param alpha: Parameter that determines the emphasis placed on rarer variants. - :type alpha: float - :param random_seed: The random seed. If this is not specified or None, simulation - will be done randomly. - :type random_seed: None or int + Parameters + ---------- + ts : tskit.TreeSequence + Tree sequence data with mutation. + num_causal : int + Number of causal sites. + model : TraitModel + Trait model that will be used to simulate effect sizes. + random_seed : int + The random seed. """ - def __init__(self, ts, num_causal, model, alpha, random_seed): + def __init__(self, ts, num_causal, model, random_seed): self.ts = ts self.num_causal = num_causal self.model = model - self.alpha = alpha self.rng = np.random.default_rng(random_seed) def _choose_causal_site(self): @@ -40,78 +36,35 @@ def _choose_causal_site(self): their genomic locations. """ site_id = self.rng.choice( - range(self.ts.num_sites), size=self.num_causal, replace=False + self.ts.num_sites, size=self.num_causal, replace=False ) site_id = np.sort(site_id) return site_id - def _obtain_allele_count(self, tree, site): - """Obtain a dictionary of allele counts, and the ancestral state is not - included in the dictionary. Input is the tree sequence site (`ts.site(ID)`) - instead of site ID, as obtaining `ts.site(ID)` can be time consuming. The - ancestral state is not deleted if the ancestral state is the only allele - at that site. - """ - counts = collections.Counter({site.ancestral_state: self.ts.num_samples}) - for m in site.mutations: - current_state = site.ancestral_state - if m.parent != tskit.NULL: - current_state = self.ts.mutation(m.parent).derived_state - # Silent mutations do nothing - if current_state != m.derived_state: - num_samples = tree.num_samples(m.node) - counts[m.derived_state] += num_samples - counts[current_state] -= num_samples - del counts[site.ancestral_state] - counts = {x: y for x, y in counts.items() if y != 0} - if len(counts) == 0: - counts = {site.ancestral_state: self.ts.num_samples} - return counts - - def _frequency_dependence(self, allele_freq): - if allele_freq == 0 or allele_freq == 1: - const = 0 - else: - const = np.sqrt(pow(2 * allele_freq * (1 - allele_freq), self.alpha)) - return const - def sim_causal_mutation(self): """This method randomly chooses causal sites and the corresponding causal state based on the `num_causal` input. Afterwards, effect size of each causal site is simulated based on the trait model given by the `model` input. - :return: Returns a pandas dataframe that includes causal site ID, causal allele, - simulated effect size, and trait ID. - :rtype: pandas.DataFrame + Returns + ------- + pandas.DataFrame + Trait dataframe that includes causal site ID, causal allele, simulated + effect size, and trait ID. """ - causal_site_array = self._choose_causal_site() - num_samples = self.ts.num_samples num_trait = self.model.num_trait - tree = tskit.Tree(self.ts) + site_id_array = self._choose_causal_site() + site_id_array = np.repeat(site_id_array, num_trait) - causal_state_array = np.zeros(self.num_causal, dtype=object) - beta_array = np.zeros((self.num_causal, num_trait), dtype=float) + beta_array = self.model._sim_effect_size( + num_causal=self.num_causal, rng=self.rng + ) trait_id_array = np.tile(np.arange(num_trait), self.num_causal) - site_id_array = np.repeat(causal_site_array, num_trait) - - for i, single_id in enumerate(causal_site_array): - site = self.ts.site(single_id) - tree.seek(site.position) - counts = self._obtain_allele_count(tree, site) - causal_state = self.rng.choice(list(counts)) - causal_state_array[i] = causal_state - allele_freq = counts[causal_state] / num_samples - beta = self.model.sim_effect_size(num_causal=self.num_causal, rng=self.rng) - beta *= self._frequency_dependence(allele_freq) - beta_array[i, :] = beta - - causal_state_array = np.repeat(causal_state_array, num_trait) df = pd.DataFrame( { "site_id": site_id_array, - "causal_state": causal_state_array, "effect_size": beta_array.flatten(), "trait_id": trait_id_array, } @@ -120,32 +73,50 @@ def sim_causal_mutation(self): return df -def sim_trait(ts, num_causal, model, alpha=0, random_seed=None): - """Randomly selects causal sites from the inputted tree sequence data, and simulates - effect sizes of causal mutations. - - :param ts: The tree sequence data that will be used in the quantitative trait - simulation. The tree sequence data must include a mutation. - :type ts: tskit.TreeSequence - :param num_causal: Number of causal sites that will be chosen randomly. It must be - a positive integer that is less than the number of sites in the tree sequence - data. - :type num_causal: int - :param model: Trait model that will be used to simulate effect sizes of causal - mutations. - :type model: TraitModel - :param alpha: Parameter that determines the relative weight on rarer variants. - A negative `alpha` value can increase the magnitude of effect sizes coming - from rarer variants. The frequency dependent architecture can be ignored - by setting `alpha` to be zero. - :type alpha: float - :param random_seed: The random seed. If this is not specified or None, simulation - will be done randomly. - :type random_seed: None or int - :return: Returns a pandas dataframe that includes causal site ID, causal allele and - simulated effect size. It can be used as an input in :func:`sim_phenotype` - function to simulate phenotypes. - :rtype: pandas.DataFrame +def sim_trait(ts, num_causal, model, random_seed=None): + """ + Randomly selects causal sites and simulates effect sizes. + + Parameters + ---------- + ts : tskit.TreeSequence + The tree sequence data that will be used in the quantitative trait + simulation. + num_causal : int + Number of causal sites. + model : tstrait.TraitModel + Trait model that will be used to simulate effect sizes. + random_seed : int, default None + Random seed of simulation. If None, simulation will be conducted randomly. + + Returns + ------- + pandas.DataFrame + Trait dataframe that includes simulated effect sizes. + + Raises + ------ + ValueError + If the number of mutations in `ts` is smaller than `num_causal`. + + See Also + -------- + trait_model : Return a trait model, which can be used as `model` input. + sim_genetic : The trait dataframe output can be used as an input to simulate + genetic values. + + Note + ---- + The simulation output is given as a :py:class:`pandas.DataFrame` and contains the + following columns: + + * **site_id**: Site IDs that have causal mutation. + * **effect_size**: Simulated effect size of causal mutation. + * **trait_id**: Trait ID. + + Examples + -------- + See :ref:`effect_size_sim` for worked examples. """ ts = _check_instance(ts, "ts", tskit.TreeSequence) model = _check_instance(model, "model", tstrait.TraitModel) @@ -161,9 +132,8 @@ def sim_trait(ts, num_causal, model, alpha=0, random_seed=None): ts=ts, num_causal=num_causal, model=model, - alpha=alpha, random_seed=random_seed, ) - effect_size_df = simulator.sim_causal_mutation() + trait_df = simulator.sim_causal_mutation() - return effect_size_df + return trait_df diff --git a/tstrait/simulate_environment.py b/tstrait/simulate_environment.py index 01a376d..f71c9af 100644 --- a/tstrait/simulate_environment.py +++ b/tstrait/simulate_environment.py @@ -8,15 +8,15 @@ class EnvSimulator: """Simulator class to simulate environmental noise of individuals. - :param genetic_df: Pandas dataframe that includes genetic value of individuals. - It must include individual ID, genetic value and trait ID. - :type genetic_df: pandas.DataFrame - :param h2: Narrow-sense heritability, which will be used to simulate environmental - noise. Narrow-sense heritability must be between 0 and 1. - :type h2: float or list or numpy.ndarray - :param random_seed: The random seed. If this is not specified or None, simulation - will be done randomly. - :type random_seed: int + Parameters + ---------- + genetic_df : pandas.DataFrame + Pandas dataframe that includes genetic value of individuals. It must include + individual ID, genetic value and trait ID. + h2 : float or array-like + Narrow-sense heritability. + random_seed : int + The random seed. """ def __init__(self, genetic_df, h2, random_seed): @@ -51,20 +51,61 @@ def sim_environment(self): def sim_env(genetic_df, h2, random_seed=None): - """Simulates environmental noise and obtain phenotype. - - :param genetic_df: Pandas dataframe that includes genetic value of individuals. - It must include individual ID, genetic value and trait ID. - :type genetic_df: pandas.DataFrame - :param h2: Narrow-sense heritability, which will be used to simulate environmental - noise. Narrow-sense heritability must be between 0 and 1. - :type h2: float or list or numpy.ndarray - :param random_seed: The random seed. If this is not specified or None, simulation - will be done randomly. - :type random_seed: int - :return: Returns a pandas dataframe that includes individual ID, genetic value, - environmental noise, phenotype, and trait ID. - :rtype: pandas.DataFrame + """ + Simulates environmental noise. + + Parameters + ---------- + genetic_df : pandas.DataFrame + Genetic value dataframe. + h2 : float or array-like + Narrow-sense heritability. The dimension of `h2` must match the number of traits + to be simulated. + random_seed : int, default None + Random seed of simulation. If None, simulation will be conducted randomly. + + Returns + ------- + pandas.DataFrame + Dataframe with simulated environmental noise. + + Raises + ------ + ValueError + If `h2` <= 0 or `h2` > 1 + + See Also + -------- + sim_genetic : Return a dataclass with genetic value dataframe, which can be used as + `genetic_df` input. + + Notes + ----- + The `genetic_df` input has some requirements that will be noted below. + + 1. Columns + + The following columns must be included in `genetic_df`: + + * **trait_id**: Trait ID. + * **individual_id**: Individual ID inside the tree sequence input. + * **genetic_value**: Simulated genetic values. + + 2. Data requirement + + Trait IDs in **trait_id** column must start from 0 and be consecutive. + + The dataframe output has the following columns: + + * **trait_id**: Trait ID. + * **individual_id**: Individual ID inside the tree sequence input. + * **genetic_value**: Simulated genetic values. + * **environmental_noise**: Simulated environmental noise. + * **phenotype**: Simulated phenotype. + + Examples + -------- + See :ref:`env_simulation` for worked examples. """ genetic_df = _check_dataframe( genetic_df, ["trait_id", "individual_id", "genetic_value"], "genetic_df" diff --git a/tstrait/simulate_phenotype.py b/tstrait/simulate_phenotype.py index fe835ca..0def1f9 100644 --- a/tstrait/simulate_phenotype.py +++ b/tstrait/simulate_phenotype.py @@ -5,14 +5,26 @@ @dataclass -class Result: - """Data class that contains effect size dataframe and phenotype dataframe. - - :param effect_size: Effect size dataframe that includes site_id, causal_state, - effect_size, and trait_id - :param phenotype: Phenotype dataframe that includes trait_id, individual_id, - genetic_value, environmental_noise, and phenotype - :type phenotype: pandas.DataFrame +class PhenotypeResult: + """ + Dataclass that contains effect size dataframe and phenotype dataframe. + + Parameters + ---------- + effect_size : pandas.DataFrame + Dataframe that includes simulated effect sizes. + phenotype : pandas.DataFrame + Dataframe that includes simulated phenotype. + + See Also + -------- + sim_phenotype : Use this dataclass as a simulation output. + + Examples + -------- + See :ref:`effect_size_output` for details on extracting the effect size + dataframe, and :ref:`phenotype_output` for details on extracting the + phenotype dataframe. """ effect_size: pd.DataFrame @@ -20,41 +32,87 @@ class Result: def sim_phenotype(ts, num_causal, model, h2, alpha=0, random_seed=None): - """Simulates quantitative traits of individuals based on the inputted tree sequence - and the specified trait model, and returns a :class:`Result` object. - - :param ts: The tree sequence data that will be used in the quantitative trait - simulation. The tree sequence data must include a mutation. - :type ts: tskit.TreeSequence - :param num_causal: Number of causal sites that will be chosen randomly. It should - be a positive integer that is greater than the number of sites in the tree - sequence data. - :type num_causal: int - :param model: Trait model that will be used to simulate effect sizes of causal sites. - :type model: tstrait.TraitModel - :param h2: Narrow-sense heritability, which will be used to simulate environmental - noise. Narrow-sense heritability must be between 0 and 1. - :type h2: float - :param alpha: Parameter that determines the relative weight on rarer variants. - A negative `alpha` value can increase the magnitude of effect sizes coming - from rarer variants. The frequency dependent architecture can be ignored - by setting `alpha` to be zero. - :type alpha: float - :param random_seed: The random seed. If this is not specified or None, simulation - will be done randomly. - :type random_seed: None or int - :return: Returns the :class:`Result` object that includes the effect size - dataframe and phenotype dataframe. - :rtype: Result + """ + Simulate quantitative traits. + + Parameters + ---------- + ts : tskit.TreeSequence + The tree sequence data that will be used in the quantitative trait + simulation. + num_causal : int + Number of causal sites. + model : tstrait.TraitModel + Trait model that will be used to simulate effect sizes. + h2 : float or array-like + Narrow-sense heritability. The dimension of `h2` must match the number of + traits to be simulated. + alpha : float, default 0 + Parameter that determines the degree of the frequency dependence model. Please + see :ref:`frequency_dependence` for details on how this parameter influences + effect size simulation. + random_seed : int, default None + Random seed of simulation. If None, simulation will be conducted randomly. + + Returns + ------- + PhenotypeResult + Dataclass object that includes phenotype and effect size dataframe. + + Raises + ------ + ValueError + If the number of mutations in `ts` is smaller than `num_causal`. + ValueError + If `h2` <= 0 or `h2` > 1 + + See Also + -------- + trait_model : Return a trait model, which can be used as `model` input. + PhenotypeResult : Dataclass object that will be used as an output. + + Notes + ----- + The simulation outputs of effect sizes and phenotypes are given as a + :py:class:`pandas.DataFrame`. + + The effect size dataframe can be extracted by using ``.effect_size`` in the + resulting object and contains the following columns: + + * **site_id**: Site IDs that have causal mutation. + * **causal_state**: Causal state. + * **effect_size**: Simulated effect size of causal mutation. + * **trait_id**: Trait ID. + + The phenotype dataframe can be extracted by using ``.phenotype`` in the resulting + object and contains the following columns: + + * **trait_id**: Trait ID. + * **individual_id**: Individual ID inside the tree sequence input. + * **genetic_value**: Simulated genetic values. + * **environmental_noise**: Simulated environmental noise. + * **phenotype**: Simulated phenotype. + + Please refer to :ref:`phenotype_model` for mathematical details of the phenotypic + model. + + Examples + -------- + See :ref:`quickstart` for worked examples. + """ trait_df = tstrait.sim_trait( - ts=ts, num_causal=num_causal, model=model, alpha=alpha, random_seed=random_seed + ts=ts, num_causal=num_causal, model=model, random_seed=random_seed + ) + genetic_result = tstrait.sim_genetic( + ts=ts, trait_df=trait_df, alpha=alpha, random_seed=random_seed ) - genetic_df = tstrait.genetic_value(ts=ts, trait_df=trait_df) phenotype_df = tstrait.sim_env( - genetic_df=genetic_df, h2=h2, random_seed=random_seed + genetic_df=genetic_result.genetic, h2=h2, random_seed=random_seed ) - result = tstrait.Result(effect_size=trait_df, phenotype=phenotype_df) + result = tstrait.PhenotypeResult( + effect_size=genetic_result.effect_size, phenotype=phenotype_df + ) return result diff --git a/tstrait/trait_model.py b/tstrait/trait_model.py index ebf52d3..6669c83 100644 --- a/tstrait/trait_model.py +++ b/tstrait/trait_model.py @@ -11,11 +11,29 @@ class TraitModel(metaclass=ABCMeta): - """Superclass of the trait model. See the :ref:`sec_trait_model` section for - more details on the available models and examples. - - :param model_name: Name of the trait model. - :type model_name: str + """ + Superclass of the trait model. + + Parameters + ---------- + name : str + Name of the trait model. + + See Also + -------- + trait_model : Construct a trait model. + TraitModelNormal : Return a normal distribution trait model. + TraitModelT : Return a Student's t-distribution trait model. + TraitModelFixed : Return a fixed value trait model. + TraitModelExponential : Return an exponential distribution trait model. + TraitModelGamma : Return a gamma distribution trait model. + TraitModelMultivariateNormal : Return a multivariate normal distribution + trait model. + + Note + ---- + This is the base class for all trait models in tstrait. All trait models + should set all parameters in their ``__init__`` as arguments. """ @abstractmethod @@ -29,18 +47,40 @@ def _check_parameter(self, num_causal, rng): return num_causal @abstractmethod - def sim_effect_size(self): + def _sim_effect_size(self): pass class TraitModelNormal(TraitModel): - """Normal distribution trait model class, where the effect sizes are simulated - from a normal distribution. - - :param mean: Mean value of the simulated effect sizes. - :type mean: float - :param var: Variance of the simulated effect sizes. - :type var: float + """ + Normal distribution trait model. + + Parameters + ---------- + mean : float + Mean of the simulated effect size. + var : float + Variance of the simulated effect size. Must be non-negative. + + Returns + ------- + TraitModel + Normal distribution trait model. + + See Also + -------- + trait_model : Construct a trait model. + numpy.random.Generator.normal : Details on the input parameters and distribution. + + Note + ---- + This is a trait model built on top of :py:func:`numpy.random.Generator.normal`, so + please see its documentation for the details of the normal distribution simulation. + + Example + ------- + Please see the docstring example of :func:`trait_model` for constructing a normal + distribution trait model. """ def __init__(self, mean, var): @@ -48,35 +88,64 @@ def __init__(self, mean, var): self.var = var super().__init__("normal") - def sim_effect_size(self, num_causal, rng): + def _sim_effect_size(self, num_causal, rng): """ This method simulates an effect size from a normal distribution. - :param num_causal: Number of causal sites - :type num_causal: int - :param rng: Random generator that will be used to simulate effect size. - :type rng: numpy.random.Generator - :return: Simulated effect size of a causal mutation. - :rtype: float + Parameters + ---------- + num_causal : int + Number of causal sites + rng : numpy.random.Generator + Random generator that will be used to simulate effect size. + + Returns + ------- + float or array-like + Simulated effect size of a causal mutation. """ num_causal = self._check_parameter(num_causal, rng) beta = rng.normal( - loc=self.mean / num_causal, scale=np.sqrt(self.var) / num_causal + loc=self.mean / num_causal, + scale=np.sqrt(self.var) / num_causal, + size=num_causal, ) return beta class TraitModelExponential(TraitModel): - """Exponential distribution trait model class, where the effect sizes - are simulated from an exponential distribution. - - :param scale: The scale parameter of the exponential distribution, and - it must be greater than zero. - :type scale: float - :param negative: Determines if a negative value can be simulated from the - trait model. If it is set to be True, 1 or -1 will be multipled to - the simulated effect size. - :type negative: bool + """Exponential distribution trait model. + + Parameters + ---------- + scale : float + Scale of the exponential distribution. Must be non-negative. + negative : bool, default False + Determine if a negative can be simulated from the trait model. If it is set + to be True, :math:`1` or :math:`-1` will be randomly multiplied to the + simulated effect sizes. + + Returns + ------- + TraitModel + Exponential distribution trait model. + + See Also + -------- + trait_model : Construct a trait model. + numpy.random.Generator.exponential : Details on the input parameters and + distribution. + + Note + ---- + This is a trait model built on top of :py:func:`numpy.random.Generator.exponential`, + so please see its documentation for the details of the exponential distribution + simulation. + + Example + ------- + Please see the docstring example of :func:`trait_model` for constructing an + exponential distribution trait model. """ def __init__(self, scale, negative=False): @@ -84,61 +153,116 @@ def __init__(self, scale, negative=False): self.negative = _check_instance(negative, "negative", bool) super().__init__("exponential") - def sim_effect_size(self, num_causal, rng): + def _sim_effect_size(self, num_causal, rng): """ This method simulates an effect size from an exponential distribution. - :param num_causal: Number of causal sites - :type num_causal: int - :param rng: Random generator that will be used to simulate effect size. - :type rng: numpy.random.Generator - :return: Simulated effect size of a causal mutation. - :rtype: float + Parameters + ---------- + num_causal : int + Number of causal sites + rng : numpy.random.Generator + Random generator that will be used to simulate effect size. + + Returns + ------- + float or array-like + Simulated effect size of a causal mutation. """ num_causal = self._check_parameter(num_causal, rng) - beta = rng.exponential(scale=self.scale / num_causal) + beta = rng.exponential(scale=self.scale / num_causal, size=num_causal) if self.negative: - beta *= rng.choice([-1, 1]) + beta = np.multiply(rng.choice([-1, 1], size=num_causal), beta) return beta class TraitModelFixed(TraitModel): - """Fixed trait model class, where the effect size is a fixed quantity. - - :param value: Effect size of causal mutation. - :type value: float + """ + Fixed value trait model. + + Parameters + ---------- + value : float + Value of the simulated effect size. + + Returns + ------- + TraitModel + Fixed value trait model. + + See Also + -------- + trait_model : Construct a trait model. + + Note + ---- + This is a trait model that only gives the fixed value that is specified in + `value`. + + Example + ------- + Please see the docstring example of :func:`trait_model` for constructing a + fixed value trait model. """ def __init__(self, value): self.value = _check_val(value, "value") super().__init__("fixed") - def sim_effect_size(self, num_causal, rng): + def _sim_effect_size(self, num_causal, rng): """ This method returns an effect size from a fixed trait model. - :param num_causal: Number of causal sites - :type num_causal: int - :param rng: Random generator that will be used to simulate effect size. - :type rng: numpy.random.Generator - :return: Simulated effect size of a causal mutation. - :rtype: float + Parameters + ---------- + num_causal : int + Number of causal sites + rng : numpy.random.Generator + Random generator that will be used to simulate effect size. + + Returns + ------- + float or array-like + Simulated effect size of a causal mutation. """ num_causal = self._check_parameter(num_causal, rng) - beta = self.value - return beta + beta = self.value / num_causal + return np.repeat(beta, num_causal) class TraitModelT(TraitModel): - """T distribution trait model class, where the effect sizes are simulated from - a t distribution. - - :param mean: Mean value of the simulated effect sizes. - :type mean: float - :param var: Variance of the simulated effect sizes. - :type var: float - :param df: Degrees of freedom, and it must be greater than 0. - :type df: float + """ + Student's t distribution trait model. + + Parameters + ---------- + mean : float + Mean of the simulated effect size. + var : float + Variance of the simulated effect size. Must be > 0. + df : float + Degrees of freedom. Must be > 0. + + Returns + ------- + TraitModel + Student's t distribution trait model. + + See Also + -------- + trait_model : Construct a trait model. + numpy.random.Generator.standard_t : Details on the input parameters and distribution. + + Note + ---- + This is a trait model built on top of :py:func:`numpy.random.Generator.standard_t`, + so please see its documentation for the details of the normal distribution + simulation. + + Example + ------- + Please see the docstring example of :func:`trait_model` for constructing a student's + t distribution trait model. """ def __init__(self, mean, var, df): @@ -147,37 +271,63 @@ def __init__(self, mean, var, df): self.df = df super().__init__("t") - def sim_effect_size(self, num_causal, rng): + def _sim_effect_size(self, num_causal, rng): """ This method returns an effect size from a t distribution. - :param num_causal: Number of causal sites - :type num_causal: int - :param rng: Random generator that will be used to simulate effect size. - :type rng: numpy.random.Generator - :return: Simulated effect size of a causal mutation. - :rtype: float + Parameters + ---------- + num_causal : int + Number of causal sites + rng : numpy.random.Generator + Random generator that will be used to simulate effect size. + + Returns + ------- + float or array-like + Simulated effect size of a causal mutation. """ num_causal = self._check_parameter(num_causal, rng) - beta = rng.standard_t(self.df) + beta = rng.standard_t(self.df, size=num_causal) beta = (beta * np.sqrt(self.var) + self.mean) / num_causal return beta class TraitModelGamma(TraitModel): - """Gamma distribution trait model class, where the effect sizes are - simulated from a gamma distribution. - - :param shape: The shape parameter of the gamma distribution, and it must be - greater than zero. - :type shape: float - :param scale: The scale parameter of the gamma distribution, and it must be - greater than zero. - :type scale: float - :param negative: Determines if a negative value can be simulated from the - trait model. If it is set to be True, 1 or -1 will be multipled to - the simulated effect size. - :type negative: bool + """ + Gamma distribution trait model. + + Parameters + ---------- + shape : float + Shape of the gamma distribution. Must be non-negative. + scale : float + Scale of the gamma distribution. Must be non-negative. + negative : bool, default False + Determine if a negative can be simulated from the trait model. If it is set + to be True, :math:`1` or :math:`-1` will be randomly multiplied to the + simulated effect sizes. + + Returns + ------- + TraitModel + Gamma distribution trait model. + + See Also + -------- + trait_model : Construct a trait model. + numpy.random.Generator.gamma : Details on the input parameters and distribution. + + Note + ---- + This is a trait model built on top of :py:func:`numpy.random.Generator.gamma`, + so please see its documentation for the details of the gamma distribution + simulation. + + Example + ------- + Please see the docstring example of :func:`trait_model` for constructing an + gamma distribution trait model. """ def __init__(self, shape, scale, negative=False): @@ -186,55 +336,91 @@ def __init__(self, shape, scale, negative=False): self.negative = _check_instance(negative, "negative", bool) super().__init__("gamma") - def sim_effect_size(self, num_causal, rng): + def _sim_effect_size(self, num_causal, rng): """ This method returns an effect size from a gamma distribution. - :param num_causal: Number of causal sites - :type num_causal: int - :param rng: Random generator that will be used to simulate effect size. - :type rng: numpy.random.Generator - :return: Simulated effect size of a causal mutation. - :rtype: float + Parameters + ---------- + num_causal : int + Number of causal sites + rng : numpy.random.Generator + Random generator that will be used to simulate effect size. + + Returns + ------- + float or array-like + Simulated effect size of a causal mutation. """ num_causal = self._check_parameter(num_causal, rng) - beta = rng.gamma(self.shape, self.scale) / num_causal + beta = rng.gamma(self.shape, self.scale, size=num_causal) / num_causal if self.negative: - beta *= rng.choice([-1, 1]) + beta = np.multiply(rng.choice([-1, 1], size=num_causal), beta) return beta class TraitModelMultivariateNormal(TraitModel): - """Trait model class of multiple traits. - - We assume that the phenotypes are pleiotropic, meaning that the genes are - influencing more than one trait. - - :param mean: Mean vector of the simulated effect sizes. The length of the vector - should match the number of traits. - :type mean: list or numpy.ndarray - :param cov: Covariance matrix of simulated effect sizes. - :type cov: list or numpy.ndarray + """ + Multivariate normal distribution trait model. + + Parameters + ---------- + mean : 1-D array_like, of length N + Mean vector. + cov : 2-D array_like, of shape (N, N) + Covariance matrix. Must be symmetric and positive-semidefinite. + + + Returns + ------- + TraitModel + Multivariate normal distribution trait model. + + See Also + -------- + trait_model : Construct a trait model. + numpy.random.Generator.multivariate_normal : Details on the input parameters + and distribution. + + Note + ---- + Multivariate normal distribution simulation is used in multi-trait simulation, + which is described in :ref:`multi_trait`. + + This is a trait model built on top of + :py:func:`random.Generator.multivariate_normal`, so please see its documentation + for the details of the multivariate normal distribution simulation. + + Example + ------- + Please see the docstring example of :func:`trait_model` for constructing a + multivariate normal distribution trait model. """ def __init__(self, mean, cov): + super().__init__("multi_normal") self.num_trait = len(mean) self.mean = mean self.cov = cov - def sim_effect_size(self, num_causal, rng): + def _sim_effect_size(self, num_causal, rng): """ This method returns an effect size from a multivariate normal distribution. - :param num_causal: Number of causal sites - :type num_causal: int - :param rng: Random generator that will be used to simulate effect size. - :type rng: numpy.random.Generator - :return: Simulated effect size of a causal mutation. - :rtype: float + Parameters + ---------- + num_causal : int + Number of causal sites + rng : numpy.random.Generator + Random generator that will be used to simulate effect size. + + Returns + ------- + float or array-like + Simulated effect size of a causal mutation. """ num_causal = self._check_parameter(num_causal, rng) - beta = rng.multivariate_normal(mean=self.mean, cov=self.cov) + beta = rng.multivariate_normal(mean=self.mean, cov=self.cov, size=num_causal) beta /= num_causal return beta @@ -250,14 +436,106 @@ def sim_effect_size(self, num_causal, rng): def trait_model(distribution, **kwargs): - """Returns a trait model corresponding to the specified model. The arguments - corresponding to the specified distribution must be inputted as arguments into - this function. - - :param distribution: A string describing the trait model. - :type distribution: str - :return: The corresponding trait model. - :rtype: TraitModel + """ + Return a trait model corresponding to the specified model. + + Parameters + ---------- + distribution : str + String describing the trait model. The list of supported distributions + are: + * "normal": Normal distribution + * "t": Student's t distribution + * "fixed": Fixed value + * "exponential": Exponential distribution + * "gamma": Gamma distribution + * "multi_normal": Multivariate normal distribution + **kwargs + These parameters will be used to specify the trait model. + + Returns + ------- + TraitModel + Trait model that specifies the distribution of effect size simulation. + + See Also + -------- + TraitModelNormal : Return a normal distribution trait model. + TraitModelT : Return a Student's t-distribution trait model. + TraitModelFixed : Return a fixed value trait model. + TraitModelExponential : Return an exponential distribution trait model. + TraitModelGamma : Return a gamma distribution trait model. + TraitModelMultivariateNormal : Return a multivariate normal distribution + trait model. + + Notes + ----- + Please reference :ref:`effect_size` for details on the effect size + simulation. Multivariate normal distribution trait model is used in + multi-trait simulation, which is described in :ref:`multi_trait`. + + Examples + -------- + Constructing a normal distribution trait model with mean :math:`0` and + variance :math:`1`. + + >>> model = tstrait.trait_model(distribution="normal", mean=0, var=1) + >>> model.name + "normal" + + Constructing a student's t-distribution trait model with mean :math:`0`, + variance :math:`1` and degrees of freedom :math:`1`. + + >>> model = tstrait.trait_model(distribution="t", mean=0, var=1, df=1) + >>> model.name + "t" + + Constructing a fixed value trait model with value :math:`1`. + + >>> model = tstrait.trait_model(distribution="fixed", value=1) + >>> model.name + "fixed" + + Constructing an exponential distribution trait model with scale + :math:`1`. + + >>> model = tstrait.trait_model(distribution="exponential", scale=1) + >>> model.name + "exponential" + + Constructing an exponential distribution trait model with scale + :math:`1`, and enable simulation of negative values. + + >>> model = tstrait.trait_model(distribution="exponential", scale=1, + >>> ... negative=True) + + Constructing a gamma distribution trait model with shape :math:`1` + and scale :math:`2`. + + >>> model = tstrait.trait_model(distribution="gamma", shape=1, + >>> ... scale=2) + >>> model.name + "gamma" + + Constructing a gamma distribution trait model with shape :math:`1`, + scale :math:`2`, and allow simulation of negative values. + + >>> model = tstrait.trait_model(distribution="gamma", shape=1, + >>> ... scale=2, negative=True) + >>> model.name + "gamma" + + Constructing a multivariate normal distribution trait model with + mean vector :math:`[0, 0]` and covariance matrix being an + identity matrix. + + >>> import numpy as np + >>> model = tstrait.trait_model(distribution="multi_normal", + >>> ... mean=np.zeros(2), cov=np.eye(2)) + >>> model.name + "multi_normal" + >>> model.num_trait + 2 """ distribution = _check_instance(distribution, "distribution", str) lower_model = distribution.lower()