From 76074ceebe803a22be39d19f3dbc77a4919d73af Mon Sep 17 00:00:00 2001 From: Gregor Gorjanc Date: Sat, 28 Sep 2024 10:12:44 +0100 Subject: [PATCH] Unifying stat method descriptions and linking to math equation summary section --- python/tskit/trees.py | 133 +++++++++++++++++++++++++++--------------- 1 file changed, 86 insertions(+), 47 deletions(-) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 3d4637829d..fbd1ebb204 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -7515,7 +7515,6 @@ def sample_count_stat( as sample sets will give ``f`` an argument of length two, giving the number of samples in ``A`` and ``B`` below the node in question. So, if we define - .. code-block:: python def f(x): @@ -7892,13 +7891,15 @@ def diversity( ): """ Computes mean genetic diversity (also known as "pi") in each of the - sets of nodes from ``sample_sets``. The statistic is also known as + sets of nodes from ``sample_sets``. The statistic is also known as "sample heterozygosity"; a common citation for the definition is `Nei and Li (1979) `_ (equation 22), so it is sometimes called called "Nei's pi" (but also sometimes "Tajima's pi"). + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. - Please see the :ref:`one-way statistics ` + See the :ref:`one-way statistics ` section for details on how the ``sample_sets`` argument is interpreted and how it interacts with the dimensions of the output array. See the :ref:`statistics interface ` section for details on @@ -7960,8 +7961,10 @@ def divergence( :math:`\pi_{XY}`. Note that the mean pairwise nucleotide diversity of a sample set to itself (computed by passing an index of the form (j,j)) is its :meth:`diversity <.TreeSequence.diversity>` (see the note below). + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. - Operates on ``k = 2`` sample sets at a time; please see the + Operates on ``k = 2`` sample sets at a time; see the :ref:`multi-way statistics ` section for details on how the ``sample_sets`` and ``indexes`` arguments are interpreted and how they interact with the dimensions of the output array. @@ -8246,6 +8249,9 @@ def genetic_relatedness( """ Computes genetic relatedness between (and within) pairs of sets of nodes from ``sample_sets``. + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + Operates on ``k = 2`` sample sets at a time; please see the :ref:`multi-way statistics ` section for details on how the ``sample_sets`` and ``indexes`` arguments are @@ -8478,8 +8484,12 @@ def genetic_relatedness_weighted( centre=True, ): r""" - Computes weighted genetic relatedness. If the :math:`k` th pair of indices - is (i, j) then the :math:`k` th column of output will be + Computes weighted genetic relatedness. + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + + If the :math:`k` th pair of indices is (i, j), + then the :math:`k` th column of output will be :math:`\sum_{a,b} W_{ai} W_{bj} C_{ab}`, where :math:`W` is the matrix of weights, and :math:`C_{ab}` is the :meth:`genetic_relatedness <.TreeSequence.genetic_relatedness>` between sample @@ -8589,19 +8599,21 @@ def trait_covariance(self, W, windows=None, mode="site", span_normalise=True): """ Computes the mean squared covariances between each of the columns of ``W`` (the "phenotypes") and inheritance along the tree sequence. - See the :ref:`statistics interface ` section for details on - :ref:`windows `, - :ref:`mode `, - :ref:`span normalise `, - and :ref:`return value `. - Operates on all samples in the tree sequence. - Concretely, if `g` is a binary vector that indicates inheritance from an allele, branch, or node and `w` is a column of W, normalised to have mean zero, then the covariance of `g` and `w` is :math:`\\sum_i g_i w_i`, the sum of the weights corresponding to entries of `g` that are `1`. Since weights sum to zero, this is also equal to the sum of weights whose entries of `g` are 0. So, :math:`cov(g,w)^2 = ((\\sum_i g_i w_i)^2 + (\\sum_i (1-g_i) w_i)^2)/2`. + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + + Operates on all samples in the tree sequence. + See the :ref:`statistics interface ` section for details on + :ref:`windows `, + :ref:`mode `, + :ref:`span normalise `, + and :ref:`return value `. What is computed depends on ``mode``: @@ -8653,17 +8665,19 @@ def trait_correlation(self, W, windows=None, mode="site", span_normalise=True): """ Computes the mean squared correlations between each of the columns of ``W`` (the "phenotypes") and inheritance along the tree sequence. + This is computed as squared covariance in + :meth:`trait_covariance <.TreeSequence.trait_covariance>`, + but divided by :math:`p (1-p)`, where `p` is the proportion of samples + inheriting from the allele, branch, or node in question. + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + + Operates on all samples in the tree sequence. See the :ref:`statistics interface ` section for details on :ref:`windows `, :ref:`mode `, :ref:`span normalise `, and :ref:`return value `. - Operates on all samples in the tree sequence. - - This is computed as squared covariance in - :meth:`trait_covariance <.TreeSequence.trait_covariance>`, - but divided by :math:`p (1-p)`, where `p` is the proportion of samples - inheriting from the allele, branch, or node in question. What is computed depends on ``mode``: @@ -8737,17 +8751,11 @@ def trait_linear_model( ): """ Finds the relationship between trait and genotype after accounting for - covariates. Concretely, for each trait w (i.e., each column of W), + covariates. Concretely, for each trait w (i.e., each column of W), this does a least-squares fit of the linear model :math:`w \\sim g + Z`, where :math:`g` is inheritance in the tree sequence (e.g., genotype) and the columns of :math:`Z` are covariates, and returns the squared coefficient of :math:`g` in this linear model. - See the :ref:`statistics interface ` section for details on - :ref:`windows `, - :ref:`mode `, - :ref:`span normalise `, - and :ref:`return value `. - Operates on all samples in the tree sequence. To do this, if `g` is a binary vector that indicates inheritance from an allele, branch, or node and `w` is a column of W, there are :math:`k` columns of @@ -8756,6 +8764,15 @@ def trait_linear_model( then this returns the number :math:`b_1^2`. If :math:`g` lies in the linear span of the columns of :math:`Z`, then :math:`b_1` is set to 0. To fit the linear model without covariates (only the intercept), set `Z = None`. + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + + Operates on all samples in the tree sequence. + See the :ref:`statistics interface ` section for details on + :ref:`windows `, + :ref:`mode `, + :ref:`span normalise `, + and :ref:`return value `. What is computed depends on ``mode``: @@ -8823,7 +8840,10 @@ def segregating_sites( """ Computes the density of segregating sites for each of the sets of nodes from ``sample_sets``, and related quantities. - Please see the :ref:`one-way statistics ` + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + + See the :ref:`one-way statistics ` section for details on how the ``sample_sets`` argument is interpreted and how it interacts with the dimensions of the output array. See the :ref:`statistics interface ` section for details on @@ -8878,6 +8898,7 @@ def allele_frequency_spectrum( """ Computes the allele frequency spectrum (AFS) in windows across the genome for with respect to the specified ``sample_sets``. + See the :ref:`statistics interface ` section for details on :ref:`sample sets `, :ref:`windows `, @@ -8977,14 +8998,7 @@ def allele_frequency_spectrum( def Tajimas_D(self, sample_sets=None, windows=None, mode="site"): """ Computes Tajima's D of sets of nodes from ``sample_sets`` in windows. - Please see the :ref:`one-way statistics ` - section for details on how the ``sample_sets`` argument is interpreted - and how it interacts with the dimensions of the output array. - See the :ref:`statistics interface ` section for details on - :ref:`windows `, :ref:`mode `, - and :ref:`return value `. - Operates on ``k = 1`` sample sets at a - time. For a sample set ``X`` of ``n`` nodes, if and ``T`` is the mean + For a sample set ``X`` of ``n`` nodes, if ``T`` is the mean number of pairwise differing sites in ``X`` and ``S`` is the number of sites segregating in ``X`` (computed with :meth:`diversity <.TreeSequence.diversity>` and :meth:`segregating sites @@ -9000,6 +9014,14 @@ def Tajimas_D(self, sample_sets=None, windows=None, mode="site"): b = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) - (n + 2) / (h * n) + g / h**2 c = h**2 + g + Operates on ``k = 1`` sample sets at a time. + Please see the :ref:`one-way statistics ` + section for details on how the ``sample_sets`` argument is interpreted + and how it interacts with the dimensions of the output array. + See the :ref:`statistics interface ` section for details on + :ref:`windows `, :ref:`mode `, + and :ref:`return value `. + What is computed for diversity and divergence depends on ``mode``; see those functions for more details. @@ -9040,16 +9062,6 @@ def Fst( ): """ Computes "windowed" Fst between pairs of sets of nodes from ``sample_sets``. - Operates on ``k = 2`` sample sets at a time; please see the - :ref:`multi-way statistics ` - section for details on how the ``sample_sets`` and ``indexes`` arguments are - interpreted and how they interact with the dimensions of the output array. - See the :ref:`statistics interface ` section for details on - :ref:`windows `, - :ref:`mode `, - :ref:`span normalise `, - and :ref:`return value `. - For sample sets ``X`` and ``Y``, if ``d(X, Y)`` is the :meth:`divergence <.TreeSequence.divergence>` between ``X`` and ``Y``, and ``d(X)`` is the @@ -9060,6 +9072,16 @@ def Fst( Fst = 1 - 2 * (d(X) + d(Y)) / (d(X) + 2 * d(X, Y) + d(Y)) + Operates on ``k = 2`` sample sets at a time; please see the + :ref:`multi-way statistics ` + section for details on how the ``sample_sets`` and ``indexes`` arguments are + interpreted and how they interact with the dimensions of the output array. + See the :ref:`statistics interface ` section for details on + :ref:`windows `, + :ref:`mode `, + :ref:`span normalise `, + and :ref:`return value `. + What is computed for diversity and divergence depends on ``mode``; see those functions for more details. @@ -9149,6 +9171,9 @@ def Y3( ): """ Computes the 'Y' statistic between triples of sets of nodes from ``sample_sets``. + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + Operates on ``k = 3`` sample sets at a time; please see the :ref:`multi-way statistics ` section for details on how the ``sample_sets`` and ``indexes`` arguments are @@ -9202,6 +9227,9 @@ def Y2( ): """ Computes the 'Y2' statistic between pairs of sets of nodes from ``sample_sets``. + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + Operates on ``k = 2`` sample sets at a time; please see the :ref:`multi-way statistics ` section for details on how the ``sample_sets`` and ``indexes`` arguments are @@ -9245,14 +9273,17 @@ def Y1(self, sample_sets, windows=None, mode="site", span_normalise=True): """ Computes the 'Y1' statistic within each of the sets of nodes given by ``sample_sets``. - Please see the :ref:`one-way statistics ` + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + + Operates on ``k = 1`` sample set at a time. + See the :ref:`one-way statistics ` section for details on how the ``sample_sets`` argument is interpreted and how it interacts with the dimensions of the output array. See the :ref:`statistics interface ` section for details on :ref:`windows `, :ref:`mode `, :ref:`span normalise `, and :ref:`return value `. - Operates on ``k = 1`` sample set at a time. What is computed depends on ``mode``. Each is computed exactly as ``Y3``, except that the average is across every possible trio of samples @@ -9284,6 +9315,9 @@ def f4( """ Computes Patterson's f4 statistic between four groups of nodes from ``sample_sets``. + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + Operates on ``k = 4`` sample sets at a time; please see the :ref:`multi-way statistics ` section for details on how the ``sample_sets`` and ``indexes`` arguments are @@ -9351,6 +9385,8 @@ def f3( is usually placed as population ``A`` (see `Peter (2016) `_ for more discussion). + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. Operates on ``k = 3`` sample sets at a time; please see the :ref:`multi-way statistics ` @@ -9396,6 +9432,9 @@ def f2( """ Computes Patterson's f2 statistic between two groups of nodes from ``sample_sets``. + Please see the :ref:`summary functions ` + section on the exact definition of the calculated statistic. + Operates on ``k = 2`` sample sets at a time; please see the :ref:`multi-way statistics ` section for details on how the ``sample_sets`` and ``indexes`` arguments are