From 76074ceebe803a22be39d19f3dbc77a4919d73af Mon Sep 17 00:00:00 2001
From: Gregor Gorjanc <gregor.gorjanc@gmail.com>
Date: Sat, 28 Sep 2024 10:12:44 +0100
Subject: [PATCH] Unifying stat method descriptions and linking to math
 equation summary section

---
 python/tskit/trees.py | 133 +++++++++++++++++++++++++++---------------
 1 file changed, 86 insertions(+), 47 deletions(-)

diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index 3d4637829d..fbd1ebb204 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -7515,7 +7515,6 @@ def sample_count_stat(
         as sample sets will give ``f`` an argument of length two, giving the number
         of samples in ``A`` and ``B`` below the node in question. So, if we define
 
-
         .. code-block:: python
 
             def f(x):
@@ -7892,13 +7891,15 @@ def diversity(
     ):
         """
         Computes mean genetic diversity (also known as "pi") in each of the
-        sets of nodes from ``sample_sets``.  The statistic is also known as
+        sets of nodes from ``sample_sets``. The statistic is also known as
         "sample heterozygosity"; a common citation for the definition is
         `Nei and Li (1979) <https://doi.org/10.1073/pnas.76.10.5269>`_
         (equation 22), so it is sometimes called called "Nei's pi"
         (but also sometimes "Tajima's pi").
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
 
-        Please see the :ref:`one-way statistics <sec_stats_sample_sets_one_way>`
+        See the :ref:`one-way statistics <sec_stats_sample_sets_one_way>`
         section for details on how the ``sample_sets`` argument is interpreted
         and how it interacts with the dimensions of the output array.
         See the :ref:`statistics interface <sec_stats_interface>` section for details on
@@ -7960,8 +7961,10 @@ def divergence(
         :math:`\pi_{XY}`. Note that the mean pairwise nucleotide diversity of a
         sample set to itself (computed by passing an index of the form (j,j))
         is its :meth:`diversity <.TreeSequence.diversity>` (see the note below).
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
 
-        Operates on ``k = 2`` sample sets at a time; please see the
+        Operates on ``k = 2`` sample sets at a time; see the
         :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
         section for details on how the ``sample_sets`` and ``indexes`` arguments are
         interpreted and how they interact with the dimensions of the output array.
@@ -8246,6 +8249,9 @@ def genetic_relatedness(
         """
         Computes genetic relatedness between (and within) pairs of
         sets of nodes from ``sample_sets``.
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
         Operates on ``k = 2`` sample sets at a time; please see the
         :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
         section for details on how the ``sample_sets`` and ``indexes`` arguments are
@@ -8478,8 +8484,12 @@ def genetic_relatedness_weighted(
         centre=True,
     ):
         r"""
-        Computes weighted genetic relatedness. If the :math:`k` th pair of indices
-        is (i, j) then the :math:`k` th column of output will be
+        Computes weighted genetic relatedness.
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
+        If the :math:`k` th pair of indices is (i, j),
+        then the :math:`k` th column of output will be
         :math:`\sum_{a,b} W_{ai} W_{bj} C_{ab}`,
         where :math:`W` is the matrix of weights, and :math:`C_{ab}` is the
         :meth:`genetic_relatedness <.TreeSequence.genetic_relatedness>` between sample
@@ -8589,19 +8599,21 @@ def trait_covariance(self, W, windows=None, mode="site", span_normalise=True):
         """
         Computes the mean squared covariances between each of the columns of ``W``
         (the "phenotypes") and inheritance along the tree sequence.
-        See the :ref:`statistics interface <sec_stats_interface>` section for details on
-        :ref:`windows <sec_stats_windows>`,
-        :ref:`mode <sec_stats_mode>`,
-        :ref:`span normalise <sec_stats_span_normalise>`,
-        and :ref:`return value <sec_stats_output_format>`.
-        Operates on all samples in the tree sequence.
-
         Concretely, if `g` is a binary vector that indicates inheritance from an allele,
         branch, or node and `w` is a column of W, normalised to have mean zero,
         then the covariance of `g` and `w` is :math:`\\sum_i g_i w_i`, the sum of the
         weights corresponding to entries of `g` that are `1`. Since weights sum to
         zero, this is also equal to the sum of weights whose entries of `g` are 0.
         So, :math:`cov(g,w)^2 = ((\\sum_i g_i w_i)^2 + (\\sum_i (1-g_i) w_i)^2)/2`.
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
+        Operates on all samples in the tree sequence.
+        See the :ref:`statistics interface <sec_stats_interface>` section for details on
+        :ref:`windows <sec_stats_windows>`,
+        :ref:`mode <sec_stats_mode>`,
+        :ref:`span normalise <sec_stats_span_normalise>`,
+        and :ref:`return value <sec_stats_output_format>`.
 
         What is computed depends on ``mode``:
 
@@ -8653,17 +8665,19 @@ def trait_correlation(self, W, windows=None, mode="site", span_normalise=True):
         """
         Computes the mean squared correlations between each of the columns of ``W``
         (the "phenotypes") and inheritance along the tree sequence.
+        This is computed as squared covariance in
+        :meth:`trait_covariance <.TreeSequence.trait_covariance>`,
+        but divided by :math:`p (1-p)`, where `p` is the proportion of samples
+        inheriting from the allele, branch, or node in question.
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
+        Operates on all samples in the tree sequence.
         See the :ref:`statistics interface <sec_stats_interface>` section for details on
         :ref:`windows <sec_stats_windows>`,
         :ref:`mode <sec_stats_mode>`,
         :ref:`span normalise <sec_stats_span_normalise>`,
         and :ref:`return value <sec_stats_output_format>`.
-        Operates on all samples in the tree sequence.
-
-        This is computed as squared covariance in
-        :meth:`trait_covariance <.TreeSequence.trait_covariance>`,
-        but divided by :math:`p (1-p)`, where `p` is the proportion of samples
-        inheriting from the allele, branch, or node in question.
 
         What is computed depends on ``mode``:
 
@@ -8737,17 +8751,11 @@ def trait_linear_model(
     ):
         """
         Finds the relationship between trait and genotype after accounting for
-        covariates.  Concretely, for each trait w (i.e., each column of W),
+        covariates. Concretely, for each trait w (i.e., each column of W),
         this does a least-squares fit of the linear model :math:`w \\sim g + Z`,
         where :math:`g` is inheritance in the tree sequence (e.g., genotype)
         and the columns of :math:`Z` are covariates, and returns the squared
         coefficient of :math:`g` in this linear model.
-        See the :ref:`statistics interface <sec_stats_interface>` section for details on
-        :ref:`windows <sec_stats_windows>`,
-        :ref:`mode <sec_stats_mode>`,
-        :ref:`span normalise <sec_stats_span_normalise>`,
-        and :ref:`return value <sec_stats_output_format>`.
-        Operates on all samples in the tree sequence.
 
         To do this, if `g` is a binary vector that indicates inheritance from an allele,
         branch, or node and `w` is a column of W, there are :math:`k` columns of
@@ -8756,6 +8764,15 @@ def trait_linear_model(
         then this returns the number :math:`b_1^2`. If :math:`g` lies in the linear span
         of the columns of :math:`Z`, then :math:`b_1` is set to 0. To fit the
         linear model without covariates (only the intercept), set `Z = None`.
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
+        Operates on all samples in the tree sequence.
+        See the :ref:`statistics interface <sec_stats_interface>` section for details on
+        :ref:`windows <sec_stats_windows>`,
+        :ref:`mode <sec_stats_mode>`,
+        :ref:`span normalise <sec_stats_span_normalise>`,
+        and :ref:`return value <sec_stats_output_format>`.
 
         What is computed depends on ``mode``:
 
@@ -8823,7 +8840,10 @@ def segregating_sites(
         """
         Computes the density of segregating sites for each of the sets of nodes
         from ``sample_sets``, and related quantities.
-        Please see the :ref:`one-way statistics <sec_stats_sample_sets_one_way>`
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
+        See the :ref:`one-way statistics <sec_stats_sample_sets_one_way>`
         section for details on how the ``sample_sets`` argument is interpreted
         and how it interacts with the dimensions of the output array.
         See the :ref:`statistics interface <sec_stats_interface>` section for details on
@@ -8878,6 +8898,7 @@ def allele_frequency_spectrum(
         """
         Computes the allele frequency spectrum (AFS) in windows across the genome for
         with respect to the specified ``sample_sets``.
+
         See the :ref:`statistics interface <sec_stats_interface>` section for details on
         :ref:`sample sets <sec_stats_sample_sets>`,
         :ref:`windows <sec_stats_windows>`,
@@ -8977,14 +8998,7 @@ def allele_frequency_spectrum(
     def Tajimas_D(self, sample_sets=None, windows=None, mode="site"):
         """
         Computes Tajima's D of sets of nodes from ``sample_sets`` in windows.
-        Please see the :ref:`one-way statistics <sec_stats_sample_sets_one_way>`
-        section for details on how the ``sample_sets`` argument is interpreted
-        and how it interacts with the dimensions of the output array.
-        See the :ref:`statistics interface <sec_stats_interface>` section for details on
-        :ref:`windows <sec_stats_windows>`, :ref:`mode <sec_stats_mode>`,
-        and :ref:`return value <sec_stats_output_format>`.
-        Operates on ``k = 1`` sample sets at a
-        time. For a sample set ``X`` of ``n`` nodes, if and ``T`` is the mean
+        For a sample set ``X`` of ``n`` nodes, if ``T`` is the mean
         number of pairwise differing sites in ``X`` and ``S`` is the number of
         sites segregating in ``X`` (computed with :meth:`diversity
         <.TreeSequence.diversity>` and :meth:`segregating sites
@@ -9000,6 +9014,14 @@ def Tajimas_D(self, sample_sets=None, windows=None, mode="site"):
             b = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) - (n + 2) / (h * n) + g / h**2
             c = h**2 + g
 
+        Operates on ``k = 1`` sample sets at a time.
+        Please see the :ref:`one-way statistics <sec_stats_sample_sets_one_way>`
+        section for details on how the ``sample_sets`` argument is interpreted
+        and how it interacts with the dimensions of the output array.
+        See the :ref:`statistics interface <sec_stats_interface>` section for details on
+        :ref:`windows <sec_stats_windows>`, :ref:`mode <sec_stats_mode>`,
+        and :ref:`return value <sec_stats_output_format>`.
+
         What is computed for diversity and divergence depends on ``mode``;
         see those functions for more details.
 
@@ -9040,16 +9062,6 @@ def Fst(
     ):
         """
         Computes "windowed" Fst between pairs of sets of nodes from ``sample_sets``.
-        Operates on ``k = 2`` sample sets at a time; please see the
-        :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
-        section for details on how the ``sample_sets`` and ``indexes`` arguments are
-        interpreted and how they interact with the dimensions of the output array.
-        See the :ref:`statistics interface <sec_stats_interface>` section for details on
-        :ref:`windows <sec_stats_windows>`,
-        :ref:`mode <sec_stats_mode>`,
-        :ref:`span normalise <sec_stats_span_normalise>`,
-        and :ref:`return value <sec_stats_output_format>`.
-
         For sample sets ``X`` and ``Y``, if ``d(X, Y)`` is the
         :meth:`divergence <.TreeSequence.divergence>`
         between ``X`` and ``Y``, and ``d(X)`` is the
@@ -9060,6 +9072,16 @@ def Fst(
 
             Fst = 1 - 2 * (d(X) + d(Y)) / (d(X) + 2 * d(X, Y) + d(Y))
 
+        Operates on ``k = 2`` sample sets at a time; please see the
+        :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
+        section for details on how the ``sample_sets`` and ``indexes`` arguments are
+        interpreted and how they interact with the dimensions of the output array.
+        See the :ref:`statistics interface <sec_stats_interface>` section for details on
+        :ref:`windows <sec_stats_windows>`,
+        :ref:`mode <sec_stats_mode>`,
+        :ref:`span normalise <sec_stats_span_normalise>`,
+        and :ref:`return value <sec_stats_output_format>`.
+
         What is computed for diversity and divergence depends on ``mode``;
         see those functions for more details.
 
@@ -9149,6 +9171,9 @@ def Y3(
     ):
         """
         Computes the 'Y' statistic between triples of sets of nodes from ``sample_sets``.
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
         Operates on ``k = 3`` sample sets at a time; please see the
         :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
         section for details on how the ``sample_sets`` and ``indexes`` arguments are
@@ -9202,6 +9227,9 @@ def Y2(
     ):
         """
         Computes the 'Y2' statistic between pairs of sets of nodes from ``sample_sets``.
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
         Operates on ``k = 2`` sample sets at a time; please see the
         :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
         section for details on how the ``sample_sets`` and ``indexes`` arguments are
@@ -9245,14 +9273,17 @@ def Y1(self, sample_sets, windows=None, mode="site", span_normalise=True):
         """
         Computes the 'Y1' statistic within each of the sets of nodes given by
         ``sample_sets``.
-        Please see the :ref:`one-way statistics <sec_stats_sample_sets_one_way>`
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
+        Operates on ``k = 1`` sample set at a time.
+        See the :ref:`one-way statistics <sec_stats_sample_sets_one_way>`
         section for details on how the ``sample_sets`` argument is interpreted
         and how it interacts with the dimensions of the output array.
         See the :ref:`statistics interface <sec_stats_interface>` section for details on
         :ref:`windows <sec_stats_windows>`, :ref:`mode <sec_stats_mode>`,
         :ref:`span normalise <sec_stats_span_normalise>`,
         and :ref:`return value <sec_stats_output_format>`.
-        Operates on ``k = 1`` sample set at a time.
 
         What is computed depends on ``mode``. Each is computed exactly as
         ``Y3``, except that the average is across every possible trio of samples
@@ -9284,6 +9315,9 @@ def f4(
         """
         Computes Patterson's f4 statistic between four groups of nodes from
         ``sample_sets``.
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
         Operates on ``k = 4`` sample sets at a time; please see the
         :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
         section for details on how the ``sample_sets`` and ``indexes`` arguments are
@@ -9351,6 +9385,8 @@ def f3(
         is usually placed as population ``A`` (see
         `Peter (2016) <https://doi.org/10.1534/genetics.115.183913>`_
         for more discussion).
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
 
         Operates on ``k = 3`` sample sets at a time; please see the
         :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
@@ -9396,6 +9432,9 @@ def f2(
         """
         Computes Patterson's f2 statistic between two groups of nodes from
         ``sample_sets``.
+        Please see the :ref:`summary functions <sec_stats_summary_functions>`
+        section on the exact definition of the calculated statistic.
+
         Operates on ``k = 2`` sample sets at a time; please see the
         :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
         section for details on how the ``sample_sets`` and ``indexes`` arguments are