tskit-dev · hanbin973 · Oct 3, 2024 · Oct 4, 2024 · Oct 7, 2024 · Oct 7, 2024
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -8598,11 +8598,12 @@ def pca(
             n_components: int = 10,
             iterated_power: int = 3,
             n_oversamples: int = 10,
-            indices: np.ndarray = None,
+            samples: np.ndarray = None,
+            individuals: np.ndarray = None,
             centre: bool = True,
             windows = None,
             random_state: np.random.Generator = None,
-            ):
+            ) -> (np.ndarray, np.ndarray):
         """
         Run randomized singular value decomposition (rSVD) to obtain principal components.
         API partially adopted from `scikit-learn`: 
@@ -8611,7 +8612,8 @@ def pca(
         :param int n_components: Number of principal components
         :param int iterated_power: Number of power iteration of range finder
         :param int n_oversamples: Number of additional test vectors
-        :param np.ndarray indices: Indcies of individuals to perform rSVD
+        :param np.ndarray samples: Samples to perform PCA
+        :param np.ndarray individuals: Individuals to perform PCA
         :param bool centre: Centre the genetic relatedness matrix
         :param windows: ???
         :param np.random.Generator random_state: Random number generator
@@ -8663,18 +8665,16 @@ def _rand_svd(
             return U[:,:rank], D[:rank], V[:rank]
 
         def _genetic_relatedness_vector(
-                ts: tskit.Treesequence,
                 arr: np.ndarray,
                 rows: np.ndarray,
                 cols: np.ndarray,
-                centre: bool = False,
+                centre: bool = True,
                 windows = None,
                 ) -> np.ndarray:
             """
             Wrapper around `tskit.TreeSequence.genetic_relatedness_vector` to support centering in respect to individuals.
             Multiplies an array to the genetic relatedness matrix of :class:`tskit.TreeSequence`.
 
-            :param tskit.TreeSequence ts: A tree sequence.
             :param numpy.ndarray arr: The array to multiply. Either a vector or a matrix.
             :param numpy.ndarray rows: Index of rows of the genetic relatedness matrix to be selected.
             :param numpy.ndarray cols: Index of cols of the genetic relatedness matrix to be selected. The size should match the row length of `arr`.
@@ -8684,56 +8684,33 @@ def _genetic_relatedness_vector(
             :rtype: `np.ndarray`
             """
 
-            # maps samples to individuals
-            def sample_individual_sparray(ts: tskit.TreeSequence) -> scipy.sparse.sparray:
-                samples_individual = ts.nodes_individual[ts.samples()]
-                return scipy.sparse.csr_array(
-                        (
-                            np.ones(ts.num_samples),
-                            (np.arange(ts.num_samples), samples_individual)
-                        ),
-                        shape=(ts.num_samples, ts.num_individuals)
-                    )
-
-            # maps values in idx to num_individuals
-            def individual_idx_sparray(n: int, idx: np.ndarray) -> scipy.sparse.sparray:
-                return scipy.sparse.csr_array(
-                        (
-                            np.ones(idx.size),
-                            (idx, np.arange(idx.size))
-                        ),
-                        shape=(n, idx.size)
-                    )
-
             assert cols.size == arr.shape[0], "Dimension mismatch"
-            # centering
+            ij = np.vstack([[n,k] for k, i in enumerate(individuals) for n in self.individual(i).nodes])
+            samples, sample_individuals = ij[:,0], ij[:,1] # sample node index, individual of those nodes
             x = arr - arr.mean(axis=0) if centre else arr # centering within index in rows
-            x = individual_idx_sparray(ts.num_individuals, cols).dot(x)
-            x = sample_individual_sparray(ts).dot(x)
-            x = ts.genetic_relatedness_vector(W=x, windows=windows, mode="branch", centre=False)
-            x = sample_individual_sparray(ts).T.dot(x)
-            x = individual_idx_sparray(ts.num_individuals, rows).T.dot(x)
+            x = self.genetic_relatedness_vector(W=x[sample_individuals], windows=windows, mode="branch", centre=False, nodes=samples)
+            bincount_fn = lambda w: np.bincount(sample_individuals, w)
+            x = np.apply_along_axis(bincount_fn, axis=0, arr=x) # I think it should be axis=1, but axis=0 gives the correct values why?
             x = x - x.mean(axis=0) if centre else x # centering within index in cols
 
             return x
 
 
-        if indices is None: indices = np.array([i.id for i in self.individuals()])
         if random_state is None: random_state = np.random.default_rng()
+        if samples is None and individuals is None: samples = self.samples()
 
-        def _G(x):
-            return _genetic_relatedness_vector(
-                    self.ts,
-                    x,
-                    indices,
-                    indices,
-                    centre,
-                    windows
-                    )
+        if samples is not None and individuals is not None:
+            raise ValueError("samples and individuals cannot be used at the same time")
+        elif samples is not None:
+            _G = lambda x: self.genetic_relatedness_vector(x, windows=windows, mode="branch", centre=centre, nodes=samples)
+            dim = samples.size
+        elif individuals is not None:
+            _G = lambda x: _genetic_relatedness_vector(x, individuals, individuals, centre=centre, windows=windows)
+            dim = individuals.size
 
         U, D, _ = _rand_svd(
                 operator=_G,
-                operator_dim=indices.size,
+                operator_dim=dim,
                 rank=n_components,
                 depth=iterated_power,
                 num_vectors=n_components+n_oversamples,