Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

randomized svd draft #3008

Draft
wants to merge 18 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 21 additions & 44 deletions python/tskit/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -8598,11 +8598,12 @@ def pca(
n_components: int = 10,
iterated_power: int = 3,
n_oversamples: int = 10,
indices: np.ndarray = None,
samples: np.ndarray = None,
individuals: np.ndarray = None,
centre: bool = True,
windows = None,
random_state: np.random.Generator = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

usually we just pass in a seed, any objections to doing that, instead?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed the option from random_state to random_seed following msprime.

):
) -> (np.ndarray, np.ndarray):
"""
Run randomized singular value decomposition (rSVD) to obtain principal components.
API partially adopted from `scikit-learn`:
Expand All @@ -8611,7 +8612,8 @@ def pca(
:param int n_components: Number of principal components
:param int iterated_power: Number of power iteration of range finder
:param int n_oversamples: Number of additional test vectors
:param np.ndarray indices: Indcies of individuals to perform rSVD
:param np.ndarray samples: Samples to perform PCA
:param np.ndarray individuals: Individuals to perform PCA
:param bool centre: Centre the genetic relatedness matrix
:param windows: ???
:param np.random.Generator random_state: Random number generator
Expand Down Expand Up @@ -8663,18 +8665,16 @@ def _rand_svd(
return U[:,:rank], D[:rank], V[:rank]

def _genetic_relatedness_vector(
ts: tskit.Treesequence,
arr: np.ndarray,
rows: np.ndarray,
cols: np.ndarray,
centre: bool = False,
centre: bool = True,
windows = None,
) -> np.ndarray:
"""
Wrapper around `tskit.TreeSequence.genetic_relatedness_vector` to support centering in respect to individuals.
Multiplies an array to the genetic relatedness matrix of :class:`tskit.TreeSequence`.
:param tskit.TreeSequence ts: A tree sequence.
:param numpy.ndarray arr: The array to multiply. Either a vector or a matrix.
:param numpy.ndarray rows: Index of rows of the genetic relatedness matrix to be selected.
:param numpy.ndarray cols: Index of cols of the genetic relatedness matrix to be selected. The size should match the row length of `arr`.
Expand All @@ -8684,56 +8684,33 @@ def _genetic_relatedness_vector(
:rtype: `np.ndarray`
"""

# maps samples to individuals
def sample_individual_sparray(ts: tskit.TreeSequence) -> scipy.sparse.sparray:
samples_individual = ts.nodes_individual[ts.samples()]
return scipy.sparse.csr_array(
(
np.ones(ts.num_samples),
(np.arange(ts.num_samples), samples_individual)
),
shape=(ts.num_samples, ts.num_individuals)
)

# maps values in idx to num_individuals
def individual_idx_sparray(n: int, idx: np.ndarray) -> scipy.sparse.sparray:
return scipy.sparse.csr_array(
(
np.ones(idx.size),
(idx, np.arange(idx.size))
),
shape=(n, idx.size)
)

assert cols.size == arr.shape[0], "Dimension mismatch"
# centering
ij = np.vstack([[n,k] for k, i in enumerate(individuals) for n in self.individual(i).nodes])
samples, sample_individuals = ij[:,0], ij[:,1] # sample node index, individual of those nodes
x = arr - arr.mean(axis=0) if centre else arr # centering within index in rows
x = individual_idx_sparray(ts.num_individuals, cols).dot(x)
x = sample_individual_sparray(ts).dot(x)
x = ts.genetic_relatedness_vector(W=x, windows=windows, mode="branch", centre=False)
x = sample_individual_sparray(ts).T.dot(x)
x = individual_idx_sparray(ts.num_individuals, rows).T.dot(x)
x = self.genetic_relatedness_vector(W=x[sample_individuals], windows=windows, mode="branch", centre=False, nodes=samples)
bincount_fn = lambda w: np.bincount(sample_individuals, w)
x = np.apply_along_axis(bincount_fn, axis=0, arr=x) # I think it should be axis=1, but axis=0 gives the correct values why?
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The matvec is sometimes GRM * matrix, so x is often a matrix than a vector. np.bincount only works for 1-dimensional weights, so I used np.apply_along_axis and lambda to vectorize np.bincount.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment I left after # looks like mostly a convention issue in that function. When axis=0, the columns are separately retrieved from the array. When axis=1, the rows are retrieved.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, that seems confusing but maybe makes sense after all?

x = x - x.mean(axis=0) if centre else x # centering within index in cols

return x


if indices is None: indices = np.array([i.id for i in self.individuals()])
if random_state is None: random_state = np.random.default_rng()
if samples is None and individuals is None: samples = self.samples()

def _G(x):
return _genetic_relatedness_vector(
self.ts,
x,
indices,
indices,
centre,
windows
)
if samples is not None and individuals is not None:
raise ValueError("samples and individuals cannot be used at the same time")
elif samples is not None:
_G = lambda x: self.genetic_relatedness_vector(x, windows=windows, mode="branch", centre=centre, nodes=samples)
dim = samples.size
elif individuals is not None:
_G = lambda x: _genetic_relatedness_vector(x, individuals, individuals, centre=centre, windows=windows)
dim = individuals.size

U, D, _ = _rand_svd(
petrelharp marked this conversation as resolved.
Show resolved Hide resolved
operator=_G,
operator_dim=indices.size,
operator_dim=dim,
rank=n_components,
depth=iterated_power,
num_vectors=n_components+n_oversamples,
Expand Down
Loading