-
Notifications
You must be signed in to change notification settings - Fork 0
/
spasne.py
268 lines (218 loc) · 10.1 KB
/
spasne.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#!/usr/bin/env python
'''
This open-source software is for implementing the SpaSNE algorithm.
Examples:
spasne = spasne.run_spasne(data, pixels = pixels, alpha = alpha, beta = beta)
Parameter List:
Required parameters:
data: The cell-gene matrix
Important default parameters:
# Although these parameters are default, it is recommended to set them.
pixels: The cell-pixel matrix.
alpha: The weight parameter for the global term.
By default, alpha = 8.
beta: The weight parameter for the spatial term.
By default, alpha = 2 when there is the "pixels" input.
Other default parameters:
exaggeration = 4
no_dims=2 # number of dimensions
perplexity=50
theta=0.5
randseed=-1 # When randseed is -1, it means randseed is not set.
verbose = False
initial_dims = 0 # initial dimensions for PCA.
By default, initial_dims = min(data.shape[1], 200).
use_pca = True # Use PCA (True) or not (False).
By default, the use_pca is set to True.
max_iter =1000 # maximum number of iterations
Paper: Dimensionality reduction for visualizing spatially resolved profiling
data using SpaSNE.
Please contact our team if you have any questions:
Yuansheng Zhou ([email protected])
Chen Tang ([email protected])
Xue Xiao ([email protected])
Lin Xu ([email protected])
Please contact Chen Tang for programming questions about the spasne.py and *.cpp
files.
Version: 07/10/2022
Please see the "LICENSE" file for the copyright information.
Notice: This SpaSNE software is adapted from the bhtsne code
(github.com/lvdmaaten/bhtsne).
Please see the "LICENSE" file for copyright details of the bhtsne
software. The implementation of the bhtsne software is described in the
publication "Accelerating t-SNE using Tree-Based Algorithms"
(https://jmlr.org/papers/v15/vandermaaten14a.html).
'''
import pandas as pd
from argparse import ArgumentParser, FileType
from os.path import abspath, dirname, isfile, join as path_join
from shutil import rmtree
from struct import calcsize, pack, unpack
from subprocess import Popen
from sys import stderr, stdin, stdout
from tempfile import mkdtemp
from platform import system
from os import devnull
import numpy as np
import os, sys
import io
### Constants
IS_WINDOWS = True if system() == 'Windows' else False
SPASNE_BIN_PATH = path_join(dirname(__file__), 'windows', 'spasne.exe') if IS_WINDOWS else path_join(dirname(__file__), 'spasne')
assert isfile(SPASNE_BIN_PATH), ('The "{}.py" is unable to find the "spasne" '
'binary in its directory. Please run "gmake" in that directory to generate '
'the "spasne" binary.').format(SPASNE_BIN_PATH)
# Default hyper-parameter values from van der Maaten (2014)
# https://lvdmaaten.github.io/publications/papers/JMLR_2014.pdf (Experimental Setup, page 13)
DEFAULT_NO_DIMS = 2
INITIAL_DIMENSIONS = 50
DEFAULT_PERPLEXITY = 30
DEFAULT_THETA = 0.5
EMPTY_SEED = -1
DEFAULT_USE_PCA = True
DEFAULT_MAX_ITERATIONS = 1000
###
def _argparse():
argparse = ArgumentParser('spasne Python wrapper')
argparse.add_argument('-d', '--no_dims', type=int,
default=DEFAULT_NO_DIMS)
argparse.add_argument('-p', '--perplexity', type=float,
default=DEFAULT_PERPLEXITY)
# 0.0 for theta is equivalent to vanilla t-SNE
argparse.add_argument('-t', '--theta', type=float, default=DEFAULT_THETA)
argparse.add_argument('-r', '--randseed', type=int, default=EMPTY_SEED)
argparse.add_argument('-n', '--initial_dims', type=int, default=INITIAL_DIMENSIONS)
argparse.add_argument('-v', '--verbose', action='store_true')
argparse.add_argument('-i', '--input', type=FileType('r'), default=stdin)
argparse.add_argument('-o', '--output', type=FileType('w'),
default=stdout)
argparse.add_argument('--use_pca', action='store_true')
argparse.add_argument('--no_pca', dest='use_pca', action='store_false')
argparse.set_defaults(use_pca=DEFAULT_USE_PCA)
argparse.add_argument('-m', '--max_iter', type=int, default=DEFAULT_MAX_ITERATIONS)
return argparse
def _read_unpack(fmt, fh):
return unpack(fmt, fh.read(calcsize(fmt)))
def _is_filelike_object(f):
try:
return isinstance(f, (file, io.IOBase))
except NameError:
# 'file' is not a class in python3
return isinstance(f, io.IOBase)
def init_spasne(samples, pixels, alpha, beta, exaggeration, workdir, no_dims=DEFAULT_NO_DIMS, initial_dims=INITIAL_DIMENSIONS, perplexity=DEFAULT_PERPLEXITY,
theta=DEFAULT_THETA, randseed=EMPTY_SEED, verbose=False, use_pca=DEFAULT_USE_PCA, max_iter=DEFAULT_MAX_ITERATIONS):
if use_pca:
samples = samples - np.mean(samples, axis=0)
cov_x = np.dot(np.transpose(samples), samples)
[eig_val, eig_vec] = np.linalg.eig(cov_x)
# sorting the eigen-values in the descending order
eig_vec = eig_vec[:, eig_val.argsort()[::-1]]
if initial_dims > len(eig_vec):
initial_dims = len(eig_vec)
# truncating the eigen-vectors matrix to keep the most important vectors
eig_vec = np.real(eig_vec[:, :initial_dims])
samples = np.dot(samples, eig_vec)
else:
samples = np.dot(samples, 1.0)
# Assume that the dimensionality of the first sample is representative for
# the whole batch
sample_dim = len(samples[0])
sample_count = len(samples)
pixels = np.dot(pixels, 1.0)
pixels_dim = len(pixels[0])
pixels_count = len(pixels)
# Note: The binary format used by spasne is roughly the same as for
# vanilla tsne
with open(path_join(workdir, 'data.dat'), 'wb') as data_file:
# Write the spasne header
data_file.write(pack('iiddii', sample_count, sample_dim, theta, perplexity, no_dims, max_iter))
# Then write the data
for sample in samples:
data_file.write(pack('{}d'.format(len(sample)), *sample))
data_file.write(pack('iii', randseed, pixels_count, pixels_dim))
for pixel in pixels:
data_file.write(pack('{}d'.format(len(pixel)), *pixel))
data_file.write(pack('ddd', alpha, beta, exaggeration))
def load_data(input_file):
# Read the data, using numpy's good judgement
return np.loadtxt(input_file)
def spasne(workdir, verbose=False):
# Call spasne and let it do its thing
with open(devnull, 'w') as dev_null:
spasne_p = Popen((abspath(SPASNE_BIN_PATH), ), cwd=workdir,
# spasne is very noisy on stdout, tell it to use stderr
# if it is to print any output
stdout=stderr if verbose else dev_null)
spasne_p.wait()
assert not spasne_p.returncode, ('ERROR: Call to spasne exited '
'with a non-zero return code exit status, please ' +
('enable verbose mode and ' if not verbose else '') +
'refer to the spasne output for further details')
# Read and pass on the results
with open(path_join(workdir, 'result.dat'), 'rb') as output_file:
# The first two integers are just the number of samples and the
# dimensionality
result_samples, result_dims = _read_unpack('ii', output_file)
# Collect the results, but they may be out of order
results = [_read_unpack('{}d'.format(result_dims), output_file)
for _ in range(result_samples)]
# Now collect the landmark data so that we can return the data in
# the order it arrived
results = [(_read_unpack('i', output_file), e) for e in results]
# Put the results in order and yield it
results.sort()
for _, result in results:
yield result
# The last piece of data is the cost for each sample, we ignore it
#read_unpack('{}d'.format(sample_count), output_file)
def run_spasne(data, pixels = None, alpha = 8.0, beta = 2.0, exaggeration = 4, no_dims=2, perplexity=50, theta=0.5, randseed=-1, verbose=False, initial_dims=0, use_pca=True, max_iter=1000):
if initial_dims == 0:
initial_dims = min(data.shape[1], 200)
if pixels is None:
pixels_count = data.shape[0]
pixels = pd.DataFrame(np.zeros((pixels_count,0)))
pixels.index = data.index
pixels['x_pixel'] = 10000.0
pixels['y_pixel'] = 10000.0
beta = 0.0
# spasne works with fixed input and output paths, give it a temporary
# directory to work in so we don't clutter the filesystem
tmp_dir_path = mkdtemp()
# print(tmp_dir_path)
# Load data in forked process to free memory for actual spasne calculation
child_pid = os.fork()
if child_pid == 0:
if _is_filelike_object(data):
data = load_data(data)
init_spasne(data, pixels, alpha, beta, exaggeration, tmp_dir_path, no_dims=no_dims, perplexity=perplexity, theta=theta, randseed=randseed,verbose=verbose, initial_dims=initial_dims, use_pca=use_pca, max_iter=max_iter)
os._exit(0)
else:
try:
os.waitpid(child_pid, 0)
except KeyboardInterrupt:
print("Please run this program directly from python and not from ipython or jupyter.")
print("This is an issue due to asynchronous error handling.")
res = []
for result in spasne(tmp_dir_path, verbose):
sample_res = []
for r in result:
sample_res.append(r)
res.append(sample_res)
rmtree(tmp_dir_path)
return np.asarray(res, dtype='float64')
def main(args):
parser = _argparse()
if len(args) <= 1:
print(parser.print_help())
return
argp = parser.parse_args(args[1:])
for result in run_spasne(argp.input, no_dims=argp.no_dims, perplexity=argp.perplexity, theta=argp.theta, randseed=argp.randseed,
verbose=argp.verbose, initial_dims=argp.initial_dims, use_pca=argp.use_pca, max_iter=argp.max_iter):
fmt = ''
for i in range(1, len(result)):
fmt = fmt + '{}\t'
fmt = fmt + '{}\n'
argp.output.write(fmt.format(*result))
if __name__ == '__main__':
from sys import argv
exit(main(argv))