-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnature-data-annotation-embedding.py
71 lines (64 loc) · 3.8 KB
/
nature-data-annotation-embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import jscatter
import numpy as np
import pandas as pd
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from time import time
from cmaps import glasbey_dark
from umap import UMAP
import matplotlib.pyplot as plt
data = pd.read_csv('data/Nature_Tumor_output.csv')
markers = ['CD4', 'CD8', 'CD3', 'CD45RA', 'CD27', 'CD19', 'CD103', 'CD28', 'CD69', 'PD1', 'HLADR', 'GranzymeB', 'CD25', 'ICOS', 'TCRgd', 'CD38', 'CD127', 'Tim3']
raw_expression = data[markers].values
data['complete_faust_label'] = 'CD4' + data['CD4_faust_annotation'] + 'CD8' + data['CD8_faust_annotation'] + 'CD3' + data['CD3_faust_annotation'] + 'CD45RA' + data['CD45RA_faust_annotation'] + 'CD27' + data['CD27_faust_annotation'] + 'CD19' + data['CD19_faust_annotation'] + 'CD103' + data['CD103_faust_annotation'] + 'CD28' + data['CD28_faust_annotation'] + 'CD69' + data['CD69_faust_annotation'] + 'PD1' + data['PD1_faust_annotation'] + 'HLADR' + data['HLADR_faust_annotation'] + 'GranzymeB' + data['GranzymeB_faust_annotation'] + 'CD25' + data['CD25_faust_annotation'] + 'ICOS' + data['ICOS_faust_annotation'] + 'TCRgd' + data['TCRgd_faust_annotation'] + 'CD38' + data['CD38_faust_annotation'] + 'CD127' + data['CD127_faust_annotation'] + 'Tim3' + data['Tim3_faust_annotation']
data[['faustLabels', 'complete_faust_label']].head(5)
expression_levels = list(data.CD3_faust_annotation.unique())
faust_labels = data.complete_faust_label.unique()
num_faust_labels = len(faust_labels)
t = 0
std_scaler = StandardScaler()
expression_level_translation = {
'-': 0,
'+': 1000,
}
embedding_expression = raw_expression.copy()
# 1. Normalize expressions for each phenotype to zero mean and unit variance
for i, faust_label in enumerate(faust_labels):
if i % 1000 == 0:
t = time()
print(f'Transform {i}-{i + 999} of {len(faust_labels)} clusters...', end='\n')
# 1a. We get the indices of all data points belonging to the same complete FAUST label\n",
idxs = data.query(f'complete_faust_label == \"{faust_label}\"').index
# 1b. Then we windsorize the expression values of all the data points for the FAUST label
for m, marker in enumerate(markers):
if (embedding_expression[idxs, m].shape[0] <= 1):
next
embedding_expression[idxs, m] = winsorize(embedding_expression[idxs, m], limits=[0.01, 0.01])
# 1c. Then we normalize the expression values of all the data points
for m, marker in enumerate(markers):
mean_val = np.mean(embedding_expression[idxs, m])
sd_val = np.std(embedding_expression[idxs, m])
if ((sd_val == 0) or (np.isnan(sd_val))):
embedding_expression[idxs, m] = (embedding_expression[idxs, m] - mean_val)
else:
embedding_expression[idxs, m] = ((embedding_expression[idxs, m] - mean_val)/sd_val)
# 2. Scale marker expressions based on the expression levels
for m, marker in enumerate(markers):
# 2a. Retrieve the expression level of a marker\n",
expression_level = data.iloc[idxs[0]][f'{marker}_faust_annotation']
# 2b. Further scale the normalized expression values respective new feature range
embedding_expression[idxs, m] += expression_level_translation[expression_level]
# 2c. Add perturbation to avoid exact ties
perturb = np.random.normal(0, 0.00001, len(idxs))
embedding_expression[idxs, m] += perturb
if i % 1000 == 999:
print(f'done! ({round(time() - t)}s)')
embedding = UMAP(n_neighbors=15, min_dist=0.2, metric='euclidean').fit_transform(embedding_expression)
df_embedding = pd.concat(
[data.faustLabels, pd.DataFrame(embedding, columns=['umapX', 'umapY'])],
axis=1
)
df_embedding.faustLabels = df_embedding.faustLabels.astype('category')
df_embedding.head()
plt.scatter(data=df_embedding, x='umapX', y='umapY')
plt.show()