-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.Rmd
278 lines (210 loc) · 11.3 KB
/
index.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
---
title: "Functional groups"
date: "`r Sys.Date()`"
site: bookdown::bookdown_site
documentclass: book
biblio-style: apalike
link-citations: yes
---
# Functional groups
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE, warning = F, error=F, message = F)
#options(repos = BiocManager::repositories())
pacman::p_load('dplyr', 'tidyr', 'htmltools', 'bbplot', 'scales',
'ggplot2', 'rdrop2', 'shiny', 'BiocManager',
'dendextend', 'data.table', 'Biostrings', 'alakazam', "unikn",
'plotly', "jcolors", 'ggdendro', "RColorBrewer","kmer", install = F)
```
```{r}
source("functions.R")
```
## Sequence pre-processing
Summary statistics for each of the functional groups declared in the app.
The app includes the P1 and P11 naive datasets and the P4 non-naive dataset.
For P1 and P11 the following filtration criteria were applied:
* Functional sequence, no stop codons or frame shifts.
* Sequences which start from position 1 of the V gene.
* Sequences which didn't have gaps open (-) and didn't include any N's
* After changing into group annotations, sequences which had more than a single assignment in naive repertoires were remove.
The groups were created with similarity of 95% based on complete linkage and functional sequences and up to position 318.
## Group devision by 95% similarity
```{r, fig.asp=0.8, out.width="1000%", fig.width=20}
load("data.rda")
mat <- mat_list$IGH$functional$nonsingle$all$`318`
clust <- hclust(as.dist(mat), "complete")
clusters <- cutree(clust, h = 0.05)
names(clusters) <- names(cutree(clust, h = 0.05))
dend <- as.dendrogram(clust)
fam_clusters <- clusters
clusters <- clusters[order.dendrogram(dend)]
clusters_numbers <- unique(clusters) - (0 %in% clusters)
n_clusters <- length(clusters_numbers)
cols <- c("#FAAB18", "#1380A1","#990000", "#588300")
pal <- cols %>%
newpal(names = c("orangy", "bluish", "redish", "greeny"))
mypal = setNames(alpha(pal %>% usecol(n = n_clusters),0.99), 1:n_clusters)
cluster_new <- mypal[clusters]
clusters_imgt <- clusters
clusters_imgt <- getGene(names(clusters_imgt), strip_d = F, omit_nl = F)
imgt_colors <-
setNames(alpha(pal %>% usecol(n = length(unique(clusters_imgt))), 0.99), sort(unique(clusters_imgt)))
clusters_imgt <- imgt_colors[clusters_imgt]
dend2 <-
dend %>% branches_attr_by_clusters(clusters, values = mypal) %>% dendextend::set("labels_cex", 0.9)
par(mar = c(12, 4, 1, 1))
dend2 %>% plot
dend2 %>% rect.dendrogram2(
k = n_clusters,lower_rect = 0,
text = clusters_numbers, xpd = T,
#border = 8,
lty = 5,
lwd = 2,
border = mypal
)
colored_bars(
cbind(clusters_imgt, cluster_new),
dend2,
sort_by_labels_order = F,
rowLabels = c("IMGT", "NEW")
)
```
## Groups Repertoire normalization
The current groups threshold and the possible states are in the table below.
```{r, fig.height=50, fig.width=30, out.height="100%", out.width="100%"}
chain = "IGH"
func <- data.frame(allele = names(vgerms[[chain]]), functionality = !grepl("(ORF|P)",sapply(seqinr::getAnnot(vgerms_full[[chain]]), function(x) unlist(strsplit(x,"[|]"))[4])), sign = sapply(seqinr::getAnnot(vgerms_full[[chain]]), function(x) unlist(strsplit(x,"[|]"))[4]), stringsAsFactors = F)
load("data_frac_new2.rda")
data <- setDT(data_frac$IGH$functional$nonsingle$all$`318`$complete$`95`)
data[,v_call:=paste0(v_gene,"*",v_allele)]
load("alleles_dbs.rda")
allele_db <- alleles_dbs$IGH$functional$nonsingle$all$`318`$complete$`95`
allele_db <- allele_db %>% rowwise() %>% mutate(gene = getGene(or_allele, strip_d = F, omit_nl = F), group = strsplit(gsub(gene, "", new_allele),"[*]")[[1]][1], gene_group = getGene(new_allele, strip_d = F, omit_nl = F))
load("functional_groups.rda")
func_groups <- functional_groups$IGH$functional$nonsingle$all$`318`$complete$`95`
groups <- setNames(allele_db$gene_group, allele_db$or_allele)
func$group <- groups[func$allele]
tmp_allele_db <- allele_db %>%
dplyr::group_by(new_allele) %>%
dplyr::summarise(or_allele = paste0(or_allele, collapse = "/"))
or_allele <- setNames(tmp_allele_db$or_allele, tmp_allele_db$new_allele)
data_cluster <- data[, v_allele_axis := or_allele[v_call]]
data_cluster$group_plot <- ifelse(is.na(data_cluster$j_call), 1, 2)
data_cluster <- data_cluster[mut == 0 & group_plot == 1]
data_cluster <- data_cluster[, .(v_allele_axis = unlist(tstrsplit(v_allele_axis, "/", type.convert = FALSE))), by = setdiff(names(data_cluster), "v_allele_axis")]
data_cluster2 <- data_cluster %>% filter(v_gene %in% names(absolute_thresholds_dict)) %>%
rowwise() %>%
dplyr::mutate(thresh = unique(as.numeric(absolute_thresholds_dict[[v_gene]][gsub("IGH","",v_allele_axis)])))
#data_cluster2 <- data_cluster2 %>% filter(freq2>=thresh)
p_list <- list()
i = 1
counts <- c()
for(g_group in names(absolute_thresholds_dict)){
n_alleles <- func %>% filter(group==g_group)
if(nrow(n_alleles)==0) next()
n_alleles <- n_alleles %>% rowwise() %>% mutate(allele_num = strsplit(allele, "[*]")[[1]][2]) %>% dplyr::arrange(allele_num)
alleles <- n_alleles$allele
threhsolds <- absolute_thresholds_dict[[g_group]]
df <- data.frame(v_allele_axis2 = paste0("IGH", names(threhsolds)), thresh = threhsolds, freq2 = 0)
data_cluster3 <- data_cluster2 %>% filter(v_gene==g_group)
if(nrow(data_cluster3)==0) next()
below_thresh <- data_cluster3 %>% filter(freq2<thresh) %>% dplyr::mutate(project = "below thresh",
zygousity_state = 0)
above_thresh <- data_cluster3 %>% filter(freq2>=thresh) %>% dplyr::arrange(desc(freq2)) %>%
dplyr::group_by(subject) %>% dplyr::mutate(
zygousity_state = dplyr::n()
) %>% arrange(subject)
if(nrow(above_thresh)==0) next()
data_cluster4 <- bind_rows(below_thresh, above_thresh)
data_cluster4$v_allele_axis2 <- factor(data_cluster4$v_allele_axis, n_alleles$allele)
data_cluster4$v_allele_axis3 <- as.numeric(data_cluster4$v_allele_axis2)
dat <- data.frame(group = g_group, allele = "IMGT", v_allele_axis = alleles, alleles = 1, state = "0", novel = F, stringsAsFactors = F)
dat2 <- above_thresh %>% group_by(v_allele_axis) %>% summarise(state = paste0(sort(unique(zygousity_state)), collapse = ","), group = g_group, alleles = 1, allele = "seen") %>% ungroup() %>% dplyr::mutate(novel = grepl("_", v_allele_axis))
counts <- bind_rows(counts, dat, dat2)
# data_cluster4 <- data_cluster4 %>% dplyr::arrange(desc(freq2)) %>%
# dplyr::group_by(subject) %>% dplyr::mutate(
# zygousity_state = dplyr::n()
# ) %>% arrange(subject)
#
p_list[[i]] <- ggplot() + geom_boxplot(data = data_cluster4, mapping = aes(v_allele_axis2, freq2) , outlier.shape = NA) +
geom_point(data = data_cluster4, mapping = aes(v_allele_axis2, freq2, color = factor(zygousity_state), shape = project), size = 6, position=position_jitter(width = 0.1)) +
geom_errorbar(data=df, aes(x = v_allele_axis2, ymin = thresh, ymax = thresh),
size=0.5,col="red", linetype = "dashed") +
labs(x = "", y = "Repertoire\nnormalized", color = "Zygousity", shape = "Project", title = g_group) +
theme_bw(base_size = 40) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
i = i +1
}
```
```{r}
data_thresh <- c()
for(g in names(absolute_thresholds_dict)){
tmp <- absolute_thresholds_dict[[g]]
tmp2 <- counts %>% filter(group==g, allele == "seen") %>% select(v_allele_axis, state) %>% dplyr::rename("alleles" = "v_allele_axis")
tmp3 <- data.frame(group = g, alleles = paste0("IGH",names(tmp)), thresholds = tmp, stringsAsFactors = F, row.names = NULL)
tmp4 <- merge(tmp3, tmp2, by = "alleles")
data_thresh <- bind_rows(data_thresh, tmp4)
}
DT::datatable(data_thresh)
```
Going over the groups showed several thins
1. We saw potential duplication events, such as in the case of IGHV1-69*04 in sample P1_I50 from group G8, where we think this allele might be sitting in the duplicated gene.
2. An interesting event involving a duplicated gene is in the case of IGHV3-43D from group G22. In this group 5 alleles are present, where three alleles are supposedly from the duplicated gene. Allele D\*04 and D\*04_G4A goes in states two with allele 01, in the J6 heterozygous samples we saw a double single chromosome deletion event. This occurred in 7 individuals and raises the question if maybe both alleles are sitting in the same location on the chromosome and are not acutely deleted.
3. Group 19 and genes IGHV3-23 and IGHV3-23D, showed a very low usage for the D gene.
4. From the usage results, we suspect that the gene in IGHV3-35G21 might be a pseudo gene.
5. We saw several signle chromosome deletion events in gene V3-66 from the group G25, which also has the gene V3-53.
6. In the group IGHV3-64G27, we suspect that allele 3-64*02 might be a pseudo allele due to very low usage.
7. In group G34 there are only two states, and include two genes. In state one only an allele from V4-30-2 appears. While in state 2, one allele V4-30-2 and one from V4-30-4 appears.
8. In group G48 there is a novel allele V6-1*01_T91C. This allele only appears in state 2 and is lowly expressed, much more when the max mutation value is 0.
Based on the threshold we compared the alleles present in IMGT versus those that passed the threshold.
```{r, fig.height=80, fig.width=80}
library(ggh4x)
library(ggpattern)
# p_list_bar <- lapply(unique(counts$group), function(g){
# ggplot(counts %>% filter(group == g)) +
# ggpattern::geom_col_pattern(aes(allele, alleles, fill = state, pattern = novel),
# colour = 'white') + theme_bw(18) #+
# #facet_nested(~ group, scales = "free", nest_line = TRUE)
#
# })
counts$group <- gsub("IGH","",counts$group)
ggplot(counts) +
ggpattern::geom_col_pattern(aes(allele, alleles, fill = state, pattern_density = novel),
pattern = 'circle',
colour = 'black') + theme_bw(100) + labs(y = "count", x = "reference") +
#ggpattern::pa +
facet_nested_wrap(.~group, scales = "free", nest_line = TRUE) + theme(legend.position = "top")
```
We plotted the relative repertoire plotted. Each dot is an individual and the colors shows the zygousity state while the shape indicates the project.
```{r, fig.height=50, fig.width=30}
cowplot::plot_grid(plotlist = p_list[1:5], align = "hv", ncol = 1)
```
```{r, fig.height=50, fig.width=30}
cowplot::plot_grid(plotlist = p_list[6:10], align = "hv", ncol = 1)
```
```{r, fig.height=50, fig.width=30}
cowplot::plot_grid(plotlist = p_list[11:15], align = "hv", ncol = 1)
```
```{r, fig.height=50, fig.width=30}
cowplot::plot_grid(plotlist = p_list[16:20], align = "hv", ncol = 1)
```
```{r, fig.height=50, fig.width=30}
cowplot::plot_grid(plotlist = p_list[21:25], align = "hv", ncol = 1)
```
```{r, fig.height=50, fig.width=30}
cowplot::plot_grid(plotlist = p_list[26:30], align = "hv", ncol = 1)
```
```{r, fig.height=50, fig.width=30}
cowplot::plot_grid(plotlist = p_list[31:35], align = "hv", ncol = 1)
```
```{r, fig.height=50, fig.width=30}
cowplot::plot_grid(plotlist = p_list[36:41], align = "hv", ncol = 1)
```
## Group Checklist
```{r eval=knitr::is_html_output(excludes = 'epub'), results = 'asis', echo = F}
# Projects sequence depth
cat(
'<iframe
src=',paste0("https://peresay.shinyapps.io/checklist"),
' border="0" frameborder="0" style="border-style:none;overflow: scroll;box-shadow:0px 0px 2px 2px rgba(0,0,0,0.2); width: 100%;height:300px;" cellspacing="0">
</iframe>', sep = ""
)
```