Skip to content

Commit

Permalink
Final analyses and adjustments
Browse files Browse the repository at this point in the history
  • Loading branch information
Mchicken1988 committed Jun 26, 2023
1 parent c207b34 commit 2ab5dc1
Show file tree
Hide file tree
Showing 22 changed files with 10,441 additions and 688 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
AD/
data/
*/rds_files/
*/RData/
11 changes: 7 additions & 4 deletions 2_MAJIQ_AS_analysis/MAJIQ_AS_analysis.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -488,10 +488,11 @@ colnames(dpsiMatrix) <- strsplit(colnames(dpsiMatrix), ".", fixed=TRUE) %>%
sapply(., "[[", 1)
# Define colors
myCols <- colorRamp2(c(-0.4, -0.15, 0, 0.15, 0.4), inferno(5))
myCols <- colorRamp2(c(-0.3, -0.15, 0, 0.15, 0.3), RColorBrewer::brewer.pal(n = 5, name = "BrBG"))
set.seed(123)
Heatmap(dpsiMatrix, col = myCols, name = "dPSI")
Heatmap(dpsiMatrix, col = myCols, row_km=3, name = "dPSI", show_row_dend = FALSE,
show_column_dend = FALSE, border=TRUE, row_title = NULL)
```

The majority of cassette exons shows increased inclusion in the stronger
Expand All @@ -514,7 +515,8 @@ colnames(dpsiMatrix) <- strsplit(colnames(dpsiMatrix), ".", fixed=TRUE) %>%
sapply(., "[[", 1)
set.seed(123)
Heatmap(dpsiMatrix, col = myCols, name = "dPSI")
Heatmap(dpsiMatrix, col = myCols, row_km=3, name = "dPSI", show_row_dend = FALSE,
show_column_dend = FALSE, border=TRUE, row_title = NULL)
```

Similar to the cassette exons, the inclusion of the first alternative
Expand All @@ -538,7 +540,8 @@ colnames(dpsiMatrix) <- strsplit(colnames(dpsiMatrix), ".", fixed=TRUE) %>%
sapply(., "[[", 1)
set.seed(123)
Heatmap(dpsiMatrix, col = myCols, name = "dPSI")
Heatmap(dpsiMatrix, col = myCols, row_km=3, name = "dPSI", show_row_dend = FALSE,
show_column_dend = FALSE, border=TRUE, row_title = NULL)
```

Similar to the cassette exons, the inclusion of the retained introns is
Expand Down
202 changes: 94 additions & 108 deletions 2_MAJIQ_AS_analysis/MAJIQ_AS_analysis.html

Large diffs are not rendered by default.

9 changes: 7 additions & 2 deletions 3_Dose_response_curve_fitting/Dose_response_curve_fitting.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -471,12 +471,17 @@ rbind(
mutate(event="IR")
) %>%
mutate(event=factor(event, levels = c("CE", "ALE", "IR"))) %>%
mutate(hillCat=factor(hillCat, levels=c("Coop-Enh", "Coop-Rep", "nonCoop-Enh", "nonCoop-Rep"))) %>%
mutate(hillCat=case_when(hillCat == "Coop-Enh" ~ "Enhanced",
hillCat == "Coop-Rep" ~ "Repressed",
TRUE ~ "Non-cooperative")) %>%
group_by(event, hillCat) %>%
dplyr::summarize(Freq = sum(Freq)) %>%
mutate(hillCat=factor(hillCat, levels=c("Enhanced", "Repressed", "Non-cooperative"))) %>%
ggplot(., aes(x=hillCat, y=Freq, fill=hillCat)) +
geom_col(col="black") +
facet_wrap(~event) +
theme_bw() +
scale_fill_manual(values=c("Coop-Enh" = "cornflowerblue", "Coop-Rep" = "salmon2", "nonCoop-Enh" = "#404040", "nonCoop-Rep" = "#404040")) +
scale_fill_manual(values=c("Enhanced" = "cornflowerblue", "Repressed" = "salmon2", "Non-cooperative" = "#404040")) +
theme_bw() +
labs(x="Hill category", y="Number of events") +
myTheme +
Expand Down
194 changes: 91 additions & 103 deletions 3_Dose_response_curve_fitting/Dose_response_curve_fitting.html

Large diffs are not rendered by default.

Binary file modified 4_iCLIP_analysis/.DS_Store
Binary file not shown.
47 changes: 25 additions & 22 deletions 4_iCLIP_analysis/Genomic_location_of_binding_sites.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,10 @@ pieDf <- data.frame(biotype=factor(c("protein_coding", "lncRNA",
# A pie chart is created showing the fraction of binding sites overlapping a
# certain biotype.
ggplot(pieDf, aes(x="", y=perc, fill = biotype)) +
scale_fill_lancet() +
scale_fill_manual(values=c("protein_coding" = "#CCCCCC",
"lncRNA" = "#8C8C8C",
"other" = "#4D4D4D",
"no overlap" = "#262626")) +
geom_col(color = 'black', position = position_stack(reverse = TRUE),
show.legend = TRUE) +
geom_text_repel(aes(x = 1.4, y = lab.ypos, label = paste(perc, "%")),
Expand Down Expand Up @@ -330,7 +333,7 @@ coeff <- max(boundGenesAtCutoff$expressed) /
# The plot is a combination of a line and bar chart using two y-axes.
ggplot() +
geom_col(data=boundGenesAtCutoff, aes(x=cutoff, y=expressed/coeff),
fill="springgreen4", width=1) +
fill="darkgrey", width=1) +
geom_line(data=boundGenesAtCutoff, aes(x=cutoff, y=fractionBound), size=1) +
scale_x_continuous(breaks=seq(0,80,20)) +
scale_y_continuous(name = "Fraction of genes bound",
Expand All @@ -339,9 +342,9 @@ ggplot() +
labs(x="TPM threshold", y="") +
myTheme +
theme(
axis.title.y.right = element_text(color = "springgreen4", size=16),
axis.ticks.y.right = element_line(color="springgreen4"),
axis.text.y.right = element_text(size = 14, colour="springgreen4")
axis.title.y.right = element_text(color = "darkgrey", size=16),
axis.ticks.y.right = element_line(color="darkgrey"),
axis.text.y.right = element_text(size = 14, colour="darkgrey")
)
```
Expand Down Expand Up @@ -382,19 +385,20 @@ fractionBoundGenes <- data.frame(
sum(genesExpressed$gene_type == "other") -
sum(genesExpressedBound$gene_type == "other"))) %>%
pivot_longer(., cols=2:3, values_to="nGenes", names_to = "bound") %>%
mutate(bound = ifelse(bound == "bound", TRUE, FALSE)) %>%
mutate(bound=factor(bound, levels=c(FALSE, TRUE))) %>%
mutate(bound = ifelse(bound == "bound", "Yes", "No")) %>%
mutate(bound=factor(bound, levels=c("No", "Yes"))) %>%
group_by(biotype) %>%
mutate(fraction = nGenes / sum(nGenes)) %>%
ungroup
# A stacked barchart is created for each of the three biotypes. Stacks
# indicate the fraction of genes bound and not bound by HNRNPH. In addition,
# the percentage of genes bound and not bound is added.
ggplot(fractionBoundGenes, aes(x=biotype, y=nGenes, fill=bound)) +
scale_fill_lancet() +
scale_fill_manual(values=c("Yes" = "black", "No" = "darkgrey")) +
geom_col(position = "fill") +
geom_text(aes(label = scales::percent(fraction)), position = 'fill') +
geom_text(aes(label = scales::percent(fraction)), position = 'fill', color="white") +
labs(y="Fraction of genes bound by HNRNPH") +
myTheme +
theme(legend.position = "right")
Expand Down Expand Up @@ -480,7 +484,10 @@ pieDf <- data.frame(feature=factor(c("CDS", "UTR5", "UTR3", "Intron"),
# A pie chart is created showing the fraction of binding sites overlapping a
# certain feature
ggplot(pieDf, aes(x="", y=perc, fill = feature)) +
scale_fill_manual(values=sapply(4,colorRampPalette(c("#003C77", "#81B4E7")))) +
scale_fill_manual(values=c("Intron" = "#CCCCCC",
"UTR3" = "#8C8C8C",
"UTR5" = "#4D4D4D",
"CDS" = "#262626")) +
geom_col(color = 'black', position = position_stack(reverse = TRUE),
show.legend = TRUE) +
geom_text_repel(aes(x = 1.4, y = lab.ypos, label = paste(perc, "%")),
Expand Down Expand Up @@ -527,7 +534,8 @@ genesWithRegulatedASevent <- genesWithRegulatedASevent[
vennList <- list('Genes with regulated AS event' = genesWithRegulatedASevent,
'Bound genes' = genes$gene_id[genes$expressed & genes$bound])
ggvenn(data=vennList, text_size=4, set_name_size=4.5, stroke_size=.5) +
ggvenn(data=vennList, text_size=4, set_name_size=4.5, stroke_size=.5,
fill_color = c("#A09090", "#A8BAA1")) +
scale_y_continuous(limits = c(-1, 1.5))
```
Expand Down Expand Up @@ -589,24 +597,19 @@ miniGenes <- miniGenes %>%
as.data.frame %>%
dplyr::select(hillCat, bound, nBSs) %>%
dplyr::filter(hillCat %in% c("Coop-Enh", "Coop-Rep", "nonReg")) %>%
mutate(hillCat = factor(hillCat, levels=c("Coop-Enh", "Coop-Rep", "nonReg"))) %>%
mutate(hillCat = case_when(hillCat == "Coop-Enh" ~ "Enhanced",
hillCat == "Coop-Rep" ~ "Repressed",
TRUE ~ "Ctrl")) %>%
mutate(hillCat = factor(hillCat, levels=c("Enhanced", "Repressed", "Ctrl"))) %>%
mutate(nBSs=ifelse(nBSs > 5, ">5", nBSs)) %>%
mutate(nBSs = factor(nBSs, levels=c("0", "1", "2", "3", "4", "5", ">5")))
miniGenes %>%
ggplot(., aes(x=hillCat, fill=bound)) +
geom_bar(position="fill") +
geom_text(data=. %>% dplyr::count(hillCat), aes(label = n, x=hillCat, y=1.05), inherit.aes=F) +
scale_fill_manual(values=c("FALSE" = "grey", "TRUE" = "black")) +
labs(x="set", y="Fraction of cassette exon events") +
myTheme +
theme(legend.position = "right")
miniGenes %>%
ggplot(., aes(x=hillCat, fill=nBSs)) +
geom_bar(position="fill") +
geom_text(data=. %>% dplyr::count(hillCat), aes(label = n, x=hillCat, y=1.05), inherit.aes=F) +
scale_fill_lancet() +
scale_fill_manual(values=c("0" = "#CACECB", "1" = "#C6D5C0", "2" = "#A5C1A1", "3" = "#83AD83",
"4" = "#5C9C66", "5" = "#4C9B84", ">5" = "#265043")) +
labs(x="set", y="Fraction of cassette exon events") +
myTheme +
theme(legend.position = "right")
Expand Down
256 changes: 121 additions & 135 deletions 4_iCLIP_analysis/Genomic_location_of_binding_sites.html

Large diffs are not rendered by default.

28 changes: 14 additions & 14 deletions 4_iCLIP_analysis/RNAmaps_of_regulated_events.html

Large diffs are not rendered by default.

171 changes: 161 additions & 10 deletions 5_rG4_analysis/Dependence_of_binding_and_cooperativity_on_rG4s.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -302,14 +302,23 @@ G4freqDf <- rbind(
G4freqDf$rG4freq[G4freqDf$rG4freq > 2] <- ">2"
G4freqDf$rG4freq <- factor(G4freqDf$rG4freq, levels=c("0", "1", "2", ">2"))
# Just check if there is an overlapping G4
G4freqDf <- G4freqDf %>%
mutate(hasG4 = ifelse(rG4freq == "0", "No", "Yes"))
# Create a stacked barchart showing the fraction of CE events with a certain
# number of overlapping rG4s
G4freqDf %>%
ggplot(., aes(x=hillCat, fill=rG4freq)) +
mutate(hillCat = case_when(hillCat == "Coop-Enh" ~ "Enhanced",
hillCat == "Coop-Rep" ~ "Repressed",
TRUE ~ "Ctrl")) %>%
mutate(hillCat = factor(hillCat, levels=c("Enhanced", "Repressed", "Ctrl"))) %>%
ggplot(., aes(x=hillCat, fill=hasG4)) +
geom_bar(position="fill") +
scale_fill_lancet() +
scale_fill_manual(values=c("No"="darkgrey", "Yes"="goldenrod3")) +
facet_wrap(~facet, ncol=2) +
labs(y="Fraction of minigenes") +
labs(y="Fraction of events") +
myTheme +
theme(legend.position = "right",
axis.text.x = element_text(angle=45, vjust = 1, hjust = 1))
Expand Down Expand Up @@ -340,6 +349,97 @@ lowest for the non-regulated events. There is also a substantial fraction
of enhanced CE events with more than two predicted rG4s in the intronic
regions.

Also statistical tests are performed, to check if the differences in the
proportions are significant. The “pairwise.prop.test()” function is used,
which automatically corrects for multiple testing using the “BH” (default would
be “holm”) method. Tests are performed and corrected independently for "AE" and
"intronic".

Here is the result for the left plot (AE):

```{r}
test_mat <- G4freqDf %>%
filter(facet=="AE") %>%
group_by(hillCat) %>%
dplyr::summarize(success=sum(hasG4 == "Yes"), trials=n()) %>%
tibble::column_to_rownames(., var="hillCat") %>%
as.matrix
pairwise.prop.test(test_mat[,1], test_mat[,2], p.adjust.method = "BH")
```

Here is the result for the right plot (Intronic):

```{r}
test_mat <- G4freqDf %>%
filter(facet=="Intronic") %>%
group_by(hillCat) %>%
dplyr::summarize(success=sum(hasG4 == "Yes"), trials=n()) %>%
tibble::column_to_rownames(., var="hillCat") %>%
as.matrix
pairwise.prop.test(test_mat[,1], test_mat[,2], p.adjust.method = "BH")
```
What is also visible, is that the fraction of inronic regions with an overlapping
rG4 is much higher than the fraction of alternative exons with an overlapping rG4.
However, the alternative exons are much shorter than the up to 3x300 nt intronic
regions.

To account for this difference, the number of overlapping predicted rG4s per
nucleotide is computed. This is done by dividing the number of events with $\ge$
overlapping rG4 by the summed up length of alternative exons / intronic regions.

```{r}
nG4s <- countOverlaps(AE_CoopEnh, regulatedMiniGenesG4s[!duplicated(regulatedMiniGenesG4s)])
normalized_G4s_CoopEnh_AE <- sum(nG4s >0) / sum(width(AE_CoopEnh))
nG4s <- countOverlaps(AE_CoopRep, regulatedMiniGenesG4s[!duplicated(regulatedMiniGenesG4s)])
normalized_G4s_CoopRep_AE <- sum(nG4s >0) / sum(width(AE_CoopRep))
nG4s <- countOverlaps(AE_nonReg, nonRegulatedMiniGenesG4s[!duplicated(nonRegulatedMiniGenesG4s)])
normalized_G4s_nonReg_AE <- sum(nG4s >0) / sum(width(AE_nonReg))
I1end_nG4s <- countOverlaps(I1end_CoopEnh, regulatedMiniGenesG4s[!duplicated(regulatedMiniGenesG4s)])
I2start_nG4s <- countOverlaps(I2start_CoopEnh, regulatedMiniGenesG4s[!duplicated(regulatedMiniGenesG4s)])
I2end_nG4s <- countOverlaps(I2end_CoopEnh, regulatedMiniGenesG4s[!duplicated(regulatedMiniGenesG4s)])
normalized_G4s_CoopEnh_intronic <- sum((I1end_nG4s + I2start_nG4s + I2end_nG4s) >0) / sum(width(c(I1end_CoopEnh, I2start_CoopEnh, I2end_CoopEnh)))
I1end_nG4s <- countOverlaps(I1end_CoopRep, regulatedMiniGenesG4s[!duplicated(regulatedMiniGenesG4s)])
I2start_nG4s <- countOverlaps(I2start_CoopRep, regulatedMiniGenesG4s[!duplicated(regulatedMiniGenesG4s)])
I2end_nG4s <- countOverlaps(I2end_CoopRep, regulatedMiniGenesG4s[!duplicated(regulatedMiniGenesG4s)])
normalized_G4s_CoopRep_intronic <- sum((I1end_nG4s + I2start_nG4s + I2end_nG4s) >0) / sum(width(c(I1end_CoopRep, I2start_CoopRep, I2end_CoopRep)))
I1end_nG4s <- countOverlaps(I1end_nonReg, nonRegulatedMiniGenesG4s[!duplicated(nonRegulatedMiniGenesG4s)])
I2start_nG4s <- countOverlaps(I2start_nonReg, nonRegulatedMiniGenesG4s[!duplicated(nonRegulatedMiniGenesG4s)])
I2end_nG4s <- countOverlaps(I2end_nonReg, nonRegulatedMiniGenesG4s[!duplicated(nonRegulatedMiniGenesG4s)])
normalized_G4s_nonReg_intronic <- sum((I1end_nG4s + I2start_nG4s + I2end_nG4s) >0) / sum(width(c(I1end_nonReg, I2start_nonReg, I2end_nonReg)))
data.frame(
set=c("Enhanced", "Enhanced",
"Repressed", "Repressed",
"Ctrl", "Ctrl"),
region=c("AE", "intronic",
"AE", "intronic",
"AE", "intronic"),
norm_G4_overlap=c(normalized_G4s_CoopEnh_AE, normalized_G4s_CoopEnh_intronic,
normalized_G4s_CoopRep_AE, normalized_G4s_CoopRep_intronic,
normalized_G4s_nonReg_AE, normalized_G4s_nonReg_intronic) * 1000
) %>%
mutate(set=factor(set, levels=c("Enhanced", "Repressed", "Ctrl"))) %>%
ggplot(., aes(x=set, y=norm_G4_overlap)) +
geom_col(position="dodge") +
labs(y="Overlapping G4s per kb") +
facet_wrap(~region) +
myTheme +
theme(legend.position = "right")
```

# Dependence of Hill coefficient on the presence of rG4s

Expand All @@ -350,7 +450,7 @@ whether the degree of cooperativity depends on the presence of predicted
rG4s.

To answer this question, repressed and enhanced CE events were each
grouped into five quantiles (0-20%, 20-40%, 40-60%, 60-80% and 80-100%)
put into five groups (0-20%, 20-40%, 40-60%, 60-80% and 80-100%)
based on the fitted Hill coefficient (nH). Afterwards the five groups of
repressed CE events were compared for the presence of predicted rG4s
overlapping the alternative exon and the five groups of enhanced CE
Expand All @@ -368,10 +468,10 @@ plotDf$nHquantile <- cut(abs(plotDf$nH),
labels=c("0-20%","20-40%","40-60%","60-80%","80-100%"))
plotDf %>%
ggplot(., aes(x=nHquantile, fill=rG4freq)) +
scale_fill_lancet() +
ggplot(., aes(x=nHquantile, fill=hasG4)) +
scale_fill_manual(values=c("No"="darkgrey", "Yes"="goldenrod3")) +
geom_bar(position="fill") +
labs(y="Fraction with predicted rG4") +
labs(x="Increasing hill coefficient", y="Fraction with predicted rG4") +
myTheme +
theme(legend.position = "bottom",
aspect.ratio=1/.75,
Expand All @@ -387,6 +487,45 @@ plotDf %>% dplyr::count(nHquantile, rG4freq) %>%
```

Also a statistical test is performed to check if the differences in the
proportions are significant. The “prop.test()” function is applied on the lowest
and highest quantile.

Here is the result:

```{r}
test_mat <- plotDf %>%
group_by(nHquantile) %>%
dplyr::summarize(success=sum(hasG4 == "Yes"), trials=n()) %>%
tibble::column_to_rownames(., var="nHquantile") %>%
as.matrix
prop.test(test_mat[c(1,5),1], test_mat[c(1,5),2])
```

To check for the strength of the relationship, I increase the number of groups
by going in 10% steps and plotting the 10 groups against the fraction of events
with $\ge$ 1 overlapping rG4.

```{r}
plotDf$new_nHquantile <- cut(abs(plotDf$nH), breaks=abs(plotDf$nH) %>% quantile(., seq(0,1,0.1)), include.lowest = TRUE, labels=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) %>% as.character %>% as.integer()
plotDf %>% dplyr::select(hasG4, new_nHquantile) %>%
dplyr::group_by(new_nHquantile) %>%
dplyr::summarize(fraction=sum(hasG4 == "Yes")/dplyr::n()) %>%
ggplot(., aes(x=new_nHquantile, y=fraction)) +
geom_point(size=3, color="goldenrod3") +
geom_smooth(method="lm", se=FALSE, color="black") +
scale_x_continuous(breaks=1:10) +
stat_cor(method="spearman", cor.coef.name="Spearman_R", label.y=0.5) +
stat_cor(method="pearson", cor.coef.name="Pearson_R", label.y=0.45) +
labs(x="nH quantiles (10% steps)",
y="Fraction with predicted rG4") +
myTheme +
theme(aspect.ratio=1)
```


Here is the plot and the underlying data.frame for the enhanced CE
events:

Expand All @@ -400,10 +539,10 @@ plotDf$nHquantile <- cut(abs(plotDf$nH),
labels=c("0-20%","20-40%","40-60%","60-80%","80-100%"))
plotDf %>%
ggplot(., aes(x=nHquantile, fill=rG4freq)) +
scale_fill_lancet() +
ggplot(., aes(x=nHquantile, fill=hasG4)) +
scale_fill_manual(values=c("No"="darkgrey", "Yes"="goldenrod3")) +
geom_bar(position="fill") +
labs(y="Fraction with predicted rG4") +
labs(x="Increasing hill coefficient", y="Fraction with predicted rG4") +
myTheme +
theme(legend.position = "bottom",
aspect.ratio=1/.75,
Expand All @@ -424,6 +563,18 @@ at least one predicted rG4 overlapping the cassette/alternative exon
increases with the degree of cooperativity suggesting a potential
link between cooperativity and rG4 formation.

Again a statistical test on the lowest and highest quantiles was performed.
Here is the result:

```{r}
test_mat <- plotDf %>%
group_by(nHquantile) %>%
dplyr::summarize(success=sum(hasG4 == "Yes"), trials=n()) %>%
tibble::column_to_rownames(., var="nHquantile") %>%
as.matrix
prop.test(test_mat[c(1,5),1], test_mat[c(1,5),2])
```

# Session Information

```{r}
Expand Down
Loading

0 comments on commit 2ab5dc1

Please sign in to comment.