-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit b00677a
Showing
79 changed files
with
127,179 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
alexandra_genes <- c("KCNQ1", "KCNQ2", "KCNQ3", "KCNQ4", "KCNQ5", | ||
"CFTR", "PTGER4", "SLC12A2") | ||
|
||
|
||
rel_abun <- apply(xx$summarized_expression, 2, function(x) x/sum(x)) | ||
|
||
data_alexandra <- tibble(gene = c(rep(as.character(xx$gene_id[which(xx$gene_id %in% alexandra_genes)]), 4), | ||
rep(as.character(xx$gene_id[which(xx$gene_id %in% alexandra_genes)]), 4)), | ||
group = c(rep("count_data", 4 * length(which(xx$gene_id %in% alexandra_genes))), | ||
rep("rel_abun", 4 * length(which(xx$gene_id %in% alexandra_genes)))), | ||
replicate = c(rep(1, length(which(xx$gene_id %in% alexandra_genes))), | ||
rep(2, length(which(xx$gene_id %in% alexandra_genes))), | ||
rep(3, length(which(xx$gene_id %in% alexandra_genes))), | ||
rep(4, length(which(xx$gene_id %in% alexandra_genes))), | ||
rep(1, length(which(xx$gene_id %in% alexandra_genes))), | ||
rep(2, length(which(xx$gene_id %in% alexandra_genes))), | ||
rep(3, length(which(xx$gene_id %in% alexandra_genes))), | ||
rep(4, length(which(xx$gene_id %in% alexandra_genes)))), | ||
data = c(as.vector(xx$summarized_expression[which(xx$gene_id %in% alexandra_genes), 1:4]), | ||
as.vector(rel_abun[which(xx$gene_id %in% alexandra_genes), 1:4]))) | ||
|
||
data_alexandra %>% | ||
dplyr::filter(., group == "count_data") %>% | ||
ggplot() + | ||
geom_boxplot(aes(x = gene, y = log10(data)), fill = "tan") + | ||
labs(x = "Gene", y = "log10(summarized counts)") + | ||
# facet_grid(. ~ group) + | ||
theme_bw() + | ||
theme(axis.text.y = element_text(size = 24, color = "black"), | ||
axis.text.x = element_text(size = 24, color = "black", angle = 90, hjust = 1, vjust = 0.5), | ||
axis.title = element_text(size = 36, color = "black"), | ||
axis.ticks = element_line(color = "black")) | ||
ggsave(file = "genes_alexandra.pdf", | ||
height = 8, | ||
width = 11, | ||
dpi = 600) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
x1 <- results(dds, | ||
contrast = c("Group", "H_E", "M_E"), | ||
pAdjustMethod = "fdr", | ||
cooksCutoff = FALSE) | ||
|
||
x2 <- results(dds, | ||
contrast = c("Group", "H_E_AB", "M_E_AB"), | ||
pAdjustMethod = "fdr", | ||
cooksCutoff = FALSE) | ||
|
||
x3 <- results(dds, | ||
contrast = c("Group", "H_E_AB", "H_E"), | ||
pAdjustMethod = "fdr", | ||
cooksCutoff = FALSE) | ||
|
||
|
||
# druid on comparisons | ||
x1 <- data_frame(gene_symbol = xx$gene_id, | ||
logFC = as.vector(as.numeric(x1$log2FoldChange)), | ||
p_val = as.vector(as.numeric(x1$pvalue)), | ||
q_val = as.vector(as.numeric(x1$padj))) | ||
|
||
g1 <- x1 %>% dplyr::filter(., q_val < 0.05) | ||
z <- AnnotationDbi::select(x = org.Hs.eg.db, | ||
keys = as.character(g1$gene_symbol), | ||
keytype = "SYMBOL", | ||
columns = "ENTREZID") | ||
|
||
# d1 <- run_conddr(conddr_data = "/Volumes/HOME/scripts/r/conddr/data/conddr_data_ndrugs=8422_neffects=41412_saveDate=03082018.RData", | ||
# dge_data = cbind(g1$logFC, g1$q_val), | ||
# pvalue_thr = 0.05, | ||
# log2_fold_thr = 1, | ||
# gene_symbols = z$SYMBOL, | ||
# gene_entrez = z$ENTREZID, | ||
# num_random = 10000, | ||
# match_effect = "neg") | ||
# | ||
# | ||
# x2 <- data_frame(gene_symbol = xx$gene_id, | ||
# logFC = as.vector(as.numeric(x2$log2FoldChange)), | ||
# p_val = as.vector(as.numeric(x2$pvalue)), | ||
# q_val = as.vector(as.numeric(x2$padj))) | ||
# | ||
# g2 <- x2 %>% dplyr::filter(., q_val < 0.05) | ||
# z <- AnnotationDbi::select(x = org.Hs.eg.db, | ||
# keys = as.character(g2$gene_symbol), | ||
# keytype = "SYMBOL", | ||
# columns = "ENTREZID") | ||
# | ||
# d2 <- run_conddr(conddr_data = "/Volumes/HOME/scripts/r/conddr/data/conddr_data_ndrugs=8422_neffects=41412_saveDate=03082018.RData", | ||
# dge_data = cbind(g2$logFC, g2$q_val), | ||
# pvalue_thr = 0.05, | ||
# log2_fold_thr = 1, | ||
# gene_symbols = z$SYMBOL, | ||
# gene_entrez = z$ENTREZID, | ||
# num_random = 10000, | ||
# match_effect = "neg") | ||
# | ||
# | ||
# x3 <- data_frame(gene_symbol = xx$gene_id, | ||
# logFC = as.vector(as.numeric(x3$log2FoldChange)), | ||
# p_val = as.vector(as.numeric(x3$pvalue)), | ||
# q_val = as.vector(as.numeric(x3$padj))) | ||
# | ||
# g3 <- x3 %>% dplyr::filter(., q_val < 0.05) | ||
# z <- AnnotationDbi::select(x = org.Hs.eg.db, | ||
# keys = as.character(g3$gene_symbol), | ||
# keytype = "SYMBOL", | ||
# columns = "ENTREZID") | ||
# | ||
# d3 <- run_conddr(conddr_data = "/Volumes/HOME/scripts/r/conddr/data/conddr_data_ndrugs=8422_neffects=41412_saveDate=03082018.RData", | ||
# dge_data = cbind(g3$logFC, g3$q_val), | ||
# pvalue_thr = 0.05, | ||
# log2_fold_thr = 1, | ||
# gene_symbols = z$SYMBOL, | ||
# gene_entrez = z$ENTREZID, | ||
# num_random = 10000, | ||
# match_effect = "neg") | ||
|
||
# PLOTS | ||
x1 %>% | ||
ggplot() + | ||
geom_point(aes(x = logFC, y = -log10(q_val)), color = "gray") + | ||
geom_point(data = subset(x1, q_val < 0.01 & logFC > 1), aes(x = logFC, y = -log10(q_val)), color = "red") + | ||
geom_point(data = subset(x1, q_val < 0.01 & logFC < -1), aes(x = logFC, y = -log10(q_val)), color = "red") + | ||
labs(x = "log2(fold change)", y = "-log10(q)", title = "H_E vs M_E") + | ||
theme_bw() + | ||
theme(panel.grid = element_blank(), | ||
axis.text = element_text(size = 12, color = "black"), | ||
axis.title = element_text(size = 24, color = "black")) | ||
|
||
x2 %>% | ||
ggplot() + | ||
geom_point(aes(x = logFC, y = -log10(q_val)), color = "gray") + | ||
geom_point(data = subset(x2, q_val < 0.05 & logFC > 1), aes(x = logFC, y = -log10(q_val)), color = "red") + | ||
geom_point(data = subset(x2, q_val < 0.05 & logFC < -1), aes(x = logFC, y = -log10(q_val)), color = "red") + | ||
labs(x = "log2(fold change)", y = "-log10(q)", title = "Hmmet + EHEC vs Mmmet + EHEC") + | ||
theme_bw() + | ||
theme(panel.grid = element_blank(), | ||
axis.text = element_text(size = 24, color = "black"), | ||
axis.title = element_text(size = 24, color = "black"), | ||
title = element_text(size = 24)) | ||
ggsave("results/volcano-heab_meab-txp_09122018.pdf", height = 8, width = 11, dpi = 600) | ||
|
||
x3 %>% | ||
ggplot() + | ||
geom_point(aes(x = logFC, y = -log10(q_val)), color = "gray") + | ||
geom_point(data = subset(x3, q_val < 0.05 & logFC > 1), aes(x = logFC, y = -log10(q_val)), color = "red") + | ||
geom_point(data = subset(x3, q_val < 0.05 & logFC < -1), aes(x = logFC, y = -log10(q_val)), color = "red") + | ||
labs(x = "log2(fold change)", y = "-log10(q)", title = "Hmmet + EHEC vs Hmmet") + | ||
theme_bw() + | ||
theme(panel.grid = element_blank(), | ||
axis.text = element_text(size = 24, color = "black"), | ||
axis.title = element_text(size = 24, color = "black"), | ||
title = element_text(size = 24)) | ||
ggsave("results/volcano-heab_he-txp_09122018.pdf", height = 8, width = 11, dpi = 600) | ||
|
||
|
||
# d3 %>% | ||
# dplyr::arrange(., desc(score)) %>% | ||
# dplyr::filter(., probability_random < 0.001) %>% | ||
# ggplot() + | ||
# geom_point(aes(x = score, y = fct_reorder(drug_name, score), size = number_matches, color = drug_group)) + | ||
# labs(x = "score", y = NULL) + | ||
# theme_minimal() + | ||
# theme(axis.text = element_text(size = 12, color = "black"), | ||
# axis.title = element_text(size = 24, color = "black")) | ||
# ggsave("results/druid-heab_he-txp_09122018.pdf", height = 8, width = 11, dpi = 600) | ||
# | ||
# | ||
# d2 %>% | ||
# dplyr::arrange(., desc(score)) %>% | ||
# dplyr::slice(1:20) %>% | ||
# ggplot() + | ||
# geom_point(aes(x = score, y = fct_reorder(drug_name, score), size = number_matches, color = drug_group)) + | ||
# labs(x = "score", y = NULL) + | ||
# theme_minimal() + | ||
# theme(axis.text = element_text(size = 12, color = "black"), | ||
# axis.title = element_text(size = 24, color = "black")) | ||
# ggsave("results/druid-heab_meab-txp_09122018.pdf", height = 8, width = 11, dpi = 600) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#' Functions for pathway enrichment | ||
#' These functions will do a pathway enrichment based on tf-idf, similar to the rPEGEOS package I have developed (https://github.com/diogocamacho/rpegeos). Since that package is still under some development, the functions here offer more stability. | ||
cosine_similarity <- function(x, y, tfidf_crossprod_mat) | ||
{ | ||
# cs <- crossprod(x,y)/sqrt(crossprod(x) * crossprod(y)) | ||
|
||
# x1 <- apply(x,1,crossprod) | ||
x2 <- tcrossprod(y) # <-- y is a 1xG vector!! | ||
x3 <- sqrt(tfidf_crossprod_mat * as.vector(x2)) | ||
|
||
y1 <- x %*% t(y) | ||
|
||
cs <- y1 / x3 | ||
|
||
return(cs) | ||
} | ||
|
||
crossprod_matrix <- function(tfidf_matrix) | ||
{ | ||
cpm <- apply(tfidf_matrix,1,crossprod) # <-- crossproduct matrix for tfidf | ||
return(cpm) | ||
} | ||
|
||
random_sets <- function(number_sets,universe_size,gene_set_size) | ||
{ | ||
# rnd_set <- matrix(0,nrow=ncol(target_tfidf),ncol=number_sets) | ||
# universe_size = number of columns in tfidf matrix | ||
# gene_set_size = number of genes that we obtained as differntially expressed | ||
|
||
x1 <- matrix(0,nrow=number_sets,ncol=universe_size) | ||
x2 <- t(replicate(number_sets,sample(universe_size,size=gene_set_size,replace=FALSE))) | ||
# yy <- apply(x2,1,function(s) {x1[s] <- 1}) | ||
|
||
for(i in seq(1,number_sets)) | ||
{ | ||
# a1 <- matrix(0,ncol=1,nrow=universe_size) | ||
# a1[sample(universe_size,size = gene_set_size,replace = FALSE)] <- 1 | ||
# x <- cbind(x,a1) | ||
x1[i,x2[i,]] <- 1 | ||
} | ||
|
||
# rnd_set <- replicate(n=number_sets,sample(x=ncol(target_tfidf),size=gene_set_size,replace=FALSE)) | ||
# return(rnd_set) | ||
return(x1) | ||
} | ||
|
||
random_tfidf <- function(random_set,target_tfidf,tfidf_crossprod_mat) | ||
{ | ||
|
||
rnd_res <- apply(random_set,1,function(x) cosine_similarity(target_tfidf,t(as.matrix(x)),tfidf_crossprod_mat)) | ||
|
||
return(rnd_res) | ||
|
||
} | ||
|
||
res_stats <- function(true_results,random_results) | ||
{ | ||
res_stats <- sapply(seq_along(true_results), function(x, y, i) length(which(y[i] > x[i, ]))/length(x[i,]), y = as.matrix(true_results) ,x = random_results) | ||
return(res_stats) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
gene_summarizing <- function(expression_data, gene_id) { | ||
x1 <- unique(gene_id) | ||
|
||
summ_data <- vector(mode = "list", length = length(x1)) | ||
|
||
for (i in seq(1, length(x1))) { | ||
x2 <- which(gene_id == x1[i]) | ||
if (length(x2) == 1) { | ||
summ_data[[i]] <- expression_data[i, ] | ||
} else { | ||
x3 <- expression_data[x2, ] | ||
summ_data[[i]] <- colSums(x3) | ||
} | ||
} | ||
|
||
summ_data <- do.call(what = rbind, args = summ_data) | ||
|
||
return(list(gene_id = x1, summarized_expression = summ_data)) | ||
} | ||
|
||
|
||
# x1 <- unique(human_genes$symbol) | ||
# summ <- vector(mode = "list", length = length(x1)) | ||
# for (i in seq(1, length(x1))) { | ||
# print(i) | ||
# x2 <- which(human_genes$symbol == x1[i]) | ||
# if (length(x2) == 1) { | ||
# summ[[i]] <- human_counts[i, ] | ||
# } else { | ||
# x3 <- human_counts[x2, ] | ||
# summ[[i]] <- colSums(x3) | ||
# } | ||
# } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# bioconductor libraries | ||
library(GO.db) | ||
library(reactome.db) | ||
library(DESeq2) | ||
# library(limma) | ||
# library(org.EcK12.eg.db) | ||
|
||
# other libraries | ||
library(tidyverse) | ||
library(tidytext) | ||
library(Matrix) | ||
library(gplots) | ||
library(RColorBrewer) | ||
library(readxl) | ||
library(DESeq2) | ||
library(org.Hs.eg.db) | ||
library(DRUID) | ||
|
||
|
||
# some functions | ||
source("R/enrichment_functions.R") | ||
|
||
# rna-seq data | ||
load(file = "data/human-centric_dual_rnaseq_05162018.RData") # <-- human centric data | ||
|
||
samples <- hs_data[[3]] | ||
human_counts <- as.matrix(hs_data[[1]]) | ||
human_genes <- hs_data[[2]] | ||
|
||
# filter out genes for which we have no gene name | ||
nix <- which(is.na(human_genes$symbol)) | ||
if (length(nix) != 0) { | ||
human_genes <- human_genes[-nix, ] | ||
human_counts <- human_counts[-nix, ] | ||
} | ||
|
||
|
||
# remove genes with zero counts | ||
nix <- which(rowSums(human_counts) == 0) | ||
|
||
if(length(nix) != 0) | ||
{ | ||
human_counts <- human_counts[-nix, ] | ||
human_genes <- human_genes[-nix, ] | ||
} | ||
|
||
|
||
# summarize gene ids | ||
xx <- gene_summarizing(expression_data = human_counts, | ||
gene_id = human_genes$symbol) | ||
|
||
|
||
# DESeq2 ---- | ||
# E. coli | ||
dds <- DESeqDataSetFromMatrix(countData = xx$summarized_expression[,c(1:8, 12:19)], | ||
colData = samples[c(1:8, 12:19),], | ||
design = ~ Group) | ||
|
||
dds <- DESeq(object = dds) | ||
|
||
# ec_e_ab_res <- results(dds,contrast=c("Group","H_E_AB","M_E_AB"),alpha=0.05,lfcThreshold = 1) | ||
hc_ab_res <- results(dds, | ||
contrast = c("Group", "H_E_AB", "M_E_AB"), | ||
pAdjustMethod = "fdr", | ||
cooksCutoff = FALSE) | ||
|
||
hc_ab_res <- data_frame(gene_symbol = xx$gene_id, | ||
logFC = as.vector(as.numeric(hc_ab_res$log2FoldChange)), | ||
p_val = as.vector(as.numeric(hc_ab_res$pvalue)), | ||
q_val = as.vector(as.numeric(hc_ab_res$padj))) | ||
|
||
diff_genes <- hc_ab_res %>% dplyr::filter(., q_val < 0.05) | ||
|
||
# get entrez ids for genes | ||
z <- AnnotationDbi::select(x = org.Hs.eg.db, | ||
keys = as.character(diff_genes$gene_symbol), | ||
keytype = "SYMBOL", | ||
columns = "ENTREZID") | ||
|
||
hs_inf <- DRUID::concoct(dge_matrix = cbind(diff_genes$logFC, diff_genes$q_val), | ||
tfidf_matrix = cmap_druid$tfidf, | ||
tfidf_crossproduct = cmap_druid$cpm, | ||
num_random = 10000, | ||
druid_direction = "neg", | ||
fold_thr = 1, | ||
pvalue_thr = 0.05, | ||
entrez = z$ENTREZID) | ||
|
||
hs_inf <- hs_inf %>% | ||
tibble::add_column(., drug_name = cmap_druid$drugs$name, .before = 1) %>% | ||
tibble::add_column(., concentration = cmap_druid$drugs$concentration, .before = 2) %>% | ||
tibble::add_column(., cell_line = cmap_druid$drugs$cell_line, .before = 3) |
Oops, something went wrong.