Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
diogocamacho authored Nov 19, 2019
0 parents commit b00677a
Show file tree
Hide file tree
Showing 79 changed files with 127,179 additions and 0 deletions.
39 changes: 39 additions & 0 deletions R/alexandra_genes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
alexandra_genes <- c("KCNQ1", "KCNQ2", "KCNQ3", "KCNQ4", "KCNQ5",
"CFTR", "PTGER4", "SLC12A2")


rel_abun <- apply(xx$summarized_expression, 2, function(x) x/sum(x))

data_alexandra <- tibble(gene = c(rep(as.character(xx$gene_id[which(xx$gene_id %in% alexandra_genes)]), 4),
rep(as.character(xx$gene_id[which(xx$gene_id %in% alexandra_genes)]), 4)),
group = c(rep("count_data", 4 * length(which(xx$gene_id %in% alexandra_genes))),
rep("rel_abun", 4 * length(which(xx$gene_id %in% alexandra_genes)))),
replicate = c(rep(1, length(which(xx$gene_id %in% alexandra_genes))),
rep(2, length(which(xx$gene_id %in% alexandra_genes))),
rep(3, length(which(xx$gene_id %in% alexandra_genes))),
rep(4, length(which(xx$gene_id %in% alexandra_genes))),
rep(1, length(which(xx$gene_id %in% alexandra_genes))),
rep(2, length(which(xx$gene_id %in% alexandra_genes))),
rep(3, length(which(xx$gene_id %in% alexandra_genes))),
rep(4, length(which(xx$gene_id %in% alexandra_genes)))),
data = c(as.vector(xx$summarized_expression[which(xx$gene_id %in% alexandra_genes), 1:4]),
as.vector(rel_abun[which(xx$gene_id %in% alexandra_genes), 1:4])))

data_alexandra %>%
dplyr::filter(., group == "count_data") %>%
ggplot() +
geom_boxplot(aes(x = gene, y = log10(data)), fill = "tan") +
labs(x = "Gene", y = "log10(summarized counts)") +
# facet_grid(. ~ group) +
theme_bw() +
theme(axis.text.y = element_text(size = 24, color = "black"),
axis.text.x = element_text(size = 24, color = "black", angle = 90, hjust = 1, vjust = 0.5),
axis.title = element_text(size = 36, color = "black"),
axis.ticks = element_line(color = "black"))
ggsave(file = "genes_alexandra.pdf",
height = 8,
width = 11,
dpi = 600)



140 changes: 140 additions & 0 deletions R/druid_etal.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
x1 <- results(dds,
contrast = c("Group", "H_E", "M_E"),
pAdjustMethod = "fdr",
cooksCutoff = FALSE)

x2 <- results(dds,
contrast = c("Group", "H_E_AB", "M_E_AB"),
pAdjustMethod = "fdr",
cooksCutoff = FALSE)

x3 <- results(dds,
contrast = c("Group", "H_E_AB", "H_E"),
pAdjustMethod = "fdr",
cooksCutoff = FALSE)


# druid on comparisons
x1 <- data_frame(gene_symbol = xx$gene_id,
logFC = as.vector(as.numeric(x1$log2FoldChange)),
p_val = as.vector(as.numeric(x1$pvalue)),
q_val = as.vector(as.numeric(x1$padj)))

g1 <- x1 %>% dplyr::filter(., q_val < 0.05)
z <- AnnotationDbi::select(x = org.Hs.eg.db,
keys = as.character(g1$gene_symbol),
keytype = "SYMBOL",
columns = "ENTREZID")

# d1 <- run_conddr(conddr_data = "/Volumes/HOME/scripts/r/conddr/data/conddr_data_ndrugs=8422_neffects=41412_saveDate=03082018.RData",
# dge_data = cbind(g1$logFC, g1$q_val),
# pvalue_thr = 0.05,
# log2_fold_thr = 1,
# gene_symbols = z$SYMBOL,
# gene_entrez = z$ENTREZID,
# num_random = 10000,
# match_effect = "neg")
#
#
# x2 <- data_frame(gene_symbol = xx$gene_id,
# logFC = as.vector(as.numeric(x2$log2FoldChange)),
# p_val = as.vector(as.numeric(x2$pvalue)),
# q_val = as.vector(as.numeric(x2$padj)))
#
# g2 <- x2 %>% dplyr::filter(., q_val < 0.05)
# z <- AnnotationDbi::select(x = org.Hs.eg.db,
# keys = as.character(g2$gene_symbol),
# keytype = "SYMBOL",
# columns = "ENTREZID")
#
# d2 <- run_conddr(conddr_data = "/Volumes/HOME/scripts/r/conddr/data/conddr_data_ndrugs=8422_neffects=41412_saveDate=03082018.RData",
# dge_data = cbind(g2$logFC, g2$q_val),
# pvalue_thr = 0.05,
# log2_fold_thr = 1,
# gene_symbols = z$SYMBOL,
# gene_entrez = z$ENTREZID,
# num_random = 10000,
# match_effect = "neg")
#
#
# x3 <- data_frame(gene_symbol = xx$gene_id,
# logFC = as.vector(as.numeric(x3$log2FoldChange)),
# p_val = as.vector(as.numeric(x3$pvalue)),
# q_val = as.vector(as.numeric(x3$padj)))
#
# g3 <- x3 %>% dplyr::filter(., q_val < 0.05)
# z <- AnnotationDbi::select(x = org.Hs.eg.db,
# keys = as.character(g3$gene_symbol),
# keytype = "SYMBOL",
# columns = "ENTREZID")
#
# d3 <- run_conddr(conddr_data = "/Volumes/HOME/scripts/r/conddr/data/conddr_data_ndrugs=8422_neffects=41412_saveDate=03082018.RData",
# dge_data = cbind(g3$logFC, g3$q_val),
# pvalue_thr = 0.05,
# log2_fold_thr = 1,
# gene_symbols = z$SYMBOL,
# gene_entrez = z$ENTREZID,
# num_random = 10000,
# match_effect = "neg")

# PLOTS
x1 %>%
ggplot() +
geom_point(aes(x = logFC, y = -log10(q_val)), color = "gray") +
geom_point(data = subset(x1, q_val < 0.01 & logFC > 1), aes(x = logFC, y = -log10(q_val)), color = "red") +
geom_point(data = subset(x1, q_val < 0.01 & logFC < -1), aes(x = logFC, y = -log10(q_val)), color = "red") +
labs(x = "log2(fold change)", y = "-log10(q)", title = "H_E vs M_E") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 12, color = "black"),
axis.title = element_text(size = 24, color = "black"))

x2 %>%
ggplot() +
geom_point(aes(x = logFC, y = -log10(q_val)), color = "gray") +
geom_point(data = subset(x2, q_val < 0.05 & logFC > 1), aes(x = logFC, y = -log10(q_val)), color = "red") +
geom_point(data = subset(x2, q_val < 0.05 & logFC < -1), aes(x = logFC, y = -log10(q_val)), color = "red") +
labs(x = "log2(fold change)", y = "-log10(q)", title = "Hmmet + EHEC vs Mmmet + EHEC") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 24, color = "black"),
axis.title = element_text(size = 24, color = "black"),
title = element_text(size = 24))
ggsave("results/volcano-heab_meab-txp_09122018.pdf", height = 8, width = 11, dpi = 600)

x3 %>%
ggplot() +
geom_point(aes(x = logFC, y = -log10(q_val)), color = "gray") +
geom_point(data = subset(x3, q_val < 0.05 & logFC > 1), aes(x = logFC, y = -log10(q_val)), color = "red") +
geom_point(data = subset(x3, q_val < 0.05 & logFC < -1), aes(x = logFC, y = -log10(q_val)), color = "red") +
labs(x = "log2(fold change)", y = "-log10(q)", title = "Hmmet + EHEC vs Hmmet") +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text = element_text(size = 24, color = "black"),
axis.title = element_text(size = 24, color = "black"),
title = element_text(size = 24))
ggsave("results/volcano-heab_he-txp_09122018.pdf", height = 8, width = 11, dpi = 600)


# d3 %>%
# dplyr::arrange(., desc(score)) %>%
# dplyr::filter(., probability_random < 0.001) %>%
# ggplot() +
# geom_point(aes(x = score, y = fct_reorder(drug_name, score), size = number_matches, color = drug_group)) +
# labs(x = "score", y = NULL) +
# theme_minimal() +
# theme(axis.text = element_text(size = 12, color = "black"),
# axis.title = element_text(size = 24, color = "black"))
# ggsave("results/druid-heab_he-txp_09122018.pdf", height = 8, width = 11, dpi = 600)
#
#
# d2 %>%
# dplyr::arrange(., desc(score)) %>%
# dplyr::slice(1:20) %>%
# ggplot() +
# geom_point(aes(x = score, y = fct_reorder(drug_name, score), size = number_matches, color = drug_group)) +
# labs(x = "score", y = NULL) +
# theme_minimal() +
# theme(axis.text = element_text(size = 12, color = "black"),
# axis.title = element_text(size = 24, color = "black"))
# ggsave("results/druid-heab_meab-txp_09122018.pdf", height = 8, width = 11, dpi = 600)
60 changes: 60 additions & 0 deletions R/enrichment_functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#' Functions for pathway enrichment
#' These functions will do a pathway enrichment based on tf-idf, similar to the rPEGEOS package I have developed (https://github.com/diogocamacho/rpegeos). Since that package is still under some development, the functions here offer more stability.
cosine_similarity <- function(x, y, tfidf_crossprod_mat)
{
# cs <- crossprod(x,y)/sqrt(crossprod(x) * crossprod(y))

# x1 <- apply(x,1,crossprod)
x2 <- tcrossprod(y) # <-- y is a 1xG vector!!
x3 <- sqrt(tfidf_crossprod_mat * as.vector(x2))

y1 <- x %*% t(y)

cs <- y1 / x3

return(cs)
}

crossprod_matrix <- function(tfidf_matrix)
{
cpm <- apply(tfidf_matrix,1,crossprod) # <-- crossproduct matrix for tfidf
return(cpm)
}

random_sets <- function(number_sets,universe_size,gene_set_size)
{
# rnd_set <- matrix(0,nrow=ncol(target_tfidf),ncol=number_sets)
# universe_size = number of columns in tfidf matrix
# gene_set_size = number of genes that we obtained as differntially expressed

x1 <- matrix(0,nrow=number_sets,ncol=universe_size)
x2 <- t(replicate(number_sets,sample(universe_size,size=gene_set_size,replace=FALSE)))
# yy <- apply(x2,1,function(s) {x1[s] <- 1})

for(i in seq(1,number_sets))
{
# a1 <- matrix(0,ncol=1,nrow=universe_size)
# a1[sample(universe_size,size = gene_set_size,replace = FALSE)] <- 1
# x <- cbind(x,a1)
x1[i,x2[i,]] <- 1
}

# rnd_set <- replicate(n=number_sets,sample(x=ncol(target_tfidf),size=gene_set_size,replace=FALSE))
# return(rnd_set)
return(x1)
}

random_tfidf <- function(random_set,target_tfidf,tfidf_crossprod_mat)
{

rnd_res <- apply(random_set,1,function(x) cosine_similarity(target_tfidf,t(as.matrix(x)),tfidf_crossprod_mat))

return(rnd_res)

}

res_stats <- function(true_results,random_results)
{
res_stats <- sapply(seq_along(true_results), function(x, y, i) length(which(y[i] > x[i, ]))/length(x[i,]), y = as.matrix(true_results) ,x = random_results)
return(res_stats)
}
33 changes: 33 additions & 0 deletions R/gene_summarizing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
gene_summarizing <- function(expression_data, gene_id) {
x1 <- unique(gene_id)

summ_data <- vector(mode = "list", length = length(x1))

for (i in seq(1, length(x1))) {
x2 <- which(gene_id == x1[i])
if (length(x2) == 1) {
summ_data[[i]] <- expression_data[i, ]
} else {
x3 <- expression_data[x2, ]
summ_data[[i]] <- colSums(x3)
}
}

summ_data <- do.call(what = rbind, args = summ_data)

return(list(gene_id = x1, summarized_expression = summ_data))
}


# x1 <- unique(human_genes$symbol)
# summ <- vector(mode = "list", length = length(x1))
# for (i in seq(1, length(x1))) {
# print(i)
# x2 <- which(human_genes$symbol == x1[i])
# if (length(x2) == 1) {
# summ[[i]] <- human_counts[i, ]
# } else {
# x3 <- human_counts[x2, ]
# summ[[i]] <- colSums(x3)
# }
# }
92 changes: 92 additions & 0 deletions R/human_transcriptomics.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# bioconductor libraries
library(GO.db)
library(reactome.db)
library(DESeq2)
# library(limma)
# library(org.EcK12.eg.db)

# other libraries
library(tidyverse)
library(tidytext)
library(Matrix)
library(gplots)
library(RColorBrewer)
library(readxl)
library(DESeq2)
library(org.Hs.eg.db)
library(DRUID)


# some functions
source("R/enrichment_functions.R")

# rna-seq data
load(file = "data/human-centric_dual_rnaseq_05162018.RData") # <-- human centric data

samples <- hs_data[[3]]
human_counts <- as.matrix(hs_data[[1]])
human_genes <- hs_data[[2]]

# filter out genes for which we have no gene name
nix <- which(is.na(human_genes$symbol))
if (length(nix) != 0) {
human_genes <- human_genes[-nix, ]
human_counts <- human_counts[-nix, ]
}


# remove genes with zero counts
nix <- which(rowSums(human_counts) == 0)

if(length(nix) != 0)
{
human_counts <- human_counts[-nix, ]
human_genes <- human_genes[-nix, ]
}


# summarize gene ids
xx <- gene_summarizing(expression_data = human_counts,
gene_id = human_genes$symbol)


# DESeq2 ----
# E. coli
dds <- DESeqDataSetFromMatrix(countData = xx$summarized_expression[,c(1:8, 12:19)],
colData = samples[c(1:8, 12:19),],
design = ~ Group)

dds <- DESeq(object = dds)

# ec_e_ab_res <- results(dds,contrast=c("Group","H_E_AB","M_E_AB"),alpha=0.05,lfcThreshold = 1)
hc_ab_res <- results(dds,
contrast = c("Group", "H_E_AB", "M_E_AB"),
pAdjustMethod = "fdr",
cooksCutoff = FALSE)

hc_ab_res <- data_frame(gene_symbol = xx$gene_id,
logFC = as.vector(as.numeric(hc_ab_res$log2FoldChange)),
p_val = as.vector(as.numeric(hc_ab_res$pvalue)),
q_val = as.vector(as.numeric(hc_ab_res$padj)))

diff_genes <- hc_ab_res %>% dplyr::filter(., q_val < 0.05)

# get entrez ids for genes
z <- AnnotationDbi::select(x = org.Hs.eg.db,
keys = as.character(diff_genes$gene_symbol),
keytype = "SYMBOL",
columns = "ENTREZID")

hs_inf <- DRUID::concoct(dge_matrix = cbind(diff_genes$logFC, diff_genes$q_val),
tfidf_matrix = cmap_druid$tfidf,
tfidf_crossproduct = cmap_druid$cpm,
num_random = 10000,
druid_direction = "neg",
fold_thr = 1,
pvalue_thr = 0.05,
entrez = z$ENTREZID)

hs_inf <- hs_inf %>%
tibble::add_column(., drug_name = cmap_druid$drugs$name, .before = 1) %>%
tibble::add_column(., concentration = cmap_druid$drugs$concentration, .before = 2) %>%
tibble::add_column(., cell_line = cmap_druid$drugs$cell_line, .before = 3)
Loading

0 comments on commit b00677a

Please sign in to comment.