Add files via upload

midas-wyss · Nov 19, 2019 · b00677a · b00677a
commit b00677a
Show file tree

Hide file tree

Showing 79 changed files with 127,179 additions and 0 deletions.
diff --git a/R/alexandra_genes.R b/R/alexandra_genes.R
@@ -0,0 +1,39 @@
+alexandra_genes <- c("KCNQ1", "KCNQ2", "KCNQ3", "KCNQ4", "KCNQ5", 
+                     "CFTR", "PTGER4", "SLC12A2")
+
+
+rel_abun <- apply(xx$summarized_expression, 2, function(x) x/sum(x))
+
+data_alexandra <- tibble(gene = c(rep(as.character(xx$gene_id[which(xx$gene_id %in% alexandra_genes)]), 4), 
+                                  rep(as.character(xx$gene_id[which(xx$gene_id %in% alexandra_genes)]), 4)),
+                         group = c(rep("count_data", 4 * length(which(xx$gene_id %in% alexandra_genes))), 
+                                   rep("rel_abun", 4 * length(which(xx$gene_id %in% alexandra_genes)))),
+                         replicate = c(rep(1, length(which(xx$gene_id %in% alexandra_genes))),
+                                       rep(2, length(which(xx$gene_id %in% alexandra_genes))),
+                                       rep(3, length(which(xx$gene_id %in% alexandra_genes))),
+                                       rep(4, length(which(xx$gene_id %in% alexandra_genes))),
+                                       rep(1, length(which(xx$gene_id %in% alexandra_genes))),
+                                       rep(2, length(which(xx$gene_id %in% alexandra_genes))),
+                                       rep(3, length(which(xx$gene_id %in% alexandra_genes))),
+                                       rep(4, length(which(xx$gene_id %in% alexandra_genes)))),
+                         data = c(as.vector(xx$summarized_expression[which(xx$gene_id %in% alexandra_genes), 1:4]), 
+                                  as.vector(rel_abun[which(xx$gene_id %in% alexandra_genes), 1:4])))
+
+data_alexandra %>% 
+  dplyr::filter(., group == "count_data") %>%
+  ggplot() + 
+  geom_boxplot(aes(x = gene, y = log10(data)), fill = "tan") +
+  labs(x = "Gene", y = "log10(summarized counts)") +
+  # facet_grid(. ~ group) +
+  theme_bw() + 
+  theme(axis.text.y = element_text(size = 24, color = "black"),
+        axis.text.x = element_text(size = 24, color = "black", angle = 90, hjust = 1, vjust = 0.5),
+        axis.title = element_text(size = 36, color = "black"),
+        axis.ticks = element_line(color = "black"))
+ggsave(file = "genes_alexandra.pdf",
+       height = 8,
+       width = 11,
+       dpi = 600)
+
+
+
diff --git a/R/druid_etal.R b/R/druid_etal.R
@@ -0,0 +1,140 @@
+x1 <- results(dds, 
+                     contrast = c("Group", "H_E", "M_E"),
+                     pAdjustMethod = "fdr",
+                     cooksCutoff = FALSE)
+
+x2 <- results(dds, 
+              contrast = c("Group", "H_E_AB", "M_E_AB"),
+              pAdjustMethod = "fdr",
+              cooksCutoff = FALSE)
+
+x3 <- results(dds, 
+              contrast = c("Group", "H_E_AB", "H_E"),
+              pAdjustMethod = "fdr",
+              cooksCutoff = FALSE)
+
+
+# druid on comparisons
+x1 <- data_frame(gene_symbol = xx$gene_id,
+                        logFC = as.vector(as.numeric(x1$log2FoldChange)),
+                        p_val = as.vector(as.numeric(x1$pvalue)),
+                        q_val = as.vector(as.numeric(x1$padj)))
+
+g1 <- x1 %>% dplyr::filter(., q_val < 0.05)
+z <- AnnotationDbi::select(x = org.Hs.eg.db, 
+                           keys = as.character(g1$gene_symbol), 
+                           keytype = "SYMBOL", 
+                           columns = "ENTREZID")
+
+# d1 <- run_conddr(conddr_data = "/Volumes/HOME/scripts/r/conddr/data/conddr_data_ndrugs=8422_neffects=41412_saveDate=03082018.RData",
+#                      dge_data = cbind(g1$logFC, g1$q_val),
+#                      pvalue_thr = 0.05,
+#                      log2_fold_thr = 1,
+#                      gene_symbols = z$SYMBOL,
+#                      gene_entrez = z$ENTREZID,
+#                      num_random = 10000,
+#                      match_effect = "neg")
+# 
+# 
+# x2 <- data_frame(gene_symbol = xx$gene_id,
+#                  logFC = as.vector(as.numeric(x2$log2FoldChange)),
+#                  p_val = as.vector(as.numeric(x2$pvalue)),
+#                  q_val = as.vector(as.numeric(x2$padj)))
+# 
+# g2 <- x2 %>% dplyr::filter(., q_val < 0.05)
+# z <- AnnotationDbi::select(x = org.Hs.eg.db, 
+#                            keys = as.character(g2$gene_symbol), 
+#                            keytype = "SYMBOL", 
+#                            columns = "ENTREZID")
+# 
+# d2 <- run_conddr(conddr_data = "/Volumes/HOME/scripts/r/conddr/data/conddr_data_ndrugs=8422_neffects=41412_saveDate=03082018.RData",
+#                  dge_data = cbind(g2$logFC, g2$q_val),
+#                  pvalue_thr = 0.05,
+#                  log2_fold_thr = 1,
+#                  gene_symbols = z$SYMBOL,
+#                  gene_entrez = z$ENTREZID,
+#                  num_random = 10000,
+#                  match_effect = "neg")
+# 
+# 
+# x3 <- data_frame(gene_symbol = xx$gene_id,
+#                  logFC = as.vector(as.numeric(x3$log2FoldChange)),
+#                  p_val = as.vector(as.numeric(x3$pvalue)),
+#                  q_val = as.vector(as.numeric(x3$padj)))
+# 
+# g3 <- x3 %>% dplyr::filter(., q_val < 0.05)
+# z <- AnnotationDbi::select(x = org.Hs.eg.db, 
+#                            keys = as.character(g3$gene_symbol), 
+#                            keytype = "SYMBOL", 
+#                            columns = "ENTREZID")
+# 
+# d3 <- run_conddr(conddr_data = "/Volumes/HOME/scripts/r/conddr/data/conddr_data_ndrugs=8422_neffects=41412_saveDate=03082018.RData",
+#                  dge_data = cbind(g3$logFC, g3$q_val),
+#                  pvalue_thr = 0.05,
+#                  log2_fold_thr = 1,
+#                  gene_symbols = z$SYMBOL,
+#                  gene_entrez = z$ENTREZID,
+#                  num_random = 10000,
+#                  match_effect = "neg")
+
+# PLOTS
+x1 %>% 
+  ggplot() + 
+  geom_point(aes(x = logFC, y = -log10(q_val)), color = "gray") + 
+  geom_point(data = subset(x1, q_val < 0.01 & logFC > 1), aes(x = logFC, y = -log10(q_val)), color = "red") + 
+  geom_point(data = subset(x1, q_val < 0.01 & logFC < -1), aes(x = logFC, y = -log10(q_val)), color = "red") + 
+  labs(x = "log2(fold change)", y = "-log10(q)", title = "H_E vs M_E") + 
+  theme_bw() + 
+  theme(panel.grid = element_blank(), 
+        axis.text = element_text(size = 12, color = "black"),
+        axis.title = element_text(size = 24, color = "black"))
+
+x2 %>% 
+  ggplot() + 
+  geom_point(aes(x = logFC, y = -log10(q_val)), color = "gray") + 
+  geom_point(data = subset(x2, q_val < 0.05 & logFC > 1), aes(x = logFC, y = -log10(q_val)), color = "red") + 
+  geom_point(data = subset(x2, q_val < 0.05 & logFC < -1), aes(x = logFC, y = -log10(q_val)), color = "red") + 
+  labs(x = "log2(fold change)", y = "-log10(q)", title = "Hmmet + EHEC vs Mmmet + EHEC") + 
+  theme_bw() + 
+  theme(panel.grid = element_blank(), 
+        axis.text = element_text(size = 24, color = "black"),
+        axis.title = element_text(size = 24, color = "black"),
+        title = element_text(size = 24))
+ggsave("results/volcano-heab_meab-txp_09122018.pdf", height = 8, width = 11, dpi = 600)
+
+x3 %>% 
+  ggplot() + 
+  geom_point(aes(x = logFC, y = -log10(q_val)), color = "gray") + 
+  geom_point(data = subset(x3, q_val < 0.05 & logFC > 1), aes(x = logFC, y = -log10(q_val)), color = "red") + 
+  geom_point(data = subset(x3, q_val < 0.05 & logFC < -1), aes(x = logFC, y = -log10(q_val)), color = "red") + 
+  labs(x = "log2(fold change)", y = "-log10(q)", title = "Hmmet + EHEC vs Hmmet") + 
+  theme_bw() + 
+  theme(panel.grid = element_blank(), 
+        axis.text = element_text(size = 24, color = "black"),
+        axis.title = element_text(size = 24, color = "black"),
+        title = element_text(size = 24))
+ggsave("results/volcano-heab_he-txp_09122018.pdf", height = 8, width = 11, dpi = 600)
+
+
+# d3 %>%  
+#   dplyr::arrange(., desc(score)) %>% 
+#   dplyr::filter(., probability_random < 0.001) %>% 
+#   ggplot() + 
+#   geom_point(aes(x = score, y = fct_reorder(drug_name, score), size = number_matches, color = drug_group)) +
+#   labs(x = "score", y = NULL) +
+#   theme_minimal() + 
+#   theme(axis.text = element_text(size = 12, color = "black"),
+#         axis.title = element_text(size = 24, color = "black"))
+# ggsave("results/druid-heab_he-txp_09122018.pdf", height = 8, width = 11, dpi = 600)
+# 
+# 
+# d2 %>%  
+#   dplyr::arrange(., desc(score)) %>% 
+#   dplyr::slice(1:20) %>% 
+#   ggplot() + 
+#   geom_point(aes(x = score, y = fct_reorder(drug_name, score), size = number_matches, color = drug_group)) +
+#   labs(x = "score", y = NULL) +
+#   theme_minimal() + 
+#   theme(axis.text = element_text(size = 12, color = "black"),
+#         axis.title = element_text(size = 24, color = "black"))
+# ggsave("results/druid-heab_meab-txp_09122018.pdf", height = 8, width = 11, dpi = 600)
diff --git a/R/enrichment_functions.R b/R/enrichment_functions.R
@@ -0,0 +1,60 @@
+#' Functions for pathway enrichment
+#' These functions will do a pathway enrichment based on tf-idf, similar to the rPEGEOS package I have developed (https://github.com/diogocamacho/rpegeos).  Since that package is still under some development, the functions here offer more stability.
+cosine_similarity <- function(x, y, tfidf_crossprod_mat)
+{
+  # cs <- crossprod(x,y)/sqrt(crossprod(x) * crossprod(y))
+
+  # x1 <- apply(x,1,crossprod)
+  x2 <- tcrossprod(y) # <-- y is a 1xG vector!!
+  x3 <- sqrt(tfidf_crossprod_mat * as.vector(x2))
+
+  y1 <- x %*% t(y)
+
+  cs <- y1 / x3
+
+  return(cs)
+}
+
+crossprod_matrix <- function(tfidf_matrix)
+{
+  cpm <- apply(tfidf_matrix,1,crossprod) # <-- crossproduct matrix for tfidf
+  return(cpm)
+}
+
+random_sets <- function(number_sets,universe_size,gene_set_size)
+{
+  # rnd_set <- matrix(0,nrow=ncol(target_tfidf),ncol=number_sets)
+  # universe_size = number of columns in tfidf matrix
+  # gene_set_size = number of genes that we obtained as differntially expressed
+
+  x1 <- matrix(0,nrow=number_sets,ncol=universe_size)
+  x2 <- t(replicate(number_sets,sample(universe_size,size=gene_set_size,replace=FALSE)))
+  # yy <- apply(x2,1,function(s) {x1[s] <- 1})
+
+  for(i in seq(1,number_sets))
+  {
+    # a1 <- matrix(0,ncol=1,nrow=universe_size)
+    # a1[sample(universe_size,size = gene_set_size,replace = FALSE)] <- 1
+    # x <- cbind(x,a1)
+    x1[i,x2[i,]] <- 1
+  }
+
+  # rnd_set <- replicate(n=number_sets,sample(x=ncol(target_tfidf),size=gene_set_size,replace=FALSE))
+  # return(rnd_set)
+  return(x1)
+}
+
+random_tfidf <- function(random_set,target_tfidf,tfidf_crossprod_mat)
+{
+
+  rnd_res <- apply(random_set,1,function(x) cosine_similarity(target_tfidf,t(as.matrix(x)),tfidf_crossprod_mat))
+
+  return(rnd_res)
+
+}
+
+res_stats <- function(true_results,random_results)
+{
+  res_stats <- sapply(seq_along(true_results), function(x, y, i) length(which(y[i] > x[i, ]))/length(x[i,]), y = as.matrix(true_results) ,x = random_results)
+  return(res_stats)
+}
diff --git a/R/gene_summarizing.R b/R/gene_summarizing.R
@@ -0,0 +1,33 @@
+gene_summarizing <- function(expression_data, gene_id) {
+  x1 <- unique(gene_id)
+
+  summ_data <- vector(mode = "list", length = length(x1))
+
+  for (i in seq(1, length(x1))) {
+    x2 <- which(gene_id == x1[i])
+    if (length(x2) == 1) {
+      summ_data[[i]] <- expression_data[i, ]
+    } else {
+      x3 <- expression_data[x2, ]
+      summ_data[[i]] <- colSums(x3)
+    }
+  }
+
+  summ_data <- do.call(what = rbind, args = summ_data)
+
+  return(list(gene_id = x1, summarized_expression = summ_data))
+}
+
+
+# x1 <- unique(human_genes$symbol)
+# summ <- vector(mode = "list", length = length(x1))
+# for (i in seq(1, length(x1))) {
+#   print(i)
+#   x2 <- which(human_genes$symbol == x1[i])
+#   if (length(x2) == 1) {
+#     summ[[i]] <- human_counts[i, ]
+#   } else {
+#     x3 <- human_counts[x2, ]
+#     summ[[i]] <- colSums(x3)
+#   }
+# }
diff --git a/R/human_transcriptomics.R b/R/human_transcriptomics.R
@@ -0,0 +1,92 @@
+# bioconductor libraries
+library(GO.db)
+library(reactome.db)
+library(DESeq2)
+# library(limma)
+# library(org.EcK12.eg.db)
+
+# other libraries
+library(tidyverse)
+library(tidytext)
+library(Matrix)
+library(gplots)
+library(RColorBrewer)
+library(readxl)
+library(DESeq2)
+library(org.Hs.eg.db)
+library(DRUID)
+
+
+# some functions
+source("R/enrichment_functions.R")
+
+# rna-seq data 
+load(file = "data/human-centric_dual_rnaseq_05162018.RData") # <-- human centric data
+
+samples <- hs_data[[3]]
+human_counts <- as.matrix(hs_data[[1]])
+human_genes <- hs_data[[2]]
+
+# filter out genes for which we have no gene name
+nix <- which(is.na(human_genes$symbol))
+if (length(nix) != 0) {
+  human_genes <- human_genes[-nix, ]
+  human_counts <- human_counts[-nix, ]
+}
+
+
+# remove genes with zero counts
+nix <- which(rowSums(human_counts) == 0)
+
+if(length(nix) != 0)
+{
+  human_counts <- human_counts[-nix, ]
+  human_genes <- human_genes[-nix, ]
+}
+
+
+# summarize gene ids
+xx <- gene_summarizing(expression_data = human_counts, 
+                       gene_id = human_genes$symbol)
+
+
+# DESeq2 ----
+# E. coli
+dds <- DESeqDataSetFromMatrix(countData = xx$summarized_expression[,c(1:8, 12:19)],
+                              colData = samples[c(1:8, 12:19),],
+                              design = ~ Group)
+
+dds <- DESeq(object = dds)
+
+# ec_e_ab_res <- results(dds,contrast=c("Group","H_E_AB","M_E_AB"),alpha=0.05,lfcThreshold = 1)
+hc_ab_res <- results(dds, 
+                     contrast = c("Group", "H_E_AB", "M_E_AB"),
+                     pAdjustMethod = "fdr",
+                     cooksCutoff = FALSE)
+
+hc_ab_res <- data_frame(gene_symbol = xx$gene_id,
+                        logFC = as.vector(as.numeric(hc_ab_res$log2FoldChange)),
+                        p_val = as.vector(as.numeric(hc_ab_res$pvalue)),
+                        q_val = as.vector(as.numeric(hc_ab_res$padj)))
+
+diff_genes <- hc_ab_res %>% dplyr::filter(., q_val < 0.05)
+
+# get entrez ids for genes
+z <- AnnotationDbi::select(x = org.Hs.eg.db, 
+                           keys = as.character(diff_genes$gene_symbol), 
+                           keytype = "SYMBOL", 
+                           columns = "ENTREZID")
+
+hs_inf <- DRUID::concoct(dge_matrix = cbind(diff_genes$logFC, diff_genes$q_val), 
+                         tfidf_matrix = cmap_druid$tfidf, 
+                         tfidf_crossproduct = cmap_druid$cpm, 
+                         num_random = 10000, 
+                         druid_direction = "neg", 
+                         fold_thr = 1, 
+                         pvalue_thr = 0.05, 
+                         entrez = z$ENTREZID)
+
+hs_inf <- hs_inf %>% 
+  tibble::add_column(., drug_name = cmap_druid$drugs$name, .before = 1) %>%
+  tibble::add_column(., concentration = cmap_druid$drugs$concentration, .before = 2) %>%
+  tibble::add_column(., cell_line = cmap_druid$drugs$cell_line, .before = 3)