AlexsLemonade
diff --git a/‎analyses/mutational-signatures/03-de_novo_range_of_nsignatures.sh
+46-16 b/‎analyses/mutational-signatures/03-de_novo_range_of_nsignatures.sh
+46-16
diff --git a/‎analyses/mutational-signatures/04-analyze_de_novo.Rmd
+213 b/‎analyses/mutational-signatures/04-analyze_de_novo.Rmd
+213
diff --git a/‎analyses/mutational-signatures/04-analyze_de_novo.html
+1,958 b/‎analyses/mutational-signatures/04-analyze_de_novo.html
+1,958
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_1_model_multinomial.png
29.1 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_1_model_multinomial.png
29.1 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_1_model_poisson.png
29.1 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_1_model_poisson.png
29.1 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_2_model_multinomial.png
29.7 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_2_model_multinomial.png
29.7 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_2_model_poisson.png
29.8 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_2_model_poisson.png
29.8 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_3_model_multinomial.png
29.1 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_3_model_multinomial.png
29.1 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_3_model_poisson.png
29.2 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_3_model_poisson.png
29.2 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_4_model_multinomial.png
29 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_4_model_multinomial.png
29 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_4_model_poisson.png
29.1 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_4_model_poisson.png
29.1 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_5_model_multinomial.png
29.2 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_5_model_multinomial.png
29.2 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/extraction/seed_5_model_poisson.png
29.1 KB b/‎analyses/mutational-signatures/plots/denovo/extraction/seed_5_model_poisson.png
29.1 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_1_model_multinomial.pdf
-4.84 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_1_model_multinomial.pdf
-4.84 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_1_model_poisson.pdf
-4.84 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_1_model_poisson.pdf
-4.84 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_2_model_multinomial.pdf
-4.83 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_2_model_multinomial.pdf
-4.83 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_2_model_poisson.pdf
-4.84 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_2_model_poisson.pdf
-4.84 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_3_model_multinomial.pdf
-4.84 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_3_model_multinomial.pdf
-4.84 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_3_model_poisson.pdf
-4.83 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_3_model_poisson.pdf
-4.83 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_4_model_multinomial.pdf
-4.83 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_4_model_multinomial.pdf
-4.83 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_4_model_poisson.pdf
-4.83 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_4_model_poisson.pdf
-4.83 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_5_model_multinomial.pdf
-4.84 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_5_model_multinomial.pdf
-4.84 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_5_model_poisson.pdf
-4.84 KB b/‎analyses/mutational-signatures/plots/denovo/gof/gof_seed_5_model_poisson.pdf
-4.84 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_1_model_multinomial.png
30.5 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_1_model_multinomial.png
30.5 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_1_model_poisson.png
30.6 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_1_model_poisson.png
30.6 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_2_model_multinomial.png
30.5 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_2_model_multinomial.png
30.5 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_2_model_poisson.png
30.2 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_2_model_poisson.png
30.2 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_3_model_multinomial.png
30.5 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_3_model_multinomial.png
30.5 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_3_model_poisson.png
30.5 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_3_model_poisson.png
30.5 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_4_model_multinomial.png
30.5 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_4_model_multinomial.png
30.5 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_4_model_poisson.png
30.6 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_4_model_poisson.png
30.6 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_5_model_multinomial.png
30.4 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_5_model_multinomial.png
30.4 KB
diff --git a/‎analyses/mutational-signatures/plots/denovo/gof/seed_5_model_poisson.png
30.5 KB b/‎analyses/mutational-signatures/plots/denovo/gof/seed_5_model_poisson.png
30.5 KB
diff --git a/‎analyses/mutational-signatures/results/de_novo_exposures.RDS
5.04 MB b/‎analyses/mutational-signatures/results/de_novo_exposures.RDS
5.04 MB
diff --git a/‎analyses/mutational-signatures/results/de_novo_signatures.RDS
595 KB b/‎analyses/mutational-signatures/results/de_novo_signatures.RDS
595 KB
diff --git a/‎analyses/mutational-signatures/run_mutational_signatures.sh
+16-3 b/‎analyses/mutational-signatures/run_mutational_signatures.sh
+16-3
diff --git a/‎analyses/mutational-signatures/scripts/de_novo_signature_extraction.R
+1-1 b/‎analyses/mutational-signatures/scripts/de_novo_signature_extraction.R
+1-1
@@ -12,13 +12,26 @@ cd "$(dirname "${BASH_SOURCE[0]}")"
 # In CI we'll run an abbreviated version of the de novo signatures extraction
 QUICK_MUTSIGS=${QUICK_MUTSIGS:-0}
 
+# Which analysis are we running?
+ANALYSIS=${ANALYSIS:-1}  # Expect 0 for GOF and 1 for Extraction
+
+
 scratch_dir=../../scratch/mutational-signatures
-denovo_plot_dir=plots/denovo/gof
-denovo_results_dir=${scratch_dir}/gof
 
-# Directories to hold the goodness-of-fit plots and results
-mkdir -p $denovo_plot_dir
-mkdir -p $denovo_results_dir
+# For initial GOF analysis to figure out how many k
+gof_plot_dir=plots/denovo/gof
+gof_result_dir=${scratch_dir}/gof
+
+# For extraction once a limited range of k is assessed with GOF
+extraction_plot_dir=plots/denovo/extraction
+extraction_result_dir=${scratch_dir}/extraction
+
+
+# Directories to hold all plots and results
+mkdir -p $gof_plot_dir
+mkdir -p $gof_result_dir
+mkdir -p $extraction_plot_dir
+mkdir -p $extraction_result_dir
 
 # The MAF file we'll use is going to WGS samples only
 maf_file=${scratch_dir}/pbta-snv-consensus-wgs.tsv.gz
@@ -36,23 +49,40 @@ then
     --num_iterations 10 \
     --seed 42 
 else
-  for model in multinomial poisson; do
-    for seed in {1..5}; do
+  # GOF
+  if [[ $ANALYSIS -eq "0" ]] 
+  then
+    FLOOR=2
+    CEIL=8
+    ITER=1000
+    plot_dir=${gof_plot_dir}
+    result_dir=${gof_result_dir}
+  fi
+  # Extraction
+  if [[ $ANALYSIS -eq "1" ]] 
+  then
+    FLOOR=2
+    CEIL=5
+    ITER=3000
+    plot_dir=${extraction_plot_dir}
+    result_dir=${extraction_result_dir}
+  fi  
 
+  # Run sigfit with params
+  for model in poisson multinomial; do
+    for seed in {1..5}; do
        # De novo signatures extraction
        Rscript --vanilla \
          scripts/de_novo_signature_extraction.R \
          --maf_file ${maf_file} \
-         --nsignatures_floor 2 \
-         --nsignatures_ceiling 8 \
-         --num_iterations 1000 \
+         --nsignatures_floor ${FLOOR} \
+         --nsignatures_ceiling ${CEIL} \
+         --num_iterations ${ITER} \
          --model ${model} \
          --seed ${seed} \
-         --plot_output "${denovo_plot_dir}/gof_seed_${seed}_model_${model}.pdf" \
-         --output_file "${denovo_results_dir}/gof_seed_${seed}_model_${model}.RDS"
-      
-
+         --plot_output "${plot_dir}/seed_${seed}_model_${model}.png" \
+         --output_file "${result_dir}/seed_${seed}_model_${model}.RDS"   
     done
   done
-
-fi
+fi  
+  
@@ -0,0 +1,213 @@
+---
+title: "De novo mutational signature extraction from WGS data"
+author: "S. Spielman"
+date: "2021"
+output:
+  html_document:
+    toc: yes
+    df_print: paged
+params:
+  is_ci: 0
+---
+
+
+<br><br><br><br>
+
+#### Packages and paths
+
+
+```{r}
+# Load libraries
+library(sigfit)
+`%>%` <- dplyr::`%>%`
+
+# Path to input data
+path_to_input_rds <- file.path("..", "..", "scratch", "mutational-signatures", "extraction")
+
+# Path to consine similarity GOF plots
+path_to_cosine <- file.path("plots", "denovo", "extraction")
+
+# Result path
+path_to_results <- "results"
+if (!dir.exists(path_to_results)) {
+  dir.create(path_to_results, recursive = TRUE)
+}
+
+# De novo signatures and exposures list (condensed from sigfit output) file
+de_novo_signatures_file <- file.path(path_to_results, "de_novo_signatures.RDS")
+de_novo_exposures_file <- file.path(path_to_results, "de_novo_exposures.RDS")
+
+# Load cosmic data from sigfit
+data("cosmic_signatures_v3")
+cosmic_names <- row.names(cosmic_signatures_v3)
+```
+
+<br><br>
+First, we need to extract the signatures and exposures from the *de novo* extraction RDS files. We save each into a list `de_novo_signatures` and `de_novo_extractions` separately, and write each list to file. 
+
+```{r parse_signatures}
+# Collect the de novo extracted signatures and exposures into a single list each. This takes _a few minutes_ to parse.
+# If running in CI, read in lists directly from RDS files. Otherwise, parse to create and save lists.
+
+if (params$is_ci == 1) {
+  de_novo_signatures <- readr::read_rds(de_novo_signatures_file)
+  de_novo_exposures <- readr::read_rds(de_novo_exposures_file)
+} else {
+ 
+  de_novo_signatures <- list()
+  de_novo_exposures <- list()
+  for (this_model in c("poisson" , "multinomial"))
+  {
+    for (this_seed in 1:5){
+  
+      this_name <- glue::glue("seed_{this_seed}_model_{this_model}")
+      filename <- file.path(path_to_input_rds, 
+                            glue::glue("{this_name}.RDS"))
+      #cat(filename)
+      if (file.exists(filename)) {
+        fit <- readr::read_rds(filename)
+        for (k in 2:5){
+          sig_name <- glue::glue("{k}_{this_name}")
+          de_novo_signatures[[ sig_name ]] <- sigfit::retrieve_pars(fit[[k]], par = "signatures") 
+          de_novo_exposures[[ sig_name ]] <- sigfit::retrieve_pars(fit[[k]], par = "exposures") 
+        }
+      }
+    }
+    
+  }
+  # We save these lists to result files
+  readr::write_rds(de_novo_signatures, de_novo_signatures_file)
+  readr::write_rds(de_novo_exposures, de_novo_exposures_file)
+}
+```
+
+<br><br>
+
+Goodness-of-fit analysis was implicitly performed by `sigfit` during inference using cosine similarity. Associated elbow plots are in `plots/extraction/`, names for the model and seed. The red-colored dot in these plots represented the selected *k* for the given seed and inference model. Notably, there substantial sensitivity to starting conditions, both seed and model. **Plots below are shown with `poisson` on top, `multinomial` on bottom**
+
+```{r gof_plots_function}
+show_gof_plots <- function(seed)
+{
+    seed_plots <- c( file.path(path_to_cosine, glue::glue("seed_{seed}_model_poisson.png")),
+                     file.path(path_to_cosine, glue::glue("seed_{seed}_model_multinomial.png"))
+                   )
+    knitr::include_graphics(seed_plots)  
+}
+```
+
+**Seed 1**
+```{r gof_seed1, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
+show_gof_plots(1)
+```
+<br><br>
+**Seed 2**
+```{r gof_seed2, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
+show_gof_plots(2)
+```
+**Seed 3**
+```{r gof_seed3, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
+show_gof_plots(3)
+```
+**Seed 4**
+```{r gof_seed4, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
+show_gof_plots(4)
+```
+**Seed 5**
+```{r gof_seed5, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
+show_gof_plots(5)
+```
+
+
+
+<br><br><br>
+  This tibble collects info from the above images:
+  
+```{r gof_k}
+gof <- tibble::tribble(
+  ~model,     ~seed,      ~best_k,
+  #-----------------
+  "poisson",1,3,
+  "multinomial",1,3,
+  "poisson",2,4,
+  "multinomial",2,3,
+  "poisson",3,4,
+  "multinomial",3,3,
+  "poisson",4,3,
+  "multinomial",4,3,  
+  "poisson",5,3,
+  "multinomial",5,3  
+)
+```
+
+
+
+<br><br>
+  Now, we can determine which signatures were extracted and map back to COSMIC names using cosine similarity to determine matching signatures:
+  
+```{r determine_sigs}
+
+# Will find the COSMIC equivalent using cosine similarity
+map_match <- function(x) {
+  sort( # Can sort since we just want to see cosmic matches
+    as.numeric( # none of this "optimal assigment ==> 1" business
+      # match_signatures is a sigfit function for cosine similarity
+      match_signatures(x, cosmic_signatures_v3)
+    )
+  )
+}
+
+purrr::map(de_novo_signatures, map_match) -> cosmic_matches
+```
+
+<br><br>
+  Now, we create a table of results for which COSMIC SBS signatures were extracted. **Full results for each combination of k, model, and random seed:**
+```{r sigs_full}
+tibble::tibble(
+  name = names(cosmic_matches), 
+  cosmic_sig_index = unname(cosmic_matches[1:length(cosmic_matches)])
+) %>%
+  tidyr::unnest(cosmic_sig_index) %>%
+  dplyr::mutate(cosmic_sig = cosmic_names[cosmic_sig_index]) -> unnested_sigs
+
+unnested_sigs %>%
+  dplyr::group_by(name) %>%
+  dplyr::summarize(cosmic_sigs = toString(cosmic_sig)) %>%
+  dplyr::mutate(cosmic_sigs = stringr::str_replace_all(cosmic_sigs, "SBS", "")) %>%
+  dplyr::mutate(seed  = stringr::str_extract(name, "_\\d_"),
+                seed  = stringr::str_replace_all(seed, "_", ""),
+                k     = stringr::str_extract(name, "^\\d"), 
+                model = stringr::str_extract(name, "[a-z]+$")) %>%
+  dplyr::select(-name) -> full_sigs
+
+full_sigs %>%
+  tidyr::spread(seed, cosmic_sigs) %>%
+  knitr::kable() 
+```
+
+<br><br> 
+  Although not very robust to starting conditions, the results are robust to the inference model, so we will just look at `poisson` going forward. **Now, here's the table specifically for the results whose k was selected by cosine similarity goodness-of-fit**, among *k in [2:5]* for each the given random seed, using 3000 iterations:
+
+```{r sigs_selected}
+gof %>%
+  dplyr::rename(k = best_k) %>%
+  dplyr::mutate(k = as.character(k), seed = as.character(seed)) %>%
+  dplyr::left_join(full_sigs) %>%
+  dplyr::filter(model == "poisson") %>%
+  dplyr::select(-model) %>% 
+  dplyr::distinct() %>%
+  knitr::kable()
+```
+
+<br><br><br>
+
+Clearly, this analysis is extremely sensitive to starting conditions. Even so, we see a few repeated SBS signatures coming up:
+
+
++ **[SBS14](https://cancer.sanger.ac.uk/signatures/sbs/sbs14/)** "is one of seven mutational signatures associated with defective DNA mismatch repair and microsatellite instability (MSI) and is often found in the same samples as other MSI associated signatures: SBS6, SBS15, SBS20, SBS21, SBS26 and SBS44."
++ **[SBS15](https://cancer.sanger.ac.uk/signatures/sbs/sbs15/)** "is one of seven mutational signatures associated with defective DNA mismatch repair and microsatellite instability (MSI) and is often found in the same samples as other MSI associated signatures: SBS6, SBS14, SBS20, SBS21, SBS26 and SBS44."
++ [SBS23](https://cancer.sanger.ac.uk/signatures/sbs/sbs23/) has unknown aetiology.
++ [SBS40](https://cancer.sanger.ac.uk/signatures/sbs/sbs40/) is correlated with age (not of clear relevance to pediatric tumors).
++ [SBS42](https://cancer.sanger.ac.uk/signatures/sbs/sbs42/) is correlated with occupational exposure to haloalkanes (seems unlikely to be relevant for pediatric tumors?).
+
+
+**The two bolded** signatures are commonly associated with one another due to putative shared mechanisms of "defective DNA mismatch repair and microsatellite instability (MSI)."
@@ -12,12 +12,25 @@ cd "$(dirname "${BASH_SOURCE[0]}")"
 # In CI we'll run an abbreviated version of the de novo signatures extraction
 ABBREVIATED_MUTSIGS=${OPENPBTA_QUICK_MUTSIGS:-0}
 
+
 # Run the mutational signatures analysis using existing signatures
 Rscript -e "rmarkdown::render('01-known_signatures.Rmd', clean = TRUE)"
 
 # Split up the consensus MAF files by experimental strategy (writes to scratch)
 Rscript --vanilla 02-split_experimental_strategy.R
 
-# Run the shell script that is for determining the number of signatures to use
-# with a low number of iterations
-QUICK_MUTSIGS=$ABBREVIATED_MUTSIGS bash 03-de_novo_range_of_nsignatures.sh
+# Run the shell script for determining the number of signatures to use
+# with a low number of iterations if run in CI
+# argument 0 --> run denovo goodness of fit
+QUICK_MUTSIGS=$ABBREVIATED_MUTSIGS ANALYSIS=0 bash 03-de_novo_range_of_nsignatures.sh
+
+# Run the shell script to perform de novo extraction 
+# with a low number of iterations if run in CI
+# argument 1 --> run more robust denovo extraction
+QUICK_MUTSIGS=$ABBREVIATED_MUTSIGS ANALYSIS=1 bash 03-de_novo_range_of_nsignatures.sh
+
+# Process results from de novo extraction 
+Rscript -e "rmarkdown::render('04-analyze_de_novo.Rmd', clean = TRUE, params = list(is_ci = ${ABBREVIATED_MUTSIGS}))"
+
+
+# Next steps: Fitting the 8 known CNS signatures
@@ -127,7 +127,7 @@ extracted_signatures <- sigfit::extract_signatures(
 # If user specifies a plot output, save the goodness-of-fit plot at that 
 # location
 if (!is.null(opt$plot_output)) {
-  pdf(opt$plot_output, width = 7, height = 7)
+  png(opt$plot_output, 640, 480)
   sigfit::plot_gof(extracted_signatures)
   dev.off()
 }
Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,7 @@ extracted_signatures <- sigfit::extract_signatures(`
`127`	`127`	`# If user specifies a plot output, save the goodness-of-fit plot at that`
`128`	`128`	`# location`
`129`	`129`	`if (!is.null(opt$plot_output)) {`
`130`		`- pdf(opt$plot_output, width = 7, height = 7)`
	`130`	`+ png(opt$plot_output, 640, 480)`
`131`	`131`	`sigfit::plot_gof(extracted_signatures)`
`132`	`132`	`dev.off()`
`133`	`133`	`}`