Skip to content
This repository was archived by the owner on Jun 21, 2023. It is now read-only.

Commit 6bf301e

Browse files
Stephaniejaclyn-taroni
Stephanie
andauthored
De novo mutational signature extraction, PR 2/n (#1018)
* Setting up de novo script to benchmark on workstation * Scripts used to benchmark de novo extraction to determine optimal k and robustness to seed and inference model * goodness-of-fit plots produced by de novo benchmarking code * Revived accidentally committed changes to and similarly accidental plot * Try accomplishing the same thing by altering existing code * Apply suggestions from code review * Apply suggestions from code review * Missing EOL backslashes * Also need to make results dir * Spell denovo correctly * Removed previous scripts * Initial scripts for running denovo extraction, committing to run on workstation * script for denovo extraction - may wish to combine 03 and 04 * Combined 03-04 with additional argument for whether running GOF or extraction * Initial script for analysis, need to run on workstation * gt dependency * added 04 to run bash * analysis of denovo filled in * correct denovo rds file * Presenting results without gt as plain table. Could be prettier with kable as needed. * Do not try to install gt * Fixed mkdir error with wrong variable * wrong script name.. * Wrong tidyr version syntax with unnest() * Path typo fixed * Accidental typo: changed 1000 to 3000. This commit reverts back to correct 3000 * straggling exit * De novo result files and kabled table in Rmd * No kableExtra - version not high enough for features I'd want * Update analyses/mutational-signatures/03-de_novo_range_of_nsignatures.sh Co-authored-by: Jaclyn Taroni <[email protected]> * Update analyses/mutational-signatures/04-analyze_de_novo.Rmd Co-authored-by: Jaclyn Taroni <[email protected]> * Added GOF plots from extraction which had not been added. Updated 04 Rmd with review comments * PNG cosine plots for module * Attempting build fix by adding ci parameter to Rmd * Other debugging attempt, split up chunk to see where error is in CI * if statement was backwards 0/1 Co-authored-by: Jaclyn Taroni <[email protected]>
1 parent e6c1452 commit 6bf301e

37 files changed

+2234
-20
lines changed

analyses/mutational-signatures/03-de_novo_range_of_nsignatures.sh

+46-16
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,26 @@ cd "$(dirname "${BASH_SOURCE[0]}")"
1212
# In CI we'll run an abbreviated version of the de novo signatures extraction
1313
QUICK_MUTSIGS=${QUICK_MUTSIGS:-0}
1414

15+
# Which analysis are we running?
16+
ANALYSIS=${ANALYSIS:-1} # Expect 0 for GOF and 1 for Extraction
17+
18+
1519
scratch_dir=../../scratch/mutational-signatures
16-
denovo_plot_dir=plots/denovo/gof
17-
denovo_results_dir=${scratch_dir}/gof
1820

19-
# Directories to hold the goodness-of-fit plots and results
20-
mkdir -p $denovo_plot_dir
21-
mkdir -p $denovo_results_dir
21+
# For initial GOF analysis to figure out how many k
22+
gof_plot_dir=plots/denovo/gof
23+
gof_result_dir=${scratch_dir}/gof
24+
25+
# For extraction once a limited range of k is assessed with GOF
26+
extraction_plot_dir=plots/denovo/extraction
27+
extraction_result_dir=${scratch_dir}/extraction
28+
29+
30+
# Directories to hold all plots and results
31+
mkdir -p $gof_plot_dir
32+
mkdir -p $gof_result_dir
33+
mkdir -p $extraction_plot_dir
34+
mkdir -p $extraction_result_dir
2235

2336
# The MAF file we'll use is going to WGS samples only
2437
maf_file=${scratch_dir}/pbta-snv-consensus-wgs.tsv.gz
@@ -36,23 +49,40 @@ then
3649
--num_iterations 10 \
3750
--seed 42
3851
else
39-
for model in multinomial poisson; do
40-
for seed in {1..5}; do
52+
# GOF
53+
if [[ $ANALYSIS -eq "0" ]]
54+
then
55+
FLOOR=2
56+
CEIL=8
57+
ITER=1000
58+
plot_dir=${gof_plot_dir}
59+
result_dir=${gof_result_dir}
60+
fi
61+
# Extraction
62+
if [[ $ANALYSIS -eq "1" ]]
63+
then
64+
FLOOR=2
65+
CEIL=5
66+
ITER=3000
67+
plot_dir=${extraction_plot_dir}
68+
result_dir=${extraction_result_dir}
69+
fi
4170

71+
# Run sigfit with params
72+
for model in poisson multinomial; do
73+
for seed in {1..5}; do
4274
# De novo signatures extraction
4375
Rscript --vanilla \
4476
scripts/de_novo_signature_extraction.R \
4577
--maf_file ${maf_file} \
46-
--nsignatures_floor 2 \
47-
--nsignatures_ceiling 8 \
48-
--num_iterations 1000 \
78+
--nsignatures_floor ${FLOOR} \
79+
--nsignatures_ceiling ${CEIL} \
80+
--num_iterations ${ITER} \
4981
--model ${model} \
5082
--seed ${seed} \
51-
--plot_output "${denovo_plot_dir}/gof_seed_${seed}_model_${model}.pdf" \
52-
--output_file "${denovo_results_dir}/gof_seed_${seed}_model_${model}.RDS"
53-
54-
83+
--plot_output "${plot_dir}/seed_${seed}_model_${model}.png" \
84+
--output_file "${result_dir}/seed_${seed}_model_${model}.RDS"
5585
done
5686
done
57-
58-
fi
87+
fi
88+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
---
2+
title: "De novo mutational signature extraction from WGS data"
3+
author: "S. Spielman"
4+
date: "2021"
5+
output:
6+
html_document:
7+
toc: yes
8+
df_print: paged
9+
params:
10+
is_ci: 0
11+
---
12+
13+
14+
<br><br><br><br>
15+
16+
#### Packages and paths
17+
18+
19+
```{r}
20+
# Load libraries
21+
library(sigfit)
22+
`%>%` <- dplyr::`%>%`
23+
24+
# Path to input data
25+
path_to_input_rds <- file.path("..", "..", "scratch", "mutational-signatures", "extraction")
26+
27+
# Path to consine similarity GOF plots
28+
path_to_cosine <- file.path("plots", "denovo", "extraction")
29+
30+
# Result path
31+
path_to_results <- "results"
32+
if (!dir.exists(path_to_results)) {
33+
dir.create(path_to_results, recursive = TRUE)
34+
}
35+
36+
# De novo signatures and exposures list (condensed from sigfit output) file
37+
de_novo_signatures_file <- file.path(path_to_results, "de_novo_signatures.RDS")
38+
de_novo_exposures_file <- file.path(path_to_results, "de_novo_exposures.RDS")
39+
40+
# Load cosmic data from sigfit
41+
data("cosmic_signatures_v3")
42+
cosmic_names <- row.names(cosmic_signatures_v3)
43+
```
44+
45+
<br><br>
46+
First, we need to extract the signatures and exposures from the *de novo* extraction RDS files. We save each into a list `de_novo_signatures` and `de_novo_extractions` separately, and write each list to file.
47+
48+
```{r parse_signatures}
49+
# Collect the de novo extracted signatures and exposures into a single list each. This takes _a few minutes_ to parse.
50+
# If running in CI, read in lists directly from RDS files. Otherwise, parse to create and save lists.
51+
52+
if (params$is_ci == 1) {
53+
de_novo_signatures <- readr::read_rds(de_novo_signatures_file)
54+
de_novo_exposures <- readr::read_rds(de_novo_exposures_file)
55+
} else {
56+
57+
de_novo_signatures <- list()
58+
de_novo_exposures <- list()
59+
for (this_model in c("poisson" , "multinomial"))
60+
{
61+
for (this_seed in 1:5){
62+
63+
this_name <- glue::glue("seed_{this_seed}_model_{this_model}")
64+
filename <- file.path(path_to_input_rds,
65+
glue::glue("{this_name}.RDS"))
66+
#cat(filename)
67+
if (file.exists(filename)) {
68+
fit <- readr::read_rds(filename)
69+
for (k in 2:5){
70+
sig_name <- glue::glue("{k}_{this_name}")
71+
de_novo_signatures[[ sig_name ]] <- sigfit::retrieve_pars(fit[[k]], par = "signatures")
72+
de_novo_exposures[[ sig_name ]] <- sigfit::retrieve_pars(fit[[k]], par = "exposures")
73+
}
74+
}
75+
}
76+
77+
}
78+
# We save these lists to result files
79+
readr::write_rds(de_novo_signatures, de_novo_signatures_file)
80+
readr::write_rds(de_novo_exposures, de_novo_exposures_file)
81+
}
82+
```
83+
84+
<br><br>
85+
86+
Goodness-of-fit analysis was implicitly performed by `sigfit` during inference using cosine similarity. Associated elbow plots are in `plots/extraction/`, names for the model and seed. The red-colored dot in these plots represented the selected *k* for the given seed and inference model. Notably, there substantial sensitivity to starting conditions, both seed and model. **Plots below are shown with `poisson` on top, `multinomial` on bottom**
87+
88+
```{r gof_plots_function}
89+
show_gof_plots <- function(seed)
90+
{
91+
seed_plots <- c( file.path(path_to_cosine, glue::glue("seed_{seed}_model_poisson.png")),
92+
file.path(path_to_cosine, glue::glue("seed_{seed}_model_multinomial.png"))
93+
)
94+
knitr::include_graphics(seed_plots)
95+
}
96+
```
97+
98+
**Seed 1**
99+
```{r gof_seed1, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
100+
show_gof_plots(1)
101+
```
102+
<br><br>
103+
**Seed 2**
104+
```{r gof_seed2, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
105+
show_gof_plots(2)
106+
```
107+
**Seed 3**
108+
```{r gof_seed3, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
109+
show_gof_plots(3)
110+
```
111+
**Seed 4**
112+
```{r gof_seed4, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
113+
show_gof_plots(4)
114+
```
115+
**Seed 5**
116+
```{r gof_seed5, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'}
117+
show_gof_plots(5)
118+
```
119+
120+
121+
122+
<br><br><br>
123+
This tibble collects info from the above images:
124+
125+
```{r gof_k}
126+
gof <- tibble::tribble(
127+
~model, ~seed, ~best_k,
128+
#-----------------
129+
"poisson",1,3,
130+
"multinomial",1,3,
131+
"poisson",2,4,
132+
"multinomial",2,3,
133+
"poisson",3,4,
134+
"multinomial",3,3,
135+
"poisson",4,3,
136+
"multinomial",4,3,
137+
"poisson",5,3,
138+
"multinomial",5,3
139+
)
140+
```
141+
142+
143+
144+
<br><br>
145+
Now, we can determine which signatures were extracted and map back to COSMIC names using cosine similarity to determine matching signatures:
146+
147+
```{r determine_sigs}
148+
149+
# Will find the COSMIC equivalent using cosine similarity
150+
map_match <- function(x) {
151+
sort( # Can sort since we just want to see cosmic matches
152+
as.numeric( # none of this "optimal assigment ==> 1" business
153+
# match_signatures is a sigfit function for cosine similarity
154+
match_signatures(x, cosmic_signatures_v3)
155+
)
156+
)
157+
}
158+
159+
purrr::map(de_novo_signatures, map_match) -> cosmic_matches
160+
```
161+
162+
<br><br>
163+
Now, we create a table of results for which COSMIC SBS signatures were extracted. **Full results for each combination of k, model, and random seed:**
164+
```{r sigs_full}
165+
tibble::tibble(
166+
name = names(cosmic_matches),
167+
cosmic_sig_index = unname(cosmic_matches[1:length(cosmic_matches)])
168+
) %>%
169+
tidyr::unnest(cosmic_sig_index) %>%
170+
dplyr::mutate(cosmic_sig = cosmic_names[cosmic_sig_index]) -> unnested_sigs
171+
172+
unnested_sigs %>%
173+
dplyr::group_by(name) %>%
174+
dplyr::summarize(cosmic_sigs = toString(cosmic_sig)) %>%
175+
dplyr::mutate(cosmic_sigs = stringr::str_replace_all(cosmic_sigs, "SBS", "")) %>%
176+
dplyr::mutate(seed = stringr::str_extract(name, "_\\d_"),
177+
seed = stringr::str_replace_all(seed, "_", ""),
178+
k = stringr::str_extract(name, "^\\d"),
179+
model = stringr::str_extract(name, "[a-z]+$")) %>%
180+
dplyr::select(-name) -> full_sigs
181+
182+
full_sigs %>%
183+
tidyr::spread(seed, cosmic_sigs) %>%
184+
knitr::kable()
185+
```
186+
187+
<br><br>
188+
Although not very robust to starting conditions, the results are robust to the inference model, so we will just look at `poisson` going forward. **Now, here's the table specifically for the results whose k was selected by cosine similarity goodness-of-fit**, among *k in [2:5]* for each the given random seed, using 3000 iterations:
189+
190+
```{r sigs_selected}
191+
gof %>%
192+
dplyr::rename(k = best_k) %>%
193+
dplyr::mutate(k = as.character(k), seed = as.character(seed)) %>%
194+
dplyr::left_join(full_sigs) %>%
195+
dplyr::filter(model == "poisson") %>%
196+
dplyr::select(-model) %>%
197+
dplyr::distinct() %>%
198+
knitr::kable()
199+
```
200+
201+
<br><br><br>
202+
203+
Clearly, this analysis is extremely sensitive to starting conditions. Even so, we see a few repeated SBS signatures coming up:
204+
205+
206+
+ **[SBS14](https://cancer.sanger.ac.uk/signatures/sbs/sbs14/)** "is one of seven mutational signatures associated with defective DNA mismatch repair and microsatellite instability (MSI) and is often found in the same samples as other MSI associated signatures: SBS6, SBS15, SBS20, SBS21, SBS26 and SBS44."
207+
+ **[SBS15](https://cancer.sanger.ac.uk/signatures/sbs/sbs15/)** "is one of seven mutational signatures associated with defective DNA mismatch repair and microsatellite instability (MSI) and is often found in the same samples as other MSI associated signatures: SBS6, SBS14, SBS20, SBS21, SBS26 and SBS44."
208+
+ [SBS23](https://cancer.sanger.ac.uk/signatures/sbs/sbs23/) has unknown aetiology.
209+
+ [SBS40](https://cancer.sanger.ac.uk/signatures/sbs/sbs40/) is correlated with age (not of clear relevance to pediatric tumors).
210+
+ [SBS42](https://cancer.sanger.ac.uk/signatures/sbs/sbs42/) is correlated with occupational exposure to haloalkanes (seems unlikely to be relevant for pediatric tumors?).
211+
212+
213+
**The two bolded** signatures are commonly associated with one another due to putative shared mechanisms of "defective DNA mismatch repair and microsatellite instability (MSI)."

analyses/mutational-signatures/04-analyze_de_novo.html

+1,958
Large diffs are not rendered by default.
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Loading
Binary file not shown.
Binary file not shown.

analyses/mutational-signatures/run_mutational_signatures.sh

+16-3
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,25 @@ cd "$(dirname "${BASH_SOURCE[0]}")"
1212
# In CI we'll run an abbreviated version of the de novo signatures extraction
1313
ABBREVIATED_MUTSIGS=${OPENPBTA_QUICK_MUTSIGS:-0}
1414

15+
1516
# Run the mutational signatures analysis using existing signatures
1617
Rscript -e "rmarkdown::render('01-known_signatures.Rmd', clean = TRUE)"
1718

1819
# Split up the consensus MAF files by experimental strategy (writes to scratch)
1920
Rscript --vanilla 02-split_experimental_strategy.R
2021

21-
# Run the shell script that is for determining the number of signatures to use
22-
# with a low number of iterations
23-
QUICK_MUTSIGS=$ABBREVIATED_MUTSIGS bash 03-de_novo_range_of_nsignatures.sh
22+
# Run the shell script for determining the number of signatures to use
23+
# with a low number of iterations if run in CI
24+
# argument 0 --> run denovo goodness of fit
25+
QUICK_MUTSIGS=$ABBREVIATED_MUTSIGS ANALYSIS=0 bash 03-de_novo_range_of_nsignatures.sh
26+
27+
# Run the shell script to perform de novo extraction
28+
# with a low number of iterations if run in CI
29+
# argument 1 --> run more robust denovo extraction
30+
QUICK_MUTSIGS=$ABBREVIATED_MUTSIGS ANALYSIS=1 bash 03-de_novo_range_of_nsignatures.sh
31+
32+
# Process results from de novo extraction
33+
Rscript -e "rmarkdown::render('04-analyze_de_novo.Rmd', clean = TRUE, params = list(is_ci = ${ABBREVIATED_MUTSIGS}))"
34+
35+
36+
# Next steps: Fitting the 8 known CNS signatures

analyses/mutational-signatures/scripts/de_novo_signature_extraction.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ extracted_signatures <- sigfit::extract_signatures(
127127
# If user specifies a plot output, save the goodness-of-fit plot at that
128128
# location
129129
if (!is.null(opt$plot_output)) {
130-
pdf(opt$plot_output, width = 7, height = 7)
130+
png(opt$plot_output, 640, 480)
131131
sigfit::plot_gof(extracted_signatures)
132132
dev.off()
133133
}

0 commit comments

Comments
 (0)