|
| 1 | +--- |
| 2 | +title: "De novo mutational signature extraction from WGS data" |
| 3 | +author: "S. Spielman" |
| 4 | +date: "2021" |
| 5 | +output: |
| 6 | + html_document: |
| 7 | + toc: yes |
| 8 | + df_print: paged |
| 9 | +params: |
| 10 | + is_ci: 0 |
| 11 | +--- |
| 12 | + |
| 13 | + |
| 14 | +<br><br><br><br> |
| 15 | + |
| 16 | +#### Packages and paths |
| 17 | + |
| 18 | + |
| 19 | +```{r} |
| 20 | +# Load libraries |
| 21 | +library(sigfit) |
| 22 | +`%>%` <- dplyr::`%>%` |
| 23 | +
|
| 24 | +# Path to input data |
| 25 | +path_to_input_rds <- file.path("..", "..", "scratch", "mutational-signatures", "extraction") |
| 26 | +
|
| 27 | +# Path to consine similarity GOF plots |
| 28 | +path_to_cosine <- file.path("plots", "denovo", "extraction") |
| 29 | +
|
| 30 | +# Result path |
| 31 | +path_to_results <- "results" |
| 32 | +if (!dir.exists(path_to_results)) { |
| 33 | + dir.create(path_to_results, recursive = TRUE) |
| 34 | +} |
| 35 | +
|
| 36 | +# De novo signatures and exposures list (condensed from sigfit output) file |
| 37 | +de_novo_signatures_file <- file.path(path_to_results, "de_novo_signatures.RDS") |
| 38 | +de_novo_exposures_file <- file.path(path_to_results, "de_novo_exposures.RDS") |
| 39 | +
|
| 40 | +# Load cosmic data from sigfit |
| 41 | +data("cosmic_signatures_v3") |
| 42 | +cosmic_names <- row.names(cosmic_signatures_v3) |
| 43 | +``` |
| 44 | + |
| 45 | +<br><br> |
| 46 | +First, we need to extract the signatures and exposures from the *de novo* extraction RDS files. We save each into a list `de_novo_signatures` and `de_novo_extractions` separately, and write each list to file. |
| 47 | + |
| 48 | +```{r parse_signatures} |
| 49 | +# Collect the de novo extracted signatures and exposures into a single list each. This takes _a few minutes_ to parse. |
| 50 | +# If running in CI, read in lists directly from RDS files. Otherwise, parse to create and save lists. |
| 51 | +
|
| 52 | +if (params$is_ci == 1) { |
| 53 | + de_novo_signatures <- readr::read_rds(de_novo_signatures_file) |
| 54 | + de_novo_exposures <- readr::read_rds(de_novo_exposures_file) |
| 55 | +} else { |
| 56 | + |
| 57 | + de_novo_signatures <- list() |
| 58 | + de_novo_exposures <- list() |
| 59 | + for (this_model in c("poisson" , "multinomial")) |
| 60 | + { |
| 61 | + for (this_seed in 1:5){ |
| 62 | + |
| 63 | + this_name <- glue::glue("seed_{this_seed}_model_{this_model}") |
| 64 | + filename <- file.path(path_to_input_rds, |
| 65 | + glue::glue("{this_name}.RDS")) |
| 66 | + #cat(filename) |
| 67 | + if (file.exists(filename)) { |
| 68 | + fit <- readr::read_rds(filename) |
| 69 | + for (k in 2:5){ |
| 70 | + sig_name <- glue::glue("{k}_{this_name}") |
| 71 | + de_novo_signatures[[ sig_name ]] <- sigfit::retrieve_pars(fit[[k]], par = "signatures") |
| 72 | + de_novo_exposures[[ sig_name ]] <- sigfit::retrieve_pars(fit[[k]], par = "exposures") |
| 73 | + } |
| 74 | + } |
| 75 | + } |
| 76 | + |
| 77 | + } |
| 78 | + # We save these lists to result files |
| 79 | + readr::write_rds(de_novo_signatures, de_novo_signatures_file) |
| 80 | + readr::write_rds(de_novo_exposures, de_novo_exposures_file) |
| 81 | +} |
| 82 | +``` |
| 83 | + |
| 84 | +<br><br> |
| 85 | + |
| 86 | +Goodness-of-fit analysis was implicitly performed by `sigfit` during inference using cosine similarity. Associated elbow plots are in `plots/extraction/`, names for the model and seed. The red-colored dot in these plots represented the selected *k* for the given seed and inference model. Notably, there substantial sensitivity to starting conditions, both seed and model. **Plots below are shown with `poisson` on top, `multinomial` on bottom** |
| 87 | + |
| 88 | +```{r gof_plots_function} |
| 89 | +show_gof_plots <- function(seed) |
| 90 | +{ |
| 91 | + seed_plots <- c( file.path(path_to_cosine, glue::glue("seed_{seed}_model_poisson.png")), |
| 92 | + file.path(path_to_cosine, glue::glue("seed_{seed}_model_multinomial.png")) |
| 93 | + ) |
| 94 | + knitr::include_graphics(seed_plots) |
| 95 | +} |
| 96 | +``` |
| 97 | + |
| 98 | +**Seed 1** |
| 99 | +```{r gof_seed1, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'} |
| 100 | +show_gof_plots(1) |
| 101 | +``` |
| 102 | +<br><br> |
| 103 | +**Seed 2** |
| 104 | +```{r gof_seed2, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'} |
| 105 | +show_gof_plots(2) |
| 106 | +``` |
| 107 | +**Seed 3** |
| 108 | +```{r gof_seed3, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'} |
| 109 | +show_gof_plots(3) |
| 110 | +``` |
| 111 | +**Seed 4** |
| 112 | +```{r gof_seed4, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'} |
| 113 | +show_gof_plots(4) |
| 114 | +``` |
| 115 | +**Seed 5** |
| 116 | +```{r gof_seed5, out.width="49%", out.height="20%",fig.show='hold',fig.align='center'} |
| 117 | +show_gof_plots(5) |
| 118 | +``` |
| 119 | + |
| 120 | + |
| 121 | + |
| 122 | +<br><br><br> |
| 123 | + This tibble collects info from the above images: |
| 124 | + |
| 125 | +```{r gof_k} |
| 126 | +gof <- tibble::tribble( |
| 127 | + ~model, ~seed, ~best_k, |
| 128 | + #----------------- |
| 129 | + "poisson",1,3, |
| 130 | + "multinomial",1,3, |
| 131 | + "poisson",2,4, |
| 132 | + "multinomial",2,3, |
| 133 | + "poisson",3,4, |
| 134 | + "multinomial",3,3, |
| 135 | + "poisson",4,3, |
| 136 | + "multinomial",4,3, |
| 137 | + "poisson",5,3, |
| 138 | + "multinomial",5,3 |
| 139 | +) |
| 140 | +``` |
| 141 | + |
| 142 | + |
| 143 | + |
| 144 | +<br><br> |
| 145 | + Now, we can determine which signatures were extracted and map back to COSMIC names using cosine similarity to determine matching signatures: |
| 146 | + |
| 147 | +```{r determine_sigs} |
| 148 | +
|
| 149 | +# Will find the COSMIC equivalent using cosine similarity |
| 150 | +map_match <- function(x) { |
| 151 | + sort( # Can sort since we just want to see cosmic matches |
| 152 | + as.numeric( # none of this "optimal assigment ==> 1" business |
| 153 | + # match_signatures is a sigfit function for cosine similarity |
| 154 | + match_signatures(x, cosmic_signatures_v3) |
| 155 | + ) |
| 156 | + ) |
| 157 | +} |
| 158 | +
|
| 159 | +purrr::map(de_novo_signatures, map_match) -> cosmic_matches |
| 160 | +``` |
| 161 | + |
| 162 | +<br><br> |
| 163 | + Now, we create a table of results for which COSMIC SBS signatures were extracted. **Full results for each combination of k, model, and random seed:** |
| 164 | +```{r sigs_full} |
| 165 | +tibble::tibble( |
| 166 | + name = names(cosmic_matches), |
| 167 | + cosmic_sig_index = unname(cosmic_matches[1:length(cosmic_matches)]) |
| 168 | +) %>% |
| 169 | + tidyr::unnest(cosmic_sig_index) %>% |
| 170 | + dplyr::mutate(cosmic_sig = cosmic_names[cosmic_sig_index]) -> unnested_sigs |
| 171 | +
|
| 172 | +unnested_sigs %>% |
| 173 | + dplyr::group_by(name) %>% |
| 174 | + dplyr::summarize(cosmic_sigs = toString(cosmic_sig)) %>% |
| 175 | + dplyr::mutate(cosmic_sigs = stringr::str_replace_all(cosmic_sigs, "SBS", "")) %>% |
| 176 | + dplyr::mutate(seed = stringr::str_extract(name, "_\\d_"), |
| 177 | + seed = stringr::str_replace_all(seed, "_", ""), |
| 178 | + k = stringr::str_extract(name, "^\\d"), |
| 179 | + model = stringr::str_extract(name, "[a-z]+$")) %>% |
| 180 | + dplyr::select(-name) -> full_sigs |
| 181 | +
|
| 182 | +full_sigs %>% |
| 183 | + tidyr::spread(seed, cosmic_sigs) %>% |
| 184 | + knitr::kable() |
| 185 | +``` |
| 186 | + |
| 187 | +<br><br> |
| 188 | + Although not very robust to starting conditions, the results are robust to the inference model, so we will just look at `poisson` going forward. **Now, here's the table specifically for the results whose k was selected by cosine similarity goodness-of-fit**, among *k in [2:5]* for each the given random seed, using 3000 iterations: |
| 189 | + |
| 190 | +```{r sigs_selected} |
| 191 | +gof %>% |
| 192 | + dplyr::rename(k = best_k) %>% |
| 193 | + dplyr::mutate(k = as.character(k), seed = as.character(seed)) %>% |
| 194 | + dplyr::left_join(full_sigs) %>% |
| 195 | + dplyr::filter(model == "poisson") %>% |
| 196 | + dplyr::select(-model) %>% |
| 197 | + dplyr::distinct() %>% |
| 198 | + knitr::kable() |
| 199 | +``` |
| 200 | + |
| 201 | +<br><br><br> |
| 202 | + |
| 203 | +Clearly, this analysis is extremely sensitive to starting conditions. Even so, we see a few repeated SBS signatures coming up: |
| 204 | + |
| 205 | + |
| 206 | ++ **[SBS14](https://cancer.sanger.ac.uk/signatures/sbs/sbs14/)** "is one of seven mutational signatures associated with defective DNA mismatch repair and microsatellite instability (MSI) and is often found in the same samples as other MSI associated signatures: SBS6, SBS15, SBS20, SBS21, SBS26 and SBS44." |
| 207 | ++ **[SBS15](https://cancer.sanger.ac.uk/signatures/sbs/sbs15/)** "is one of seven mutational signatures associated with defective DNA mismatch repair and microsatellite instability (MSI) and is often found in the same samples as other MSI associated signatures: SBS6, SBS14, SBS20, SBS21, SBS26 and SBS44." |
| 208 | ++ [SBS23](https://cancer.sanger.ac.uk/signatures/sbs/sbs23/) has unknown aetiology. |
| 209 | ++ [SBS40](https://cancer.sanger.ac.uk/signatures/sbs/sbs40/) is correlated with age (not of clear relevance to pediatric tumors). |
| 210 | ++ [SBS42](https://cancer.sanger.ac.uk/signatures/sbs/sbs42/) is correlated with occupational exposure to haloalkanes (seems unlikely to be relevant for pediatric tumors?). |
| 211 | + |
| 212 | + |
| 213 | +**The two bolded** signatures are commonly associated with one another due to putative shared mechanisms of "defective DNA mismatch repair and microsatellite instability (MSI)." |
0 commit comments