Merge pull request #4220 from vgteam/eligible-reads

Add notion of eligible reads to GAM comparison
vgteam · Feb 8, 2024 · 8a28902 · 8a28902 · adamnovak · Feb 9, 2024
2 parents 40cc426 + 2e025c2
commit 8a28902
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 27 deletions.
diff --git a/scripts/plot-pr.R b/scripts/plot-pr.R
@@ -10,11 +10,16 @@ require("tidyverse")
 require("ggrepel")
 
 # Read in the combined toil-vg stats.tsv, listing:
-# correct, mapq, aligner (really graph name), read name, count
-dat <- read.table(commandArgs(TRUE)[1], header=T)
+# correct, mapq, aligner (really graph name), read name, count, eligible
+dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))
+
+if (("eligible" %in% names(dat))) {
+    # If the eligible column is present, remove ineligible reads
+    dat <- dat[dat$eligible == 1, ]
+}
 
 if (! ("count" %in% names(dat))) {
-    # If the count column is not present, add i
+    # If the count column is not present, add it
     dat$count <- rep(1, nrow(dat))
 }
 
@@ -48,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
 name.lists <- name.lists[name.order]
 
 # Determine colors for aligners
-bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
-light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
+bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
+light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
 # We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
 cursor <- 1
 
@@ -95,6 +100,25 @@ colors <- colors[aligner.names]
 # Add a bin "factor" to each row, binning float MAPQs into bins from 0 to 60 (and inclusing bins for out of range on each end)
 dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))
 
+# We need to work out our scales
+reads.per.condition <- sum(dat$count) / length(aligner.names)
+# Start with small scale
+labels <- c("1e-0","1e-1","1e-2","1e-3","1e-4")
+breaks <- c(0,1,2,3,4)
+limits <- c(0, 4)
+if ( reads.per.condition > 10000 ) {
+    # Use big scale if there are a lot of reads
+    labels <- c(labels, "1e-5","1e-6")
+    breaks <- c(breaks, 5,6)
+    limits <- c(0, 6)
+}
+if ( reads.per.condition > 1000000 ) {
+    # Use big scale if there are a lot of reads
+    labels <- c(labels, "1e-7","1e-8","1e-9")
+    breaks <- c(breaks, 7,8,9)
+    limits <- c(0, 9)
+}
+
 # Now we break out the cool dplyr/magrittr/tidyverse tools like %>% pipe operators.
 dat.roc <- dat %>%
     # Make positive and negative count columns
@@ -127,15 +151,15 @@ dat.plot <- dat.roc %>%
         # There will be points with variable sizes
         geom_point(aes(size=Positive+Negative)) +
         # We manually assign these selected colors
-        scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
+        scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
         # And we want a size legend
         scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
         # And we want a fake log Y axis
-        scale_y_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
+        scale_y_continuous(labels=labels, breaks=breaks, limits=limits) +
         # Label it
         ylab("1 - Precision") +
         # And we want a fake log X axis
-        scale_x_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
+        scale_x_continuous(labels=labels, breaks=breaks, limits=limits) +
         # Label it
         xlab("1 - Recall") +
         # And we want this cool theme

diff --git a/scripts/plot-qq.R b/scripts/plot-qq.R
@@ -2,15 +2,21 @@
 
 # plot-qq.R <stats TSV> <destination image file> [<comma-separated "aligner" names to include> [title]]
 
-list.of.packages <- c("tidyverse", "ggrepel", "svglite")
+list.of.packages <- c("tidyverse", "ggrepel", "svglite", "binom")
 new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
 if(length(new.packages)) install.packages(new.packages)
 require("tidyverse")
 require("ggrepel")
+require("binom")
 
 # Read in the combined toil-vg stats.tsv, listing:
-# correct, mapq, aligner (really graph name), read name, count
-dat <- read.table(commandArgs(TRUE)[1], header=T)
+# correct, mapq, aligner (really graph name), read name, count, eligible
+dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))
+
+if (("eligible" %in% names(dat))) {
+    # If the eligible column is present, remove ineligible reads
+    dat <- dat[dat$eligible == 1, ]
+}
 
 if (! ("count" %in% names(dat))) {
     # If the count column is not present, add i
@@ -47,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
 name.lists <- name.lists[name.order]
 
 # Determine colors for aligners
-bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
-light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
+bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
+light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
 # We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
 cursor <- 1
 
@@ -93,14 +99,20 @@ colors <- colors[aligner.names]
 
 dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))
 
-x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count)))
+x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count), select(binom.confint(sum(correct * count), sum(count), conf.level=0.9, methods="lrt"), c("lower", "upper"))))
+
+print(names(x))
+print(x$ci)
 
-dat.plot <- ggplot(x, aes(1-mapprob+1e-9, 1-observed+1e-9, color=aligner, size=N, weight=N, label=round(mapq,2))) +
-    scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
-    scale_y_log10("measured error", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
-    scale_x_log10("error estimate", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
+# Now plot the points as different sizes, but the error bar line ranges as a consistent size
+dat.plot <- ggplot(x, aes(1-mapprob+1e-7, 1-observed+1e-7, color=aligner, size=N, weight=N, label=round(mapq,2))) +
+    scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
+    scale_y_log10("measured error", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
+    scale_x_log10("error estimate", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
     scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
     geom_point() +
+    # Only aesthetics that depend on each point need to be in the aes() mapping
+    geom_linerange(aes(x=1-mapprob+1e-7, ymin=1-upper+1e-7, ymax=1-lower+1e-7), linewidth=0.2, position=position_dodge(.05)) +
     geom_smooth() +
     geom_abline(intercept=0, slope=1, linetype=2) +
     theme_bw()

diff --git a/scripts/plot-roc-log.R b/scripts/plot-roc-log.R
@@ -20,8 +20,13 @@ require("tidyverse")
 require("ggrepel")
 
 # Read in the combined toil-vg stats.tsv, listing:
-# correct, mapq, aligner (really graph name), read name, count
-dat <- read.table(commandArgs(TRUE)[1], header=T)
+# correct, mapq, aligner (really graph name), read name, count, eligible
+dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))
+
+if (("eligible" %in% names(dat))) {
+    # If the eligible column is present, remove ineligible reads
+    dat <- dat[dat$eligible == 1, ]
+}
 
 if (! ("count" %in% names(dat))) {
     # If the count column is not present, add i

diff --git a/scripts/plot-roc.R b/scripts/plot-roc.R
@@ -21,8 +21,13 @@ require("ggrepel")
 require("scales") # For squish
 
 # Read in the combined toil-vg stats.tsv, listing:
-# correct, mapq, aligner (really graph name), read name, count
-dat <- read.table(commandArgs(TRUE)[1], header=T)
+# correct, mapq, aligner (really graph name), read name, count, eligible
+dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))
+
+if (("eligible" %in% names(dat))) {
+    # If the eligible column is present, remove ineligible reads
+    dat <- dat[dat$eligible == 1, ]
+}
 
 if (! ("count" %in% names(dat))) {
     # If the count column is not present, add i

diff --git a/src/subcommand/gamcompare_main.cpp b/src/subcommand/gamcompare_main.cpp
@@ -11,6 +11,7 @@
 #include "subcommand.hpp"
 
 #include "../alignment.hpp"
+#include "../annotation.hpp"
 #include "../snarl_distance_index.hpp"
 #include "../vg.hpp"
 #include <vg/io/stream.hpp>
@@ -257,7 +258,7 @@ int main_gamcompare(int argc, char** argv) {
         // Output TSV to standard out in the format plot-qq.R needs.
         if (!header_printed) {
             // It needs a header
-            cout << "correct\tmq\taligner\tread" << endl;
+            cout << "correct\tmq\taligner\tread\teligible" << endl;
             header_printed = true;
         }
 
@@ -266,7 +267,8 @@ int main_gamcompare(int argc, char** argv) {
             cout << (aln.correctly_mapped() ? "1" : "0") << "\t";
             cout << aln.mapping_quality() << "\t";
             cout << aligner_name << "\t";
-            cout << aln.name() << endl;
+            cout << aln.name() << "\t";
+            cout << (has_annotation(aln, "no_truth") ? "0" : "1") << endl;
         }
         text_buffer.clear();
     };
@@ -341,6 +343,8 @@ int main_gamcompare(int argc, char** argv) {
 
             // Annotate it as such
             aln.set_correctly_mapped(correctly_mapped);
+            // And make sure we say it was possible to get
+            clear_annotation(aln, "no_truth");
 
             if (correctly_mapped) {
                 correct_counts.at(omp_get_thread_num()) += 1;
@@ -358,15 +362,25 @@ int main_gamcompare(int argc, char** argv) {
                     correct_count_by_mapq_by_thread.at(omp_get_thread_num()).at(mapq) += 1;
                 }
             }
+        } else if (range != -1) {
+            // We are flagging reads correct/incorrect, but this read has no truth position.
+            // Remember that it was impossible to get.
+            set_annotation(aln, "no_truth", true);
         }
 #pragma omp critical
         {
             if (output_tsv) {
-                text_buffer.emplace_back(std::move(aln));
+                if (emitter) {
+                    // Copy the alignment since we need it twice
+                    text_buffer.emplace_back(aln);
+                } else {
+                    text_buffer.emplace_back(std::move(aln));
+                }
                 if (text_buffer.size() > 1000) {
                     flush_text_buffer();
                 }
-            } else {
+            }
+            if (emitter) {
                 emitter->write(std::move(aln));
             }
         }

diff --git a/vgci/vgci.sh b/vgci/vgci.sh
@@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0
 # Should we show stdout and stderr from tests? If so, set to "-s".
 SHOW_OPT=""
 # What toil-vg should we install?
-TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@64774cc76ef117aef705c1b5203450482c6a4b60"
+TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6"
 # What toil should we install?
 # Could be something like "toil[aws,mesos]==3.20.0"
 # or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"