Skip to content

Commit

Permalink
Merge pull request #4220 from vgteam/eligible-reads
Browse files Browse the repository at this point in the history
Add notion of eligible reads to GAM comparison
  • Loading branch information
adamnovak authored Feb 8, 2024
2 parents 40cc426 + 2e025c2 commit 8a28902
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 27 deletions.
40 changes: 32 additions & 8 deletions scripts/plot-pr.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,16 @@ require("tidyverse")
require("ggrepel")

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
# If the count column is not present, add it
dat$count <- rep(1, nrow(dat))
}

Expand Down Expand Up @@ -48,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
name.lists <- name.lists[name.order]

# Determine colors for aligners
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
# We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
cursor <- 1

Expand Down Expand Up @@ -95,6 +100,25 @@ colors <- colors[aligner.names]
# Add a bin "factor" to each row, binning float MAPQs into bins from 0 to 60 (and inclusing bins for out of range on each end)
dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))

# We need to work out our scales
reads.per.condition <- sum(dat$count) / length(aligner.names)
# Start with small scale
labels <- c("1e-0","1e-1","1e-2","1e-3","1e-4")
breaks <- c(0,1,2,3,4)
limits <- c(0, 4)
if ( reads.per.condition > 10000 ) {
# Use big scale if there are a lot of reads
labels <- c(labels, "1e-5","1e-6")
breaks <- c(breaks, 5,6)
limits <- c(0, 6)
}
if ( reads.per.condition > 1000000 ) {
# Use big scale if there are a lot of reads
labels <- c(labels, "1e-7","1e-8","1e-9")
breaks <- c(breaks, 7,8,9)
limits <- c(0, 9)
}

# Now we break out the cool dplyr/magrittr/tidyverse tools like %>% pipe operators.
dat.roc <- dat %>%
# Make positive and negative count columns
Expand Down Expand Up @@ -127,15 +151,15 @@ dat.plot <- dat.roc %>%
# There will be points with variable sizes
geom_point(aes(size=Positive+Negative)) +
# We manually assign these selected colors
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
# And we want a size legend
scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
# And we want a fake log Y axis
scale_y_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
scale_y_continuous(labels=labels, breaks=breaks, limits=limits) +
# Label it
ylab("1 - Precision") +
# And we want a fake log X axis
scale_x_continuous(labels=c("1e-0","1e-1","1e-2","1e-3","1e-4","1e-5","1e-6","1e-7","1e-8","1e-9"), breaks=c(0,1,2,3,4,5,6,7,8,9), limits=c(0, 9)) +
scale_x_continuous(labels=labels, breaks=breaks, limits=limits) +
# Label it
xlab("1 - Recall") +
# And we want this cool theme
Expand Down
32 changes: 22 additions & 10 deletions scripts/plot-qq.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,21 @@

# plot-qq.R <stats TSV> <destination image file> [<comma-separated "aligner" names to include> [title]]

list.of.packages <- c("tidyverse", "ggrepel", "svglite")
list.of.packages <- c("tidyverse", "ggrepel", "svglite", "binom")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
require("tidyverse")
require("ggrepel")
require("binom")

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
Expand Down Expand Up @@ -47,8 +53,8 @@ dat$aligner <- factor(dat$aligner, levels=aligner.names)
name.lists <- name.lists[name.order]

# Determine colors for aligners
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee")
bold.colors <- c("#1f78b4","#e31a1c","#33a02c","#6600cc","#ff8000","#5c415d","#458b74","#698b22","#008b8b","#6caed1")
light.colors <- c("#a6cee3","#fb9a99","#b2df8a","#e5ccff","#ffe5cc","#9a7c9b","#76eec6","#b3ee3a","#00eeee","#b9d9e9")
# We have to go through both lists together when assigning colors, because pe and non-pe versions of a condition need corresponding colors.
cursor <- 1

Expand Down Expand Up @@ -93,14 +99,20 @@ colors <- colors[aligner.names]

dat$bin <- cut(dat$mq, c(-Inf,seq(0,60,1),Inf))

x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count)))
x <- as.data.frame(summarize(group_by(dat, bin, aligner), N=n(), mapq=mean(mq), mapprob=mean(1-10^(-mapq/10)), observed=weighted.mean(correct, count), select(binom.confint(sum(correct * count), sum(count), conf.level=0.9, methods="lrt"), c("lower", "upper"))))

print(names(x))
print(x$ci)

dat.plot <- ggplot(x, aes(1-mapprob+1e-9, 1-observed+1e-9, color=aligner, size=N, weight=N, label=round(mapq,2))) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=2)) +
scale_y_log10("measured error", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_x_log10("error estimate", limits=c(5e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
# Now plot the points as different sizes, but the error bar line ranges as a consistent size
dat.plot <- ggplot(x, aes(1-mapprob+1e-7, 1-observed+1e-7, color=aligner, size=N, weight=N, label=round(mapq,2))) +
scale_color_manual(values=colors, guide=guide_legend(title=NULL, ncol=1)) +
scale_y_log10("measured error", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_x_log10("error estimate", limits=c(1e-7,2), breaks=c(1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1e0)) +
scale_size_continuous("number", guide=guide_legend(title=NULL, ncol=4)) +
geom_point() +
# Only aesthetics that depend on each point need to be in the aes() mapping
geom_linerange(aes(x=1-mapprob+1e-7, ymin=1-upper+1e-7, ymax=1-lower+1e-7), linewidth=0.2, position=position_dodge(.05)) +
geom_smooth() +
geom_abline(intercept=0, slope=1, linetype=2) +
theme_bw()
Expand Down
9 changes: 7 additions & 2 deletions scripts/plot-roc-log.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,13 @@ require("tidyverse")
require("ggrepel")

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
Expand Down
9 changes: 7 additions & 2 deletions scripts/plot-roc.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ require("ggrepel")
require("scales") # For squish

# Read in the combined toil-vg stats.tsv, listing:
# correct, mapq, aligner (really graph name), read name, count
dat <- read.table(commandArgs(TRUE)[1], header=T)
# correct, mapq, aligner (really graph name), read name, count, eligible
dat <- read.table(commandArgs(TRUE)[1], header=T, colClasses=c("aligner"="factor"))

if (("eligible" %in% names(dat))) {
# If the eligible column is present, remove ineligible reads
dat <- dat[dat$eligible == 1, ]
}

if (! ("count" %in% names(dat))) {
# If the count column is not present, add i
Expand Down
22 changes: 18 additions & 4 deletions src/subcommand/gamcompare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "subcommand.hpp"

#include "../alignment.hpp"
#include "../annotation.hpp"
#include "../snarl_distance_index.hpp"
#include "../vg.hpp"
#include <vg/io/stream.hpp>
Expand Down Expand Up @@ -257,7 +258,7 @@ int main_gamcompare(int argc, char** argv) {
// Output TSV to standard out in the format plot-qq.R needs.
if (!header_printed) {
// It needs a header
cout << "correct\tmq\taligner\tread" << endl;
cout << "correct\tmq\taligner\tread\teligible" << endl;
header_printed = true;
}

Expand All @@ -266,7 +267,8 @@ int main_gamcompare(int argc, char** argv) {
cout << (aln.correctly_mapped() ? "1" : "0") << "\t";
cout << aln.mapping_quality() << "\t";
cout << aligner_name << "\t";
cout << aln.name() << endl;
cout << aln.name() << "\t";
cout << (has_annotation(aln, "no_truth") ? "0" : "1") << endl;
}
text_buffer.clear();
};
Expand Down Expand Up @@ -341,6 +343,8 @@ int main_gamcompare(int argc, char** argv) {

// Annotate it as such
aln.set_correctly_mapped(correctly_mapped);
// And make sure we say it was possible to get
clear_annotation(aln, "no_truth");

if (correctly_mapped) {
correct_counts.at(omp_get_thread_num()) += 1;
Expand All @@ -358,15 +362,25 @@ int main_gamcompare(int argc, char** argv) {
correct_count_by_mapq_by_thread.at(omp_get_thread_num()).at(mapq) += 1;
}
}
} else if (range != -1) {
// We are flagging reads correct/incorrect, but this read has no truth position.
// Remember that it was impossible to get.
set_annotation(aln, "no_truth", true);
}
#pragma omp critical
{
if (output_tsv) {
text_buffer.emplace_back(std::move(aln));
if (emitter) {
// Copy the alignment since we need it twice
text_buffer.emplace_back(aln);
} else {
text_buffer.emplace_back(std::move(aln));
}
if (text_buffer.size() > 1000) {
flush_text_buffer();
}
} else {
}
if (emitter) {
emitter->write(std::move(aln));
}
}
Expand Down
2 changes: 1 addition & 1 deletion vgci/vgci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ KEEP_INTERMEDIATE_FILES=0
# Should we show stdout and stderr from tests? If so, set to "-s".
SHOW_OPT=""
# What toil-vg should we install?
TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@64774cc76ef117aef705c1b5203450482c6a4b60"
TOIL_VG_PACKAGE="git+https://github.com/vgteam/toil-vg.git@c9bd6414f935e6095574a41a34addbb8d87b41a6"
# What toil should we install?
# Could be something like "toil[aws,mesos]==3.20.0"
# or "git+https://github.com/DataBiosphere/toil.git@3ab74776a3adebd6db75de16985ce9d734f60743#egg=toil[aws,mesos]"
Expand Down

1 comment on commit 8a28902

@adamnovak
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17060 seconds

Please sign in to comment.