From 63c68ae29f696ec4dc8a27d3529fcea59bff6efb Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 25 May 2022 16:16:42 -0700 Subject: [PATCH 1/4] add group mapq to gampcompare --- src/subcommand/gampcompare_main.cpp | 16 +++++++++++----- src/subcommand/mpmap_main.cpp | 4 +++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/subcommand/gampcompare_main.cpp b/src/subcommand/gampcompare_main.cpp index 6a04226c429..fd8b5b4ec3e 100644 --- a/src/subcommand/gampcompare_main.cpp +++ b/src/subcommand/gampcompare_main.cpp @@ -157,10 +157,10 @@ int main_gampcompare(int argc, char** argv) { } // A buffer we use for the TSV output - vector>> buffers(get_thread_count()); + vector>> buffers(get_thread_count()); // We have an output function to dump all the reads in the text buffer in TSV - auto flush_buffer = [&](vector>& buffer) { + auto flush_buffer = [&](vector>& buffer) { // We print exactly one header line. static bool header_printed = false; // Output TSV to standard out in the format plot-qq.R needs. @@ -172,7 +172,7 @@ int main_gampcompare(int argc, char** argv) { else { cout << "correct"; } - cout << "\tmapped\tmq\taligner\tread" << endl; + cout << "\tmapped\tmq\tgroupmq\taligner\tread" << endl; header_printed = true; } for (auto& result : buffer) { @@ -183,7 +183,7 @@ int main_gampcompare(int argc, char** argv) { else { cout << (get<0>(result) <= range); } - cout << '\t' << get<1>(result) << '\t' << get<2>(result) << '\t' << aligner_name << '\t' << get<3>(result) << endl; + cout << '\t' << get<1>(result) << '\t' << get<2>(result) << '\t' << get<3>(result) << '\t' << aligner_name << '\t' << get<4>(result) << endl; } buffer.clear(); }; @@ -230,9 +230,15 @@ int main_gampcompare(int argc, char** argv) { correct_counts[omp_get_thread_num()]++; } + // group mapq defaults to regular mapq + int64_t group_mapq = proto_mp_aln.mapping_quality(); + if (has_annotation(proto_mp_aln, "group_mapq")) { + group_mapq = get_annotation(proto_mp_aln, "group_mapq"); + } + // put the result on the IO buffer auto& buffer = buffers[omp_get_thread_num()]; - buffer.emplace_back(abs_dist, proto_mp_aln.subpath_size() > 0, proto_mp_aln.mapping_quality(), move(*proto_mp_aln.mutable_name())); + buffer.emplace_back(abs_dist, proto_mp_aln.subpath_size() > 0, proto_mp_aln.mapping_quality(), group_mapq, move(*proto_mp_aln.mutable_name())); if (buffer.size() > buffer_size) { #pragma omp critical flush_buffer(buffer); diff --git a/src/subcommand/mpmap_main.cpp b/src/subcommand/mpmap_main.cpp index 638e7e68fef..4c05b4931be 100644 --- a/src/subcommand/mpmap_main.cpp +++ b/src/subcommand/mpmap_main.cpp @@ -130,7 +130,7 @@ void help_mpmap(char** argv) { //<< " --suppress-tail-anchors don't produce extra anchors when aligning to alternate paths in snarls" << endl //<< " -T, --same-strand read pairs are from the same strand of the DNA/RNA molecule" << endl << " -X, --not-spliced do not form spliced alignments, even if aligning with --nt-type 'rna'" << endl - << " -M, --max-multimaps INT report (up to) this many mappings per read [1]" << endl + << " -M, --max-multimaps INT report (up to) this many mappings per read [10 rna / 1 dna]" << endl << " -a, --agglomerate-alns combine separate multipath alignments into one (possibly disconnected) alignment" << endl << " -r, --intron-distr FILE intron length distribution (from scripts/intron_length_distribution.py)" << endl << " -Q, --mq-max INT cap mapping quality estimates at this much [60]" << endl @@ -254,6 +254,7 @@ int main_mpmap(int argc, char** argv) { // TODO: create an option. int localization_max_paths = 5; int max_num_mappings = 1; + int default_rna_num_mappings = 10; int hit_max = 1024; int hit_max_arg = numeric_limits::min(); int hard_hit_max_muliplier = 3; @@ -1019,6 +1020,7 @@ int main_mpmap(int argc, char** argv) { // can be one alignment suppress_multicomponent_splitting = true; } + max_num_mappings = default_rna_num_mappings; } else if (nt_type != "dna") { // DNA is the default From f090e545d40ac94bcd62d079672c5e529d9d8f92 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 25 May 2022 17:07:05 -0700 Subject: [PATCH 2/4] fix test and option parsing --- src/subcommand/mpmap_main.cpp | 14 +++++++++++--- test/t/33_vg_mpmap.t | 4 ++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/subcommand/mpmap_main.cpp b/src/subcommand/mpmap_main.cpp index 4c05b4931be..9afde4eb65d 100644 --- a/src/subcommand/mpmap_main.cpp +++ b/src/subcommand/mpmap_main.cpp @@ -253,7 +253,8 @@ int main_mpmap(int argc, char** argv) { // How many distinct single path alignments should we look for in a multipath, for MAPQ? // TODO: create an option. int localization_max_paths = 5; - int max_num_mappings = 1; + int max_num_mappings = 0; + int default_dna_num_mappings = 1; int default_rna_num_mappings = 10; int hit_max = 1024; int hit_max_arg = numeric_limits::min(); @@ -1020,9 +1021,16 @@ int main_mpmap(int argc, char** argv) { // can be one alignment suppress_multicomponent_splitting = true; } - max_num_mappings = default_rna_num_mappings; + if (max_num_mappings == 0) { + max_num_mappings = default_rna_num_mappings; + } + } + else if (nt_type == "dna") { + if (max_num_mappings == 0) { + max_num_mappings = default_dna_num_mappings; + } } - else if (nt_type != "dna") { + else { // DNA is the default cerr << "error:[vg mpmap] Cannot identify sequencing type preset (-n): " << nt_type << endl; exit(1); diff --git a/test/t/33_vg_mpmap.t b/test/t/33_vg_mpmap.t index bb3a34072ec..f0ab4190865 100644 --- a/test/t/33_vg_mpmap.t +++ b/test/t/33_vg_mpmap.t @@ -140,7 +140,7 @@ vg snarls -T xy.vg > xy.snarls vg index xy.vg -x xy.xg -g xy.gcsa vg index xy.vg -j xy.dist -s xy.snarls -vg mpmap -x xy.xg -d xy.dist -g xy.gcsa -G x.gam -F SAM -i --frag-mean 50 --frag-stddev 10 >xy.sam +vg mpmap -x xy.xg -d xy.dist -g xy.gcsa -G x.gam -F SAM -i --frag-mean 50 --frag-stddev 10 -M 1 >xy.sam X_HITS="$(cat xy.sam | grep -v "^@" | cut -f3 | grep x | wc -l)" if [ "${X_HITS}" -lt 1200 ] && [ "${X_HITS}" -gt 800 ] ; then IN_RANGE="1" @@ -149,7 +149,7 @@ else fi is "${IN_RANGE}" "1" "paired reads are evenly split between equivalent mappings" -vg mpmap -x xy.xg -d xy.dist -g xy.gcsa -G x.gam -F SAM >xy.sam +vg mpmap -x xy.xg -d xy.dist -g xy.gcsa -G x.gam -F SAM -M 1 >xy.sam X_HITS="$(cat xy.sam | grep -v "^@" | cut -f3 | grep x | wc -l)" if [ "${X_HITS}" -lt 1200 ] && [ "${X_HITS}" -gt 800 ] ; then IN_RANGE="1" From aa143dd840d59519040907681a3b572b3fc9ac10 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Wed, 25 May 2022 17:27:57 -0700 Subject: [PATCH 3/4] also fix surject test --- test/t/15_vg_surject.t | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/t/15_vg_surject.t b/test/t/15_vg_surject.t index 486c8d9f2c8..e10bebdfcc3 100644 --- a/test/t/15_vg_surject.t +++ b/test/t/15_vg_surject.t @@ -130,9 +130,9 @@ is $(vg map -b minigiab/NA12878.chr22.tiny.bam -x m.xg -g m.gcsa | vg surject -p is $(vg map -f minigiab/NA12878.chr22.tiny.fq.gz -x m.xg -g m.gcsa | vg surject -p q -x m.xg -s - | grep chr22.bin8.cram:166:6027 | grep BBBBBFBFI | wc -l) 1 "mapping reproduces qualities from fastq input" is $(vg map -f minigiab/NA12878.chr22.tiny.fq.gz -x m.xg -g m.gcsa --gaf | vg surject -p q -x m.xg -s - -G | grep chr22.bin8.cram:166:6027 | grep BBBBBFBFI | wc -l) 1 "mapping reproduces qualities from GAF input" -is "$(zcat < minigiab/NA12878.chr22.tiny.fq.gz | head -n 4000 | vg mpmap -B -p -x m.xg -g m.gcsa -f - | vg surject -m -x m.xg -p q -s - | samtools view | wc -l)" 1000 "surject works on GAMP input" +is "$(zcat < minigiab/NA12878.chr22.tiny.fq.gz | head -n 4000 | vg mpmap -B -p -x m.xg -g m.gcsa -M 1 -f - | vg surject -m -x m.xg -p q -s - | samtools view | wc -l)" 1000 "surject works on GAMP input" -is "$(vg sim -x m.xg -n 500 -l 150 -a -s 768594 -i 0.01 -e 0.01 -p 250 -v 50 | vg view -aX - | vg mpmap -B -p -b 200 -x m.xg -g m.gcsa -i -f - | vg surject -m -x m.xg -i -p q -s - | samtools view | wc -l)" 1000 "surject works on paired GAMP input" +is "$(vg sim -x m.xg -n 500 -l 150 -a -s 768594 -i 0.01 -e 0.01 -p 250 -v 50 | vg view -aX - | vg mpmap -B -p -b 200 -x m.xg -g m.gcsa -i -M 1 -f - | vg surject -m -x m.xg -i -p q -s - | samtools view | wc -l)" 1000 "surject works on paired GAMP input" rm -rf minigiab.vg* m.xg m.gcsa From 58a65e70836e19e792b7785a9d9b46ab40ea25e9 Mon Sep 17 00:00:00 2001 From: Jordan Eizenga Date: Fri, 27 May 2022 11:45:16 -0700 Subject: [PATCH 4/4] adjust default mpmap params in CI test --- vgci/vgci.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vgci/vgci.py b/vgci/vgci.py index 7cda5b915b1..2fa02210e77 100755 --- a/vgci/vgci.py +++ b/vgci/vgci.py @@ -1328,7 +1328,7 @@ def test_sim_mhc_snp1kg_mpmap(self): acc_threshold=0.02, auc_threshold=0.02, sim_opts='-d 0.01 -p 1000 -v 75.0 -S 5 -I', sim_fastq=self._input('platinum_NA12878_MHC.fq.gz'), - more_mpmap_opts=['-u 8']) + more_mpmap_opts=['-u 8 -M 1']) @timeout_decorator.timeout(4000) def test_sim_yeast_cactus(self):