Skip to content

Commit

Permalink
feat: Remove duplicate marks from consensus records (#247)
Browse files Browse the repository at this point in the history
* feat: remove duplicate tags

* update rocksdb

* fix clippy error
  • Loading branch information
FelixMoelder committed Aug 19, 2022
1 parent 434090a commit 692eb04
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ serde_derive = "1.0"
serde_json = "1.0"
uuid = { version = "0.7", features = ["v4"] }
tempfile = "3.0"
rocksdb = "0.17"
rocksdb = "0.19"
ordered-float = "0.5"
flate2 = "1.0.5"
streaming-stats = "0.2.2"
Expand Down
1 change: 1 addition & 0 deletions src/bam/anonymize_reads.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub fn anonymize_reads<P: AsRef<Path> + std::fmt::Debug>(
let mut fasta_reader = fasta::IndexedReader::from_file(&input_ref)?;
fasta_reader.fetch(&chr, start, end)?;
let mut reference = Vec::new();
reference.resize((end - start) as usize, 0);
fasta_reader.read(&mut reference)?;
let mut rng = rand::thread_rng();
let alphabet = [b'A', b'C', b'G', b'T'];
Expand Down
8 changes: 8 additions & 0 deletions src/bam/collapse_reads_to_fragments/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,11 @@ pub fn call_consensus_reads_from_paths<P: AsRef<Path>>(
)
.call_consensus_reads()
}

pub fn unmark_record(record: &mut bam::record::Record) -> Result<()> {
record.unset_duplicate();
let _ = record.remove_aux(b"PG");
let _ = record.remove_aux(b"DI");
let _ = record.remove_aux(b"DS");
Ok(())
}
35 changes: 30 additions & 5 deletions src/bam/collapse_reads_to_fragments/pipeline.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use super::calc_consensus::{CalcNonOverlappingConsensus, CalcOverlappingConsensus};
use super::unmark_record;
use anyhow::Result;
use bio::io::fastq;
use derive_new::new;
Expand All @@ -9,6 +10,7 @@ use std::cmp::Ordering;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::io;
use std::ops::Deref;
use std::ops::DerefMut;
use uuid::Uuid;

#[derive(new)]
Expand Down Expand Up @@ -129,6 +131,7 @@ impl<W: io::Write> CallConsensusRead<W> {
)?;
}
if record.is_unmapped() || record.is_mate_unmapped() {
unmark_record(&mut record)?;
self.bam_skipped_writer.write(&record)?;
continue;
}
Expand Down Expand Up @@ -161,13 +164,15 @@ impl<W: io::Write> CallConsensusRead<W> {
//For right record save end position and duplicate group ID
let group_id_opt = match storage_entry {
RecordStorage::PairedRecords {
r1_rec,
ref mut r1_rec,
ref mut r2_rec,
} => {
let group_id = if cigar_has_softclips(r1_rec)
|| cigar_has_softclips(&record)
{
unmark_record(r1_rec)?;
self.bam_skipped_writer.write(r1_rec)?;
unmark_record(&mut record)?;
self.bam_skipped_writer.write(&record)?;
None
} else {
Expand All @@ -189,7 +194,9 @@ impl<W: io::Write> CallConsensusRead<W> {
let group_id = if cigar_has_softclips(rec)
|| cigar_has_softclips(&record)
{
unmark_record(rec)?;
self.bam_skipped_writer.write(rec)?;
unmark_record(&mut record)?;
self.bam_skipped_writer.write(&record)?;
None
} else {
Expand Down Expand Up @@ -222,6 +229,7 @@ impl<W: io::Write> CallConsensusRead<W> {
if !record.is_paired() {
//If right or single record save end position and duplicate group ID
if cigar_has_softclips(&record) {
unmark_record(&mut record)?;
self.bam_skipped_writer.write(&record)?;
} else {
duplicate_groups
Expand Down Expand Up @@ -265,6 +273,7 @@ impl<W: io::Write> CallConsensusRead<W> {
//Case: Left record
None => {
if !record.is_paired() || record.tid() != record.mtid() {
unmark_record(&mut record)?;
self.bam_skipped_writer.write(&record)?;
} else {
record_storage.insert(
Expand All @@ -281,7 +290,7 @@ impl<W: io::Write> CallConsensusRead<W> {
}
//Case: Left record already stored
Some(_record_pair) => {
let (rec_id, l_rec) = match record_storage
let (rec_id, mut l_rec) = match record_storage
.remove(&RecordId::Regular(record_name.to_owned()))
.unwrap()
{
Expand All @@ -291,7 +300,9 @@ impl<W: io::Write> CallConsensusRead<W> {
RecordStorage::SingleRecord { .. } => unreachable!(),
};
if cigar_has_softclips(&l_rec) || cigar_has_softclips(&record) {
unmark_record(&mut l_rec)?;
self.bam_skipped_writer.write(&l_rec)?;
unmark_record(&mut record)?;
self.bam_skipped_writer.write(&record)?;
} else {
let alignment_vectors = calc_read_alignments(&l_rec, &record);
Expand All @@ -314,7 +325,9 @@ impl<W: io::Write> CallConsensusRead<W> {
)?;
}
None => {
unmark_record(&mut l_rec)?;
self.bam_skipped_writer.write(&l_rec)?;
unmark_record(&mut record)?;
self.bam_skipped_writer.write(&record)?;
}
};
Expand Down Expand Up @@ -403,8 +416,12 @@ pub fn calc_consensus_complete_groups<'a, W: io::Write>(
.0,
)?;
} else {
bam_skipped_writer.write(&r1_recs[0])?;
bam_skipped_writer.write(&r2_recs[0])?;
let mut r1_rec = r1_recs[0].clone();
unmark_record(&mut r1_rec)?;
bam_skipped_writer.write(&r1_rec)?;
let mut r2_rec = r2_recs[0].clone();
unmark_record(&mut r2_rec)?;
bam_skipped_writer.write(&r2_rec)?;
}
}
};
Expand All @@ -419,7 +436,9 @@ pub fn calc_consensus_complete_groups<'a, W: io::Write>(
)?;
}
_ => {
bam_skipped_writer.write(&recs[0])?;
let mut rec = recs[0].clone();
unmark_record(&mut rec)?;
bam_skipped_writer.write(&rec)?;
}
},
}
Expand Down Expand Up @@ -668,6 +687,12 @@ impl Deref for IndexedRecord {
}
}

impl DerefMut for IndexedRecord {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.rec
}
}

pub enum CigarGroup {
PairedRecords {
r1_recs: Vec<bam::Record>,
Expand Down

0 comments on commit 692eb04

Please sign in to comment.