Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/fix/empty-nodes' into rust
Browse files Browse the repository at this point in the history
  • Loading branch information
ivan-aksamentov committed Jan 13, 2025
2 parents 7d28b1b + 305221e commit c43ac39
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 60 deletions.
41 changes: 41 additions & 0 deletions packages/pangraph/src/pangraph/edits.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,26 @@ impl Edit {
Ok(qry)
}

/// Check if the alignment is empty, i.e. after applying the edits to the
/// consensus sequence, the resulting sequence is empty.
pub fn is_empty_alignment(&self, consensus: impl AsRef<str>) -> bool {
let cons_len = consensus.as_ref().len();
// if there are insertions, the alignment is not empty
let insertions_len = self.inss.iter().map(|i| i.seq.len()).sum::<usize>();
if insertions_len > 0 {
return false;
}
// if the total length of deletions is less than the length of the consensus
// sequence, the alignment is not empty
let deletions_len = self.dels.iter().map(|d| d.len).sum::<usize>();
if deletions_len < cons_len {
return false;
}
// otherwise, apply the edits and check if the resulting sequence is empty
let seq_len = self.apply(consensus).unwrap().len();
return seq_len == 0;
}

#[cfg(any(test, debug_assertions))]
pub fn sanity_check(&self, len: usize) -> Result<(), Report> {
let block_interval = Interval::new(0, len);
Expand Down Expand Up @@ -399,4 +419,25 @@ mod tests {
let actual = edits.apply(r).unwrap();
assert_eq!(q, actual);
}

#[rstest]
fn test_empty_alignment() {
let consensus = "ACGT";
let edits = Edit::empty();
assert!(!edits.is_empty_alignment(consensus));

let edits = Edit {
subs: vec![],
dels: vec![Del::new(0, 4)],
inss: vec![Ins::new(1, "A")],
};
assert!(!edits.is_empty_alignment(consensus));

let edits = Edit {
subs: vec![],
dels: vec![Del::new(0, 4)],
inss: vec![],
};
assert!(edits.is_empty_alignment(consensus));
}
}
7 changes: 5 additions & 2 deletions packages/pangraph/src/pangraph/reweave.rs
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,11 @@ fn split_block(
let b = &graph.blocks[&bid];
for interval in intervals {
let (b_slice, n_dict) = block_slice(b, &interval, graph);
for (old_nid, new_node) in n_dict {
u.n_new.entry(old_nid).or_default().push(new_node);
for (old_nid, new_node_opt) in n_dict {
// push to u.n_new entry if new_node is not empty
if let Some(new_node) = new_node_opt {
u.n_new.get_mut(&old_nid).unwrap().push(new_node);
}
}
if interval.aligned {
h.push(ToMerge {
Expand Down
58 changes: 42 additions & 16 deletions packages/pangraph/src/pangraph/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,27 @@ pub fn interval_node_coords(i: &PangraphInterval, edits: &Edit, block_L: usize)
(s, e)
}

/// given a block, an interval and the graph, it extract the slice of the block
/// defined by the interval.
///
/// It returns the new block, along with a dictionary of updates for the nodes,
/// that maps old node ids to new nodes.
/// In case a slice contains an empty node, the corresponding entry in the dictionary
/// will be None.
pub fn block_slice(
b: &PangraphBlock,
i: &PangraphInterval,
G: &Pangraph,
) -> (PangraphBlock, BTreeMap<NodeId, PangraphNode>) {
) -> (PangraphBlock, BTreeMap<NodeId, Option<PangraphNode>>) {
#[allow(clippy::string_slice)]
// consensus of the new block
let new_consensus = b.consensus()[i.interval.to_range()].to_owned();

// length of the new block
let block_L = b.consensus_len();
debug_assert!(block_L > 0, "Block {} has length 0", b.id());

// containers for new alignment and node updates
let mut node_updates = BTreeMap::new();
let mut new_alignment = BTreeMap::new();

Expand Down Expand Up @@ -164,15 +176,23 @@ pub fn block_slice(
};

let new_node = PangraphNode::new(None, i.new_block_id, old_node.path_id(), new_strand, new_pos);
node_updates.insert(*old_node_id, new_node.clone());

// extract edits for the slice
let new_edits = slice_edits(i, edits, block_L);

#[cfg(any(debug_assertions, test))]
new_edits.sanity_check(new_consensus.len()).unwrap();

let ovw = new_alignment.insert(new_node.id(), new_edits);
debug_assert!(ovw.is_none()); // new node id is not already in new_alignment
if new_edits.is_empty_alignment(&new_consensus) {
// if the node is empty, add `None` to the node updates
// and do not add the alignment to the new block
node_updates.insert(*old_node_id, None);
} else {
// if the node is not empty, add the alignment to the new block
let ovw = new_alignment.insert(new_node.id(), new_edits);
debug_assert!(ovw.is_none(), "Node id was already present! {:?}", ovw);
node_updates.insert(*old_node_id, Some(new_node));
}
}

let new_block = PangraphBlock::new(i.new_block_id, new_consensus, new_alignment);
Expand Down Expand Up @@ -448,20 +468,23 @@ mod tests {
assert_eq!(new_b.consensus(), "TATATTTATC");

let nn1 = PangraphNode::new(None, new_bid, PathId(1), Forward, (111, 120));
assert_eq!(&new_nodes[&NodeId(1)], &nn1);
let nn1_slice = new_nodes[&NodeId(1)].as_ref().unwrap();
assert_eq!(nn1, *nn1_slice);

let nn2 = PangraphNode::new(None, new_bid, PathId(2), Reverse, (1008, 1017));
assert_eq!(&new_nodes[&NodeId(2)], &nn2);
let nn2_slice = new_nodes[&NodeId(2)].as_ref().unwrap();
assert_eq!(nn2, *nn2_slice);

let nn3 = PangraphNode::new(None, new_bid, PathId(3), Reverse, (96, 4));
assert_eq!(&new_nodes[&NodeId(3)], &nn3);
let nn3_slice = new_nodes[&NodeId(3)].as_ref().unwrap();
assert_eq!(nn3, *nn3_slice);

assert_eq!(
new_nodes,
btreemap! {
NodeId(1) => nn1.clone(),
NodeId(2) => nn2.clone(),
NodeId(3) => nn3.clone(),
NodeId(1) => Some(nn1.clone()),
NodeId(2) => Some(nn2.clone()),
NodeId(3) => Some(nn3.clone()),
}
);

Expand Down Expand Up @@ -571,20 +594,23 @@ mod tests {
assert_eq!(new_b.consensus(), "TATATTTATC");

let nn1 = PangraphNode::new(None, new_bid, PathId(1), Reverse, (111, 120));
assert_eq!(&new_nodes[&NodeId(1)], &nn1);
let nn1_slice = new_nodes[&NodeId(1)].as_ref().unwrap();
assert_eq!(nn1, *nn1_slice);

let nn2 = PangraphNode::new(None, new_bid, PathId(2), Forward, (1008, 1017));
assert_eq!(&new_nodes[&NodeId(2)], &nn2);
let nn2_slice = new_nodes[&NodeId(2)].as_ref().unwrap();
assert_eq!(nn2, *nn2_slice);

let nn3 = PangraphNode::new(None, new_bid, PathId(3), Forward, (96, 4));
assert_eq!(&new_nodes[&NodeId(3)], &nn3);
let nn3_slice = new_nodes[&NodeId(3)].as_ref().unwrap();
assert_eq!(nn3, *nn3_slice);

assert_eq!(
new_nodes,
btreemap! {
NodeId(1) => nn1.clone(),
NodeId(2) => nn2.clone(),
NodeId(3) => nn3.clone(),
NodeId(1) => Some(nn1.clone()),
NodeId(2) => Some(nn2.clone()),
NodeId(3) => Some(nn3.clone()),
}
);

Expand Down
11 changes: 8 additions & 3 deletions packages/pangraph/src/reconsensus/reconsensus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ use crate::pangraph::edits::{Ins, Sub};
use crate::pangraph::pangraph::Pangraph;
use crate::pangraph::pangraph_block::{BlockId, PangraphBlock};
use crate::pangraph::pangraph_node::NodeId;
use crate::reconsensus::remove_nodes::remove_emtpy_nodes;
// use crate::reconsensus::remove_nodes::remove_emtpy_nodes;
use crate::reconsensus::remove_nodes::find_empty_nodes;
use crate::utils::collections::insert_at_inplace;
use eyre::Report;
use itertools::Itertools;
Expand All @@ -24,8 +25,12 @@ use std::collections::BTreeMap;
/// - for any in/del present in > M/2 sites, appends it to the consensus
/// - if the consensus has updated indels, then re-aligns all the sequences to the new consensus
pub fn reconsensus_graph(graph: &mut Pangraph, ids_updated_blocks: Vec<BlockId>) -> Result<(), Report> {
// remove selected nodes from graph
remove_emtpy_nodes(graph, &ids_updated_blocks);
// // remove selected nodes from graph
// remove_emtpy_nodes(graph, &ids_updated_blocks);
debug_assert!(
find_empty_nodes(graph, &ids_updated_blocks).is_empty(),
"Empty nodes found in the graph"
);

for block_id in ids_updated_blocks {
let block = graph.blocks.get_mut(&block_id).unwrap();
Expand Down
45 changes: 6 additions & 39 deletions packages/pangraph/src/reconsensus/remove_nodes.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use log::info;

use crate::pangraph::pangraph::Pangraph;
use crate::pangraph::pangraph_block::BlockId;
use crate::pangraph::pangraph_node::NodeId;
Expand All @@ -18,55 +16,24 @@ pub fn remove_emtpy_nodes(graph: &mut Pangraph, block_ids: &[BlockId]) {

/// Finds nodes with empty sequences (full deletion) in the graph.
/// It only checks specific blocks (the ones that were updated by a merger/split).
fn find_empty_nodes(graph: &Pangraph, block_ids: &[BlockId]) -> Vec<NodeId> {
pub fn find_empty_nodes(graph: &Pangraph, block_ids: &[BlockId]) -> Vec<NodeId> {
let mut node_ids_to_delete = Vec::new();
for &block_id in block_ids {
let block = &graph.blocks[&block_id];
let cons_len = block.consensus_len();
let consensus = block.consensus();
let consensus_len = consensus.len();

debug_assert!(
cons_len > 0,
consensus_len > 0,
"Block {block_id} has a consensus length of 0 and should have been removed",
);

for (&node_id, edits) in block.alignments() {
// check that edits are non-overlapping and well-defined
#[cfg(any(test, debug_assertions))]
edits.sanity_check(cons_len).unwrap();

// if the node has insertions or substitutions, or the total deletion size is less than the consensus length
// then it is not empty
if !edits.inss.is_empty() || !edits.subs.is_empty() || edits.dels.is_empty() {
// check that the node is not empty
// first exclude edge-case: circular path with a single node. In this case start == end but the node is not empty
let path_id = graph.nodes[&node_id].path_id();
let path = &graph.paths[&path_id];
let is_circular = path.circular();
let single_node = path.nodes.len() == 1;
if is_circular && single_node {
debug_assert!(
path.tot_len > 0,
"Circular path {path_id} with a single node should not have a length of 0"
);
continue;
}

// if this is not the case, then it is sufficient to check that the node start != end
debug_assert!(
!graph.nodes[&node_id].start_is_end(),
"Node {node_id} with edits {edits:?} and consensus length {cons_len} is empty and should have been removed",
);

continue;
}
edits.sanity_check(consensus_len).unwrap();

// if the node has no insertions or substitutions and the total deletion size is equal to the consensus length
// then it is empty and should be removed
if edits.dels.iter().map(|d| d.len).sum::<usize>() == cons_len {
info!(
"empty node {} with edits {:?} and consensus length {} is empty: to be removed",
node_id, edits, cons_len
);
if edits.is_empty_alignment(&consensus) {
debug_assert!(graph.nodes[&node_id].start_is_end());
node_ids_to_delete.push(node_id);
}
Expand Down

0 comments on commit c43ac39

Please sign in to comment.