diff --git a/packages/pangraph/src/align/alignment_args.rs b/packages/pangraph/src/align/alignment_args.rs index b8929ffd..772c3989 100644 --- a/packages/pangraph/src/align/alignment_args.rs +++ b/packages/pangraph/src/align/alignment_args.rs @@ -10,13 +10,13 @@ pub struct AlignmentArgs { #[clap(value_hint = ValueHint::Other)] pub indel_len_threshold: usize, - /// Energy cost for introducing junction due to alignment merger + /// Energy cost for splitting a block during alignment merger. Controls graph fragmentation, see documentation. #[default = 100.0] #[clap(long, short = 'a', default_value_t = AlignmentArgs::default().alpha)] #[clap(value_hint = ValueHint::Other)] pub alpha: f64, - /// Energy cost for interblock diversity due to alignment merger + /// Energy cost for diversity in the alignment. A high value prevents merging of distantly-related sequences in the same block, see documentation. #[default = 10.0] #[clap(long, short = 'b', default_value_t = AlignmentArgs::default().beta)] #[clap(value_hint = ValueHint::Other)] diff --git a/packages/pangraph/src/commands/build/build_args.rs b/packages/pangraph/src/commands/build/build_args.rs index f5fc0485..9310a933 100644 --- a/packages/pangraph/src/commands/build/build_args.rs +++ b/packages/pangraph/src/commands/build/build_args.rs @@ -31,7 +31,7 @@ pub enum AlignmentBackend { Mmseqs, } -/// Align genomes into a multiple sequence alignment graph +/// Align genomes into a pangenome graph #[derive(Parser, Debug)] pub struct PangraphBuildArgs { /// Path(s) to zero, one or multiple FASTA files with input sequences. Multiple records within one file are treated as separate genomes. @@ -63,10 +63,6 @@ pub struct PangraphBuildArgs { #[clap(long, short = 'c')] pub circular: bool, - /// Transforms all sequences to upper case - #[clap(long, short = 'u')] - pub upper_case: bool, - /// Maximum number of alignment rounds to consider per pairwise graph merger #[clap(long, short = 'x', default_value_t = 100)] #[clap(value_hint = ValueHint::Other)] @@ -82,11 +78,7 @@ pub struct PangraphBuildArgs { #[clap(value_hint = ValueHint::Other)] pub alignment_kernel: AlignmentBackend, - /// Verify that the original sequences can be reconstructed from the resulting pangraph + /// Sanity check: after construction verifies that the original sequences can be reconstructed exactly from the resulting pangraph. Raises an error otherwise. #[clap(long, short = 'f')] pub verify: bool, - - /// Random seed for block id generation - #[clap(long)] - pub seed: Option, } diff --git a/packages/pangraph/src/commands/build/build_run.rs b/packages/pangraph/src/commands/build/build_run.rs index c2507d65..d74b971e 100644 --- a/packages/pangraph/src/commands/build/build_run.rs +++ b/packages/pangraph/src/commands/build/build_run.rs @@ -7,16 +7,13 @@ use crate::pangraph::pangraph::Pangraph; use crate::pangraph::strand::Strand::Forward; use crate::tree::clade::postorder; use crate::tree::neighbor_joining::build_tree_using_neighbor_joining; -use crate::utils::random::get_random_number_generator; use crate::{make_internal_error, make_internal_report}; use eyre::{Report, WrapErr}; use itertools::Itertools; use log::info; pub fn build_run(args: &PangraphBuildArgs) -> Result<(), Report> { - let PangraphBuildArgs { input_fastas, seed, .. } = &args; - - let rng = get_random_number_generator(seed); + let input_fastas = &args.input_fastas; let fastas = read_many_fasta(input_fastas)?; diff --git a/packages/pangraph/src/commands/export/export_args.rs b/packages/pangraph/src/commands/export/export_args.rs index bae55111..e7423651 100644 --- a/packages/pangraph/src/commands/export/export_args.rs +++ b/packages/pangraph/src/commands/export/export_args.rs @@ -17,10 +17,10 @@ pub enum PangraphExportArgs { /// Export block consensus sequences to a fasta file BlockConsensus(PangraphExportBlockConsensusArgs), - /// Export aligned or unaligned sequences for each block. Note that alignments exclude insertions + /// Export aligned or unaligned sequences for each block in separate fasta files. Note that alignments exclude insertions. BlockSequences(PangraphExportBlockSequencesArgs), - /// Export the core-genome alignment + /// Export the core-genome alignment. Note that alignment excludes insertions. CoreGenome(PangraphExportCoreAlignmentArgs), } @@ -85,7 +85,7 @@ pub struct PangraphExportBlockSequencesArgs { #[clap(display_order = 1, value_hint = ValueHint::FilePath)] pub input_json: Option, - /// Path to directory to write output FASTA files to + /// Path to directory to write output FASTA files to. Files are named `block_{block_id}.fa` in the folder. /// /// See: https://en.wikipedia.org/wiki/FASTA_format #[clap(long, short = 'o', value_hint = ValueHint::AnyPath)] diff --git a/packages/pangraph/src/commands/export/export_block_sequences.rs b/packages/pangraph/src/commands/export/export_block_sequences.rs index 55199b4d..f750e9ce 100644 --- a/packages/pangraph/src/commands/export/export_block_sequences.rs +++ b/packages/pangraph/src/commands/export/export_block_sequences.rs @@ -8,7 +8,7 @@ use eyre::{Context, Report}; #[derive(Parser, Debug, Default, Clone)] pub struct ExportBlockSequencesParams { - /// If set, then the full block sequences are exported but not aligned. + /// If set, then the full non-aligned block sequences are exported. #[clap(long)] pub unaligned: bool, } diff --git a/packages/pangraph/src/commands/root_args.rs b/packages/pangraph/src/commands/root_args.rs index 4420ec90..c1430617 100644 --- a/packages/pangraph/src/commands/root_args.rs +++ b/packages/pangraph/src/commands/root_args.rs @@ -36,14 +36,14 @@ fn styles() -> styling::Styles { #[clap(styles = styles())] /// Bioinformatic toolkit to align large sets of closely related genomes into a graph data structure. /// -/// Finds homology amongst large collections of closely related genomes. The core of the algorithm partitions each genome into pancontigs that represent a sequence interval related by vertical descent. Each genome is then an ordered walk along pancontigs; the collection of all genomes form a graph that captures all observed structural diversity. The tool useful to parsimoniously infer horizontal gene transfer events within a community; perform comparative studies of genome gain, loss, and rearrangement dynamics; or simply to compress many related genomes. +/// Finds homology amongst large collections of closely related genomes. The core of the algorithm partitions each genome into pancontigs (also called blocks) that represent a sequence interval related by vertical descent. Each genome is then an ordered walk along pancontigs. The collection of all genomes form a graph that captures all observed structural diversity. The tool useful to study structural variations in the genome, perform comparative studies of genome gain, loss, and rearrangement dynamics; or simply to compress many related genomes. /// /// -/// Publication: "PanGraph: scalable bacterial pan-genome graph construction. Nicholas Noll, Marco Molari, Richard Neher. bioRxiv 2022.02.24.481757; doi: https://doi.org/10.1101/2022.02.24.481757" +/// Publication: "PanGraph: scalable bacterial pan-genome graph construction."" Nicholas Noll, Marco Molari, Richard Neher. Microbial Genomics 9.6 (2023): 001034.; doi: https://doi.org/10.1099/mgen.0.001034" /// /// Documentation: https://pangraph.readthedocs.io/en/stable/ /// -/// Source code:https://github.com/neherlab/pangraph +/// Source code: https://github.com/neherlab/pangraph /// /// Questions, ideas, bug reports: https://github.com/neherlab/pangraph/issues pub struct PangraphArgs { @@ -71,10 +71,10 @@ pub enum PangraphCommands { args: PangraphExportArgs, }, - /// Compute all pairwise marginalizations of a multiple sequence alignment graph + /// Generates a simplified graph that only contains a subset of the input genomes. Simplify(PangraphSimplifyArgs), - /// Reconstruct input fasta sequences from graph + /// Reconstruct all input fasta sequences from graph Reconstruct(PangraphReconstructArgs), /// Generate JSON schema for Pangraph file format diff --git a/packages/pangraph/src/commands/simplify/simplify_args.rs b/packages/pangraph/src/commands/simplify/simplify_args.rs index b719f766..6c34a8de 100644 --- a/packages/pangraph/src/commands/simplify/simplify_args.rs +++ b/packages/pangraph/src/commands/simplify/simplify_args.rs @@ -2,7 +2,7 @@ use clap::{Parser, ValueHint}; use std::fmt::Debug; use std::path::PathBuf; -/// Computes all pairwise marginalizations of a multiple sequence alignment graph +/// Generates a simplified graph that only contains a subset of the input genomes. #[derive(Parser, Debug)] pub struct PangraphSimplifyArgs { /// Path to Pangraph JSON. diff --git a/packages/pangraph/src/io/fasta.rs b/packages/pangraph/src/io/fasta.rs index 73a9721b..97ca1f3c 100644 --- a/packages/pangraph/src/io/fasta.rs +++ b/packages/pangraph/src/io/fasta.rs @@ -141,8 +141,8 @@ impl<'a> FastaReader<'a> { let fragment = self .line .chars() - .filter(|c| is_char_allowed(*c)) - .map(|c| c.to_ascii_uppercase()); + .map(|c| c.to_ascii_uppercase()) + .filter(|c| is_char_allowed(*c)); record.seq.extend(fragment); @@ -157,8 +157,8 @@ impl<'a> FastaReader<'a> { let fragment = self .line .chars() - .filter(|c| is_char_allowed(*c)) - .map(|c| c.to_ascii_uppercase()); + .map(|c| c.to_ascii_uppercase()) + .filter(|c| is_char_allowed(*c)); record.seq.extend(fragment); } diff --git a/packages/pangraph/src/pangraph/graph_merging.rs b/packages/pangraph/src/pangraph/graph_merging.rs index 1187b777..b44adb7a 100644 --- a/packages/pangraph/src/pangraph/graph_merging.rs +++ b/packages/pangraph/src/pangraph/graph_merging.rs @@ -15,7 +15,7 @@ use crate::utils::interval::{have_no_overlap, Interval}; use crate::utils::map_merge::{map_merge, ConflictResolution}; use eyre::{Report, WrapErr}; use itertools::Itertools; -use log::{debug, trace}; +use log::{debug, trace, warn}; use maplit::btreemap; use ordered_float::OrderedFloat; use rayon::prelude::*; @@ -48,9 +48,16 @@ pub fn merge_graphs( graph = graph_new; // stop when no more mergers are possible + // or when the maximum number of iterations is reached if !has_changed { debug!("Graph merge {left_keys} <---> {right_keys} complete."); break; + } else if i >= args.max_self_map { + warn!( + "Reached maximum number of self-merge iterations at graph merging {left_keys} <---> {right_keys}, consider increasing the current limit -x {}", + args.max_self_map + ); + break; } i += 1; }