Merge remote-tracking branch 'origin/cli-help' into rust

neherlab · Jan 13, 2025 · 007411b · 007411b
2 parents c43ac39 + 8c6a781
commit 007411b
Show file tree

Hide file tree

Showing 9 changed files with 27 additions and 31 deletions.
diff --git a/packages/pangraph/src/align/alignment_args.rs b/packages/pangraph/src/align/alignment_args.rs
@@ -10,13 +10,13 @@ pub struct AlignmentArgs {
   #[clap(value_hint = ValueHint::Other)]
   pub indel_len_threshold: usize,
 
-  /// Energy cost for introducing junction due to alignment merger
+  /// Energy cost for splitting a block during alignment merger. Controls graph fragmentation, see documentation.
   #[default = 100.0]
   #[clap(long, short = 'a', default_value_t = AlignmentArgs::default().alpha)]
   #[clap(value_hint = ValueHint::Other)]
   pub alpha: f64,
 
-  /// Energy cost for interblock diversity due to alignment merger
+  /// Energy cost for diversity in the alignment. A high value prevents merging of distantly-related sequences in the same block, see documentation.
   #[default = 10.0]
   #[clap(long, short = 'b', default_value_t = AlignmentArgs::default().beta)]
   #[clap(value_hint = ValueHint::Other)]

diff --git a/packages/pangraph/src/commands/build/build_args.rs b/packages/pangraph/src/commands/build/build_args.rs
@@ -31,7 +31,7 @@ pub enum AlignmentBackend {
   Mmseqs,
 }
 
-/// Align genomes into a multiple sequence alignment graph
+/// Align genomes into a pangenome graph
 #[derive(Parser, Debug)]
 pub struct PangraphBuildArgs {
   /// Path(s) to zero, one or multiple FASTA files with input sequences. Multiple records within one file are treated as separate genomes.
@@ -63,10 +63,6 @@ pub struct PangraphBuildArgs {
   #[clap(long, short = 'c')]
   pub circular: bool,
 
-  /// Transforms all sequences to upper case
-  #[clap(long, short = 'u')]
-  pub upper_case: bool,
-
   /// Maximum number of alignment rounds to consider per pairwise graph merger
   #[clap(long, short = 'x', default_value_t = 100)]
   #[clap(value_hint = ValueHint::Other)]
@@ -82,11 +78,7 @@ pub struct PangraphBuildArgs {
   #[clap(value_hint = ValueHint::Other)]
   pub alignment_kernel: AlignmentBackend,
 
-  /// Verify that the original sequences can be reconstructed from the resulting pangraph
+  /// Sanity check: after construction verifies that the original sequences can be reconstructed exactly from the resulting pangraph. Raises an error otherwise.
   #[clap(long, short = 'f')]
   pub verify: bool,
-
-  /// Random seed for block id generation
-  #[clap(long)]
-  pub seed: Option<u64>,
 }
diff --git a/packages/pangraph/src/commands/build/build_run.rs b/packages/pangraph/src/commands/build/build_run.rs
@@ -7,16 +7,13 @@ use crate::pangraph::pangraph::Pangraph;
 use crate::pangraph::strand::Strand::Forward;
 use crate::tree::clade::postorder;
 use crate::tree::neighbor_joining::build_tree_using_neighbor_joining;
-use crate::utils::random::get_random_number_generator;
 use crate::{make_internal_error, make_internal_report};
 use eyre::{Report, WrapErr};
 use itertools::Itertools;
 use log::info;
 
 pub fn build_run(args: &PangraphBuildArgs) -> Result<(), Report> {
-  let PangraphBuildArgs { input_fastas, seed, .. } = &args;
-
-  let rng = get_random_number_generator(seed);
+  let input_fastas = &args.input_fastas;
 
   let fastas = read_many_fasta(input_fastas)?;
 

diff --git a/packages/pangraph/src/commands/export/export_args.rs b/packages/pangraph/src/commands/export/export_args.rs
@@ -17,10 +17,10 @@ pub enum PangraphExportArgs {
   /// Export block consensus sequences to a fasta file
   BlockConsensus(PangraphExportBlockConsensusArgs),
 
-  /// Export aligned or unaligned sequences for each block. Note that alignments exclude insertions
+  /// Export aligned or unaligned sequences for each block in separate fasta files. Note that alignments exclude insertions.
   BlockSequences(PangraphExportBlockSequencesArgs),
 
-  /// Export the core-genome alignment
+  /// Export the core-genome alignment. Note that alignment excludes insertions.
   CoreGenome(PangraphExportCoreAlignmentArgs),
 }
 
@@ -85,7 +85,7 @@ pub struct PangraphExportBlockSequencesArgs {
   #[clap(display_order = 1, value_hint = ValueHint::FilePath)]
   pub input_json: Option<PathBuf>,
 
-  /// Path to directory to write output FASTA files to
+  /// Path to directory to write output FASTA files to. Files are named `block_{block_id}.fa` in the folder.
   ///
   /// See: https://en.wikipedia.org/wiki/FASTA_format
   #[clap(long, short = 'o', value_hint = ValueHint::AnyPath)]

diff --git a/packages/pangraph/src/commands/export/export_block_sequences.rs b/packages/pangraph/src/commands/export/export_block_sequences.rs
@@ -8,7 +8,7 @@ use eyre::{Context, Report};
 
 #[derive(Parser, Debug, Default, Clone)]
 pub struct ExportBlockSequencesParams {
-  /// If set, then the full block sequences are exported but not aligned.
+  /// If set, then the full non-aligned block sequences are exported.
   #[clap(long)]
   pub unaligned: bool,
 }

diff --git a/packages/pangraph/src/commands/root_args.rs b/packages/pangraph/src/commands/root_args.rs
@@ -36,14 +36,14 @@ fn styles() -> styling::Styles {
 #[clap(styles = styles())]
 /// Bioinformatic toolkit to align large sets of closely related genomes into a graph data structure.
 ///
-/// Finds homology amongst large collections of closely related genomes. The core of the algorithm partitions each genome into pancontigs that represent a sequence interval related by vertical descent. Each genome is then an ordered walk along pancontigs; the collection of all genomes form a graph that captures all observed structural diversity. The tool useful to parsimoniously infer horizontal gene transfer events within a community; perform comparative studies of genome gain, loss, and rearrangement dynamics; or simply to compress many related genomes.
+/// Finds homology amongst large collections of closely related genomes. The core of the algorithm partitions each genome into pancontigs (also called blocks) that represent a sequence interval related by vertical descent. Each genome is then an ordered walk along pancontigs. The collection of all genomes form a graph that captures all observed structural diversity. The tool useful to study structural variations in the genome, perform comparative studies of genome gain, loss, and rearrangement dynamics; or simply to compress many related genomes.
 ///
 ///
-/// Publication: "PanGraph: scalable bacterial pan-genome graph construction. Nicholas Noll, Marco Molari, Richard Neher. bioRxiv 2022.02.24.481757; doi: https://doi.org/10.1101/2022.02.24.481757"
+/// Publication: "PanGraph: scalable bacterial pan-genome graph construction."" Nicholas Noll, Marco Molari, Richard Neher. Microbial Genomics 9.6 (2023): 001034.; doi: https://doi.org/10.1099/mgen.0.001034"
 ///
 /// Documentation: https://pangraph.readthedocs.io/en/stable/
 ///
-/// Source code:https://github.com/neherlab/pangraph
+/// Source code: https://github.com/neherlab/pangraph
 ///
 /// Questions, ideas, bug reports: https://github.com/neherlab/pangraph/issues
 pub struct PangraphArgs {
@@ -71,10 +71,10 @@ pub enum PangraphCommands {
     args: PangraphExportArgs,
   },
 
-  /// Compute all pairwise marginalizations of a multiple sequence alignment graph
+  /// Generates a simplified graph that only contains a subset of the input genomes.
   Simplify(PangraphSimplifyArgs),
 
-  /// Reconstruct input fasta sequences from graph
+  /// Reconstruct all input fasta sequences from graph
   Reconstruct(PangraphReconstructArgs),
 
   /// Generate JSON schema for Pangraph file format

diff --git a/packages/pangraph/src/commands/simplify/simplify_args.rs b/packages/pangraph/src/commands/simplify/simplify_args.rs
@@ -2,7 +2,7 @@ use clap::{Parser, ValueHint};
 use std::fmt::Debug;
 use std::path::PathBuf;
 
-/// Computes all pairwise marginalizations of a multiple sequence alignment graph
+/// Generates a simplified graph that only contains a subset of the input genomes.
 #[derive(Parser, Debug)]
 pub struct PangraphSimplifyArgs {
   /// Path to Pangraph JSON.

diff --git a/packages/pangraph/src/io/fasta.rs b/packages/pangraph/src/io/fasta.rs
@@ -141,8 +141,8 @@ impl<'a> FastaReader<'a> {
     let fragment = self
       .line
       .chars()
-      .filter(|c| is_char_allowed(*c))
-      .map(|c| c.to_ascii_uppercase());
+      .map(|c| c.to_ascii_uppercase())
+      .filter(|c| is_char_allowed(*c));
 
     record.seq.extend(fragment);
 
@@ -157,8 +157,8 @@ impl<'a> FastaReader<'a> {
       let fragment = self
         .line
         .chars()
-        .filter(|c| is_char_allowed(*c))
-        .map(|c| c.to_ascii_uppercase());
+        .map(|c| c.to_ascii_uppercase())
+        .filter(|c| is_char_allowed(*c));
 
       record.seq.extend(fragment);
     }

diff --git a/packages/pangraph/src/pangraph/graph_merging.rs b/packages/pangraph/src/pangraph/graph_merging.rs
@@ -15,7 +15,7 @@ use crate::utils::interval::{have_no_overlap, Interval};
 use crate::utils::map_merge::{map_merge, ConflictResolution};
 use eyre::{Report, WrapErr};
 use itertools::Itertools;
-use log::{debug, trace};
+use log::{debug, trace, warn};
 use maplit::btreemap;
 use ordered_float::OrderedFloat;
 use rayon::prelude::*;
@@ -48,9 +48,16 @@ pub fn merge_graphs(
     graph = graph_new;
 
     // stop when no more mergers are possible
+    // or when the maximum number of iterations is reached
     if !has_changed {
       debug!("Graph merge {left_keys} <---> {right_keys} complete.");
       break;
+    } else if i >= args.max_self_map {
+      warn!(
+        "Reached maximum number of self-merge iterations at graph merging {left_keys} <---> {right_keys}, consider increasing the current limit -x {}",
+        args.max_self_map
+      );
+      break;
     }
     i += 1;
   }