Update to yacrd 0.6

natir · Jan 10, 2020 · f32900e · f32900e
1 parent 08271fd
commit f32900e
Show file tree

Hide file tree

Showing 18 changed files with 307 additions and 359 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,6 +2,7 @@
 name = "yacrd"
 version = "0.6.0"
 authors = ["Pierre Marijon <[email protected]>"]
+edition = '2018'
 
 exclude = ["image/*", "tests/*"]
 
@@ -10,30 +11,26 @@ homepage = "https://github.com/natir/yacrd"
 repository = "https://github.com/natir/yacrd"
 readme = "Readme.md"
 license = "MIT"
-keywords = ["bioinformatics", "chimera", "long-read"]
+keywords = ["bioinformatics", "chimera", "long-read", "scrubbing"]
 
 [badges]
 travis-ci = { repository = "natir/yacrd", branch = "master" }
 
 [dependencies]
 bio            = "0.30"
-csv            = "1"
-log 	       = "0.4.0"
+csv            = "1.1"
+log 	       = "0.4"
 anyhow         = "1.0"
 niffler        = {git = "https://github.com/luizirber/niffler/", branch = "api_1.0"}
 thiserror      = "1.0"
 structopt      = "0.3"
 env_logger     = "0.7"
-lazy_static    = "1.0"
-serde_derive   = "1.0"
-enum_primitive = "0.1.1"
-
 
 [dev-dependencies]
-tempfile = "3"
+tempfile = "3.1"
 
 [profile.release]
-debug = true # uncomment for proffiling
+# debug = true # uncomment for proffiling
 lto = 'thin'
 opt-level = 3
 overflow-checks = false

diff --git a/Readme.md b/Readme.md
@@ -1,11 +1,5 @@
-# README IN BETA JUMP TO 0.5.1 TAG
-
 # Yet Another Chimeric Read Detector for long reads
 
-[![build-status]][github-actions]
-
-![yacrd pipeline presentation](image/pipeline.svg)
-
 Using all-against-all read mapping, yacrd performs:
 
 1. computation of pile-up coverage for each read
@@ -16,7 +10,7 @@ Chimera detection is done as follows:
 1. for each region where coverage is smaller or equal than `min_coverage` (default 0), yacrd creates a _bad region_.
 2. if there is a _bad region_ that starts at a position strictly after the beginning of the read and ends strictly before the end of the read, the read is marked as `Chimeric`
 3. if total _bad region_ length > 0.8 * read length, the read is marked as `NotCovered`
-4. if read isn't `Chimeric` or `NotCovered` is `NotBad`
+4. if a read isn't `Chimeric` or `NotCovered` is `NotBad`
 
 ## Rationale
 
@@ -84,7 +78,7 @@ yacrd -i overlap.paf -o reads.yacrd
 yacrd can perform some post-detection operation:
 
 - filter: for sequence or overlap file, record with reads marked as Chimeric or NotCovered isn't write in output
-- extract: for sequence or overlap file, record contain reads marked as Chimeric or NotCovered is write in output
+- extract: for sequence or overlap file, record contains reads marked as Chimeric or NotCovered is write in output
 - split: for sequence file bad region in middle of reads are removed, NotCovered read is removed
 - scrubb: for sequence file all bad region are removed, NotCovered read is removed
 
@@ -96,24 +90,24 @@ yacrd -i mapping.paf -o reads.yacrd split -i reads.fasta -o reads.split.fasta
 yacrd -i mapping.paf -o reads.yacrd scrubb -i reads.fasta -o reads.scrubb.fasta
 ```
 
-### Read scrubbing overlapping recommanded parameter
+### Read scrubbing overlapping recommended parameter
 
-For nanopore data, we recommand to use minimap2 with all-vs-all nanopore preset with maximal distance between seeds fixe to 500 (option `-g 500`) to generate overlap. We recommand to run yacrd with minimal coverage fixed to 4 (option `-c`) and minimal coverage of read fixed to 0.4 (option `-n`).
+For nanopore data, we recommend using minimap2 with all-vs-all nanopore preset with a maximal distance between seeds fixe to 500 (option `-g 500`) to generate overlap. We recommend to run yacrd with minimal coverage fixed to 4 (option `-c`) and minimal coverage of read fixed to 0.4 (option `-n`).
 
 This is an exemple of how run a yacrd scrubbing:
 ```
 minimap2 -x ava-ont -g 500 reads.fasta reads.fasta > overlap.paf
 yacrd -i overlap.paf -o report.yacrd -c 4 -n 0.4 scrubb -i reads.fasta -o reads.scrubb.fasta
 ```
 
-For pacbio P6-C4 data, we recommand to use minimap2 with all-vs-all pacbio preset with maximal distance between seeds fixe to 800 (option `-g 800`) to generate overlap. We recommand to run yacrd with minimal coverage fixed to 4 (option `-c 4`) and minimal coverage of read fixed to 0.4 (option `-n 0.4`).
+For pacbio P6-C4 data, we recommend to use minimap2 with all-vs-all pacbio preset with a maximal distance between seeds fixe to 800 (option `-g 800`) to generate overlap. We recommend to run yacrd with minimal coverage fixed to 4 (option `-c 4`) and minimal coverage of read fixed to 0.4 (option `-n 0.4`).
 
 ```
 minimap2 -x ava-pb -g 800 reads.fasta reads.fasta > overlap.paf
 yacrd -i overlap.paf -o report.yacrd -c 4 -n 0.4 scrubb -i reads.fasta -o reads.scrubb.fasta
 ```
 
-For pacbio Sequel data, we recommand to use minimap2 with all-vs-all pacbio preset with maximal distance between seeds fixe to 5000 (option `-g 5000`) to generate overlap. We recommand to run yacrd with minimal coverage fixed to 3 (option `-c 3`) and minimal coverage of read fixed to 0.4 (option `-n 0.4`).
+For pacbio Sequel data, we recommend to use minimap2 with all-vs-all pacbio preset with a maximal distance between seeds fixe to 5000 (option `-g 5000`) to generate overlap. We recommand to run yacrd with minimal coverage fixed to 3 (option `-c 3`) and minimal coverage of read fixed to 0.4 (option `-n 0.4`).
 
 ```
 minimap2 -x ava-pb -g 5000 reads.fasta reads.fasta > overlap.paf
@@ -133,7 +127,7 @@ yacrd use extension to detect format file if your filename contains (anywhere):
 
 #### Compression
 
-yacrd automaticly detect file if is compress or not (gzip, bzip2 and lzma compression is avaible). For post-detection operation if input is compress output have same compression.
+yacrd automatically detect file if is compress or not (gzip, bzip2 and lzma compression is available). For post-detection operation, if input is compressed output have the same compression format.
 
 #### Use yacrd report as input
 
@@ -142,13 +136,13 @@ You can use yacrd report as input in place of overlap file, `ondisk` option are
 ## Output
 
 ```
-type_of_read	id_in_mapping_file  length_of_read  length_of_gap,begin_pos_of_gap,end_pos_of_gap;length_of_gap,be…
+type_of_read    id_in_mapping_file  length_of_read  length_of_gap,begin_pos_of_gap,end_pos_of_gap;length_of_gap,be…
 ```
 
 ### Example
 
 ```
-NotCovered readA 4599	3782,0,3782
+NotCovered readA 4599    3782,0,3782
 ```
 
 Here, readA doesn't have sufficient coverage, there is a zero-coverage region of length 3782bp between positions 0 and 3782.
@@ -170,13 +164,13 @@ Pierre Marijon, Rayan Chikhi, Jean-Stéphane Varré, yacrd and fpa: upstream too
 bibtex format:
 ```
 @article {Marijon2019,
-	author = {Marijon, Pierre and Chikhi, Rayan and Varr{\'e}, Jean-St{\'e}phane},
-	title = {yacrd and fpa: upstream tools for long-read genome assembly},
-	elocation-id = {674036},
-	year = {2019},
-	doi = {10.1101/674036},
-	URL = {https://www.biorxiv.org/content/early/2019/06/18/674036},
-	eprint = {https://www.biorxiv.org/content/early/2019/06/18/674036.full.pdf},
-	journal = {bioRxiv}
+    author = {Marijon, Pierre and Chikhi, Rayan and Varr{\'e}, Jean-St{\'e}phane},
+    title = {yacrd and fpa: upstream tools for long-read genome assembly},
+    elocation-id = {674036},
+    year = {2019},
+    doi = {10.1101/674036},
+    URL = {https://www.biorxiv.org/content/early/2019/06/18/674036},
+    eprint = {https://www.biorxiv.org/content/early/2019/06/18/674036.full.pdf},
+    journal = {bioRxiv}
 }
 ```
diff --git a/src/cli.rs b/src/cli.rs
@@ -22,45 +22,39 @@ SOFTWARE.
 
 #[derive(StructOpt, Debug)]
 #[structopt(
-    version = "0.6b Mew",
+    version = "0.6.0 Flareon",
     author = "Pierre Marijon <[email protected]>",
     name = "yacrd",
     about = "
 Yacrd use overlap between reads, to detect 'good' and 'bad' region,
-region with coverage over threshold is 'good' other are 'bad'.
-If read have a 'bad' region in middle this reads is mark as 'Chimeric'.
-If ratio of 'bad' region length on total read length is larger than threshold this reads is mark as 'Not_covered'.
+a region with coverage over the threshold is 'good' others are 'bad'.
+If read has a 'bad' region in middle this reads is mark as 'Chimeric'.
+If the ratio of 'bad' region length on total read length is larger than threshold this reads is mark as 'Not_covered'.
 
 Yacrd can make some other actions:
-- filter: for sequence or overlap file, record with reads marked as Chimeric or Not_covered isn't write in output
-- extract: for sequence or overlap file, record contain reads marked as Chimeric or Not_covered is write in output
-- split: for sequence file bad region in middle of reads are removed, Not_covered read is removed
-- scrubb: for sequence file all bad region are removed, Not_covered read is removed
+- filter: for sequence or overlap file, record with reads marked as Chimeric or NotCovered isn't written in the output
+- extract: for sequence or overlap file, record contains reads marked as Chimeric or NotCovered is written in the output
+- split: for sequence file bad region in the middle of reads are removed, NotCovered read is removed
+- scrubb: for sequence file all bad region are removed, NotCovered read is removed
 "
 )]
 pub struct Command {
     #[structopt(
         short = "i",
         long = "input",
         required = true,
-        help = "path to input file overlap (.paf|.m4) or yacrd report (.yacrd) format audetected input-format overide detection"
+        help = "path to input file overlap (.paf|.m4|.mhap) or yacrd report (.yacrd), format is autodetect and compression input is allowed (gz|bzip2|lzma)"
     )]
     pub input: String,
 
     #[structopt(
         short = "o",
         long = "output",
         required = true,
-        help = "path output file, yacrd format by default output-format can overide this value"
+        help = "path output file"
     )]
     pub output: String,
 
-    #[structopt(long = "input-format", possible_values = &["paf", "m4", "yacrd", "json"], help = "set the input-format")]
-    pub input_format: Option<String>,
-
-    #[structopt(long = "output-format", possible_values = &["yacrd", "json"], default_value = "yacrd", help = "set the output-format")]
-    pub output_format: String,
-
     #[structopt(
         short = "c",
         long = "coverage",
@@ -73,21 +67,21 @@ pub struct Command {
         short = "n",
         long = "not-coverage",
         default_value = "0.8",
-        help = "if ratio of bad region length on total lengh is lower that this value, all read is mark as bad"
+        help = "if the ratio of bad region length on total length is lower than this value, read is marked as NotCovered"
     )]
     pub not_coverage: f64,
 
     #[structopt(
         short = "d",
         long = "ondisk",
-        help = "if it set yacrd create tempory file, with value of this parameter as prefix, to reduce memory usage but increase the runtime, warning if prefix contain path separator (`/` for unix or `\\` for windows) directory is delete"
+        help = "yacrd switches to 'ondisk' mode which will reduce memory usage but increase computation time. The value passed as a parameter is used as a prefix for the temporary files created by yacrd. Be careful if the prefix contains path separators (`/` for unix or `\\` for windows) this folder will be deleted"
     )]
     pub ondisk: Option<String>,
 
     #[structopt(
         long = "ondisk-buffer-size",
         default_value = "64000000",
-        help = "with the default value yacrd in ondisk mode use around 800 MBytes, you can increase to reduce runtime but increase memory usage"
+        help = "with the default value yacrd in 'ondisk' mode use around 1 GBytes, you can increase to reduce runtime but increase memory usage"
     )]
     pub ondisk_buffer_size: String,
 
@@ -99,11 +93,11 @@ pub struct Command {
 pub enum SubCommand {
     #[structopt(about = "All bad region of read is removed")]
     Scrubb(Scrubb),
-    #[structopt(about = "Record mark as chimeric or Not_covered is filter")]
+    #[structopt(about = "Record mark as chimeric or NotCovered is filter")]
     Filter(Filter),
-    #[structopt(about = "Record mark as chimeric or Not_covered is extract")]
+    #[structopt(about = "Record mark as chimeric or NotCovered is extract")]
     Extract(Extract),
-    #[structopt(about = "Record mark as chimeric or Not_covered is split")]
+    #[structopt(about = "Record mark as chimeric or NotCovered is split")]
     Split(Split),
 }
 

diff --git a/src/editor/extract.rs b/src/editor/extract.rs
@@ -25,10 +25,10 @@ use anyhow::{anyhow, Context, Result};
 use bio::io::{fasta, fastq};
 
 /* local use */
-use editor;
-use error;
-use stack;
-use util;
+use crate::editor;
+use crate::error;
+use crate::stack;
+use crate::util;
 
 pub fn extract(
     input_path: &str,
@@ -225,8 +225,8 @@ where
 mod tests {
     use super::*;
 
-    use reads2ovl;
-    use reads2ovl::Reads2Ovl;
+    use crate::reads2ovl;
+    use crate::reads2ovl::Reads2Ovl;
 
     const FASTA_FILE: &'static [u8] = b">1
 ACTG

diff --git a/src/editor/filter.rs b/src/editor/filter.rs
@@ -25,10 +25,10 @@ use anyhow::{anyhow, Context, Result};
 use bio::io::{fasta, fastq};
 
 /* local use */
-use editor;
-use error;
-use stack;
-use util;
+use crate::editor;
+use crate::error;
+use crate::stack;
+use crate::util;
 
 pub fn filter(
     input_path: &str,
@@ -225,8 +225,8 @@ where
 mod tests {
     use super::*;
 
-    use reads2ovl;
-    use reads2ovl::Reads2Ovl;
+    use crate::reads2ovl;
+    use crate::reads2ovl::Reads2Ovl;
 
     const FASTA_FILE: &'static [u8] = b">1
 ACTG

diff --git a/src/editor/mod.rs b/src/editor/mod.rs
@@ -36,8 +36,8 @@ pub use self::split::*;
 use anyhow::{Context, Result};
 
 /* local use */
-use error;
-use util;
+use crate::error;
+use crate::util;
 
 #[derive(Debug, PartialEq)]
 pub enum ReadType {

diff --git a/src/editor/scrubbing.rs b/src/editor/scrubbing.rs
@@ -25,10 +25,10 @@ use anyhow::{anyhow, Context, Result};
 use bio::io::{fasta, fastq};
 
 /* local use */
-use editor;
-use error;
-use stack;
-use util;
+use crate::editor;
+use crate::error;
+use crate::stack;
+use crate::util;
 
 pub fn scrubbing(
     input_path: &str,
@@ -212,8 +212,8 @@ where
 mod tests {
     use super::*;
 
-    use reads2ovl;
-    use reads2ovl::Reads2Ovl;
+    use crate::reads2ovl;
+    use crate::reads2ovl::Reads2Ovl;
 
     const FASTA_FILE: &'static [u8] = b">1
 ACTGGGGGGACTGGGGGGACTG

diff --git a/src/editor/split.rs b/src/editor/split.rs
@@ -25,10 +25,10 @@ use anyhow::{Context, Result};
 use bio::io::{fasta, fastq};
 
 /* local use */
-use editor;
-use error;
-use stack;
-use util;
+use crate::editor;
+use crate::error;
+use crate::stack;
+use crate::util;
 
 pub fn split(
     input_path: &str,
@@ -202,8 +202,8 @@ where
 mod tests {
     use super::*;
 
-    use reads2ovl;
-    use reads2ovl::Reads2Ovl;
+    use crate::reads2ovl;
+    use crate::reads2ovl::Reads2Ovl;
 
     const FASTA_FILE: &'static [u8] = b">1
 ACTGGGGGGACTGGGGGGACTG