Skip to content

Commit 68fa664

Browse files
authored
Merge pull request #18 from DLBPointon/dp24_splitby
Dp24 splitby
2 parents c0555f2 + f0f6103 commit 68fa664

File tree

6 files changed

+188
-78
lines changed

6 files changed

+188
-78
lines changed

Cargo.lock

+8-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
[package]
22
name = "fasta_manipulation"
3-
version = "0.1.2"
3+
version = "0.1.3"
44
edition = "2021"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
77

88
[dependencies]
99
clap = { version = "4.4.4", features = ["cargo"] }
1010
colored = "2.0.4"
11+
compare = "0.1.0"
1112
csv = "1.3.0"
1213
io = "0.0.2"
1314
noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] }

README.md

+8-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ Currently, this program has the following arguments:
2626
This command will generate a directory of files made up of a user given number of sequences from the input fasta. This is useful when generating geneset data for TreeVal use or sub-setting data in a non-random manner.
2727
The count will be the upper limit, as there will be a left over number of records.
2828

29-
`splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE}`
29+
This will generate files in `{outdir}/{fasta-file.prefix}/{data_type}/{input_file_prefix}_f{file_count}_c{requested_chunk_count}-a{actual_chunk_count}.fa`
30+
31+
`splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE} --data_type ['pep','cdna', 'cds', 'rna', 'other']`
3032

3133
- split_by_size (NOT YET WRITTEN)
3234

@@ -59,5 +61,10 @@ Currently, this program has the following arguments:
5961

6062
- GC percentage per scaffold + counts
6163
- GC percentage whole genome
64+
- N50 and N90
65+
- L50
66+
- GAP count and length (summary with average length)
67+
68+
`profile -f input.fasta -o outdir`
6269

6370
If there are other options that would be useful to any other teams, leave a message or issue.

src/generics.rs

+40
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
use noodles::fasta;
2+
use noodles::fasta::record::Definition;
23
use std::error::Error;
34
use std::{collections::HashMap, fmt, io::BufRead, result, str};
45

@@ -41,3 +42,42 @@ pub fn only_keys<K, V>(map: HashMap<K, V>) -> impl Iterator<Item = K> {
4142
// Take a HashMap and return a Key only Vec
4243
map.into_iter().map(|(k, _v)| k)
4344
}
45+
46+
fn get_gene_symbol(header: String) -> Result<String, Box<dyn std::error::Error>> {
47+
let header_list: Vec<&str> = header.split(' ').collect();
48+
let record_header = header_list[0];
49+
Ok(record_header[1..].to_owned())
50+
// let re = Regex::new(r"gene=([A-Z]\w+)").unwrap();
51+
52+
// let first_run = re.captures(&header).ok_or("None")?;
53+
54+
// if first_run[0] == "None".to_owned() {
55+
// let re = Regex::new(r"symbol:(\S+)").unwrap();
56+
// let second_run = re.captures(&header).ok_or("None")?;
57+
// if second_run[0] == "None".to_owned() {
58+
// let re = Regex::new(r"(\(\S+\)) gene").unwrap();
59+
// let third_run = re.captures(&header).ok_or("None")?;
60+
// if third_run[0] == "None".to_owned() {
61+
// Ok("NOCAPTUREDRESULT".to_string())
62+
// } else {
63+
// Ok(third_run[0].to_string())
64+
// }
65+
// } else {
66+
// Ok(second_run[0].to_string())
67+
// }
68+
// } else {
69+
// Ok(first_run[0].to_string())
70+
// }
71+
}
72+
73+
pub fn sanitise_header(old_header: &Definition) -> String {
74+
let x = get_gene_symbol(old_header.to_string());
75+
76+
// Yeah i dont know either...
77+
match x {
78+
Ok(c) => c,
79+
Err(e) => {
80+
format!("Regex isnt good enough to capture header id: {}", e)
81+
}
82+
}
83+
}

0 commit comments

Comments
 (0)