Skip to content

Commit 66ba192

Browse files
authoredAug 15, 2024··
Merge pull request #44 from Rust-Wellcome/dev
0.1.5
2 parents c0c4aa5 + 8edce07 commit 66ba192

28 files changed

+21139
-497
lines changed
 

‎.github/workflows/documentation.yml

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
name: Documentation
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
8+
jobs:
9+
docs:
10+
permissions:
11+
contents: write
12+
name: Documentation
13+
runs-on: ubuntu-latest
14+
steps:
15+
- name: Checkout source code
16+
uses: actions/checkout@v2
17+
with:
18+
persist-credentials: false
19+
20+
- name: Install Rust
21+
uses: actions-rs/toolchain@v1
22+
with:
23+
profile: minimal
24+
toolchain: nightly
25+
override: true
26+
27+
- name: Build documentation
28+
run: RUSTDOCFLAGS="--html-in-header katex-header.html" cargo doc --no-deps
29+
# uses: actions-rs/cargo@v1
30+
# with:
31+
# command: doc
32+
# args: --verbose --no-deps --all-features
33+
34+
- name: Finalize documentation
35+
run: |
36+
CRATE_NAME=$(echo '${{ github.repository }}-lib' | tr '[:upper:]' '[:lower:]' | cut -f2 -d"/")
37+
echo "<meta http-equiv=\"refresh\" content=\"0; url=${CRATE_NAME/-/_}\">" > target/doc/index.html
38+
touch target/doc/.nojekyll
39+
40+
- name: Upload as artifact
41+
uses: actions/upload-artifact@v2
42+
with:
43+
name: Documentation
44+
path: target/doc
45+
46+
- name: Deploy
47+
uses: JamesIves/github-pages-deploy-action@v4
48+
with:
49+
# ACCESS_TOKEN: ${{ secrets.GH_PAT }}
50+
# BRANCH: gh-pages
51+
folder: target/doc

‎.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
/target
2+
.idea

‎Cargo.lock

+423-253
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎Cargo.toml

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "fasta_manipulation"
3-
version = "0.1.4"
3+
version = "0.1.5"
44
edition = "2021"
55

66
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -11,8 +11,11 @@ colored = "2.0.4"
1111
compare = "0.1.0"
1212
csv = "1.3.0"
1313
io = "0.0.2"
14-
noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] }
14+
noodles = { version = "0.78.0", features = ["fasta", "cram", "csi", "core"] }
1515
regex = "1.9.5"
1616
serde = { version = "1.0.188", features = ["derive"] }
1717
serde_yaml = "0.9.25"
1818
stacker = "0.1.15"
19+
walkdir = "2.5.0"
20+
assert_cmd = "2.0.14"
21+
tempfile = "3.10.1"

‎README.md

+63-6
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,49 @@
1-
# FastaManipulator
1+
# FasMan
2+
3+
## A FastaManipulator script that is slowly doing more...
4+
5+
Originally written by @DLBPointon
6+
Now a collaborative programming project for the Rust@Wellcome group (Sanger).
7+
8+
Collaborators and contributors:
9+
10+
- @figueroakl - Genome Profiling
11+
- @stevieing - Adding tests, optimisations & CI/CD
12+
- @dasunpubudumal- Adding tests, optimisations & CI/CD
13+
14+
---
215

316
This is a re-write of the current fasta manipulation scripts I've written whilst at ToL, as well as adding some functionality needed for future projects.
417

518
Currently, this program has the following arguments:
619

7-
- yaml_validator
20+
- yaml_validator (v2)
21+
22+
THIS FUNCTION IS SPECIFIC TO THE TREEVAL.yaml FILE
23+
24+
Updated for new yaml style and now uses struct methods.
825

926
This validates a given yaml against the TreeVal yaml standard. This is specific to the TreeVal pipeline.
1027
This command will go through the yaml and validate file and directory paths as well as files are in the expected format.
1128

12-
`validateyaml ${PATH TO YAML} --verbose {DEFAULT FALSE} --output ${OUTPUT LOCATION OF LOGS}`
29+
This has been tested by downloading the TreeValTinyTest data set:
30+
31+
```bash
32+
curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf -
33+
```
34+
35+
`validateyaml ${PATH TO YAML}`
36+
37+
TODO:
38+
39+
- Add CRAM validator to the module
40+
- Check for sorting order
41+
- SO record (added now) or
42+
- Take first 100 records and determine whether they are paired reads
43+
- Find equiv to `samtools quickcheck -vvv` for a report on completeness of cram.
44+
- if not then it will be a secondary process (external to FasMan)
45+
- Better report
46+
- Report should complete and if there are fails then panic! or std::process::exit("FAILED DUE TO: ...") this is so that it can be added to the Nextflow pipelines and cause them to error out at the right place, e.g, not rely on scanning the report.log throught functions in NF.
1347

1448
- map_headers
1549

@@ -21,7 +55,7 @@ Currently, this program has the following arguments:
2155

2256
This compliments the above function by using the above generated map file to regenerate the original headers.
2357

24-
- split_by_count (NOT YET WRITTEN)
58+
- split_by_count
2559

2660
This command will generate a directory of files made up of a user given number of sequences from the input fasta. This is useful when generating geneset data for TreeVal use or sub-setting data in a non-random manner.
2761
The count will be the upper limit, as there will be a left over number of records.
@@ -30,13 +64,36 @@ Currently, this program has the following arguments:
3064

3165
`splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE} --data_type ['pep','cdna', 'cds', 'rna', 'other']`
3266

33-
- split_by_size (NOT YET WRITTEN)
67+
- split_by_size
3468

3569
This command will generate a directory of files, of user given size (in MB), generated from the input fasta. This is useful for consistent sizes of files used in geneset alignments.
3670
The mem-size will be approximate as some records may exceed the chosen size, inversely, there will be a final file collecting small sequences which do not meet the limit.
3771

3872
`splitbysize --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --mem-size ${SIZE OF OUTPUT FILES IN Mega Bytes}`
3973

74+
- generate_csv
75+
THIS IS SPECIFIC TO TREEVAL AND THE STUCTURE OF THE GENESET DATA IN USE FOR IT
76+
77+
This function generates CSV files summarising the contents of a directory structure like shown below and saves this in csv_data dir:
78+
79+
```
80+
geneset_data_dir
81+
|
82+
insect
83+
|
84+
csv_data
85+
| |
86+
| ApisMellifera.AMel1-data.csv
87+
|
88+
ApisMellifera
89+
|
90+
ApisMellifera.AMel1
91+
|
92+
{pep, cdna, cds, rna}
93+
|
94+
split.fasta files
95+
```
96+
4097
- curate
4198

4299
Use a tpf and fasta file to generate a curated fasta file.
@@ -55,7 +112,7 @@ Currently, this program has the following arguments:
55112

56113
`mergehaps -p primary.fasta -s secondary.fasta -n PRI/HAP -o merged.fasta`
57114

58-
- profile (NOT YET WRITTEN)
115+
- profile (IN PROGRESS)
59116

60117
Profile a given fasta file:
61118

‎src/exclude_seq.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ pub mod exclude_seq_mod {
2727
let mut binding = fasta;
2828
for result in binding.records() {
2929
let record = result?;
30-
if !exclusions.contains(&record.name()) {
30+
let record_name = str::from_utf8(record.name())?;
31+
if !exclusions.contains(&record_name) {
3132
writer.write_record(&record)?;
3233
} else {
3334
println!("Found record to exclude: {:?}", &record.name());

‎src/generate_csv.rs

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/// Generate CSV generates a csv file which describes a specific data directory /User/...../geneset_alignment_data/insect/ApisMeliffera/ApisMeliffera.AMel1_1/{pep,cdna,rna,cds}/files.fa
2+
/// This is for data tracking for TreeVal
3+
/// This may be replaced or enhanced with a function to send this to a Google Sheets so the team has an easier way of tracking it all.
4+
pub mod gencsv_mod {
5+
use crate::generics::get_folder_list;
6+
use clap::ArgMatches;
7+
use csv::Writer;
8+
use std::collections::HashMap;
9+
use std::error::Error;
10+
use std::{fs, path::Path, path::PathBuf};
11+
use walkdir::WalkDir;
12+
13+
fn get_file_list(root: &str) -> Vec<PathBuf> {
14+
WalkDir::new(root)
15+
.into_iter()
16+
.filter_map(|e| e.ok())
17+
.filter(|e| e.file_type().is_file())
18+
.map(|e| e.into_path())
19+
.collect()
20+
}
21+
22+
// Function to convert list to dictionary
23+
fn list_2_dict(file_list: &Vec<PathBuf>) -> (HashMap<String, Vec<String>>, String) {
24+
let mut file_dict = HashMap::new();
25+
let mut org = String::new();
26+
for path in file_list {
27+
let path_str = path.to_str().unwrap();
28+
let path_list: Vec<&str> = path_str.split('/').collect();
29+
let file_name = path_list[path_list.len() - 1];
30+
if file_name.to_lowercase() != "readme.txt" && file_name.to_lowercase() != "readme" {
31+
file_dict.insert(
32+
file_name.to_string(),
33+
vec![
34+
path_list[path_list.len() - 3].to_string(),
35+
path_list[path_list.len() - 2].to_string(),
36+
path_str.to_string(),
37+
],
38+
);
39+
org = path_list[path_list.len() - 3].to_string();
40+
}
41+
}
42+
(file_dict, org)
43+
}
44+
45+
fn save_data(
46+
dict_of_data: HashMap<String, Vec<String>>,
47+
save_loc: &str,
48+
org_accession: &str,
49+
) -> Result<(), Box<dyn Error>> {
50+
let save_dir = format!("{}/csv_data", save_loc);
51+
52+
let save_path = format!("{}/csv_data/{}-data.csv", save_loc, org_accession);
53+
let save_path = Path::new(&save_path);
54+
55+
// Ensure the save directory exists
56+
if !Path::new(&save_dir).exists() {
57+
fs::create_dir_all(&save_dir).unwrap();
58+
}
59+
60+
if save_path.exists() {
61+
fs::remove_file(save_path).unwrap();
62+
}
63+
64+
println!(
65+
"Generating CSV for:\t{}\nSave Path:\t\t{}",
66+
org_accession,
67+
save_path.display()
68+
);
69+
70+
println!("{}", save_dir);
71+
72+
let mut wtr = Writer::from_path(save_path)?;
73+
wtr.write_record(&["org", "type", "data_file"])?;
74+
for (_key, value) in dict_of_data {
75+
wtr.write_record(&value)?;
76+
}
77+
wtr.flush()?;
78+
Ok(())
79+
}
80+
81+
pub fn gencsv(arguments: std::option::Option<&ArgMatches>) {
82+
let geneset_folder: &String = arguments.unwrap().get_one::<String>("geneset_dir").unwrap();
83+
84+
let clade_folder = get_folder_list(&geneset_folder);
85+
86+
for clade in clade_folder {
87+
let save_clade = clade.clone();
88+
let org_folder = get_folder_list(&clade.into_os_string().into_string().unwrap());
89+
90+
// Filter out the folders ending with csv_data as these are output folders
91+
let new_org_folder: Vec<&PathBuf> = org_folder
92+
.iter()
93+
.filter(|x| !x.ends_with("csv_data"))
94+
.collect();
95+
96+
for org in new_org_folder {
97+
let mut master_list = Vec::new();
98+
99+
let accession_folder = get_folder_list(
100+
&<PathBuf as Clone>::clone(&org)
101+
.into_os_string()
102+
.into_string()
103+
.unwrap(),
104+
);
105+
106+
for accession in accession_folder {
107+
let data_list = get_folder_list(accession.to_str().unwrap());
108+
for data in data_list {
109+
master_list.push(get_file_list(data.to_str().unwrap()));
110+
}
111+
112+
let file_dict: HashMap<String, Vec<String>>;
113+
let orgs: String;
114+
(file_dict, orgs) =
115+
list_2_dict(&master_list.iter().flatten().cloned().collect());
116+
let save_loc = format!(
117+
"{}/{}",
118+
geneset_folder,
119+
save_clade.file_name().unwrap().to_str().unwrap()
120+
);
121+
let _ = save_data(file_dict, &save_loc, &orgs);
122+
}
123+
}
124+
}
125+
}
126+
}

‎src/generics.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,10 @@ pub fn validate_fasta(
3131
reader.expect("NO VALID HEADER / SEQUENCE PAIRS");
3232
for result in binding.records() {
3333
let record = result?;
34-
fasta_map.insert(record.name().to_owned(), record.sequence().len());
34+
fasta_map.insert(
35+
str::from_utf8(record.name())?.to_string(),
36+
record.sequence().len(),
37+
);
3538
}
3639
Ok(fasta_map)
3740
}

‎src/main.rs

+27-10
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,24 @@ fn main() -> Result<(), Error> {
4242
.help("Path to the TreeVal yaml file generated by the user")
4343
)
4444
.arg(
45-
Arg::new("verbose")
46-
.short('v')
47-
.value_parser(clap::value_parser!(bool))
48-
.default_value("false")
49-
.help("Print explainers as to why validation fails, if it does fail")
45+
Arg::new("output_to_file")
46+
.short('f')
47+
.value_parser(clap::builder::BoolishValueParser::new())
48+
.default_value(std::ffi::OsStr::new("true"))
49+
.help("Output the log to file")
5050
)
5151
.arg(
52-
Arg::new("output")
53-
.short('o')
54-
.default_value("./")
52+
Arg::new("output_to_stdout")
53+
.short('s')
54+
.value_parser(clap::builder::BoolishValueParser::new())
55+
.default_value(std::ffi::OsStr::new("true"))
56+
.help("Output the log to file")
57+
)
58+
.arg(
59+
Arg::new("output_to_pipeline")
60+
.short('p')
61+
.value_parser(clap::builder::BoolishValueParser::new())
62+
.default_value(std::ffi::OsStr::new("true"))
5563
.help("Output the log to file")
5664
)
5765
)
@@ -320,17 +328,26 @@ fn main() -> Result<(), Error> {
320328
};
321329

322330
match match_result.subcommand_name() {
331+
// Should really be pulled out into it's own program
332+
// Validator for YAML file for TreeVal and potentially CurationPretext
333+
Some("validateyaml") => validate_yaml(match_result.subcommand_matches("validateyaml")),
334+
335+
// FASTA Manipulator modules
323336
Some("splitbysize") => split_file_by_size(match_result.subcommand_matches("splitbysize")),
324337
Some("splitbycount") => {
325338
split_file_by_count(match_result.subcommand_matches("splitbycount"))
326339
}
340+
//Some("subset") => subset(match_result.subcommand_matches("subset"))
341+
//Some("profile") => profile(match_result.subcommand_matches("profile"))
327342
Some("mapheaders") => {
328343
_ = map_fasta_head(match_result.subcommand_matches("mapheaders"));
329344
}
330-
Some("validateyaml") => validate_yaml(match_result.subcommand_matches("validateyaml")),
331345
Some("remapheaders") => remapping_head(match_result.subcommand_matches("remapheaders")),
332-
Some("curate") => curate_fasta(match_result.subcommand_matches("curate")),
333346
Some("filterfasta") => filter_fasta(match_result.subcommand_matches("filterfasta")),
347+
348+
// FASTA + TPF = NEW_FASTA
349+
Some("curate") => curate_fasta(match_result.subcommand_matches("curate")),
350+
334351
_ => {
335352
unreachable!()
336353
}

‎src/split_by_size.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,10 @@ pub mod split_by_size_mod {
106106
let mut record_list: Vec<Record> = Vec::new();
107107
let list: Vec<&String> = only_keys(i.1.to_owned()).collect();
108108
for ii in list {
109-
let results = fasta_repo.get(ii).transpose();
109+
let results = fasta_repo.get(ii.as_bytes()).transpose();
110110
let new_rec = match results {
111111
Ok(data) => {
112-
let definition = Definition::new(ii, None);
112+
let definition = Definition::new(ii.as_bytes(), None);
113113
Record::new(definition, data.unwrap())
114114
}
115115
Err(e) => panic!("{:?}", e),

‎src/tpf_fasta.rs

+32-25
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,12 @@ pub mod tpf_fasta_mod {
1111
use crate::generics::validate_fasta;
1212

1313
#[derive(Debug, Clone, PartialEq, Eq)]
14-
struct Tpf {
15-
ori_scaffold: String,
16-
start_coord: usize,
17-
end_coord: usize,
18-
new_scaffold: String,
19-
orientation: String,
14+
pub struct Tpf {
15+
pub ori_scaffold: String,
16+
pub start_coord: usize,
17+
pub end_coord: usize,
18+
pub new_scaffold: String,
19+
pub orientation: String,
2020
}
2121

2222
impl std::fmt::Display for Tpf {
@@ -31,9 +31,9 @@ pub mod tpf_fasta_mod {
3131
}
3232

3333
#[derive(Debug, PartialEq, Eq)]
34-
struct NewFasta {
35-
tpf: Tpf,
36-
sequence: String,
34+
pub struct NewFasta {
35+
pub tpf: Tpf,
36+
pub sequence: String,
3737
}
3838

3939
#[derive(Debug)]
@@ -42,7 +42,7 @@ pub mod tpf_fasta_mod {
4242
sequence: Vec<String>,
4343
}
4444

45-
fn parse_tpf(path: &String) -> Vec<Tpf> {
45+
pub fn parse_tpf(path: &String) -> Vec<Tpf> {
4646
// Instantiate a List of Tpf objects
4747
let mut all_tpf: Vec<Tpf> = Vec::new();
4848
for line in read_to_string(path).unwrap().lines() {
@@ -67,7 +67,7 @@ pub mod tpf_fasta_mod {
6767
all_tpf
6868
}
6969

70-
fn subset_vec_tpf<'a>(
70+
pub fn subset_vec_tpf<'a>(
7171
tpf: &'a Vec<Tpf>,
7272
fasta: (&std::string::String, &usize),
7373
) -> Vec<&'a Tpf> {
@@ -83,14 +83,14 @@ pub mod tpf_fasta_mod {
8383
subset_tpf
8484
}
8585

86-
fn check_orientation(
86+
// The TPF will contain data in both PLUS (normal) and
87+
// MINUS (inverted), if MINUS then we need to invert again
88+
// and get the complement sequence
89+
// We then return the sequence of the record.
90+
pub fn check_orientation(
8791
parsed: std::option::Option<noodles::fasta::record::Sequence>,
8892
orientation: String,
8993
) -> String {
90-
// The TPF will contain data in both PLUS (normal) and
91-
// MINUS (inverted), if MINUS then we need to invert again
92-
// and get thr complement sequence
93-
// We then return the sequence of the record.
9494
if orientation == "MINUS" {
9595
let start = Position::try_from(1).unwrap();
9696
let parse_orientation = parsed.unwrap();
@@ -108,7 +108,7 @@ pub mod tpf_fasta_mod {
108108
}
109109
}
110110

111-
fn parse_seq(
111+
pub fn parse_seq(
112112
sequence: std::option::Option<noodles::fasta::record::Sequence>,
113113
tpf: Vec<&Tpf>,
114114
) -> Vec<NewFasta> {
@@ -139,7 +139,7 @@ pub mod tpf_fasta_mod {
139139
subset_tpf
140140
}
141141

142-
fn get_uniques(tpf_list: &Vec<Tpf>) -> Vec<String> {
142+
pub fn get_uniques(tpf_list: &Vec<Tpf>) -> Vec<String> {
143143
// Get a Vec of the uniques names in the TPF Vec
144144
let mut uniques: Vec<String> = Vec::new();
145145

@@ -151,17 +151,21 @@ pub mod tpf_fasta_mod {
151151
uniques
152152
}
153153

154-
fn save_to_fasta(
154+
// The function could take in a path where the output files are stored.
155+
pub fn save_to_fasta(
155156
fasta_data: Vec<NewFasta>,
156157
tpf_data: Vec<Tpf>,
157158
output: &String,
158159
n_length: usize,
159160
) {
160161
//
161162
// TPF is in the input TPF order, this will continue to be the case until
162-
// such time that the script starts modifying the TPF in place which
163-
// we don't want to happen. Once this happens the order will no
164-
// longer be guaranteed.
163+
// such time that the script starts modifying the TPF in place.
164+
//
165+
// This now happends but this is ok as the order of the final scaffolds
166+
// isn't essential as long as the data is correct.
167+
//
168+
// In the future an optional sort function should be added
165169
//
166170
let _data_file = File::create(output);
167171
let mut file = OpenOptions::new()
@@ -180,6 +184,7 @@ pub mod tpf_fasta_mod {
180184
// This is inefficient as we are scanning through the fasta_data, uniques
181185
// ( equal to number of scaffolds) number of times
182186
// If uniques is 10 long and fasta is 100, then this is 1000 scans through in total.
187+
// we need to change x to something more descriptive
183188
for x in uniques {
184189
println!("NOW WRITING DATA FOR: {:?}", &x);
185190
// X = "SUPER_1"
@@ -194,12 +199,14 @@ pub mod tpf_fasta_mod {
194199
.expect("Unable to write to file");
195200

196201
let mut data: MyRecord = MyRecord {
202+
// would it be better to use x.clone()
197203
name: "".to_string(),
198204
sequence: Vec::new(),
199205
};
200206

201207
x.clone_into(&mut data.name);
202208
for tpf in &tpf_data {
209+
// x should be data.name and we should probably transfer ownership?
203210
if tpf.new_scaffold == x {
204211
for fasta in &fasta_data {
205212
if fasta.tpf == *tpf {
@@ -270,11 +277,11 @@ pub mod tpf_fasta_mod {
270277
Ok(data) => {
271278
let adapter = IndexedReader::new(data);
272279

273-
// Now read the fasta and return is as a queryable object
280+
// Now read the fasta and return as a queryable object
274281
let repository = fasta::Repository::new(adapter);
275282
repository
276283
}
277-
Err(_) => todo!(), // Probably just panic!
284+
Err(e) => panic!("NOODLES/STD::IO ERROR: {:?}\n Likely a malformatted FAI - Check that the seperators are TABS not spaces!!!", e),
278285
};
279286

280287
//
@@ -290,7 +297,7 @@ pub mod tpf_fasta_mod {
290297
let subset_tpf = subset_vec_tpf(&tpf_data, (&i.0, &i.1));
291298

292299
// Query the fasta for scaffold = header
293-
let sequence = fasta_repo.get(&i.0).transpose();
300+
let sequence = fasta_repo.get(&i.0.as_bytes()).transpose();
294301

295302
// if exists then get the seqeuence, return a tpf object
296303
// containing the trimmed sequence

‎src/yaml_validator.rs

+546-194
Large diffs are not rendered by default.
File renamed without changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
? SCAFFOLD_12:1-900734 RL_3 MINUS
2+
GAP TYPE-2 200
3+
? SCAFFOLD_50:1-61000 RL_3 PLUS
4+
? SCAFFOLD_26:1-201195 RL_3_unloc_1 PLUS
5+
? SCAFFOLD_84:1-2000 SCAFFOLD_84 PLUS

‎test_data/iyAndFlav1/full/iyAndFlav1_subset.fa

+19,422
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
SCAFFOLD_12 900734 13 60 61
2+
SCAFFOLD_26 201195 915773 60 61
3+
SCAFFOLD_50 61000 1120335 60 61
4+
SCAFFOLD_84 2000 1182365 60 61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
? SCAFFOLD_1:1-9 RL_1 MINUS
2+
GAP TYPE-2 200
3+
? SCAFFOLD_2:1-11 RL_1 PLUS
4+
? SCAFFOLD_3:1-5 RL_2 PLUS
5+
? SCAFFOLD_2:12-20 RL_3_unloc_1 PLUS
6+
? SCAFFOLD_3:6-10 SCAFFOLD_3 PLUS
+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
>SCAFFOLD_1
2+
ATGCATGCCGTATAACCAATGTGTGTGATGTGAGTATGCATCGTGCATCGATCGCTAGCA
3+
TGCCAGTCAGTCTA
4+
>SCAFFOLD_2
5+
ATGCATGCCGTATAACCAATGTGTGTGATGTGAGTATGCATCGTGCATCGATCGCTAGCA
6+
TGCCAGTCAGTCTA
7+
>SCAFFOLD_3
8+
AGTGTATTTTTATGCATGCCGTATAACCAATGTGTGTGATGTGAGTATGCATCGTGCATC
9+
GATCGCTAGCATGCCAGTCAGTCTA
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
SCAFFOLD_1 74 12 60 61
2+
SCAFFOLD_2 74 100 60 61
3+
SCAFFOLD_3 85 188 60 61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
>SUPER_1
2+
GGCATGCATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
3+
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
4+
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
5+
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNATGCATGCCGT
6+
>SUPER_2
7+
AGTGT
8+
>SUPER_3_unloc_1
9+
ATAACCAAT
10+
>SCAFFOLD_3
11+
ATTTT
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
? SCAFFOLD_1:1-9 RL_1 MINUS
2+
GAP TYPE-2 200
3+
? SCAFFOLD_3:1-5 RL_2 PLUS
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>SUPER_1
2+
SCAFFOLD_1 -- 1 -- 9
3+
>SUPER_2
4+
SCAFFOLD_3 -- 1 -- 5
+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>SCAFFOLD_1
2+
ATGCATGCCGTATAGA
3+
>SCAFFOLD_3
4+
AGTGTATTTTTATGCA
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SCAFFOLD_1 16 12 16 17
2+
SCAFFOLD_3 16 41 16 17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>SUPER_1
2+
GGCATGCAT
3+
>SUPER_2
4+
AGTGT

‎test_data/yaml/test.yaml

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
assembly:
2+
assem_level: scaffold
3+
assem_version: 1
4+
sample_id: pxPlaOval8
5+
latin_name: to_provide_taxonomic_rank
6+
defined_class: fungi
7+
project_id: DTOL
8+
reference_file: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/assembly/draft/grTriPseu1.fa
9+
map_order: unsorted
10+
assem_reads:
11+
read_type: hifi
12+
read_data: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/pacbio2/
13+
supplementary_data: path
14+
hic_data:
15+
hic_cram: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/hic-arima/
16+
hic_aligner: minimap2
17+
kmer_profile:
18+
# kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
19+
kmer_length: 31
20+
dir: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/empty/
21+
alignment:
22+
data_dir: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/gene_alignment_data/
23+
common_name: "" # For future implementation (adding bee, wasp, ant etc)
24+
geneset_id: "LaetiporusSulphureus.gfLaeSulp1,Iam.Fail"
25+
#Path should end up looking like "{{data_dir}}{{classT}}/{{common_name}}/csv_data/{{geneset}}-data.csv"
26+
self_comp:
27+
motif_len: 0
28+
mummer_chunk: 20
29+
synteny:
30+
synteny_path: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/synteny/
31+
synteny_genomes: ""
32+
intron:
33+
size: "50k"
34+
telomere:
35+
teloseq: TTCAGGG
36+
busco:
37+
lineages_path: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/busco/subset
38+
lineage: fungi_odb10

‎tests/tpf_fasta.rs

+323-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,326 @@
1-
pub use fasta_manipulation::tpf_fasta::*;
1+
use assert_cmd::Command;
2+
use std::fs;
3+
use std::fs::File;
4+
use std::io::Write;
5+
6+
use noodles::fasta::record::Sequence;
7+
use tempfile::Builder;
8+
9+
use fasta_manipulation::tpf_fasta_mod::{
10+
check_orientation, get_uniques, parse_seq, parse_tpf, save_to_fasta, subset_vec_tpf, NewFasta,
11+
Tpf,
12+
};
13+
14+
mod util;
15+
16+
use util::are_files_identical;
17+
18+
// To test the check orientation function we need to publicly expose it
19+
// Is there a way to test private functions?
20+
#[test]
21+
fn check_orientation_inverts_sequence_if_minus() {
22+
let sequence = Sequence::from(b"ATGC".to_vec());
23+
let orientation = "MINUS".to_string();
24+
let result = check_orientation(Some(sequence), orientation);
25+
assert_eq!(result, "GCAT".to_string());
26+
}
27+
28+
#[test]
29+
fn check_orientation_does_not_invert_sequence_if_plus() {
30+
let sequence = Sequence::from(b"ATGC".to_vec());
31+
let orientation = "PLUS".to_string();
32+
let result = check_orientation(Some(sequence), orientation);
33+
assert_eq!(result, "ATGC".to_string());
34+
}
35+
36+
// Again we need to publicly expose the get_uniques function to test it
37+
// Also we need to publicly expose the Tpf struct attributes
38+
// Do we need a factory function to create Tpf structs?
39+
#[test]
40+
fn get_uniques_returns_unique_scaffold_names() {
41+
let tpf1 = Tpf {
42+
ori_scaffold: "scaffold1".to_string(),
43+
start_coord: 1,
44+
end_coord: 100,
45+
new_scaffold: "newScaffold1".to_string(),
46+
orientation: "PLUS".to_string(),
47+
};
48+
let tpf2 = Tpf {
49+
ori_scaffold: "scaffold2".to_string(),
50+
start_coord: 1,
51+
end_coord: 100,
52+
new_scaffold: "newScaffold2".to_string(),
53+
orientation: "PLUS".to_string(),
54+
};
55+
let tpf3 = Tpf {
56+
ori_scaffold: "scaffold1".to_string(),
57+
start_coord: 1,
58+
end_coord: 100,
59+
new_scaffold: "newScaffold1".to_string(),
60+
orientation: "PLUS".to_string(),
61+
};
62+
let tpfs = vec![tpf1, tpf2, tpf3];
63+
let result = get_uniques(&tpfs);
64+
assert_eq!(
65+
result,
66+
vec!["newScaffold1".to_string(), "newScaffold2".to_string()]
67+
);
68+
}
69+
70+
// Need to add some docs for function
71+
// as we were not entirely sure what it was doing
72+
#[test]
73+
fn get_subset_of_tpfs() {
74+
let tpf1 = Tpf {
75+
ori_scaffold: "scaffold1".to_string(),
76+
start_coord: 1,
77+
end_coord: 100,
78+
new_scaffold: "newScaffold1".to_string(),
79+
orientation: "PLUS".to_string(),
80+
};
81+
let tpf2 = Tpf {
82+
ori_scaffold: "scaffold2".to_string(),
83+
start_coord: 1,
84+
end_coord: 100,
85+
new_scaffold: "newScaffold2".to_string(),
86+
orientation: "PLUS".to_string(),
87+
};
88+
let tpf3 = Tpf {
89+
ori_scaffold: "scaffold1".to_string(),
90+
start_coord: 1,
91+
end_coord: 100,
92+
new_scaffold: "newScaffold1".to_string(),
93+
orientation: "PLUS".to_string(),
94+
};
95+
let tpfs = vec![tpf1, tpf2, tpf3];
96+
let fasta = (&"scaffold1".to_string(), &(1 as usize));
97+
let result = subset_vec_tpf(&tpfs, fasta);
98+
assert_eq!(result.len(), 2);
99+
}
100+
101+
#[test]
102+
fn check_parse_seq() {
103+
let sequence =
104+
Sequence::from(b"AATGGCCGGCGCGTTAAACCCAATGCCCCGGTTAANNGCTCGTCGCTTGCTTCGCAAAA".to_vec());
105+
let tpf1 = Tpf {
106+
ori_scaffold: "scaffold1".to_string(),
107+
start_coord: 3,
108+
end_coord: 5,
109+
new_scaffold: "newScaffold1".to_string(),
110+
orientation: "PLUS".to_string(),
111+
};
112+
let tpf2 = Tpf {
113+
ori_scaffold: "scaffold2".to_string(),
114+
start_coord: 10,
115+
end_coord: 20,
116+
new_scaffold: "newScaffold2".to_string(),
117+
orientation: "MINUS".to_string(),
118+
};
119+
let tpf3 = Tpf {
120+
ori_scaffold: "scaffold1".to_string(),
121+
start_coord: 1,
122+
end_coord: 58,
123+
new_scaffold: "newScaffold1".to_string(),
124+
orientation: "PLUS".to_string(),
125+
};
126+
127+
let tpfs = vec![&tpf1, &tpf2, &tpf3];
128+
let input_sequence = Some(sequence);
129+
130+
let new_fasta = parse_seq(input_sequence, tpfs);
131+
132+
assert_eq!(new_fasta.len(), 3);
133+
assert_eq!(new_fasta.first().unwrap().sequence, "TGG");
134+
assert_eq!(new_fasta.get(1).unwrap().sequence, "GGTTTAACGCG");
135+
assert_eq!(
136+
new_fasta.get(2).unwrap().sequence,
137+
"AATGGCCGGCGCGTTAAACCCAATGCCCCGGTTAANNGCTCGTCGCTTGCTTCGCAAA"
138+
);
139+
}
140+
141+
// This should panic with a end_coord > sequence.length
142+
// Should the exception be handled in a more graceful way?
143+
#[test]
144+
#[should_panic]
145+
fn check_parse_seq_bounds_error() {
146+
let sequence =
147+
Sequence::from(b"AATGGCCGGCGCGTTAAACCCAATGCCCCGGTTAANNGCTCGTCGCTTGCTTCGCAAAA".to_vec());
148+
let tpf = Tpf {
149+
ori_scaffold: "scaffold1".to_string(),
150+
start_coord: 10,
151+
end_coord: 60,
152+
new_scaffold: "newScaffold1".to_string(),
153+
orientation: "PLUS".to_string(),
154+
};
155+
let tpfs = vec![&tpf];
156+
157+
let input_sequence = Some(sequence);
158+
159+
parse_seq(input_sequence, tpfs);
160+
}
2161

3162
#[test]
4-
fn it_works() {
5-
assert_eq!(true, true);
163+
fn check_parse_tpf() {
164+
let path = "test_data/iyAndFlav1/full/iyAndFlav1.curated_subset.tpf".to_string();
165+
let tpfs = parse_tpf(&path);
166+
assert_eq!(tpfs.len(), 4);
167+
168+
// ? SCAFFOLD_12:1-900734 RL_3 MINUS
169+
// GAP TYPE-2 200
170+
// ? SCAFFOLD_50:1-61000 RL_3 PLUS
171+
// ? SCAFFOLD_26:1-201195 RL_3_unloc_1 PLUS
172+
// ? SCAFFOLD_84:1-2000 SCAFFOLD_84 PLUS
173+
174+
let tpf1 = tpfs.first().unwrap();
175+
assert_eq!(tpf1.ori_scaffold, "SCAFFOLD_12".to_string());
176+
assert_eq!(tpf1.start_coord, 1);
177+
assert_eq!(tpf1.end_coord, 900734);
178+
assert_eq!(tpf1.new_scaffold, "SUPER_3".to_string());
179+
assert_eq!(tpf1.orientation, "MINUS".to_string());
180+
181+
let tpf2 = tpfs.last().unwrap();
182+
assert_eq!(tpf2.ori_scaffold, "SCAFFOLD_84".to_string());
183+
assert_eq!(tpf2.start_coord, 1);
184+
assert_eq!(tpf2.end_coord, 2000);
185+
assert_eq!(tpf2.new_scaffold, "SCAFFOLD_84".to_string());
186+
assert_eq!(tpf2.orientation, "PLUS".to_string());
187+
}
188+
189+
#[test]
190+
fn check_save_to_fasta() {
191+
// Inputs: Vector of NewFasta types, vector of Tpf types, output path, and n_length
192+
// 1. Creates a data file based on the output path, and open the created file using OpenOption
193+
// 2. Creates a debug.txt file, and open that file.
194+
// 3. Retrieving unique scaffolds based on the initial tpf types
195+
196+
// Iterating over the unique scaffold names:
197+
// - appends a > symbol to the start and a new line to the end
198+
// - appends the scaffold name to the file
199+
// - appends the scaffold name to file2 ()debug.txt)
200+
// - creates a struct called MyRecord with an empty name and sequence
201+
// - assigns the unique scaffold name to data name
202+
// - iterating over the tpf data (comes from parse_tpf function)
203+
// - if the new scaffold name is equal to the unique scaffold name
204+
// - iterates over the new_fasta data
205+
// - checking for object equality
206+
// - if the object is equal it formats the tpf into a string and writes it to file2 (debug.txt)
207+
// - if the object is equal it appends the fasta sequence to the data sequence
208+
// - creates a variable line_len set to 60
209+
// - creates a fixed variable which is is the sequence
210+
// - creates a n_string variable which is N repeated n_length times
211+
// - creates fixed2 variable which is fixed joined with n_string
212+
// - creates a variable called fixed3 which is converted to bytes and chunks it by line_len and converts it to a vector of strings
213+
// - iterates over the fixed3 variable and writes it to the file
214+
215+
let new_fasta_items = vec![
216+
NewFasta {
217+
tpf: Tpf {
218+
ori_scaffold: "SCAFFOLD_1".to_string(),
219+
start_coord: 1,
220+
end_coord: 9,
221+
new_scaffold: "SUPER_1".to_string(),
222+
orientation: "MINUS".to_string(),
223+
},
224+
sequence: "GGCATGCAT".to_string(),
225+
},
226+
NewFasta {
227+
tpf: Tpf {
228+
ori_scaffold: "SCAFFOLD_3".to_string(),
229+
start_coord: 1,
230+
end_coord: 5,
231+
new_scaffold: "SUPER_2".to_string(),
232+
orientation: "PLUS".to_string(),
233+
},
234+
sequence: "AGTGT".to_string(),
235+
},
236+
];
237+
238+
let tpf_items = vec![
239+
Tpf {
240+
ori_scaffold: "SCAFFOLD_1".to_string(),
241+
start_coord: 1,
242+
end_coord: 9,
243+
new_scaffold: "SUPER_1".to_string(),
244+
orientation: "MINUS".to_string(),
245+
},
246+
Tpf {
247+
ori_scaffold: "SCAFFOLD_3".to_string(),
248+
start_coord: 1,
249+
end_coord: 5,
250+
new_scaffold: "SUPER_2".to_string(),
251+
orientation: "PLUS".to_string(),
252+
},
253+
];
254+
255+
let output = &"new.fasta".to_string();
256+
257+
let n_length: usize = 200;
258+
259+
save_to_fasta(new_fasta_items, tpf_items, output, n_length);
260+
261+
assert!(
262+
are_files_identical(output, "test_data/iyAndFlav1/tiny/tiny_test.output.fasta").unwrap()
263+
);
264+
265+
assert!(
266+
are_files_identical("debug.txt", "test_data/iyAndFlav1/tiny/tiny_test.debug.txt").unwrap()
267+
);
268+
269+
match fs::remove_file(output) {
270+
Ok(_) => true,
271+
Err(_err) => panic!("File cannot be found!"),
272+
};
273+
match fs::remove_file("debug.txt") {
274+
Ok(_) => true,
275+
Err(_err) => panic!("File cannot be found!"),
276+
};
277+
}
278+
279+
//#[ignore = "Work in Progress (WIP)"]
280+
#[test]
281+
fn check_curate_fasta() {
282+
let mut cmd = Command::cargo_bin("fasta_manipulation").unwrap();
283+
284+
// Create temp directory that will get cleaned up
285+
let dir = Builder::new().prefix("local_tests").tempdir().unwrap();
286+
287+
// Generate paths for mock files
288+
let fasta_path = &dir.path().join("input_fasta.fa");
289+
let fai_path = &dir.path().join("input_fasta.fa.fai");
290+
let tpf_path = &dir.path().join("input.tpf");
291+
292+
// Actually generate the mock files
293+
let mut fasta = File::create(fasta_path).unwrap();
294+
let mut fai = File::create(fai_path).unwrap();
295+
let mut tpf = File::create(tpf_path).unwrap();
296+
297+
let output = "./output.fa";
298+
299+
write!(
300+
fai,
301+
"SCAFFOLD_1\t16\t12\t16\t17\nSCAFFOLD_3\t16\t41\t16\t17"
302+
)
303+
.unwrap();
304+
305+
write!(
306+
fasta,
307+
">SCAFFOLD_1\nATGCATGCCGTATAGA\n>SCAFFOLD_3\nAGTGTATTTTTATGCA"
308+
)
309+
.unwrap();
310+
311+
write!(
312+
tpf,
313+
"?\tSCAFFOLD_1:1-9\tRL_1\tMINUS\nGAP\tTYPE-2\t200\n?\tSCAFFOLD_3:1-5\tRL_2\tPLUS"
314+
)
315+
.unwrap();
316+
317+
cmd.arg("curate")
318+
.arg("-f")
319+
.arg(fasta_path)
320+
.arg("-t")
321+
.arg(tpf_path)
322+
.arg("-o")
323+
.arg(output)
324+
.assert()
325+
.success();
6326
}

‎tests/util/mod.rs

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
use std::{fs, io::ErrorKind};
2+
3+
/// Checks whether the contents of the two files are identical.
4+
/// file_path1 and file_path2 are input file paths.
5+
/// Returns erred Result struct for errors.
6+
/// Use pattern matching to extract the resulting bool.
7+
pub fn are_files_identical(file_path1: &str, file_path2: &str) -> std::io::Result<bool> {
8+
match (fs::read(file_path1), fs::read(file_path2)) {
9+
(Ok(contents1), Ok(contents2)) => Ok(contents1 == contents2),
10+
(Err(e), _) | (_, Err(e)) => {
11+
if e.kind() == ErrorKind::NotFound {
12+
Err(e)
13+
} else {
14+
// Handle other errors (e.g., permissions issues)
15+
Err(e)
16+
}
17+
}
18+
}
19+
}

0 commit comments

Comments
 (0)
Please sign in to comment.