Rust-Wellcome · Aug 15, 2024
diff --git a/‎.github/workflows/documentation.yml
+51 b/‎.github/workflows/documentation.yml
+51
diff --git a/‎.gitignore
+1 b/‎.gitignore
+1
diff --git a/‎Cargo.lock
+423-253 b/‎Cargo.lock
+423-253
diff --git a/‎Cargo.toml
+5-2 b/‎Cargo.toml
+5-2
diff --git a/‎README.md
+63-6 b/‎README.md
+63-6
diff --git a/‎src/exclude_seq.rs
+2-1 b/‎src/exclude_seq.rs
+2-1
diff --git a/‎src/generate_csv.rs
+126 b/‎src/generate_csv.rs
+126
diff --git a/‎src/generics.rs
+4-1 b/‎src/generics.rs
+4-1
diff --git a/‎src/main.rs
+27-10 b/‎src/main.rs
+27-10
diff --git a/‎src/split_by_size.rs
+2-2 b/‎src/split_by_size.rs
+2-2
diff --git a/‎src/tpf_fasta.rs
+32-25 b/‎src/tpf_fasta.rs
+32-25
diff --git a/‎src/yaml_validator.rs
+546-194 b/‎src/yaml_validator.rs
+546-194
diff --git a/‎test_data/synthetic/tiny.fa ‎test-data/synthetic/tiny.fa b/‎test_data/synthetic/tiny.fa ‎test-data/synthetic/tiny.fa
diff --git a/‎test_data/iyAndFlav1/full/iyAndFlav1.curated_subset.tpf
+5 b/‎test_data/iyAndFlav1/full/iyAndFlav1.curated_subset.tpf
+5
diff --git a/‎test_data/iyAndFlav1/full/iyAndFlav1_subset.fa
+19,422 b/‎test_data/iyAndFlav1/full/iyAndFlav1_subset.fa
+19,422
diff --git a/‎test_data/iyAndFlav1/full/iyAndFlav1_subset.fa.fai
+4 b/‎test_data/iyAndFlav1/full/iyAndFlav1_subset.fa.fai
+4
diff --git a/‎test_data/iyAndFlav1/small/small_test.curated.tpf
+6 b/‎test_data/iyAndFlav1/small/small_test.curated.tpf
+6
diff --git a/‎test_data/iyAndFlav1/small/small_test.fa
+9 b/‎test_data/iyAndFlav1/small/small_test.fa
+9
diff --git a/‎test_data/iyAndFlav1/small/small_test.fa.fai
+3 b/‎test_data/iyAndFlav1/small/small_test.fa.fai
+3
diff --git a/‎test_data/iyAndFlav1/small/small_test.output.fasta
+11 b/‎test_data/iyAndFlav1/small/small_test.output.fasta
+11
diff --git a/‎test_data/iyAndFlav1/tiny/tiny_test.curated.tpf
+3 b/‎test_data/iyAndFlav1/tiny/tiny_test.curated.tpf
+3
diff --git a/‎test_data/iyAndFlav1/tiny/tiny_test.debug.txt
+4 b/‎test_data/iyAndFlav1/tiny/tiny_test.debug.txt
+4
diff --git a/‎test_data/iyAndFlav1/tiny/tiny_test.fa
+4 b/‎test_data/iyAndFlav1/tiny/tiny_test.fa
+4
diff --git a/‎test_data/iyAndFlav1/tiny/tiny_test.fa.fai
+2 b/‎test_data/iyAndFlav1/tiny/tiny_test.fa.fai
+2
diff --git a/‎test_data/iyAndFlav1/tiny/tiny_test.output.fasta
+4 b/‎test_data/iyAndFlav1/tiny/tiny_test.output.fasta
+4
diff --git a/‎test_data/yaml/test.yaml
+38 b/‎test_data/yaml/test.yaml
+38
diff --git a/‎tests/tpf_fasta.rs
+323-3 b/‎tests/tpf_fasta.rs
+323-3
diff --git a/‎tests/util/mod.rs
+19 b/‎tests/util/mod.rs
+19
@@ -0,0 +1,51 @@
+name: Documentation
+
+on:
+  push:
+    branches:
+      - master
+
+jobs:
+  docs:
+    permissions:
+        contents: write
+    name: Documentation
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout source code
+        uses: actions/checkout@v2
+        with:
+          persist-credentials: false
+
+      - name: Install Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: nightly
+          override: true
+
+      - name: Build documentation
+        run: RUSTDOCFLAGS="--html-in-header katex-header.html" cargo doc --no-deps
+        # uses: actions-rs/cargo@v1
+        # with:
+        #   command: doc
+        #   args: --verbose --no-deps --all-features
+
+      - name: Finalize documentation
+        run: |
+          CRATE_NAME=$(echo '${{ github.repository }}-lib' | tr '[:upper:]' '[:lower:]' | cut -f2 -d"/")
+          echo "<meta http-equiv=\"refresh\" content=\"0; url=${CRATE_NAME/-/_}\">" > target/doc/index.html
+          touch target/doc/.nojekyll
+
+      - name: Upload as artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: Documentation
+          path: target/doc
+
+      - name: Deploy
+        uses: JamesIves/github-pages-deploy-action@v4
+        with:
+        #   ACCESS_TOKEN: ${{ secrets.GH_PAT }}
+        #   BRANCH: gh-pages
+          folder: target/doc
@@ -1 +1,2 @@
 /target
+.idea
@@ -1,6 +1,6 @@
 [package]
 name = "fasta_manipulation"
-version = "0.1.4"
+version = "0.1.5"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
@@ -11,8 +11,11 @@ colored = "2.0.4"
 compare = "0.1.0"
 csv = "1.3.0"
 io = "0.0.2"
-noodles = { version = "0.52.0", features = ["fasta", "cram", "csi", "core"] }
+noodles = { version = "0.78.0", features = ["fasta", "cram", "csi", "core"] }
 regex = "1.9.5"
 serde = { version = "1.0.188", features = ["derive"] }
 serde_yaml = "0.9.25"
 stacker = "0.1.15"
+walkdir = "2.5.0"
+assert_cmd = "2.0.14"
+tempfile = "3.10.1"
@@ -1,15 +1,49 @@
-# FastaManipulator
+# FasMan
+
+## A FastaManipulator script that is slowly doing more...
+
+Originally written by @DLBPointon
+Now a collaborative programming project for the Rust@Wellcome group (Sanger).
+
+Collaborators and contributors:
+
+-   @figueroakl - Genome Profiling
+-   @stevieing - Adding tests, optimisations & CI/CD
+-   @dasunpubudumal- Adding tests, optimisations & CI/CD
+
+---
 
 This is a re-write of the current fasta manipulation scripts I've written whilst at ToL, as well as adding some functionality needed for future projects.
 
 Currently, this program has the following arguments:
 
--   yaml_validator
+-   yaml_validator (v2)
+
+    THIS FUNCTION IS SPECIFIC TO THE TREEVAL.yaml FILE
+
+    Updated for new yaml style and now uses struct methods.
 
     This validates a given yaml against the TreeVal yaml standard. This is specific to the TreeVal pipeline.
     This command will go through the yaml and validate file and directory paths as well as files are in the expected format.
 
-    `validateyaml ${PATH TO YAML} --verbose {DEFAULT FALSE} --output ${OUTPUT LOCATION OF LOGS}`
+    This has been tested by downloading the TreeValTinyTest data set:
+
+    ```bash
+    curl https://tolit.cog.sanger.ac.uk/test-data/resources/treeval/TreeValTinyData.tar.gz | tar xzf -
+    ```
+
+    `validateyaml ${PATH TO YAML}`
+
+    TODO:
+
+    -   Add CRAM validator to the module
+        -   Check for sorting order
+            -   SO record (added now) or
+            -   Take first 100 records and determine whether they are paired reads
+        -   Find equiv to `samtools quickcheck -vvv` for a report on completeness of cram.
+            -   if not then it will be a secondary process (external to FasMan)
+    -   Better report
+        -   Report should complete and if there are fails then panic! or std::process::exit("FAILED DUE TO: ...") this is so that it can be added to the Nextflow pipelines and cause them to error out at the right place, e.g, not rely on scanning the report.log throught functions in NF.
 
 -   map_headers
 
@@ -21,7 +55,7 @@ Currently, this program has the following arguments:
 
     This compliments the above function by using the above generated map file to regenerate the original headers.
 
--   split_by_count (NOT YET WRITTEN)
+-   split_by_count
 
     This command will generate a directory of files made up of a user given number of sequences from the input fasta. This is useful when generating geneset data for TreeVal use or sub-setting data in a non-random manner.
     The count will be the upper limit, as there will be a left over number of records.
@@ -30,13 +64,36 @@ Currently, this program has the following arguments:
 
     `splitbycount --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --count {NUMBER OF FASTA RECORDS PER FILE} --data_type ['pep','cdna', 'cds', 'rna', 'other']`
 
--   split_by_size (NOT YET WRITTEN)
+-   split_by_size
 
     This command will generate a directory of files, of user given size (in MB), generated from the input fasta. This is useful for consistent sizes of files used in geneset alignments.
     The mem-size will be approximate as some records may exceed the chosen size, inversely, there will be a final file collecting small sequences which do not meet the limit.
 
     `splitbysize --fasta-file ${PATH TO FASTA} --output-directory ${OUTPUT LOCATION} --mem-size ${SIZE OF OUTPUT FILES IN Mega Bytes}`
 
+-   generate_csv
+    THIS IS SPECIFIC TO TREEVAL AND THE STUCTURE OF THE GENESET DATA IN USE FOR IT
+
+    This function generates CSV files summarising the contents of a directory structure like shown below and saves this in csv_data dir:
+
+    ```
+    geneset_data_dir
+        |
+        insect
+            |
+            csv_data
+            |   |
+            |   ApisMellifera.AMel1-data.csv
+            |
+            ApisMellifera
+                |
+                ApisMellifera.AMel1
+                    |
+                    {pep, cdna, cds, rna}
+                        |
+                        split.fasta files
+    ```
+
 -   curate
 
     Use a tpf and fasta file to generate a curated fasta file.
@@ -55,7 +112,7 @@ Currently, this program has the following arguments:
 
     `mergehaps -p primary.fasta -s secondary.fasta -n PRI/HAP -o merged.fasta`
 
--   profile (NOT YET WRITTEN)
+-   profile (IN PROGRESS)
 
     Profile a given fasta file:
 
 
@@ -27,7 +27,8 @@ pub mod exclude_seq_mod {
                 let mut binding = fasta;
                 for result in binding.records() {
                     let record = result?;
-                    if !exclusions.contains(&record.name()) {
+                    let record_name = str::from_utf8(record.name())?;
+                    if !exclusions.contains(&record_name) {
                         writer.write_record(&record)?;
                     } else {
                         println!("Found record to exclude: {:?}", &record.name());
 
@@ -0,0 +1,126 @@
+/// Generate CSV generates a csv file which describes a specific data directory /User/...../geneset_alignment_data/insect/ApisMeliffera/ApisMeliffera.AMel1_1/{pep,cdna,rna,cds}/files.fa
+/// This is for data tracking for TreeVal
+/// This may be replaced or enhanced with a function to send this to a Google Sheets so the team has an easier way of tracking it all.
+pub mod gencsv_mod {
+    use crate::generics::get_folder_list;
+    use clap::ArgMatches;
+    use csv::Writer;
+    use std::collections::HashMap;
+    use std::error::Error;
+    use std::{fs, path::Path, path::PathBuf};
+    use walkdir::WalkDir;
+
+    fn get_file_list(root: &str) -> Vec<PathBuf> {
+        WalkDir::new(root)
+            .into_iter()
+            .filter_map(|e| e.ok())
+            .filter(|e| e.file_type().is_file())
+            .map(|e| e.into_path())
+            .collect()
+    }
+
+    // Function to convert list to dictionary
+    fn list_2_dict(file_list: &Vec<PathBuf>) -> (HashMap<String, Vec<String>>, String) {
+        let mut file_dict = HashMap::new();
+        let mut org = String::new();
+        for path in file_list {
+            let path_str = path.to_str().unwrap();
+            let path_list: Vec<&str> = path_str.split('/').collect();
+            let file_name = path_list[path_list.len() - 1];
+            if file_name.to_lowercase() != "readme.txt" && file_name.to_lowercase() != "readme" {
+                file_dict.insert(
+                    file_name.to_string(),
+                    vec![
+                        path_list[path_list.len() - 3].to_string(),
+                        path_list[path_list.len() - 2].to_string(),
+                        path_str.to_string(),
+                    ],
+                );
+                org = path_list[path_list.len() - 3].to_string();
+            }
+        }
+        (file_dict, org)
+    }
+
+    fn save_data(
+        dict_of_data: HashMap<String, Vec<String>>,
+        save_loc: &str,
+        org_accession: &str,
+    ) -> Result<(), Box<dyn Error>> {
+        let save_dir = format!("{}/csv_data", save_loc);
+
+        let save_path = format!("{}/csv_data/{}-data.csv", save_loc, org_accession);
+        let save_path = Path::new(&save_path);
+
+        // Ensure the save directory exists
+        if !Path::new(&save_dir).exists() {
+            fs::create_dir_all(&save_dir).unwrap();
+        }
+
+        if save_path.exists() {
+            fs::remove_file(save_path).unwrap();
+        }
+
+        println!(
+            "Generating CSV for:\t{}\nSave Path:\t\t{}",
+            org_accession,
+            save_path.display()
+        );
+
+        println!("{}", save_dir);
+
+        let mut wtr = Writer::from_path(save_path)?;
+        wtr.write_record(&["org", "type", "data_file"])?;
+        for (_key, value) in dict_of_data {
+            wtr.write_record(&value)?;
+        }
+        wtr.flush()?;
+        Ok(())
+    }
+
+    pub fn gencsv(arguments: std::option::Option<&ArgMatches>) {
+        let geneset_folder: &String = arguments.unwrap().get_one::<String>("geneset_dir").unwrap();
+
+        let clade_folder = get_folder_list(&geneset_folder);
+
+        for clade in clade_folder {
+            let save_clade = clade.clone();
+            let org_folder = get_folder_list(&clade.into_os_string().into_string().unwrap());
+
+            // Filter out the folders ending with csv_data as these are output folders
+            let new_org_folder: Vec<&PathBuf> = org_folder
+                .iter()
+                .filter(|x| !x.ends_with("csv_data"))
+                .collect();
+
+            for org in new_org_folder {
+                let mut master_list = Vec::new();
+
+                let accession_folder = get_folder_list(
+                    &<PathBuf as Clone>::clone(&org)
+                        .into_os_string()
+                        .into_string()
+                        .unwrap(),
+                );
+
+                for accession in accession_folder {
+                    let data_list = get_folder_list(accession.to_str().unwrap());
+                    for data in data_list {
+                        master_list.push(get_file_list(data.to_str().unwrap()));
+                    }
+
+                    let file_dict: HashMap<String, Vec<String>>;
+                    let orgs: String;
+                    (file_dict, orgs) =
+                        list_2_dict(&master_list.iter().flatten().cloned().collect());
+                    let save_loc = format!(
+                        "{}/{}",
+                        geneset_folder,
+                        save_clade.file_name().unwrap().to_str().unwrap()
+                    );
+                    let _ = save_data(file_dict, &save_loc, &orgs);
+                }
+            }
+        }
+    }
+}
@@ -31,7 +31,10 @@ pub fn validate_fasta(
                 reader.expect("NO VALID HEADER / SEQUENCE PAIRS");
             for result in binding.records() {
                 let record = result?;
-                fasta_map.insert(record.name().to_owned(), record.sequence().len());
+                fasta_map.insert(
+                    str::from_utf8(record.name())?.to_string(),
+                    record.sequence().len(),
+                );
             }
             Ok(fasta_map)
         }
 
@@ -42,16 +42,24 @@ fn main() -> Result<(), Error> {
                     .help("Path to the TreeVal yaml file generated by the user")
             )
             .arg(
-                Arg::new("verbose")
-                    .short('v')
-                    .value_parser(clap::value_parser!(bool))
-                    .default_value("false")
-                    .help("Print explainers as to why validation fails, if it does fail")
+                Arg::new("output_to_file")
+                    .short('f')
+                    .value_parser(clap::builder::BoolishValueParser::new())
+                    .default_value(std::ffi::OsStr::new("true"))
+                    .help("Output the log to file")
             )
             .arg(
-                Arg::new("output")
-                    .short('o')
-                    .default_value("./")
+                Arg::new("output_to_stdout")
+                    .short('s')
+                    .value_parser(clap::builder::BoolishValueParser::new())
+                    .default_value(std::ffi::OsStr::new("true"))
+                    .help("Output the log to file")
+            )
+            .arg(
+                Arg::new("output_to_pipeline")
+                    .short('p')
+                    .value_parser(clap::builder::BoolishValueParser::new())
+                    .default_value(std::ffi::OsStr::new("true"))
                     .help("Output the log to file")
             )
     )
@@ -320,17 +328,26 @@ fn main() -> Result<(), Error> {
     };
 
     match match_result.subcommand_name() {
+        // Should really be pulled out into it's own program
+        // Validator for YAML file for TreeVal and potentially CurationPretext
+        Some("validateyaml") => validate_yaml(match_result.subcommand_matches("validateyaml")),
+
+        // FASTA Manipulator modules
         Some("splitbysize") => split_file_by_size(match_result.subcommand_matches("splitbysize")),
         Some("splitbycount") => {
             split_file_by_count(match_result.subcommand_matches("splitbycount"))
         }
+        //Some("subset") => subset(match_result.subcommand_matches("subset"))
+        //Some("profile") => profile(match_result.subcommand_matches("profile"))
         Some("mapheaders") => {
             _ = map_fasta_head(match_result.subcommand_matches("mapheaders"));
         }
-        Some("validateyaml") => validate_yaml(match_result.subcommand_matches("validateyaml")),
         Some("remapheaders") => remapping_head(match_result.subcommand_matches("remapheaders")),
-        Some("curate") => curate_fasta(match_result.subcommand_matches("curate")),
         Some("filterfasta") => filter_fasta(match_result.subcommand_matches("filterfasta")),
+
+        // FASTA + TPF = NEW_FASTA
+        Some("curate") => curate_fasta(match_result.subcommand_matches("curate")),
+
         _ => {
             unreachable!()
         }
 
@@ -106,10 +106,10 @@ pub mod split_by_size_mod {
             let mut record_list: Vec<Record> = Vec::new();
             let list: Vec<&String> = only_keys(i.1.to_owned()).collect();
             for ii in list {
-                let results = fasta_repo.get(ii).transpose();
+                let results = fasta_repo.get(ii.as_bytes()).transpose();
                 let new_rec = match results {
                     Ok(data) => {
-                        let definition = Definition::new(ii, None);
+                        let definition = Definition::new(ii.as_bytes(), None);
                         Record::new(definition, data.unwrap())
                     }
                     Err(e) => panic!("{:?}", e),
 
@@ -11,12 +11,12 @@ pub mod tpf_fasta_mod {
     use crate::generics::validate_fasta;
 
     #[derive(Debug, Clone, PartialEq, Eq)]
-    struct Tpf {
-        ori_scaffold: String,
-        start_coord: usize,
-        end_coord: usize,
-        new_scaffold: String,
-        orientation: String,
+    pub struct Tpf {
+        pub ori_scaffold: String,
+        pub start_coord: usize,
+        pub end_coord: usize,
+        pub new_scaffold: String,
+        pub orientation: String,
     }
 
     impl std::fmt::Display for Tpf {
@@ -31,9 +31,9 @@ pub mod tpf_fasta_mod {
     }
 
     #[derive(Debug, PartialEq, Eq)]
-    struct NewFasta {
-        tpf: Tpf,
-        sequence: String,
+    pub struct NewFasta {
+        pub tpf: Tpf,
+        pub sequence: String,
     }
 
     #[derive(Debug)]
@@ -42,7 +42,7 @@ pub mod tpf_fasta_mod {
         sequence: Vec<String>,
     }
 
-    fn parse_tpf(path: &String) -> Vec<Tpf> {
+    pub fn parse_tpf(path: &String) -> Vec<Tpf> {
         // Instantiate a List of Tpf objects
         let mut all_tpf: Vec<Tpf> = Vec::new();
         for line in read_to_string(path).unwrap().lines() {
@@ -67,7 +67,7 @@ pub mod tpf_fasta_mod {
         all_tpf
     }
 
-    fn subset_vec_tpf<'a>(
+    pub fn subset_vec_tpf<'a>(
         tpf: &'a Vec<Tpf>,
         fasta: (&std::string::String, &usize),
     ) -> Vec<&'a Tpf> {
@@ -83,14 +83,14 @@ pub mod tpf_fasta_mod {
         subset_tpf
     }
 
-    fn check_orientation(
+    // The TPF will contain data in both PLUS (normal) and
+    // MINUS (inverted), if MINUS then we need to invert again
+    // and get the complement sequence
+    // We then return the sequence of the record.
+    pub fn check_orientation(
         parsed: std::option::Option<noodles::fasta::record::Sequence>,
         orientation: String,
     ) -> String {
-        // The TPF will contain data in both PLUS (normal) and
-        // MINUS (inverted), if MINUS then we need to invert again
-        // and get thr complement sequence
-        // We then return the sequence of the record.
         if orientation == "MINUS" {
             let start = Position::try_from(1).unwrap();
             let parse_orientation = parsed.unwrap();
@@ -108,7 +108,7 @@ pub mod tpf_fasta_mod {
         }
     }
 
-    fn parse_seq(
+    pub fn parse_seq(
         sequence: std::option::Option<noodles::fasta::record::Sequence>,
         tpf: Vec<&Tpf>,
     ) -> Vec<NewFasta> {
@@ -139,7 +139,7 @@ pub mod tpf_fasta_mod {
         subset_tpf
     }
 
-    fn get_uniques(tpf_list: &Vec<Tpf>) -> Vec<String> {
+    pub fn get_uniques(tpf_list: &Vec<Tpf>) -> Vec<String> {
         // Get a Vec of the uniques names in the TPF Vec
         let mut uniques: Vec<String> = Vec::new();
 
@@ -151,17 +151,21 @@ pub mod tpf_fasta_mod {
         uniques
     }
 
-    fn save_to_fasta(
+    // The function could take in a path where the output files are stored.
+    pub fn save_to_fasta(
         fasta_data: Vec<NewFasta>,
         tpf_data: Vec<Tpf>,
         output: &String,
         n_length: usize,
     ) {
         //
         // TPF is in the input TPF order, this will continue to be the case until
-        // such time that the script starts modifying the TPF in place which
-        // we don't want to happen. Once this happens the order will no
-        // longer be guaranteed.
+        // such time that the script starts modifying the TPF in place.
+        //
+        // This now happends but this is ok as the order of the final scaffolds
+        // isn't essential as long as the data is correct.
+        //
+        // In the future an optional sort function should be added
         //
         let _data_file = File::create(output);
         let mut file = OpenOptions::new()
@@ -180,6 +184,7 @@ pub mod tpf_fasta_mod {
         // This is inefficient as we are scanning through the fasta_data, uniques
         // ( equal to number of scaffolds) number of times
         // If uniques is 10 long and fasta is 100, then this is 1000 scans through in total.
+        // we need to change x to something more descriptive
         for x in uniques {
             println!("NOW WRITING DATA FOR: {:?}", &x);
             // X = "SUPER_1"
@@ -194,12 +199,14 @@ pub mod tpf_fasta_mod {
                 .expect("Unable to write to file");
 
             let mut data: MyRecord = MyRecord {
+                // would it be better to use x.clone()
                 name: "".to_string(),
                 sequence: Vec::new(),
             };
 
             x.clone_into(&mut data.name);
             for tpf in &tpf_data {
+                // x should be data.name and we should probably transfer ownership?
                 if tpf.new_scaffold == x {
                     for fasta in &fasta_data {
                         if fasta.tpf == *tpf {
@@ -270,11 +277,11 @@ pub mod tpf_fasta_mod {
                         Ok(data) => {
                             let adapter = IndexedReader::new(data);
 
-                            // Now read the fasta and return is as a queryable object
+                            // Now read the fasta and return as a queryable object
                             let repository = fasta::Repository::new(adapter);
                             repository
                         }
-                        Err(_) => todo!(), // Probably just panic!
+                        Err(e) => panic!("NOODLES/STD::IO ERROR: {:?}\n Likely a malformatted FAI - Check that the seperators are TABS not spaces!!!", e),
                     };
 
                     //
@@ -290,7 +297,7 @@ pub mod tpf_fasta_mod {
                         let subset_tpf = subset_vec_tpf(&tpf_data, (&i.0, &i.1));
 
                         // Query the fasta for scaffold = header
-                        let sequence = fasta_repo.get(&i.0).transpose();
+                        let sequence = fasta_repo.get(&i.0.as_bytes()).transpose();
 
                         // if exists then get the seqeuence, return a tpf object
                         // containing the trimmed sequence
 
@@ -0,0 +1,5 @@
+?	SCAFFOLD_12:1-900734	RL_3	MINUS
+GAP	TYPE-2	200
+?	SCAFFOLD_50:1-61000	RL_3	PLUS
+?	SCAFFOLD_26:1-201195	RL_3_unloc_1	PLUS
+?	SCAFFOLD_84:1-2000	SCAFFOLD_84	PLUS
@@ -0,0 +1,4 @@
+SCAFFOLD_12	900734	13	60	61
+SCAFFOLD_26	201195	915773	60	61
+SCAFFOLD_50	61000	1120335	60	61
+SCAFFOLD_84	2000	1182365	60	61
@@ -0,0 +1,6 @@
+?	SCAFFOLD_1:1-9	RL_1	MINUS
+GAP	TYPE-2	200
+?	SCAFFOLD_2:1-11	RL_1	PLUS
+?   SCAFFOLD_3:1-5  RL_2    PLUS
+?	SCAFFOLD_2:12-20	RL_3_unloc_1	PLUS
+?	SCAFFOLD_3:6-10	SCAFFOLD_3	PLUS
@@ -0,0 +1,9 @@
+>SCAFFOLD_1
+ATGCATGCCGTATAACCAATGTGTGTGATGTGAGTATGCATCGTGCATCGATCGCTAGCA
+TGCCAGTCAGTCTA
+>SCAFFOLD_2
+ATGCATGCCGTATAACCAATGTGTGTGATGTGAGTATGCATCGTGCATCGATCGCTAGCA
+TGCCAGTCAGTCTA
+>SCAFFOLD_3
+AGTGTATTTTTATGCATGCCGTATAACCAATGTGTGTGATGTGAGTATGCATCGTGCATC
+GATCGCTAGCATGCCAGTCAGTCTA
@@ -0,0 +1,3 @@
+SCAFFOLD_1	74	12	60	61
+SCAFFOLD_2	74	100	60	61
+SCAFFOLD_3	85	188	60	61
@@ -0,0 +1,11 @@
+>SUPER_1
+GGCATGCATNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
+NNNNNNNNNNNNNNNNNNNNNNNNNNNNNATGCATGCCGT
+>SUPER_2
+AGTGT
+>SUPER_3_unloc_1
+ATAACCAAT
+>SCAFFOLD_3
+ATTTT
@@ -0,0 +1,3 @@
+?	SCAFFOLD_1:1-9	RL_1	MINUS
+GAP	TYPE-2	200
+?   SCAFFOLD_3:1-5  RL_2    PLUS
@@ -0,0 +1,4 @@
+>SUPER_1
+		SCAFFOLD_1 -- 1 -- 9
+>SUPER_2
+		SCAFFOLD_3 -- 1 -- 5
@@ -0,0 +1,4 @@
+>SCAFFOLD_1
+ATGCATGCCGTATAGA
+>SCAFFOLD_3
+AGTGTATTTTTATGCA
@@ -0,0 +1,2 @@
+SCAFFOLD_1	16	12	16	17
+SCAFFOLD_3	16	41	16	17
@@ -0,0 +1,4 @@
+>SUPER_1
+GGCATGCAT
+>SUPER_2
+AGTGT
@@ -0,0 +1,38 @@
+assembly:
+    assem_level: scaffold
+    assem_version: 1
+    sample_id: pxPlaOval8
+    latin_name: to_provide_taxonomic_rank
+    defined_class: fungi
+    project_id: DTOL
+reference_file: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/assembly/draft/grTriPseu1.fa
+map_order: unsorted
+assem_reads:
+    read_type: hifi
+    read_data: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/pacbio2/
+    supplementary_data: path
+hic_data:
+    hic_cram: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/genomic_data/hic-arima/
+    hic_aligner: minimap2
+kmer_profile:
+    # kmer_length will act as input for kmer_read_cov fastk and as the name of folder in profile_dir
+    kmer_length: 31
+    dir: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/empty/
+alignment:
+    data_dir: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/gene_alignment_data/
+    common_name: "" # For future implementation (adding bee, wasp, ant etc)
+    geneset_id: "LaetiporusSulphureus.gfLaeSulp1,Iam.Fail"
+    #Path should end up looking like "{{data_dir}}{{classT}}/{{common_name}}/csv_data/{{geneset}}-data.csv"
+self_comp:
+    motif_len: 0
+    mummer_chunk: 20
+synteny:
+    synteny_path: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/synteny/
+    synteny_genomes: ""
+intron:
+    size: "50k"
+telomere:
+    teloseq: TTCAGGG
+busco:
+    lineages_path: /Users/dp24/Documents/FastaManipulator/TreeValTinyData/busco/subset
+    lineage: fungi_odb10
@@ -1,6 +1,326 @@
-pub use fasta_manipulation::tpf_fasta::*;
+use assert_cmd::Command;
+use std::fs;
+use std::fs::File;
+use std::io::Write;
+
+use noodles::fasta::record::Sequence;
+use tempfile::Builder;
+
+use fasta_manipulation::tpf_fasta_mod::{
+    check_orientation, get_uniques, parse_seq, parse_tpf, save_to_fasta, subset_vec_tpf, NewFasta,
+    Tpf,
+};
+
+mod util;
+
+use util::are_files_identical;
+
+// To test the check orientation function we need to publicly expose it
+// Is there a way to test private functions?
+#[test]
+fn check_orientation_inverts_sequence_if_minus() {
+    let sequence = Sequence::from(b"ATGC".to_vec());
+    let orientation = "MINUS".to_string();
+    let result = check_orientation(Some(sequence), orientation);
+    assert_eq!(result, "GCAT".to_string());
+}
+
+#[test]
+fn check_orientation_does_not_invert_sequence_if_plus() {
+    let sequence = Sequence::from(b"ATGC".to_vec());
+    let orientation = "PLUS".to_string();
+    let result = check_orientation(Some(sequence), orientation);
+    assert_eq!(result, "ATGC".to_string());
+}
+
+// Again we need to publicly expose the get_uniques function to test it
+// Also we need to publicly expose the Tpf struct attributes
+// Do we need a factory function to create Tpf structs?
+#[test]
+fn get_uniques_returns_unique_scaffold_names() {
+    let tpf1 = Tpf {
+        ori_scaffold: "scaffold1".to_string(),
+        start_coord: 1,
+        end_coord: 100,
+        new_scaffold: "newScaffold1".to_string(),
+        orientation: "PLUS".to_string(),
+    };
+    let tpf2 = Tpf {
+        ori_scaffold: "scaffold2".to_string(),
+        start_coord: 1,
+        end_coord: 100,
+        new_scaffold: "newScaffold2".to_string(),
+        orientation: "PLUS".to_string(),
+    };
+    let tpf3 = Tpf {
+        ori_scaffold: "scaffold1".to_string(),
+        start_coord: 1,
+        end_coord: 100,
+        new_scaffold: "newScaffold1".to_string(),
+        orientation: "PLUS".to_string(),
+    };
+    let tpfs = vec![tpf1, tpf2, tpf3];
+    let result = get_uniques(&tpfs);
+    assert_eq!(
+        result,
+        vec!["newScaffold1".to_string(), "newScaffold2".to_string()]
+    );
+}
+
+// Need to add some docs for function
+// as we were not entirely sure what it was doing
+#[test]
+fn get_subset_of_tpfs() {
+    let tpf1 = Tpf {
+        ori_scaffold: "scaffold1".to_string(),
+        start_coord: 1,
+        end_coord: 100,
+        new_scaffold: "newScaffold1".to_string(),
+        orientation: "PLUS".to_string(),
+    };
+    let tpf2 = Tpf {
+        ori_scaffold: "scaffold2".to_string(),
+        start_coord: 1,
+        end_coord: 100,
+        new_scaffold: "newScaffold2".to_string(),
+        orientation: "PLUS".to_string(),
+    };
+    let tpf3 = Tpf {
+        ori_scaffold: "scaffold1".to_string(),
+        start_coord: 1,
+        end_coord: 100,
+        new_scaffold: "newScaffold1".to_string(),
+        orientation: "PLUS".to_string(),
+    };
+    let tpfs = vec![tpf1, tpf2, tpf3];
+    let fasta = (&"scaffold1".to_string(), &(1 as usize));
+    let result = subset_vec_tpf(&tpfs, fasta);
+    assert_eq!(result.len(), 2);
+}
+
+#[test]
+fn check_parse_seq() {
+    let sequence =
+        Sequence::from(b"AATGGCCGGCGCGTTAAACCCAATGCCCCGGTTAANNGCTCGTCGCTTGCTTCGCAAAA".to_vec());
+    let tpf1 = Tpf {
+        ori_scaffold: "scaffold1".to_string(),
+        start_coord: 3,
+        end_coord: 5,
+        new_scaffold: "newScaffold1".to_string(),
+        orientation: "PLUS".to_string(),
+    };
+    let tpf2 = Tpf {
+        ori_scaffold: "scaffold2".to_string(),
+        start_coord: 10,
+        end_coord: 20,
+        new_scaffold: "newScaffold2".to_string(),
+        orientation: "MINUS".to_string(),
+    };
+    let tpf3 = Tpf {
+        ori_scaffold: "scaffold1".to_string(),
+        start_coord: 1,
+        end_coord: 58,
+        new_scaffold: "newScaffold1".to_string(),
+        orientation: "PLUS".to_string(),
+    };
+
+    let tpfs = vec![&tpf1, &tpf2, &tpf3];
+    let input_sequence = Some(sequence);
+
+    let new_fasta = parse_seq(input_sequence, tpfs);
+
+    assert_eq!(new_fasta.len(), 3);
+    assert_eq!(new_fasta.first().unwrap().sequence, "TGG");
+    assert_eq!(new_fasta.get(1).unwrap().sequence, "GGTTTAACGCG");
+    assert_eq!(
+        new_fasta.get(2).unwrap().sequence,
+        "AATGGCCGGCGCGTTAAACCCAATGCCCCGGTTAANNGCTCGTCGCTTGCTTCGCAAA"
+    );
+}
+
+// This should panic with a end_coord > sequence.length
+// Should the exception be handled in a more graceful way?
+#[test]
+#[should_panic]
+fn check_parse_seq_bounds_error() {
+    let sequence =
+        Sequence::from(b"AATGGCCGGCGCGTTAAACCCAATGCCCCGGTTAANNGCTCGTCGCTTGCTTCGCAAAA".to_vec());
+    let tpf = Tpf {
+        ori_scaffold: "scaffold1".to_string(),
+        start_coord: 10,
+        end_coord: 60,
+        new_scaffold: "newScaffold1".to_string(),
+        orientation: "PLUS".to_string(),
+    };
+    let tpfs = vec![&tpf];
+
+    let input_sequence = Some(sequence);
+
+    parse_seq(input_sequence, tpfs);
+}
 
 #[test]
-fn it_works() {
-    assert_eq!(true, true);
+fn check_parse_tpf() {
+    let path = "test_data/iyAndFlav1/full/iyAndFlav1.curated_subset.tpf".to_string();
+    let tpfs = parse_tpf(&path);
+    assert_eq!(tpfs.len(), 4);
+
+    // ?	SCAFFOLD_12:1-900734	RL_3	MINUS
+    // GAP	TYPE-2	200
+    // ?	SCAFFOLD_50:1-61000	RL_3	PLUS
+    // ?	SCAFFOLD_26:1-201195	RL_3_unloc_1	PLUS
+    // ?	SCAFFOLD_84:1-2000	SCAFFOLD_84	PLUS
+
+    let tpf1 = tpfs.first().unwrap();
+    assert_eq!(tpf1.ori_scaffold, "SCAFFOLD_12".to_string());
+    assert_eq!(tpf1.start_coord, 1);
+    assert_eq!(tpf1.end_coord, 900734);
+    assert_eq!(tpf1.new_scaffold, "SUPER_3".to_string());
+    assert_eq!(tpf1.orientation, "MINUS".to_string());
+
+    let tpf2 = tpfs.last().unwrap();
+    assert_eq!(tpf2.ori_scaffold, "SCAFFOLD_84".to_string());
+    assert_eq!(tpf2.start_coord, 1);
+    assert_eq!(tpf2.end_coord, 2000);
+    assert_eq!(tpf2.new_scaffold, "SCAFFOLD_84".to_string());
+    assert_eq!(tpf2.orientation, "PLUS".to_string());
+}
+
+#[test]
+fn check_save_to_fasta() {
+    // Inputs: Vector of NewFasta types, vector of Tpf types, output path, and n_length
+    // 1. Creates a data file based on the output path, and open the created file using OpenOption
+    // 2. Creates a debug.txt file, and open that file.
+    // 3. Retrieving unique scaffolds based on the initial tpf types
+
+    // Iterating over the unique scaffold names:
+    // - appends a > symbol to the start and a new line to the end
+    // - appends the scaffold name to the file
+    // - appends the scaffold name to file2 ()debug.txt)
+    // - creates a struct called MyRecord with an empty name and sequence
+    // - assigns the unique scaffold name to data name
+    // - iterating over the tpf data (comes from parse_tpf function)
+    // - if the new scaffold name is equal to the unique scaffold name
+    // - iterates over the new_fasta data
+    // - checking for object equality
+    // - if the object is equal it formats the tpf into a string and writes it to file2 (debug.txt)
+    // - if the object is equal it appends the fasta sequence to the data sequence
+    // - creates a variable line_len set to 60
+    // - creates a fixed variable which is is the sequence
+    // - creates a n_string variable which is N repeated n_length times
+    // - creates fixed2 variable which is fixed joined with n_string
+    // - creates a variable called fixed3 which is converted to bytes and chunks it by line_len and converts it to a vector of strings
+    // - iterates over the fixed3 variable and writes it to the file
+
+    let new_fasta_items = vec![
+        NewFasta {
+            tpf: Tpf {
+                ori_scaffold: "SCAFFOLD_1".to_string(),
+                start_coord: 1,
+                end_coord: 9,
+                new_scaffold: "SUPER_1".to_string(),
+                orientation: "MINUS".to_string(),
+            },
+            sequence: "GGCATGCAT".to_string(),
+        },
+        NewFasta {
+            tpf: Tpf {
+                ori_scaffold: "SCAFFOLD_3".to_string(),
+                start_coord: 1,
+                end_coord: 5,
+                new_scaffold: "SUPER_2".to_string(),
+                orientation: "PLUS".to_string(),
+            },
+            sequence: "AGTGT".to_string(),
+        },
+    ];
+
+    let tpf_items = vec![
+        Tpf {
+            ori_scaffold: "SCAFFOLD_1".to_string(),
+            start_coord: 1,
+            end_coord: 9,
+            new_scaffold: "SUPER_1".to_string(),
+            orientation: "MINUS".to_string(),
+        },
+        Tpf {
+            ori_scaffold: "SCAFFOLD_3".to_string(),
+            start_coord: 1,
+            end_coord: 5,
+            new_scaffold: "SUPER_2".to_string(),
+            orientation: "PLUS".to_string(),
+        },
+    ];
+
+    let output = &"new.fasta".to_string();
+
+    let n_length: usize = 200;
+
+    save_to_fasta(new_fasta_items, tpf_items, output, n_length);
+
+    assert!(
+        are_files_identical(output, "test_data/iyAndFlav1/tiny/tiny_test.output.fasta").unwrap()
+    );
+
+    assert!(
+        are_files_identical("debug.txt", "test_data/iyAndFlav1/tiny/tiny_test.debug.txt").unwrap()
+    );
+
+    match fs::remove_file(output) {
+        Ok(_) => true,
+        Err(_err) => panic!("File cannot be found!"),
+    };
+    match fs::remove_file("debug.txt") {
+        Ok(_) => true,
+        Err(_err) => panic!("File cannot be found!"),
+    };
+}
+
+//#[ignore = "Work in Progress (WIP)"]
+#[test]
+fn check_curate_fasta() {
+    let mut cmd = Command::cargo_bin("fasta_manipulation").unwrap();
+
+    // Create temp directory that will get cleaned up
+    let dir = Builder::new().prefix("local_tests").tempdir().unwrap();
+
+    // Generate paths for mock files
+    let fasta_path = &dir.path().join("input_fasta.fa");
+    let fai_path = &dir.path().join("input_fasta.fa.fai");
+    let tpf_path = &dir.path().join("input.tpf");
+
+    // Actually generate the mock files
+    let mut fasta = File::create(fasta_path).unwrap();
+    let mut fai = File::create(fai_path).unwrap();
+    let mut tpf = File::create(tpf_path).unwrap();
+
+    let output = "./output.fa";
+
+    write!(
+        fai,
+        "SCAFFOLD_1\t16\t12\t16\t17\nSCAFFOLD_3\t16\t41\t16\t17"
+    )
+    .unwrap();
+
+    write!(
+        fasta,
+        ">SCAFFOLD_1\nATGCATGCCGTATAGA\n>SCAFFOLD_3\nAGTGTATTTTTATGCA"
+    )
+    .unwrap();
+
+    write!(
+        tpf,
+        "?\tSCAFFOLD_1:1-9\tRL_1\tMINUS\nGAP\tTYPE-2\t200\n?\tSCAFFOLD_3:1-5\tRL_2\tPLUS"
+    )
+    .unwrap();
+
+    cmd.arg("curate")
+        .arg("-f")
+        .arg(fasta_path)
+        .arg("-t")
+        .arg(tpf_path)
+        .arg("-o")
+        .arg(output)
+        .assert()
+        .success();
 }
@@ -0,0 +1,19 @@
+use std::{fs, io::ErrorKind};
+
+/// Checks whether the contents of the two files are identical.
+/// file_path1 and file_path2 are input file paths.
+/// Returns erred Result struct for errors.
+/// Use pattern matching to extract the resulting bool.
+pub fn are_files_identical(file_path1: &str, file_path2: &str) -> std::io::Result<bool> {
+    match (fs::read(file_path1), fs::read(file_path2)) {
+        (Ok(contents1), Ok(contents2)) => Ok(contents1 == contents2),
+        (Err(e), _) | (_, Err(e)) => {
+            if e.kind() == ErrorKind::NotFound {
+                Err(e)
+            } else {
+                // Handle other errors (e.g., permissions issues)
+                Err(e)
+            }
+        }
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,10 @@ pub fn validate_fasta(`
`31`	`31`	`reader.expect("NO VALID HEADER / SEQUENCE PAIRS");`
`32`	`32`	`for result in binding.records() {`
`33`	`33`	`let record = result?;`
`34`		`- fasta_map.insert(record.name().to_owned(), record.sequence().len());`
	`34`	`+ fasta_map.insert(`
	`35`	`+ str::from_utf8(record.name())?.to_string(),`
	`36`	`+ record.sequence().len(),`
	`37`	`+ );`
`35`	`38`	`}`
`36`	`39`	`Ok(fasta_map)`
`37`	`40`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+SCAFFOLD_1 74 12 60 61`
	`2`	`+SCAFFOLD_2 74 100 60 61`
	`3`	`+SCAFFOLD_3 85 188 60 61`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+? SCAFFOLD_1:1-9 RL_1 MINUS`
	`2`	`+GAP TYPE-2 200`
	`3`	`+? SCAFFOLD_3:1-5 RL_2 PLUS`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+SCAFFOLD_1 16 12 16 17`
	`2`	`+SCAFFOLD_3 16 41 16 17`