diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index ac33de8..5b77cd9 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -18,16 +18,17 @@ class Adaptor: def __init__( self, name: str, - adaptors: dict, + i7_sequence_token: str, + i5_sequence_token: str, i7_index: str | None = None, i5_index: str | None = None, ): self.name: str = name - self.i5 = AdaptorPart( - sequence_token=adaptors[name]["i5"], name=name, index_seq=i5_index - ) self.i7 = AdaptorPart( - sequence_token=adaptors[name]["i7"], name=name, index_seq=i7_index + sequence_token=i7_sequence_token, name=name, index_seq=i7_index + ) + self.i5 = AdaptorPart( + sequence_token=i5_sequence_token, name=name, index_seq=i5_index ) def get_fastastring(self, insert_Ns: bool = True) -> str: @@ -265,6 +266,12 @@ def load_adaptors(raw: bool = False) -> list[Adaptor] | dict: # By default, return list of Adaptor objects else: adaptors = [] - for adaptor_name in adaptors_dict: - adaptors.append(Adaptor(name=adaptor_name, adaptors=adaptors_dict)) + for adaptor_name, adaptor_parts_dict in adaptors_dict.items(): + adaptors.append( + Adaptor( + name=adaptor_name, + i7_sequence_token=adaptor_parts_dict["i7"], + i5_sequence_token=adaptor_parts_dict["i5"], + ) + ) return adaptors diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index 1998f33..4672b22 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -75,9 +75,10 @@ def __init__(self, input_csv: str, ont_barcodes_enabled: bool): sample_name, Adaptor( name=row["adaptors"], - adaptors=ADAPTORS, - i5_index=i5_index, + i7_sequence_token=ADAPTORS[row["adaptors"]]["i7"], + i5_sequence_token=ADAPTORS[row["adaptors"]]["i5"], i7_index=i7_index, + i5_index=i5_index, ), row["fastq_path"], ont_barcode, diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index e39396a..b4f2470 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -1,3 +1,4 @@ +import json import logging import os import uuid @@ -26,56 +27,101 @@ def run_explore( umi_threshold: float, kmer_length: int, ): - # Setup a run directory - run_uuid = str(uuid.uuid4()) - try: - os.mkdir(outdir) - except FileExistsError: - log.info(f"Output directory {outdir} already exists") - if not use_existing: - log.error( - f"Output directory {outdir} already exists, please use --use_existing to continue" - ) - exit(1) - else: - pass + results, adaptors_included, entries, umi_threshold, kmer_length, outdir = ( + get_explore_results( + fastq, + outdir, + threads, + use_existing, + good_hit_threshold, + insert_thres_low, + insert_thres_high, + minimap_b, + min_hits_per_adaptor, + umi_threshold, + kmer_length, + ) + ) + + results = check_for_umis( + results, adaptors_included, entries, umi_threshold, kmer_length, outdir + ) + + explore_stats_file = os.path.join(outdir, "anglerfish_explore_stats.json") + # Save results to a JSON file + with open(explore_stats_file, "w") as f: + json.dump(results, f, indent=4) + log.info(f"Results saved to {explore_stats_file}") + + +def get_explore_results( + fastq: str, + outdir: str, + threads: int, + use_existing: bool, + good_hit_threshold: float, + insert_thres_low: int, + insert_thres_high: int, + minimap_b: int, + min_hits_per_adaptor: int, + umi_threshold: float, + kmer_length: int, +): + run_uuid = str(uuid.uuid4()) log.info("Running anglerfish explore") log.info(f"Run uuid {run_uuid}") - adaptors = cast(list[Adaptor], load_adaptors()) - adaptors_and_aln_paths: list[tuple[Adaptor, str]] = [] + setup_explore_directory(outdir, use_existing) - # Map all reads against all adaptors - for adaptor in adaptors: - # Align - aln_path = os.path.join(outdir, f"{adaptor.name}.paf") - adaptors_and_aln_paths.append((adaptor, aln_path)) - if os.path.exists(aln_path) and use_existing: - log.info(f"Skipping {adaptor.name} as alignment already exists") - continue - adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta") - with open(adaptor_path, "w") as f: - f.write(adaptor.get_fastastring(insert_Ns=False)) + # Create a results dictionary which is the skeleton for json output + results = { + "run_uuid": run_uuid, + "timestamp": pd.Timestamp.now().isoformat(), + "parameters": { + "fastq": fastq, + "outdir": outdir, + "threads": threads, + "use_existing": use_existing, + "good_hit_threshold": good_hit_threshold, + "insert_thres_low": insert_thres_low, + "insert_thres_high": insert_thres_high, + "minimap_b": minimap_b, + "min_hits_per_adaptor": min_hits_per_adaptor, + "umi_threshold": umi_threshold, + "kmer_length": kmer_length, + }, + "included_adaptors": {}, + "excluded_adaptors": {}, + } - log.info(f"Aligning {adaptor.name}") - run_minimap2( - fastq_in=fastq, - index_file=adaptor_path, - output_paf=aln_path, - threads=threads, - minimap_b=minimap_b, - ) + adaptors = cast(list[Adaptor], load_adaptors()) + + adaptors_and_aln_paths = run_multiple_alignments( + fastq, + outdir, + threads, + use_existing, + adaptors, + minimap_b, + ) # Parse alignments entries: dict = {} adaptors_included = [] for adaptor, aln_path in adaptors_and_aln_paths: log.info(f"Parsing {adaptor.name}") + adaptor_data = {"name": adaptor.name, "i5": {}, "i7": {}} reads_alns: dict[str, list[Alignment]] = map_reads_to_alns( aln_path, complex_identifier=True ) + if len(reads_alns) == 0: + log.info( + f"No hits for {adaptor.name} in alignment file (perhaps the read file was mis-formatted?)" + ) + continue + # Choose only the highest scoring alignment for each combination of read, adaptor end and strand reads_to_highest_q_aln_dict: dict[str, dict] = {} for aln_name, alns in reads_alns.items(): @@ -130,8 +176,16 @@ def run_explore( ) df_good_hits = df_mim[requirements] - median_insert_length = df_good_hits["insert_len"].median() insert_lengths = df_good_hits["insert_len"].value_counts() + + if len(insert_lengths) == 0: + median_insert_length = None + insert_lengths_histogram = {} + else: + median_insert_length = df_good_hits["insert_len"].median() + insert_lengths_histogram = insert_lengths[ + sorted(insert_lengths.index) + ].to_dict() else: m_re_cs = r"^cs:Z::([1-9][0-9]*)$" df_good_hits = df[df.cg.str.match(m_re_cs)] @@ -148,6 +202,7 @@ def run_explore( df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres] median_insert_length = None + insert_lengths_histogram = {} insert_lengths = None # Filter multiple hits per read and adaptor end @@ -164,46 +219,96 @@ def run_explore( f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits." ) + # Collect data for the results dictionary + adaptor_data[adaptor_end_name] = { + "nr_good_hits": len(df_good_hits), + "index_length": median_insert_length, + "insert_lengths_histogram": insert_lengths_histogram, + "relative_entropy": {}, + } + if min(nr_good_hits.values()) >= min_hits_per_adaptor: log.info(f"Adaptor {adaptor.name} is included in the analysis") + results["included_adaptors"][adaptor.name] = adaptor_data adaptors_included.append(adaptor) else: + results["excluded_adaptors"][adaptor.name] = adaptor_data log.info(f"Adaptor {adaptor.name} is excluded from the analysis") - # Print insert length info for adaptor types included in the analysis + return results, adaptors_included, entries, umi_threshold, kmer_length, outdir + + +def setup_explore_directory(outdir: str, use_existing: bool): + # Setup a run directory + + try: + os.mkdir(outdir) + except FileExistsError: + log.info(f"Output directory {outdir} already exists") + if not use_existing: + log.error( + f"Output directory {outdir} already exists, please use --use_existing to continue" + ) + exit(1) + else: + pass + + +def run_multiple_alignments( + fastq: str, + outdir: str, + threads: int, + use_existing: bool, + adaptors: list[Adaptor], + minimap_b: int, +) -> list[tuple[Adaptor, str]]: + adaptors_and_aln_paths: list[tuple[Adaptor, str]] = [] + + # Map all reads against all adaptors + for adaptor in adaptors: + # Align + aln_path = os.path.join(outdir, f"{adaptor.name}.paf") + adaptors_and_aln_paths.append((adaptor, aln_path)) + if os.path.exists(aln_path) and use_existing: + log.info(f"Skipping {adaptor.name} as alignment already exists") + continue + adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta") + with open(adaptor_path, "w") as f: + f.write(adaptor.get_fastastring(insert_Ns=False)) + + log.info(f"Aligning {adaptor.name}") + run_minimap2( + fastq_in=fastq, + index_file=adaptor_path, + output_paf=aln_path, + threads=threads, + minimap_b=minimap_b, + ) + return adaptors_and_aln_paths + + +def check_for_umis( + results, adaptors_included, entries, umi_threshold, kmer_length, outdir +): + # Traverse the adaptors to include and check whether they need UMI entropy analysis for adaptor in adaptors_included: + adaptor_data = results["included_adaptors"][adaptor.name] + for adaptor_end_name, adaptor_end in zip( ["i5", "i7"], [adaptor.i5, adaptor.i7] ): df_good_hits = entries[adaptor.name][adaptor_end_name] if adaptor_end.has_index: median_insert_length = df_good_hits["insert_len"].median() + if median_insert_length > umi_threshold: # Calculate entropies here entropies = calculate_relative_entropy( df_good_hits, kmer_length, median_insert_length ) - entropy_file = os.path.join( - outdir, f"{adaptor.name}_{adaptor_end_name}.entropy.csv" - ) - pd.Series(entropies).to_csv(entropy_file, float_format="%.2f") - log.info( - f"{adaptor.name}:{adaptor_end_name}, relative entropy for k={kmer_length}, over the index saved to {entropy_file}" - ) - insert_lengths = df_good_hits["insert_len"].value_counts() + adaptor_data[adaptor_end_name]["relative_entropy"] = entropies log.info( f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}" ) - histogram_file = os.path.join( - outdir, f"{adaptor.name}_{adaptor_end_name}.hist.csv" - ) - insert_lengths[sorted(insert_lengths.index)].to_csv(histogram_file) - log.info( - f"{adaptor.name}:{adaptor_end_name} insert length histogram saved {histogram_file}" - ) - else: - median_insert_length = None - insert_lengths = None - log.info( - f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits (no insert length since no index)" - ) + + return results diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_anglerfish/test_demux/test_demux.py b/tests/conftest.py similarity index 63% rename from tests/test_anglerfish/test_demux/test_demux.py rename to tests/conftest.py index ed8124b..f21c9c1 100644 --- a/tests/test_anglerfish/test_demux/test_demux.py +++ b/tests/conftest.py @@ -4,11 +4,9 @@ import pytest -from anglerfish.demux import demux as to_test - @pytest.fixture(scope="module") -def fixture(): +def demux_fixture(): f"""Fixture for all tests within {__file__}. Creates a temporary directory with... @@ -85,73 +83,3 @@ def fixture(): yield fixture tmp.cleanup() - - -def test_run_minimap2(fixture): - """Check that the function runs successfully, not the output.""" - # Test alignment on single read - to_test.run_minimap2( - fastq_in=fixture["testdata_single"], - index_file=fixture["index_file"], - output_paf=fixture["paf_single"], - threads=1, - minimap_b=1, - ) - - # Create aligntment from multiple reads - to_test.run_minimap2( - fastq_in=fixture["testdata_multiple"], - index_file=fixture["index_file"], - output_paf=fixture["paf_multiple"], - threads=1, - minimap_b=1, - ) - - -def test_map_reads_to_alns(fixture): - reads_alns = to_test.map_reads_to_alns(fixture["paf_single"]) - - for read_name, alns in reads_alns.items(): - assert read_name == "0ad8bdb6-e009-43c5-95b1-d381e699f983" - for aln in alns: - if aln.adapter_name == "truseq_i7": - assert ( - aln.cs - == "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" - ) - else: - assert aln.cs == "cs:Z::15-a*cg:5+tcccgat:3+g:33" - - -def test_parse_cs(fixture): - test_cs_str = ( - "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" - ) - expected_cs_parsed = ("taacttggtc", 0) - - cs_parsed = to_test.parse_cs( - cs_string=test_cs_str, - index_seq=fixture["index"], - umi_before=0, - umi_after=0, - ) - - assert cs_parsed == expected_cs_parsed - - -def test_categorize_matches(fixture): - i5_name = "truseq_i5" - i7_name = "truseq_i7" - reads_alns = to_test.map_reads_to_alns(fixture["paf_multiple"]) - - layout = to_test.categorize_matches( - i5_name=i5_name, i7_name=i7_name, reads_to_alns=reads_alns - ) - fragments, singletons, concats, unknowns = layout - - # Assert reads were categorized as expected - assert len(fragments["fragment_read"]) == 2 - assert len(singletons["singleton_i5_read"]) == 1 - assert len(singletons["singleton_i7_read"]) == 1 - assert len(concats["concat_read"]) == 4 - assert len(unknowns["unknown_read"]) == 2 diff --git a/tests/test_demux/__init__.py b/tests/test_demux/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_anglerfish/test_demux/test_adaptor.py b/tests/test_demux/test_adaptor.py similarity index 97% rename from tests/test_anglerfish/test_demux/test_adaptor.py rename to tests/test_demux/test_adaptor.py index 5b3fb34..62969c3 100644 --- a/tests/test_anglerfish/test_demux/test_adaptor.py +++ b/tests/test_demux/test_adaptor.py @@ -199,13 +199,17 @@ class TestAdaptor: def test_adaptor(self): adaptors = { "simple_and_index_umi": { - "i5": "AAA", "i7": "AAACCC", + "i5": "AAA", } } adaptor = to_test.Adaptor( - "simple_and_index_umi", adaptors, i5_index=None, i7_index="TTT" + "simple_and_index_umi", + i7_sequence_token=adaptors["simple_and_index_umi"]["i7"], + i5_sequence_token=adaptors["simple_and_index_umi"]["i5"], + i7_index="TTT", + i5_index=None, ) assert adaptor.name == "simple_and_index_umi" assert isinstance(adaptor.i5, to_test.AdaptorPart) diff --git a/tests/test_demux/test_demux.py b/tests/test_demux/test_demux.py new file mode 100644 index 0000000..8015950 --- /dev/null +++ b/tests/test_demux/test_demux.py @@ -0,0 +1,71 @@ +from anglerfish.demux import demux as to_test + + +def test_run_minimap2(demux_fixture): + """Check that the function runs successfully, not the output.""" + # Test alignment on single read + to_test.run_minimap2( + fastq_in=demux_fixture["testdata_single"], + index_file=demux_fixture["index_file"], + output_paf=demux_fixture["paf_single"], + threads=1, + minimap_b=1, + ) + + # Create aligntment from multiple reads + to_test.run_minimap2( + fastq_in=demux_fixture["testdata_multiple"], + index_file=demux_fixture["index_file"], + output_paf=demux_fixture["paf_multiple"], + threads=1, + minimap_b=1, + ) + + +def test_map_reads_to_alns(demux_fixture): + reads_alns = to_test.map_reads_to_alns(demux_fixture["paf_single"]) + + for read_name, alns in reads_alns.items(): + assert read_name == "0ad8bdb6-e009-43c5-95b1-d381e699f983" + for aln in alns: + if aln.adapter_name == "truseq_i7": + assert ( + aln.cs + == "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" + ) + else: + assert aln.cs == "cs:Z::15-a*cg:5+tcccgat:3+g:33" + + +def test_parse_cs(demux_fixture): + test_cs_str = ( + "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" + ) + expected_cs_parsed = ("taacttggtc", 0) + + cs_parsed = to_test.parse_cs( + cs_string=test_cs_str, + index_seq=demux_fixture["index"], + umi_before=0, + umi_after=0, + ) + + assert cs_parsed == expected_cs_parsed + + +def test_categorize_matches(demux_fixture): + i5_name = "truseq_i5" + i7_name = "truseq_i7" + reads_alns = to_test.map_reads_to_alns(demux_fixture["paf_multiple"]) + + layout = to_test.categorize_matches( + i5_name=i5_name, i7_name=i7_name, reads_to_alns=reads_alns + ) + fragments, singletons, concats, unknowns = layout + + # Assert reads were categorized as expected + assert len(fragments["fragment_read"]) == 2 + assert len(singletons["singleton_i5_read"]) == 1 + assert len(singletons["singleton_i7_read"]) == 1 + assert len(concats["concat_read"]) == 4 + assert len(unknowns["unknown_read"]) == 2 diff --git a/tests/test_explore/test_explore.py b/tests/test_explore/test_explore.py new file mode 100644 index 0000000..45a4933 --- /dev/null +++ b/tests/test_explore/test_explore.py @@ -0,0 +1,224 @@ +import gzip +import os +import tempfile + +import pytest + +from anglerfish.demux.adaptor import Adaptor +from anglerfish.explore.explore import ( + get_explore_results, + run_multiple_alignments, + setup_explore_directory, +) + + +@pytest.fixture(scope="module") +def explore_fixture(request): + f"""Fixture for all tests within {__file__}. + + Creates a temporary directory with... + - truseq.fasta: A fasta file with the truseq i7 and i5 adaptors. + - illumina_ud.fasta: A fasta file with the illumina unique dual i7 and i5 adaptors. + - testdata_single.fastq.gz: A single read subset from the testdata with a perfect index match (TAACTTGGTC) to the truseq i7 adaptor. + - testdata_multiple.fastq.gz: Fabricated test data intended to test the categorization of alignments. + + Expected files to be created... + """ + + adaptors_dict = { + "illumina_ud": { + "i5": "AATGATACGGCGACCACCGAGATCTACACTCGTCGGCAGCGTC", + "i7": "CAAGCAGAAGACGGCATACGAGATGTCTCGTGGGCTCGG", + }, + "truseq": { + "i5": "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", + "i7": "GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCTCGTATGCCGTCTTCTGCTTG", + }, + } + + # Read with perfect match to illumina ud i7 and i5 adaptors + matching_read_seq = "".join( + [ + "TTATGCTAACTCATTCAGCGTATTGCTAATGATACGGCGACCACCGAGATCTACAC", # i5 before index + "GCGTAAGA", # i5 index + "TCGTCGGCAGCGTC", # i5 after index + "CGGAAAATGTGGAAGTGACTTTGGAACTGGGTGCAGGAAAGACTGAAACAGTTTAGAGGGCGAAGAGAACGCGAAAAATATGGTAAGTTTGGAAAAGATCGGACGTCGTGAGAAAAGAGTGTAAAAGATCAGATGTGTAGATCTCA", # Real read + "CCGAGCCCACGAGAC", # rev. comp i7 before index + "TCCTGAGC", # rev. comp i7 index + "ATCTCGTATGCCGTCTTCTGCTTGAGCAATACGTT", # rev. comp i7 after index + ] + ) + + matching_read = ( + "\n".join( + [ + "@matching_read", + matching_read_seq, + "+", + "%" * len(matching_read_seq), + ] + ) + + "\n" + ) + + # Read with perfect match to illumina_ud, i5 only, slight mismatch i7 + i5_only_read_seq = "".join( + [ + "TTATGCTAACTCATTCAGCGTATTGCTAATGATACGGCGACCACCGAGATCTACAC", # i5 before index + "GCGTAAGA", # i5 index + "TCGTCGGCAGCGTC", # i5 after index + "CGGAAAATGTGGAAGTGACTTTGGAACTGGGTGCAGGAAAGACTGAAACAGTTTAGAGGGCGAAGAGAACGCGAAAAATATGGTAAGTTTGGAAAAGATCGGACGTCGTGAGAAAAGAGTGTAAAAGATCAGATGTGTAGATCTCA", # Real read + "CAGACCCCACGAGAC", # rev. comp i7 before index with 2 mismatches + "TCCTGAGC", # rev. comp i7 index + "ATCTCGTATGCCGTCTTCTGCTTGAGCAATACGTT", # rev. comp i7 after index + ] + ) + i5_only_read = ( + "\n".join( + [ + "@half_matching_read", + i5_only_read_seq, + "+", + "%" * len(i5_only_read_seq), + ] + ) + + "\n" + ) + demux_fixture = request.getfixturevalue("demux_fixture") + tmp_path = demux_fixture["tmp_path"] + demux_fixture["explore_reads"] = tmp_path / "testdata_explore.fastq.gz" + + with gzip.open(tmp_path / "testdata_explore.fastq.gz", "ab") as f: + f.write(matching_read.encode()) + f.write(i5_only_read.encode()) + + fixture = demux_fixture + fixture["adaptors"] = [ + Adaptor( + name=adaptor_name, + i7_sequence_token=adaptor_parts_dict["i7"], + i5_sequence_token=adaptor_parts_dict["i5"], + ) + for adaptor_name, adaptor_parts_dict in adaptors_dict.items() + ] + return fixture + + +def test_setup_explore_directory(): + tmp_dir = tempfile.TemporaryDirectory() + dir_to_be_created = os.path.join(tmp_dir.name, "test_dir") + setup_explore_directory(dir_to_be_created, False) + assert os.path.exists(dir_to_be_created) + tmp_dir.cleanup() + + +def test_setup_explore_directory_use_existing(): + tmp_dir = tempfile.TemporaryDirectory() + dir_to_be_created = os.path.join(tmp_dir.name, "test_dir") + os.makedirs(dir_to_be_created) + # This method should run exit(1) if the directory already exists + with pytest.raises(SystemExit): + setup_explore_directory(dir_to_be_created, False) + + # save modification time of the directory + dir_time = os.path.getmtime(dir_to_be_created) + setup_explore_directory(dir_to_be_created, True) + # Check that the directory was not recreated + assert dir_time == os.path.getmtime(dir_to_be_created) + + tmp_dir.cleanup() + + +def test_run_multiple_alignments(explore_fixture): + # Test a basic run + tmp_dir = tempfile.TemporaryDirectory() + tmp_path = tmp_dir.name + + return_list = run_multiple_alignments( + fastq=explore_fixture["testdata_single"], + outdir=tmp_path, + threads=1, + use_existing=False, + adaptors=explore_fixture["adaptors"], + minimap_b=1, + ) + + assert return_list + assert len(return_list) == 2 # One for illumina_ud and one for truseq + for adapter, alignment_file in return_list: + # check that tile file ending is .paf + assert alignment_file.endswith(".paf") + # check that alignment file exists + assert os.path.exists(alignment_file) + + # When use_existing is true, the alignment should not be rerun + # Save modification time of the alignment file + alignment_file = return_list[0][1] + alignment_time = os.path.getmtime(alignment_file) + + return_list = run_multiple_alignments( + fastq=explore_fixture["testdata_single"], + outdir=tmp_path, + threads=1, + use_existing=True, + adaptors=explore_fixture["adaptors"], + minimap_b=1, + ) + # Check that the alignment file was not rerun + assert alignment_time == os.path.getmtime(alignment_file) + + # When use_existing is false, the alignment should be rerun + # Save modification time of the alignment file + alignment_file = return_list[0][1] + alignment_time = os.path.getmtime(alignment_file) + + return_list = run_multiple_alignments( + fastq=explore_fixture["testdata_single"], + outdir=tmp_path, + threads=1, + use_existing=False, + adaptors=explore_fixture["adaptors"], + minimap_b=1, + ) + + # Check that the alignment file was rerun + assert alignment_time != os.path.getmtime(alignment_file) + + tmp_dir.cleanup() + + +def test_get_explore_results_functional_test(explore_fixture): + """Test overall function of the explore command.""" + tmp_dir = tempfile.TemporaryDirectory() + tmp_path = os.path.join(tmp_dir.name, "outdir") + + # Running with 2 reads, one perfect match to illumina_ud, one partial match to illumina_ud + results, adaptors_included, entries, umi_threshold, kmer_length, outdir = ( + get_explore_results( + fastq=explore_fixture["explore_reads"], + outdir=tmp_path, + threads=1, + use_existing=False, + good_hit_threshold=0.9, + insert_thres_low=4, + insert_thres_high=30, + minimap_b=4, + min_hits_per_adaptor=1, # Not default + umi_threshold=11, + kmer_length=2, + ) + ) + + assert results + + # Make sure one read matches both and one read matches only i5 + assert list(results["included_adaptors"].keys()) == ["illumina_ud"] + assert results["included_adaptors"]["illumina_ud"]["i5"]["nr_good_hits"] == 2 + assert results["included_adaptors"]["illumina_ud"]["i7"]["nr_good_hits"] == 1 + + # Correct index length found + assert results["included_adaptors"]["illumina_ud"]["i5"]["index_length"] == 8 + assert results["included_adaptors"]["illumina_ud"]["i7"]["index_length"] == 8 + + assert len(adaptors_included) == 1 + assert adaptors_included[0].name == "illumina_ud"