From 54850109ee10adcb854d3c58c6dc83a75cf2eb17 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Mon, 2 Sep 2024 15:18:35 +0200 Subject: [PATCH 1/5] Split up explore to a few more functions --- anglerfish/explore/explore.py | 375 ++++++++++++++++++++-------------- 1 file changed, 225 insertions(+), 150 deletions(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index e39396a..b486168 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -1,3 +1,4 @@ +import json import logging import os import uuid @@ -26,184 +27,258 @@ def run_explore( umi_threshold: float, kmer_length: int, ): - # Setup a run directory - run_uuid = str(uuid.uuid4()) - try: - os.mkdir(outdir) - except FileExistsError: - log.info(f"Output directory {outdir} already exists") - if not use_existing: - log.error( - f"Output directory {outdir} already exists, please use --use_existing to continue" + def _run_explore( + fastq: str, + outdir: str, + threads: int, + use_existing: bool, + good_hit_threshold: float, + insert_thres_low: int, + insert_thres_high: int, + minimap_b: int, + min_hits_per_adaptor: int, + umi_threshold: float, + kmer_length: int, + ): + # Setup a run directory + run_uuid = str(uuid.uuid4()) + try: + os.mkdir(outdir) + except FileExistsError: + log.info(f"Output directory {outdir} already exists") + if not use_existing: + log.error( + f"Output directory {outdir} already exists, please use --use_existing to continue" + ) + exit(1) + else: + pass + + log.info("Running anglerfish explore") + log.info(f"Run uuid {run_uuid}") + + # Create a results dictionary which is the skeleton for json output + results = { + "run_uuid": run_uuid, + "timestamp": pd.Timestamp.now().isoformat(), + "parameters": { + "fastq": fastq, + "outdir": outdir, + "threads": threads, + "use_existing": use_existing, + "good_hit_threshold": good_hit_threshold, + "insert_thres_low": insert_thres_low, + "insert_thres_high": insert_thres_high, + "minimap_b": minimap_b, + "min_hits_per_adaptor": min_hits_per_adaptor, + "umi_threshold": umi_threshold, + "kmer_length": kmer_length, + }, + "included_adapters": {}, + "excluded_adapters": {}, + } + + adaptors = cast(list[Adaptor], load_adaptors()) + adaptors_and_aln_paths: list[tuple[Adaptor, str]] = [] + + # Map all reads against all adaptors + for adaptor in adaptors: + # Align + aln_path = os.path.join(outdir, f"{adaptor.name}.paf") + adaptors_and_aln_paths.append((adaptor, aln_path)) + if os.path.exists(aln_path) and use_existing: + log.info(f"Skipping {adaptor.name} as alignment already exists") + continue + adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta") + with open(adaptor_path, "w") as f: + f.write(adaptor.get_fastastring(insert_Ns=False)) + + log.info(f"Aligning {adaptor.name}") + run_minimap2( + fastq_in=fastq, + index_file=adaptor_path, + output_paf=aln_path, + threads=threads, + minimap_b=minimap_b, ) - exit(1) - else: - pass - - log.info("Running anglerfish explore") - log.info(f"Run uuid {run_uuid}") - - adaptors = cast(list[Adaptor], load_adaptors()) - adaptors_and_aln_paths: list[tuple[Adaptor, str]] = [] - - # Map all reads against all adaptors - for adaptor in adaptors: - # Align - aln_path = os.path.join(outdir, f"{adaptor.name}.paf") - adaptors_and_aln_paths.append((adaptor, aln_path)) - if os.path.exists(aln_path) and use_existing: - log.info(f"Skipping {adaptor.name} as alignment already exists") - continue - adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta") - with open(adaptor_path, "w") as f: - f.write(adaptor.get_fastastring(insert_Ns=False)) - - log.info(f"Aligning {adaptor.name}") - run_minimap2( - fastq_in=fastq, - index_file=adaptor_path, - output_paf=aln_path, - threads=threads, - minimap_b=minimap_b, - ) - # Parse alignments - entries: dict = {} - adaptors_included = [] - for adaptor, aln_path in adaptors_and_aln_paths: - log.info(f"Parsing {adaptor.name}") - reads_alns: dict[str, list[Alignment]] = map_reads_to_alns( - aln_path, complex_identifier=True - ) + # Parse alignments + entries: dict = {} + adaptors_included = [] + for adaptor, aln_path in adaptors_and_aln_paths: + log.info(f"Parsing {adaptor.name}") + adaptor_data = {"name": adaptor.name, "i5": {}, "i7": {}} + reads_alns: dict[str, list[Alignment]] = map_reads_to_alns( + aln_path, complex_identifier=True + ) - # Choose only the highest scoring alignment for each combination of read, adaptor end and strand - reads_to_highest_q_aln_dict: dict[str, dict] = {} - for aln_name, alns in reads_alns.items(): - highest_q_aln = max(alns, key=lambda aln: aln.q) - reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln) - df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index") - nr_good_hits = {} - - # Match Insert Match = mim - # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert - # The cg string allows for mismatches within the matching strings but no insertions or deletions - # All cg string matches are also cs string matches (subset) but not vice versa - mim_re_cs = ( - r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$" # Match Insert Match = mim - ) - mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$" - df_mim = df[df.cs.str.match(mim_re_cs)] + # Choose only the highest scoring alignment for each combination of read, adaptor end and strand + reads_to_highest_q_aln_dict: dict[str, dict] = {} + for aln_name, alns in reads_alns.items(): + highest_q_aln = max(alns, key=lambda aln: aln.q) + reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln) + df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index") + nr_good_hits = {} - # Extract the match lengths - match_col_df = df_mim.cg.str.extract(mim_re_cg).rename( - {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1 - ) - match_col_df = match_col_df.astype( - { - "match_1_len": "int32", - "insert_len": "int32", - "match_2_len": "int32", - } - ) + # Match Insert Match = mim + # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert + # The cg string allows for mismatches within the matching strings but no insertions or deletions + # All cg string matches are also cs string matches (subset) but not vice versa + mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$" # Match Insert Match = mim + mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$" + df_mim = df[df.cs.str.match(mim_re_cs)] + + # Extract the match lengths + match_col_df = df_mim.cg.str.extract(mim_re_cg).rename( + {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1 + ) + match_col_df = match_col_df.astype( + { + "match_1_len": "int32", + "insert_len": "int32", + "match_2_len": "int32", + } + ) - df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df + df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df - for adaptor_end_name, adaptor_end in zip( - ["i5", "i7"], [adaptor.i5, adaptor.i7] - ): - if adaptor_end.has_index: - assert adaptor_end.len_before_index is not None - assert adaptor_end.len_after_index is not None - - # Alignment thresholds - before_thres = round(adaptor_end.len_before_index * good_hit_threshold) - after_thres = round(adaptor_end.len_after_index * good_hit_threshold) - insert_thres_low = insert_thres_low - insert_thres_high = insert_thres_high - - requirements = ( - (df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}") - & (df_mim["match_1_len"] >= (before_thres)) - & (df_mim["insert_len"] >= insert_thres_low) - & (df_mim["insert_len"] <= insert_thres_high) - & (df_mim["match_2_len"] >= (after_thres)) - ) - df_good_hits = df_mim[requirements] + for adaptor_end_name, adaptor_end in zip( + ["i5", "i7"], [adaptor.i5, adaptor.i7] + ): + if adaptor_end.has_index: + assert adaptor_end.len_before_index is not None + assert adaptor_end.len_after_index is not None - median_insert_length = df_good_hits["insert_len"].median() - insert_lengths = df_good_hits["insert_len"].value_counts() - else: - m_re_cs = r"^cs:Z::([1-9][0-9]*)$" - df_good_hits = df[df.cg.str.match(m_re_cs)] - match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename( - {0: "match_1_len"}, axis=1 - ) - match_col_df = match_col_df.astype({"match_1_len": "int32"}) + # Alignment thresholds + before_thres = round( + adaptor_end.len_before_index * good_hit_threshold + ) + after_thres = round( + adaptor_end.len_after_index * good_hit_threshold + ) + insert_thres_low = insert_thres_low + insert_thres_high = insert_thres_high + + requirements = ( + (df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}") + & (df_mim["match_1_len"] >= (before_thres)) + & (df_mim["insert_len"] >= insert_thres_low) + & (df_mim["insert_len"] <= insert_thres_high) + & (df_mim["match_2_len"] >= (after_thres)) + ) + df_good_hits = df_mim[requirements] - df_good_hits.loc[match_col_df.index, match_col_df.columns] = ( - match_col_df + insert_lengths = df_good_hits["insert_len"].value_counts() + + if len(insert_lengths) == 0: + median_insert_length = None + insert_lengths_histogram = {} + else: + median_insert_length = df_good_hits["insert_len"].median() + insert_lengths_histogram = insert_lengths[ + sorted(insert_lengths.index) + ].to_dict() + else: + m_re_cs = r"^cs:Z::([1-9][0-9]*)$" + df_good_hits = df[df.cg.str.match(m_re_cs)] + match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename( + {0: "match_1_len"}, axis=1 + ) + match_col_df = match_col_df.astype({"match_1_len": "int32"}) + + df_good_hits.loc[match_col_df.index, match_col_df.columns] = ( + match_col_df + ) + + thres = round(adaptor_end.len_constant * good_hit_threshold) + df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres] + + median_insert_length = None + insert_lengths_histogram = {} + insert_lengths = None + + # Filter multiple hits per read and adaptor end + df_good_hits = df_good_hits.sort_values( + by=["read_name", "match_1_len"], ascending=False + ).drop_duplicates(subset=["read_name", "adapter_name"], keep="first") + + if adaptor.name not in entries.keys(): + entries[adaptor.name] = {} + entries[adaptor.name][adaptor_end_name] = df_good_hits + + nr_good_hits[adaptor_end_name] = len(df_good_hits) + log.info( + f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits." ) - thres = round(adaptor_end.len_constant * good_hit_threshold) - df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres] + # Collect data for the results dictionary + adaptor_data[adaptor_end_name] = { + "nr_good_hits": len(df_good_hits), + "index_length": median_insert_length, + "insert_lengths_histogram": insert_lengths_histogram, + "relative_entropy": {}, + } - median_insert_length = None - insert_lengths = None + if min(nr_good_hits.values()) >= min_hits_per_adaptor: + log.info(f"Adaptor {adaptor.name} is included in the analysis") + results["included_adapters"][adaptor.name] = adaptor_data + adaptors_included.append(adaptor) + else: + results["excluded_adapters"][adaptor.name] = adaptor_data + log.info(f"Adaptor {adaptor.name} is excluded from the analysis") - # Filter multiple hits per read and adaptor end - df_good_hits = df_good_hits.sort_values( - by=["read_name", "match_1_len"], ascending=False - ).drop_duplicates(subset=["read_name", "adapter_name"], keep="first") + return results, adaptors_included, entries, umi_threshold, kmer_length, outdir - if adaptor.name not in entries.keys(): - entries[adaptor.name] = {} - entries[adaptor.name][adaptor_end_name] = df_good_hits + results, adaptors_included, entries, umi_threshold, kmer_length, outdir = ( + _run_explore( + fastq, + outdir, + threads, + use_existing, + good_hit_threshold, + insert_thres_low, + insert_thres_high, + minimap_b, + min_hits_per_adaptor, + umi_threshold, + kmer_length, + ) + ) - nr_good_hits[adaptor_end_name] = len(df_good_hits) - log.info( - f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits." - ) + results = check_for_umis( + results, adaptors_included, entries, umi_threshold, kmer_length, outdir + ) + + explore_stats_file = os.path.join(outdir, "anglerfish_explore_stats.json") + # Save results to a JSON file + with open(explore_stats_file, "w") as f: + json.dump(results, f, indent=4) - if min(nr_good_hits.values()) >= min_hits_per_adaptor: - log.info(f"Adaptor {adaptor.name} is included in the analysis") - adaptors_included.append(adaptor) - else: - log.info(f"Adaptor {adaptor.name} is excluded from the analysis") + log.info(f"Results saved to {explore_stats_file}") - # Print insert length info for adaptor types included in the analysis + +def check_for_umis( + results, adaptors_included, entries, umi_threshold, kmer_length, outdir +): + # Traverse the adaptors to include and check whether they need UMI entropy analysis for adaptor in adaptors_included: + adaptor_data = results["included_adapters"][adaptor.name] + for adaptor_end_name, adaptor_end in zip( ["i5", "i7"], [adaptor.i5, adaptor.i7] ): df_good_hits = entries[adaptor.name][adaptor_end_name] if adaptor_end.has_index: median_insert_length = df_good_hits["insert_len"].median() + if median_insert_length > umi_threshold: # Calculate entropies here entropies = calculate_relative_entropy( df_good_hits, kmer_length, median_insert_length ) - entropy_file = os.path.join( - outdir, f"{adaptor.name}_{adaptor_end_name}.entropy.csv" - ) - pd.Series(entropies).to_csv(entropy_file, float_format="%.2f") - log.info( - f"{adaptor.name}:{adaptor_end_name}, relative entropy for k={kmer_length}, over the index saved to {entropy_file}" - ) - insert_lengths = df_good_hits["insert_len"].value_counts() + adaptor_data[adaptor_end_name]["relative_entropy"] = entropies log.info( f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}" ) - histogram_file = os.path.join( - outdir, f"{adaptor.name}_{adaptor_end_name}.hist.csv" - ) - insert_lengths[sorted(insert_lengths.index)].to_csv(histogram_file) - log.info( - f"{adaptor.name}:{adaptor_end_name} insert length histogram saved {histogram_file}" - ) - else: - median_insert_length = None - insert_lengths = None - log.info( - f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits (no insert length since no index)" - ) + + return results From 3c3aa51a352d931a23f83eda9e3a86979a78e124 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Tue, 3 Sep 2024 12:43:46 +0200 Subject: [PATCH 2/5] Changed from adaptors argument to i7_sequence_token and i5_sequence_token --- anglerfish/demux/adaptor.py | 21 ++++++++++++------- anglerfish/demux/samplesheet.py | 5 +++-- .../test_demux/test_adaptor.py | 8 +++++-- 3 files changed, 23 insertions(+), 11 deletions(-) diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py index ac33de8..5b77cd9 100644 --- a/anglerfish/demux/adaptor.py +++ b/anglerfish/demux/adaptor.py @@ -18,16 +18,17 @@ class Adaptor: def __init__( self, name: str, - adaptors: dict, + i7_sequence_token: str, + i5_sequence_token: str, i7_index: str | None = None, i5_index: str | None = None, ): self.name: str = name - self.i5 = AdaptorPart( - sequence_token=adaptors[name]["i5"], name=name, index_seq=i5_index - ) self.i7 = AdaptorPart( - sequence_token=adaptors[name]["i7"], name=name, index_seq=i7_index + sequence_token=i7_sequence_token, name=name, index_seq=i7_index + ) + self.i5 = AdaptorPart( + sequence_token=i5_sequence_token, name=name, index_seq=i5_index ) def get_fastastring(self, insert_Ns: bool = True) -> str: @@ -265,6 +266,12 @@ def load_adaptors(raw: bool = False) -> list[Adaptor] | dict: # By default, return list of Adaptor objects else: adaptors = [] - for adaptor_name in adaptors_dict: - adaptors.append(Adaptor(name=adaptor_name, adaptors=adaptors_dict)) + for adaptor_name, adaptor_parts_dict in adaptors_dict.items(): + adaptors.append( + Adaptor( + name=adaptor_name, + i7_sequence_token=adaptor_parts_dict["i7"], + i5_sequence_token=adaptor_parts_dict["i5"], + ) + ) return adaptors diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py index 1998f33..4672b22 100644 --- a/anglerfish/demux/samplesheet.py +++ b/anglerfish/demux/samplesheet.py @@ -75,9 +75,10 @@ def __init__(self, input_csv: str, ont_barcodes_enabled: bool): sample_name, Adaptor( name=row["adaptors"], - adaptors=ADAPTORS, - i5_index=i5_index, + i7_sequence_token=ADAPTORS[row["adaptors"]]["i7"], + i5_sequence_token=ADAPTORS[row["adaptors"]]["i5"], i7_index=i7_index, + i5_index=i5_index, ), row["fastq_path"], ont_barcode, diff --git a/tests/test_anglerfish/test_demux/test_adaptor.py b/tests/test_anglerfish/test_demux/test_adaptor.py index 5b3fb34..62969c3 100644 --- a/tests/test_anglerfish/test_demux/test_adaptor.py +++ b/tests/test_anglerfish/test_demux/test_adaptor.py @@ -199,13 +199,17 @@ class TestAdaptor: def test_adaptor(self): adaptors = { "simple_and_index_umi": { - "i5": "AAA", "i7": "AAACCC", + "i5": "AAA", } } adaptor = to_test.Adaptor( - "simple_and_index_umi", adaptors, i5_index=None, i7_index="TTT" + "simple_and_index_umi", + i7_sequence_token=adaptors["simple_and_index_umi"]["i7"], + i5_sequence_token=adaptors["simple_and_index_umi"]["i5"], + i7_index="TTT", + i5_index=None, ) assert adaptor.name == "simple_and_index_umi" assert isinstance(adaptor.i5, to_test.AdaptorPart) From 7b62050747458f905cc401cd2d5ac71beadcec4c Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Wed, 4 Sep 2024 10:43:30 +0200 Subject: [PATCH 3/5] Rearranging the tests and added basic explore tests --- tests/__init__.py | 0 .../test_demux/test_demux.py => conftest.py} | 74 +-------------- tests/test_demux/__init__.py | 0 .../test_demux/test_adaptor.py | 0 tests/test_demux/test_demux.py | 71 ++++++++++++++ tests/test_explore/test_explore.py | 94 +++++++++++++++++++ 6 files changed, 166 insertions(+), 73 deletions(-) create mode 100644 tests/__init__.py rename tests/{test_anglerfish/test_demux/test_demux.py => conftest.py} (63%) create mode 100644 tests/test_demux/__init__.py rename tests/{test_anglerfish => }/test_demux/test_adaptor.py (100%) create mode 100644 tests/test_demux/test_demux.py create mode 100644 tests/test_explore/test_explore.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_anglerfish/test_demux/test_demux.py b/tests/conftest.py similarity index 63% rename from tests/test_anglerfish/test_demux/test_demux.py rename to tests/conftest.py index ed8124b..f21c9c1 100644 --- a/tests/test_anglerfish/test_demux/test_demux.py +++ b/tests/conftest.py @@ -4,11 +4,9 @@ import pytest -from anglerfish.demux import demux as to_test - @pytest.fixture(scope="module") -def fixture(): +def demux_fixture(): f"""Fixture for all tests within {__file__}. Creates a temporary directory with... @@ -85,73 +83,3 @@ def fixture(): yield fixture tmp.cleanup() - - -def test_run_minimap2(fixture): - """Check that the function runs successfully, not the output.""" - # Test alignment on single read - to_test.run_minimap2( - fastq_in=fixture["testdata_single"], - index_file=fixture["index_file"], - output_paf=fixture["paf_single"], - threads=1, - minimap_b=1, - ) - - # Create aligntment from multiple reads - to_test.run_minimap2( - fastq_in=fixture["testdata_multiple"], - index_file=fixture["index_file"], - output_paf=fixture["paf_multiple"], - threads=1, - minimap_b=1, - ) - - -def test_map_reads_to_alns(fixture): - reads_alns = to_test.map_reads_to_alns(fixture["paf_single"]) - - for read_name, alns in reads_alns.items(): - assert read_name == "0ad8bdb6-e009-43c5-95b1-d381e699f983" - for aln in alns: - if aln.adapter_name == "truseq_i7": - assert ( - aln.cs - == "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" - ) - else: - assert aln.cs == "cs:Z::15-a*cg:5+tcccgat:3+g:33" - - -def test_parse_cs(fixture): - test_cs_str = ( - "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" - ) - expected_cs_parsed = ("taacttggtc", 0) - - cs_parsed = to_test.parse_cs( - cs_string=test_cs_str, - index_seq=fixture["index"], - umi_before=0, - umi_after=0, - ) - - assert cs_parsed == expected_cs_parsed - - -def test_categorize_matches(fixture): - i5_name = "truseq_i5" - i7_name = "truseq_i7" - reads_alns = to_test.map_reads_to_alns(fixture["paf_multiple"]) - - layout = to_test.categorize_matches( - i5_name=i5_name, i7_name=i7_name, reads_to_alns=reads_alns - ) - fragments, singletons, concats, unknowns = layout - - # Assert reads were categorized as expected - assert len(fragments["fragment_read"]) == 2 - assert len(singletons["singleton_i5_read"]) == 1 - assert len(singletons["singleton_i7_read"]) == 1 - assert len(concats["concat_read"]) == 4 - assert len(unknowns["unknown_read"]) == 2 diff --git a/tests/test_demux/__init__.py b/tests/test_demux/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_anglerfish/test_demux/test_adaptor.py b/tests/test_demux/test_adaptor.py similarity index 100% rename from tests/test_anglerfish/test_demux/test_adaptor.py rename to tests/test_demux/test_adaptor.py diff --git a/tests/test_demux/test_demux.py b/tests/test_demux/test_demux.py new file mode 100644 index 0000000..8015950 --- /dev/null +++ b/tests/test_demux/test_demux.py @@ -0,0 +1,71 @@ +from anglerfish.demux import demux as to_test + + +def test_run_minimap2(demux_fixture): + """Check that the function runs successfully, not the output.""" + # Test alignment on single read + to_test.run_minimap2( + fastq_in=demux_fixture["testdata_single"], + index_file=demux_fixture["index_file"], + output_paf=demux_fixture["paf_single"], + threads=1, + minimap_b=1, + ) + + # Create aligntment from multiple reads + to_test.run_minimap2( + fastq_in=demux_fixture["testdata_multiple"], + index_file=demux_fixture["index_file"], + output_paf=demux_fixture["paf_multiple"], + threads=1, + minimap_b=1, + ) + + +def test_map_reads_to_alns(demux_fixture): + reads_alns = to_test.map_reads_to_alns(demux_fixture["paf_single"]) + + for read_name, alns in reads_alns.items(): + assert read_name == "0ad8bdb6-e009-43c5-95b1-d381e699f983" + for aln in alns: + if aln.adapter_name == "truseq_i7": + assert ( + aln.cs + == "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" + ) + else: + assert aln.cs == "cs:Z::15-a*cg:5+tcccgat:3+g:33" + + +def test_parse_cs(demux_fixture): + test_cs_str = ( + "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17" + ) + expected_cs_parsed = ("taacttggtc", 0) + + cs_parsed = to_test.parse_cs( + cs_string=test_cs_str, + index_seq=demux_fixture["index"], + umi_before=0, + umi_after=0, + ) + + assert cs_parsed == expected_cs_parsed + + +def test_categorize_matches(demux_fixture): + i5_name = "truseq_i5" + i7_name = "truseq_i7" + reads_alns = to_test.map_reads_to_alns(demux_fixture["paf_multiple"]) + + layout = to_test.categorize_matches( + i5_name=i5_name, i7_name=i7_name, reads_to_alns=reads_alns + ) + fragments, singletons, concats, unknowns = layout + + # Assert reads were categorized as expected + assert len(fragments["fragment_read"]) == 2 + assert len(singletons["singleton_i5_read"]) == 1 + assert len(singletons["singleton_i7_read"]) == 1 + assert len(concats["concat_read"]) == 4 + assert len(unknowns["unknown_read"]) == 2 diff --git a/tests/test_explore/test_explore.py b/tests/test_explore/test_explore.py new file mode 100644 index 0000000..a5d2e94 --- /dev/null +++ b/tests/test_explore/test_explore.py @@ -0,0 +1,94 @@ +import os + +import pytest + +from anglerfish.demux.adaptor import Adaptor +from anglerfish.explore.explore import run_multiple_alignments + + +@pytest.fixture(scope="module") +def explore_fixture(request): + f"""Fixture for all tests within {__file__}. + + Creates a temporary directory with... + - truseq.fasta: A fasta file with the truseq i7 and i5 adaptors. + - illumina_ud.fasta: A fasta file with the illumina unique dual i7 and i5 adaptors. + - testdata_single.fastq.gz: A single read subset from the testdata with a perfect index match (TAACTTGGTC) to the truseq i7 adaptor. + - testdata_multiple.fastq.gz: Fabricated test data intended to test the categorization of alignments. + + Expected files to be created... + """ + + adaptors_dict = { + "illumina_ud": { + "i5": "AATGATACGGCGACCACCGAGATCTACACTCGTCGGCAGCGTC", + "i7": "CAAGCAGAAGACGGCATACGAGATGTCTCGTGGGCTCGG", + }, + "truseq": { + "i5": "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT", + "i7": "GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCTCGTATGCCGTCTTCTGCTTG", + }, + } + demux_fixture = request.getfixturevalue("demux_fixture") + fixture = demux_fixture + fixture["adaptors"] = [ + Adaptor( + name=adaptor_name, + i7_sequence_token=adaptor_parts_dict["i7"], + i5_sequence_token=adaptor_parts_dict["i5"], + ) + for adaptor_name, adaptor_parts_dict in adaptors_dict.items() + ] + return fixture + + +def test_run_multiple_alignments(explore_fixture): + # Test a basic run + return_list = run_multiple_alignments( + fastq=explore_fixture["testdata_single"], + outdir=explore_fixture["tmp_path"], + threads=1, + use_existing=False, + adaptors=explore_fixture["adaptors"], + minimap_b=1, + ) + + assert return_list + assert len(return_list) == 2 # One for illumina_ud and one for truseq + for adapter, alignment_file in return_list: + # check that tile file ending is .paf + assert alignment_file.endswith(".paf") + # check that alignment file exists + assert os.path.exists(alignment_file) + + # When use_existing is true, the alignment should not be rerun + # Save modification time of the alignment file + alignment_file = return_list[0][1] + alignment_time = os.path.getmtime(alignment_file) + + return_list = run_multiple_alignments( + fastq=explore_fixture["testdata_single"], + outdir=explore_fixture["tmp_path"], + threads=1, + use_existing=True, + adaptors=explore_fixture["adaptors"], + minimap_b=1, + ) + # Check that the alignment file was not rerun + assert alignment_time == os.path.getmtime(alignment_file) + + # When use_existing is false, the alignment should be rerun + # Save modification time of the alignment file + alignment_file = return_list[0][1] + alignment_time = os.path.getmtime(alignment_file) + + return_list = run_multiple_alignments( + fastq=explore_fixture["testdata_single"], + outdir=explore_fixture["tmp_path"], + threads=1, + use_existing=False, + adaptors=explore_fixture["adaptors"], + minimap_b=1, + ) + # Check that the alignment file was rerun + assert alignment_time != os.path.getmtime(alignment_file) From 37d51fd53e411eb5b591a5600e80649fe5e09bae Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Mon, 9 Sep 2024 09:54:02 +0200 Subject: [PATCH 4/5] Splitting up explore a bit and basic testing --- anglerfish/explore/explore.py | 436 +++++++++++++++-------------- tests/test_explore/test_explore.py | 138 ++++++++- 2 files changed, 367 insertions(+), 207 deletions(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index b486168..ae7b684 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -27,208 +27,6 @@ def run_explore( umi_threshold: float, kmer_length: int, ): - def _run_explore( - fastq: str, - outdir: str, - threads: int, - use_existing: bool, - good_hit_threshold: float, - insert_thres_low: int, - insert_thres_high: int, - minimap_b: int, - min_hits_per_adaptor: int, - umi_threshold: float, - kmer_length: int, - ): - # Setup a run directory - run_uuid = str(uuid.uuid4()) - try: - os.mkdir(outdir) - except FileExistsError: - log.info(f"Output directory {outdir} already exists") - if not use_existing: - log.error( - f"Output directory {outdir} already exists, please use --use_existing to continue" - ) - exit(1) - else: - pass - - log.info("Running anglerfish explore") - log.info(f"Run uuid {run_uuid}") - - # Create a results dictionary which is the skeleton for json output - results = { - "run_uuid": run_uuid, - "timestamp": pd.Timestamp.now().isoformat(), - "parameters": { - "fastq": fastq, - "outdir": outdir, - "threads": threads, - "use_existing": use_existing, - "good_hit_threshold": good_hit_threshold, - "insert_thres_low": insert_thres_low, - "insert_thres_high": insert_thres_high, - "minimap_b": minimap_b, - "min_hits_per_adaptor": min_hits_per_adaptor, - "umi_threshold": umi_threshold, - "kmer_length": kmer_length, - }, - "included_adapters": {}, - "excluded_adapters": {}, - } - - adaptors = cast(list[Adaptor], load_adaptors()) - adaptors_and_aln_paths: list[tuple[Adaptor, str]] = [] - - # Map all reads against all adaptors - for adaptor in adaptors: - # Align - aln_path = os.path.join(outdir, f"{adaptor.name}.paf") - adaptors_and_aln_paths.append((adaptor, aln_path)) - if os.path.exists(aln_path) and use_existing: - log.info(f"Skipping {adaptor.name} as alignment already exists") - continue - adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta") - with open(adaptor_path, "w") as f: - f.write(adaptor.get_fastastring(insert_Ns=False)) - - log.info(f"Aligning {adaptor.name}") - run_minimap2( - fastq_in=fastq, - index_file=adaptor_path, - output_paf=aln_path, - threads=threads, - minimap_b=minimap_b, - ) - - # Parse alignments - entries: dict = {} - adaptors_included = [] - for adaptor, aln_path in adaptors_and_aln_paths: - log.info(f"Parsing {adaptor.name}") - adaptor_data = {"name": adaptor.name, "i5": {}, "i7": {}} - reads_alns: dict[str, list[Alignment]] = map_reads_to_alns( - aln_path, complex_identifier=True - ) - - # Choose only the highest scoring alignment for each combination of read, adaptor end and strand - reads_to_highest_q_aln_dict: dict[str, dict] = {} - for aln_name, alns in reads_alns.items(): - highest_q_aln = max(alns, key=lambda aln: aln.q) - reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln) - df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index") - nr_good_hits = {} - - # Match Insert Match = mim - # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert - # The cg string allows for mismatches within the matching strings but no insertions or deletions - # All cg string matches are also cs string matches (subset) but not vice versa - mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$" # Match Insert Match = mim - mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$" - df_mim = df[df.cs.str.match(mim_re_cs)] - - # Extract the match lengths - match_col_df = df_mim.cg.str.extract(mim_re_cg).rename( - {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1 - ) - match_col_df = match_col_df.astype( - { - "match_1_len": "int32", - "insert_len": "int32", - "match_2_len": "int32", - } - ) - - df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df - - for adaptor_end_name, adaptor_end in zip( - ["i5", "i7"], [adaptor.i5, adaptor.i7] - ): - if adaptor_end.has_index: - assert adaptor_end.len_before_index is not None - assert adaptor_end.len_after_index is not None - - # Alignment thresholds - before_thres = round( - adaptor_end.len_before_index * good_hit_threshold - ) - after_thres = round( - adaptor_end.len_after_index * good_hit_threshold - ) - insert_thres_low = insert_thres_low - insert_thres_high = insert_thres_high - - requirements = ( - (df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}") - & (df_mim["match_1_len"] >= (before_thres)) - & (df_mim["insert_len"] >= insert_thres_low) - & (df_mim["insert_len"] <= insert_thres_high) - & (df_mim["match_2_len"] >= (after_thres)) - ) - df_good_hits = df_mim[requirements] - - insert_lengths = df_good_hits["insert_len"].value_counts() - - if len(insert_lengths) == 0: - median_insert_length = None - insert_lengths_histogram = {} - else: - median_insert_length = df_good_hits["insert_len"].median() - insert_lengths_histogram = insert_lengths[ - sorted(insert_lengths.index) - ].to_dict() - else: - m_re_cs = r"^cs:Z::([1-9][0-9]*)$" - df_good_hits = df[df.cg.str.match(m_re_cs)] - match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename( - {0: "match_1_len"}, axis=1 - ) - match_col_df = match_col_df.astype({"match_1_len": "int32"}) - - df_good_hits.loc[match_col_df.index, match_col_df.columns] = ( - match_col_df - ) - - thres = round(adaptor_end.len_constant * good_hit_threshold) - df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres] - - median_insert_length = None - insert_lengths_histogram = {} - insert_lengths = None - - # Filter multiple hits per read and adaptor end - df_good_hits = df_good_hits.sort_values( - by=["read_name", "match_1_len"], ascending=False - ).drop_duplicates(subset=["read_name", "adapter_name"], keep="first") - - if adaptor.name not in entries.keys(): - entries[adaptor.name] = {} - entries[adaptor.name][adaptor_end_name] = df_good_hits - - nr_good_hits[adaptor_end_name] = len(df_good_hits) - log.info( - f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits." - ) - - # Collect data for the results dictionary - adaptor_data[adaptor_end_name] = { - "nr_good_hits": len(df_good_hits), - "index_length": median_insert_length, - "insert_lengths_histogram": insert_lengths_histogram, - "relative_entropy": {}, - } - - if min(nr_good_hits.values()) >= min_hits_per_adaptor: - log.info(f"Adaptor {adaptor.name} is included in the analysis") - results["included_adapters"][adaptor.name] = adaptor_data - adaptors_included.append(adaptor) - else: - results["excluded_adapters"][adaptor.name] = adaptor_data - log.info(f"Adaptor {adaptor.name} is excluded from the analysis") - - return results, adaptors_included, entries, umi_threshold, kmer_length, outdir - results, adaptors_included, entries, umi_threshold, kmer_length, outdir = ( _run_explore( fastq, @@ -257,12 +55,244 @@ def _run_explore( log.info(f"Results saved to {explore_stats_file}") +def _run_explore( + fastq: str, + outdir: str, + threads: int, + use_existing: bool, + good_hit_threshold: float, + insert_thres_low: int, + insert_thres_high: int, + minimap_b: int, + min_hits_per_adaptor: int, + umi_threshold: float, + kmer_length: int, +): + run_uuid = str(uuid.uuid4()) + log.info("Running anglerfish explore") + log.info(f"Run uuid {run_uuid}") + + setup_explore_directory(outdir, use_existing) + + # Create a results dictionary which is the skeleton for json output + results = { + "run_uuid": run_uuid, + "timestamp": pd.Timestamp.now().isoformat(), + "parameters": { + "fastq": fastq, + "outdir": outdir, + "threads": threads, + "use_existing": use_existing, + "good_hit_threshold": good_hit_threshold, + "insert_thres_low": insert_thres_low, + "insert_thres_high": insert_thres_high, + "minimap_b": minimap_b, + "min_hits_per_adaptor": min_hits_per_adaptor, + "umi_threshold": umi_threshold, + "kmer_length": kmer_length, + }, + "included_adaptors": {}, + "excluded_adaptors": {}, + } + + adaptors = cast(list[Adaptor], load_adaptors()) + + adaptors_and_aln_paths = run_multiple_alignments( + fastq, + outdir, + threads, + use_existing, + adaptors, + minimap_b, + ) + + # Parse alignments + entries: dict = {} + adaptors_included = [] + for adaptor, aln_path in adaptors_and_aln_paths: + log.info(f"Parsing {adaptor.name}") + adaptor_data = {"name": adaptor.name, "i5": {}, "i7": {}} + reads_alns: dict[str, list[Alignment]] = map_reads_to_alns( + aln_path, complex_identifier=True + ) + + if len(reads_alns) == 0: + log.info( + f"No hits for {adaptor.name} in alignment file (perhaps the read file was mis-formatted?)" + ) + continue + + # Choose only the highest scoring alignment for each combination of read, adaptor end and strand + reads_to_highest_q_aln_dict: dict[str, dict] = {} + for aln_name, alns in reads_alns.items(): + highest_q_aln = max(alns, key=lambda aln: aln.q) + reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln) + df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index") + nr_good_hits = {} + + # Match Insert Match = mim + # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert + # The cg string allows for mismatches within the matching strings but no insertions or deletions + # All cg string matches are also cs string matches (subset) but not vice versa + mim_re_cs = ( + r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$" # Match Insert Match = mim + ) + mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$" + df_mim = df[df.cs.str.match(mim_re_cs)] + + # Extract the match lengths + match_col_df = df_mim.cg.str.extract(mim_re_cg).rename( + {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1 + ) + match_col_df = match_col_df.astype( + { + "match_1_len": "int32", + "insert_len": "int32", + "match_2_len": "int32", + } + ) + + df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df + + for adaptor_end_name, adaptor_end in zip( + ["i5", "i7"], [adaptor.i5, adaptor.i7] + ): + if adaptor_end.has_index: + assert adaptor_end.len_before_index is not None + assert adaptor_end.len_after_index is not None + + # Alignment thresholds + before_thres = round(adaptor_end.len_before_index * good_hit_threshold) + after_thres = round(adaptor_end.len_after_index * good_hit_threshold) + insert_thres_low = insert_thres_low + insert_thres_high = insert_thres_high + + requirements = ( + (df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}") + & (df_mim["match_1_len"] >= (before_thres)) + & (df_mim["insert_len"] >= insert_thres_low) + & (df_mim["insert_len"] <= insert_thres_high) + & (df_mim["match_2_len"] >= (after_thres)) + ) + df_good_hits = df_mim[requirements] + + insert_lengths = df_good_hits["insert_len"].value_counts() + + if len(insert_lengths) == 0: + median_insert_length = None + insert_lengths_histogram = {} + else: + median_insert_length = df_good_hits["insert_len"].median() + insert_lengths_histogram = insert_lengths[ + sorted(insert_lengths.index) + ].to_dict() + else: + m_re_cs = r"^cs:Z::([1-9][0-9]*)$" + df_good_hits = df[df.cg.str.match(m_re_cs)] + match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename( + {0: "match_1_len"}, axis=1 + ) + match_col_df = match_col_df.astype({"match_1_len": "int32"}) + + df_good_hits.loc[match_col_df.index, match_col_df.columns] = ( + match_col_df + ) + + thres = round(adaptor_end.len_constant * good_hit_threshold) + df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres] + + median_insert_length = None + insert_lengths_histogram = {} + insert_lengths = None + + # Filter multiple hits per read and adaptor end + df_good_hits = df_good_hits.sort_values( + by=["read_name", "match_1_len"], ascending=False + ).drop_duplicates(subset=["read_name", "adapter_name"], keep="first") + + if adaptor.name not in entries.keys(): + entries[adaptor.name] = {} + entries[adaptor.name][adaptor_end_name] = df_good_hits + + nr_good_hits[adaptor_end_name] = len(df_good_hits) + log.info( + f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits." + ) + + # Collect data for the results dictionary + adaptor_data[adaptor_end_name] = { + "nr_good_hits": len(df_good_hits), + "index_length": median_insert_length, + "insert_lengths_histogram": insert_lengths_histogram, + "relative_entropy": {}, + } + + if min(nr_good_hits.values()) >= min_hits_per_adaptor: + log.info(f"Adaptor {adaptor.name} is included in the analysis") + results["included_adaptors"][adaptor.name] = adaptor_data + adaptors_included.append(adaptor) + else: + results["excluded_adaptors"][adaptor.name] = adaptor_data + log.info(f"Adaptor {adaptor.name} is excluded from the analysis") + + return results, adaptors_included, entries, umi_threshold, kmer_length, outdir + + +def setup_explore_directory(outdir: str, use_existing: bool): + # Setup a run directory + + try: + os.mkdir(outdir) + except FileExistsError: + log.info(f"Output directory {outdir} already exists") + if not use_existing: + log.error( + f"Output directory {outdir} already exists, please use --use_existing to continue" + ) + exit(1) + else: + pass + + +def run_multiple_alignments( + fastq: str, + outdir: str, + threads: int, + use_existing: bool, + adaptors: list[Adaptor], + minimap_b: int, +) -> list[tuple[Adaptor, str]]: + adaptors_and_aln_paths: list[tuple[Adaptor, str]] = [] + + # Map all reads against all adaptors + for adaptor in adaptors: + # Align + aln_path = os.path.join(outdir, f"{adaptor.name}.paf") + adaptors_and_aln_paths.append((adaptor, aln_path)) + if os.path.exists(aln_path) and use_existing: + log.info(f"Skipping {adaptor.name} as alignment already exists") + continue + adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta") + with open(adaptor_path, "w") as f: + f.write(adaptor.get_fastastring(insert_Ns=False)) + + log.info(f"Aligning {adaptor.name}") + run_minimap2( + fastq_in=fastq, + index_file=adaptor_path, + output_paf=aln_path, + threads=threads, + minimap_b=minimap_b, + ) + return adaptors_and_aln_paths + + def check_for_umis( results, adaptors_included, entries, umi_threshold, kmer_length, outdir ): # Traverse the adaptors to include and check whether they need UMI entropy analysis for adaptor in adaptors_included: - adaptor_data = results["included_adapters"][adaptor.name] + adaptor_data = results["included_adaptors"][adaptor.name] for adaptor_end_name, adaptor_end in zip( ["i5", "i7"], [adaptor.i5, adaptor.i7] diff --git a/tests/test_explore/test_explore.py b/tests/test_explore/test_explore.py index a5d2e94..5993a67 100644 --- a/tests/test_explore/test_explore.py +++ b/tests/test_explore/test_explore.py @@ -1,9 +1,15 @@ +import gzip import os +import tempfile import pytest from anglerfish.demux.adaptor import Adaptor -from anglerfish.explore.explore import run_multiple_alignments +from anglerfish.explore.explore import ( + _run_explore, + run_multiple_alignments, + setup_explore_directory, +) @pytest.fixture(scope="module") @@ -29,7 +35,63 @@ def explore_fixture(request): "i7": "GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCTCGTATGCCGTCTTCTGCTTG", }, } + + # Read with perfect match to illumina ud i7 and i5 adaptors + matching_read_seq = "".join( + [ + "TTATGCTAACTCATTCAGCGTATTGCTAATGATACGGCGACCACCGAGATCTACAC", # i5 before index + "GCGTAAGA", # i5 index + "TCGTCGGCAGCGTC", # i5 after index + "CGGAAAATGTGGAAGTGACTTTGGAACTGGGTGCAGGAAAGACTGAAACAGTTTAGAGGGCGAAGAGAACGCGAAAAATATGGTAAGTTTGGAAAAGATCGGACGTCGTGAGAAAAGAGTGTAAAAGATCAGATGTGTAGATCTCA", # Real read + "CCGAGCCCACGAGAC", # rev. comp i7 before index + "TCCTGAGC", # rev. comp i7 index + "ATCTCGTATGCCGTCTTCTGCTTGAGCAATACGTT", # rev. comp i7 after index + ] + ) + + matching_read = ( + "\n".join( + [ + "@matching_read", + matching_read_seq, + "+", + "%" * len(matching_read_seq), + ] + ) + + "\n" + ) + + # Read with perfect match to illumina_ud, i5 only, slight mismatch i7 + i5_only_read_seq = "".join( + [ + "TTATGCTAACTCATTCAGCGTATTGCTAATGATACGGCGACCACCGAGATCTACAC", # i5 before index + "GCGTAAGA", # i5 index + "TCGTCGGCAGCGTC", # i5 after index + "CGGAAAATGTGGAAGTGACTTTGGAACTGGGTGCAGGAAAGACTGAAACAGTTTAGAGGGCGAAGAGAACGCGAAAAATATGGTAAGTTTGGAAAAGATCGGACGTCGTGAGAAAAGAGTGTAAAAGATCAGATGTGTAGATCTCA", # Real read + "CAGACCCCACGAGAC", # rev. comp i7 before index with 2 mismatches + "TCCTGAGC", # rev. comp i7 index + "ATCTCGTATGCCGTCTTCTGCTTGAGCAATACGTT", # rev. comp i7 after index + ] + ) + i5_only_read = ( + "\n".join( + [ + "@half_matching_read", + i5_only_read_seq, + "+", + "%" * len(i5_only_read_seq), + ] + ) + + "\n" + ) demux_fixture = request.getfixturevalue("demux_fixture") + tmp_path = demux_fixture["tmp_path"] + demux_fixture["explore_reads"] = tmp_path / "testdata_explore.fastq.gz" + + with gzip.open(tmp_path / "testdata_explore.fastq.gz", "ab") as f: + f.write(matching_read.encode()) + f.write(i5_only_read.encode()) + fixture = demux_fixture fixture["adaptors"] = [ Adaptor( @@ -42,11 +104,39 @@ def explore_fixture(request): return fixture +def test_setup_explore_directory(): + tmp_dir = tempfile.TemporaryDirectory() + dir_to_be_created = os.path.join(tmp_dir.name, "test_dir") + setup_explore_directory(dir_to_be_created, False) + assert os.path.exists(dir_to_be_created) + tmp_dir.cleanup() + + +def test_setup_explore_directory_use_existing(): + tmp_dir = tempfile.TemporaryDirectory() + dir_to_be_created = os.path.join(tmp_dir.name, "test_dir") + os.makedirs(dir_to_be_created) + # This method should run exit(1) if the directory already exists + with pytest.raises(SystemExit): + setup_explore_directory(dir_to_be_created, False) + + # save modification time of the directory + dir_time = os.path.getmtime(dir_to_be_created) + setup_explore_directory(dir_to_be_created, True) + # Check that the directory was not recreated + assert dir_time == os.path.getmtime(dir_to_be_created) + + tmp_dir.cleanup() + + def test_run_multiple_alignments(explore_fixture): # Test a basic run + tmp_dir = tempfile.TemporaryDirectory() + tmp_path = tmp_dir.name + return_list = run_multiple_alignments( fastq=explore_fixture["testdata_single"], - outdir=explore_fixture["tmp_path"], + outdir=tmp_path, threads=1, use_existing=False, adaptors=explore_fixture["adaptors"], @@ -68,7 +158,7 @@ def test_run_multiple_alignments(explore_fixture): return_list = run_multiple_alignments( fastq=explore_fixture["testdata_single"], - outdir=explore_fixture["tmp_path"], + outdir=tmp_path, threads=1, use_existing=True, adaptors=explore_fixture["adaptors"], @@ -84,11 +174,51 @@ def test_run_multiple_alignments(explore_fixture): return_list = run_multiple_alignments( fastq=explore_fixture["testdata_single"], - outdir=explore_fixture["tmp_path"], + outdir=tmp_path, threads=1, use_existing=False, adaptors=explore_fixture["adaptors"], minimap_b=1, ) + # Check that the alignment file was rerun assert alignment_time != os.path.getmtime(alignment_file) + + tmp_dir.cleanup() + + +def test_run_explore_functional_test(explore_fixture): + """Test overall function of the explore command.""" + tmp_dir = tempfile.TemporaryDirectory() + tmp_path = os.path.join(tmp_dir.name, "outdir") + + # Running with 2 reads, one perfect match to illumina_ud, one partial match to illumina_ud + results, adaptors_included, entries, umi_threshold, kmer_length, outdir = ( + _run_explore( + fastq=explore_fixture["explore_reads"], + outdir=tmp_path, + threads=1, + use_existing=False, + good_hit_threshold=0.9, + insert_thres_low=4, + insert_thres_high=30, + minimap_b=4, + min_hits_per_adaptor=1, # Not default + umi_threshold=11, + kmer_length=2, + ) + ) + + assert results + + # Make sure one read matches both and one read matches only i5 + assert list(results["included_adaptors"].keys()) == ["illumina_ud"] + assert results["included_adaptors"]["illumina_ud"]["i5"]["nr_good_hits"] == 2 + assert results["included_adaptors"]["illumina_ud"]["i7"]["nr_good_hits"] == 1 + + # Correct index length found + assert results["included_adaptors"]["illumina_ud"]["i5"]["index_length"] == 8 + assert results["included_adaptors"]["illumina_ud"]["i7"]["index_length"] == 8 + + assert len(adaptors_included) == 1 + assert adaptors_included[0].name == "illumina_ud" From fe0194351f611a06780a784f9624b61fe1183d86 Mon Sep 17 00:00:00 2001 From: Johannes Alneberg Date: Mon, 16 Sep 2024 14:30:27 +0200 Subject: [PATCH 5/5] renamed _run_explore --- anglerfish/explore/explore.py | 4 ++-- tests/test_explore/test_explore.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py index ae7b684..b4f2470 100644 --- a/anglerfish/explore/explore.py +++ b/anglerfish/explore/explore.py @@ -28,7 +28,7 @@ def run_explore( kmer_length: int, ): results, adaptors_included, entries, umi_threshold, kmer_length, outdir = ( - _run_explore( + get_explore_results( fastq, outdir, threads, @@ -55,7 +55,7 @@ def run_explore( log.info(f"Results saved to {explore_stats_file}") -def _run_explore( +def get_explore_results( fastq: str, outdir: str, threads: int, diff --git a/tests/test_explore/test_explore.py b/tests/test_explore/test_explore.py index 5993a67..45a4933 100644 --- a/tests/test_explore/test_explore.py +++ b/tests/test_explore/test_explore.py @@ -6,7 +6,7 @@ from anglerfish.demux.adaptor import Adaptor from anglerfish.explore.explore import ( - _run_explore, + get_explore_results, run_multiple_alignments, setup_explore_directory, ) @@ -187,14 +187,14 @@ def test_run_multiple_alignments(explore_fixture): tmp_dir.cleanup() -def test_run_explore_functional_test(explore_fixture): +def test_get_explore_results_functional_test(explore_fixture): """Test overall function of the explore command.""" tmp_dir = tempfile.TemporaryDirectory() tmp_path = os.path.join(tmp_dir.name, "outdir") # Running with 2 reads, one perfect match to illumina_ud, one partial match to illumina_ud results, adaptors_included, entries, umi_threshold, kmer_length, outdir = ( - _run_explore( + get_explore_results( fastq=explore_fixture["explore_reads"], outdir=tmp_path, threads=1,