From 54850109ee10adcb854d3c58c6dc83a75cf2eb17 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Mon, 2 Sep 2024 15:18:35 +0200
Subject: [PATCH 1/5] Split up explore to a few more functions

---
 anglerfish/explore/explore.py | 375 ++++++++++++++++++++--------------
 1 file changed, 225 insertions(+), 150 deletions(-)

diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py
index e39396a..b486168 100644
--- a/anglerfish/explore/explore.py
+++ b/anglerfish/explore/explore.py
@@ -1,3 +1,4 @@
+import json
 import logging
 import os
 import uuid
@@ -26,184 +27,258 @@ def run_explore(
     umi_threshold: float,
     kmer_length: int,
 ):
-    # Setup a run directory
-    run_uuid = str(uuid.uuid4())
-    try:
-        os.mkdir(outdir)
-    except FileExistsError:
-        log.info(f"Output directory {outdir} already exists")
-        if not use_existing:
-            log.error(
-                f"Output directory {outdir} already exists, please use --use_existing to continue"
+    def _run_explore(
+        fastq: str,
+        outdir: str,
+        threads: int,
+        use_existing: bool,
+        good_hit_threshold: float,
+        insert_thres_low: int,
+        insert_thres_high: int,
+        minimap_b: int,
+        min_hits_per_adaptor: int,
+        umi_threshold: float,
+        kmer_length: int,
+    ):
+        # Setup a run directory
+        run_uuid = str(uuid.uuid4())
+        try:
+            os.mkdir(outdir)
+        except FileExistsError:
+            log.info(f"Output directory {outdir} already exists")
+            if not use_existing:
+                log.error(
+                    f"Output directory {outdir} already exists, please use --use_existing to continue"
+                )
+                exit(1)
+            else:
+                pass
+
+        log.info("Running anglerfish explore")
+        log.info(f"Run uuid {run_uuid}")
+
+        # Create a results dictionary which is the skeleton for json output
+        results = {
+            "run_uuid": run_uuid,
+            "timestamp": pd.Timestamp.now().isoformat(),
+            "parameters": {
+                "fastq": fastq,
+                "outdir": outdir,
+                "threads": threads,
+                "use_existing": use_existing,
+                "good_hit_threshold": good_hit_threshold,
+                "insert_thres_low": insert_thres_low,
+                "insert_thres_high": insert_thres_high,
+                "minimap_b": minimap_b,
+                "min_hits_per_adaptor": min_hits_per_adaptor,
+                "umi_threshold": umi_threshold,
+                "kmer_length": kmer_length,
+            },
+            "included_adapters": {},
+            "excluded_adapters": {},
+        }
+
+        adaptors = cast(list[Adaptor], load_adaptors())
+        adaptors_and_aln_paths: list[tuple[Adaptor, str]] = []
+
+        # Map all reads against all adaptors
+        for adaptor in adaptors:
+            # Align
+            aln_path = os.path.join(outdir, f"{adaptor.name}.paf")
+            adaptors_and_aln_paths.append((adaptor, aln_path))
+            if os.path.exists(aln_path) and use_existing:
+                log.info(f"Skipping {adaptor.name} as alignment already exists")
+                continue
+            adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta")
+            with open(adaptor_path, "w") as f:
+                f.write(adaptor.get_fastastring(insert_Ns=False))
+
+            log.info(f"Aligning {adaptor.name}")
+            run_minimap2(
+                fastq_in=fastq,
+                index_file=adaptor_path,
+                output_paf=aln_path,
+                threads=threads,
+                minimap_b=minimap_b,
             )
-            exit(1)
-        else:
-            pass
-
-    log.info("Running anglerfish explore")
-    log.info(f"Run uuid {run_uuid}")
-
-    adaptors = cast(list[Adaptor], load_adaptors())
-    adaptors_and_aln_paths: list[tuple[Adaptor, str]] = []
-
-    # Map all reads against all adaptors
-    for adaptor in adaptors:
-        # Align
-        aln_path = os.path.join(outdir, f"{adaptor.name}.paf")
-        adaptors_and_aln_paths.append((adaptor, aln_path))
-        if os.path.exists(aln_path) and use_existing:
-            log.info(f"Skipping {adaptor.name} as alignment already exists")
-            continue
-        adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta")
-        with open(adaptor_path, "w") as f:
-            f.write(adaptor.get_fastastring(insert_Ns=False))
-
-        log.info(f"Aligning {adaptor.name}")
-        run_minimap2(
-            fastq_in=fastq,
-            index_file=adaptor_path,
-            output_paf=aln_path,
-            threads=threads,
-            minimap_b=minimap_b,
-        )
 
-    # Parse alignments
-    entries: dict = {}
-    adaptors_included = []
-    for adaptor, aln_path in adaptors_and_aln_paths:
-        log.info(f"Parsing {adaptor.name}")
-        reads_alns: dict[str, list[Alignment]] = map_reads_to_alns(
-            aln_path, complex_identifier=True
-        )
+        # Parse alignments
+        entries: dict = {}
+        adaptors_included = []
+        for adaptor, aln_path in adaptors_and_aln_paths:
+            log.info(f"Parsing {adaptor.name}")
+            adaptor_data = {"name": adaptor.name, "i5": {}, "i7": {}}
+            reads_alns: dict[str, list[Alignment]] = map_reads_to_alns(
+                aln_path, complex_identifier=True
+            )
 
-        # Choose only the highest scoring alignment for each combination of read, adaptor end and strand
-        reads_to_highest_q_aln_dict: dict[str, dict] = {}
-        for aln_name, alns in reads_alns.items():
-            highest_q_aln = max(alns, key=lambda aln: aln.q)
-            reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln)
-        df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index")
-        nr_good_hits = {}
-
-        # Match Insert Match = mim
-        # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert
-        # The cg string allows for mismatches within the matching strings but no insertions or deletions
-        # All cg string matches are also cs string matches (subset) but not vice versa
-        mim_re_cs = (
-            r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$"  # Match Insert Match = mim
-        )
-        mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$"
-        df_mim = df[df.cs.str.match(mim_re_cs)]
+            # Choose only the highest scoring alignment for each combination of read, adaptor end and strand
+            reads_to_highest_q_aln_dict: dict[str, dict] = {}
+            for aln_name, alns in reads_alns.items():
+                highest_q_aln = max(alns, key=lambda aln: aln.q)
+                reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln)
+            df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index")
+            nr_good_hits = {}
 
-        # Extract the match lengths
-        match_col_df = df_mim.cg.str.extract(mim_re_cg).rename(
-            {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1
-        )
-        match_col_df = match_col_df.astype(
-            {
-                "match_1_len": "int32",
-                "insert_len": "int32",
-                "match_2_len": "int32",
-            }
-        )
+            # Match Insert Match = mim
+            # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert
+            # The cg string allows for mismatches within the matching strings but no insertions or deletions
+            # All cg string matches are also cs string matches (subset) but not vice versa
+            mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$"  # Match Insert Match = mim
+            mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$"
+            df_mim = df[df.cs.str.match(mim_re_cs)]
+
+            # Extract the match lengths
+            match_col_df = df_mim.cg.str.extract(mim_re_cg).rename(
+                {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1
+            )
+            match_col_df = match_col_df.astype(
+                {
+                    "match_1_len": "int32",
+                    "insert_len": "int32",
+                    "match_2_len": "int32",
+                }
+            )
 
-        df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df
+            df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df
 
-        for adaptor_end_name, adaptor_end in zip(
-            ["i5", "i7"], [adaptor.i5, adaptor.i7]
-        ):
-            if adaptor_end.has_index:
-                assert adaptor_end.len_before_index is not None
-                assert adaptor_end.len_after_index is not None
-
-                # Alignment thresholds
-                before_thres = round(adaptor_end.len_before_index * good_hit_threshold)
-                after_thres = round(adaptor_end.len_after_index * good_hit_threshold)
-                insert_thres_low = insert_thres_low
-                insert_thres_high = insert_thres_high
-
-                requirements = (
-                    (df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}")
-                    & (df_mim["match_1_len"] >= (before_thres))
-                    & (df_mim["insert_len"] >= insert_thres_low)
-                    & (df_mim["insert_len"] <= insert_thres_high)
-                    & (df_mim["match_2_len"] >= (after_thres))
-                )
-                df_good_hits = df_mim[requirements]
+            for adaptor_end_name, adaptor_end in zip(
+                ["i5", "i7"], [adaptor.i5, adaptor.i7]
+            ):
+                if adaptor_end.has_index:
+                    assert adaptor_end.len_before_index is not None
+                    assert adaptor_end.len_after_index is not None
 
-                median_insert_length = df_good_hits["insert_len"].median()
-                insert_lengths = df_good_hits["insert_len"].value_counts()
-            else:
-                m_re_cs = r"^cs:Z::([1-9][0-9]*)$"
-                df_good_hits = df[df.cg.str.match(m_re_cs)]
-                match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename(
-                    {0: "match_1_len"}, axis=1
-                )
-                match_col_df = match_col_df.astype({"match_1_len": "int32"})
+                    # Alignment thresholds
+                    before_thres = round(
+                        adaptor_end.len_before_index * good_hit_threshold
+                    )
+                    after_thres = round(
+                        adaptor_end.len_after_index * good_hit_threshold
+                    )
+                    insert_thres_low = insert_thres_low
+                    insert_thres_high = insert_thres_high
+
+                    requirements = (
+                        (df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}")
+                        & (df_mim["match_1_len"] >= (before_thres))
+                        & (df_mim["insert_len"] >= insert_thres_low)
+                        & (df_mim["insert_len"] <= insert_thres_high)
+                        & (df_mim["match_2_len"] >= (after_thres))
+                    )
+                    df_good_hits = df_mim[requirements]
 
-                df_good_hits.loc[match_col_df.index, match_col_df.columns] = (
-                    match_col_df
+                    insert_lengths = df_good_hits["insert_len"].value_counts()
+
+                    if len(insert_lengths) == 0:
+                        median_insert_length = None
+                        insert_lengths_histogram = {}
+                    else:
+                        median_insert_length = df_good_hits["insert_len"].median()
+                        insert_lengths_histogram = insert_lengths[
+                            sorted(insert_lengths.index)
+                        ].to_dict()
+                else:
+                    m_re_cs = r"^cs:Z::([1-9][0-9]*)$"
+                    df_good_hits = df[df.cg.str.match(m_re_cs)]
+                    match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename(
+                        {0: "match_1_len"}, axis=1
+                    )
+                    match_col_df = match_col_df.astype({"match_1_len": "int32"})
+
+                    df_good_hits.loc[match_col_df.index, match_col_df.columns] = (
+                        match_col_df
+                    )
+
+                    thres = round(adaptor_end.len_constant * good_hit_threshold)
+                    df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres]
+
+                    median_insert_length = None
+                    insert_lengths_histogram = {}
+                    insert_lengths = None
+
+                # Filter multiple hits per read and adaptor end
+                df_good_hits = df_good_hits.sort_values(
+                    by=["read_name", "match_1_len"], ascending=False
+                ).drop_duplicates(subset=["read_name", "adapter_name"], keep="first")
+
+                if adaptor.name not in entries.keys():
+                    entries[adaptor.name] = {}
+                entries[adaptor.name][adaptor_end_name] = df_good_hits
+
+                nr_good_hits[adaptor_end_name] = len(df_good_hits)
+                log.info(
+                    f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits."
                 )
 
-                thres = round(adaptor_end.len_constant * good_hit_threshold)
-                df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres]
+                # Collect data for the results dictionary
+                adaptor_data[adaptor_end_name] = {
+                    "nr_good_hits": len(df_good_hits),
+                    "index_length": median_insert_length,
+                    "insert_lengths_histogram": insert_lengths_histogram,
+                    "relative_entropy": {},
+                }
 
-                median_insert_length = None
-                insert_lengths = None
+            if min(nr_good_hits.values()) >= min_hits_per_adaptor:
+                log.info(f"Adaptor {adaptor.name} is included in the analysis")
+                results["included_adapters"][adaptor.name] = adaptor_data
+                adaptors_included.append(adaptor)
+            else:
+                results["excluded_adapters"][adaptor.name] = adaptor_data
+                log.info(f"Adaptor {adaptor.name} is excluded from the analysis")
 
-            # Filter multiple hits per read and adaptor end
-            df_good_hits = df_good_hits.sort_values(
-                by=["read_name", "match_1_len"], ascending=False
-            ).drop_duplicates(subset=["read_name", "adapter_name"], keep="first")
+        return results, adaptors_included, entries, umi_threshold, kmer_length, outdir
 
-            if adaptor.name not in entries.keys():
-                entries[adaptor.name] = {}
-            entries[adaptor.name][adaptor_end_name] = df_good_hits
+    results, adaptors_included, entries, umi_threshold, kmer_length, outdir = (
+        _run_explore(
+            fastq,
+            outdir,
+            threads,
+            use_existing,
+            good_hit_threshold,
+            insert_thres_low,
+            insert_thres_high,
+            minimap_b,
+            min_hits_per_adaptor,
+            umi_threshold,
+            kmer_length,
+        )
+    )
 
-            nr_good_hits[adaptor_end_name] = len(df_good_hits)
-            log.info(
-                f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits."
-            )
+    results = check_for_umis(
+        results, adaptors_included, entries, umi_threshold, kmer_length, outdir
+    )
+
+    explore_stats_file = os.path.join(outdir, "anglerfish_explore_stats.json")
+    # Save results to a JSON file
+    with open(explore_stats_file, "w") as f:
+        json.dump(results, f, indent=4)
 
-        if min(nr_good_hits.values()) >= min_hits_per_adaptor:
-            log.info(f"Adaptor {adaptor.name} is included in the analysis")
-            adaptors_included.append(adaptor)
-        else:
-            log.info(f"Adaptor {adaptor.name} is excluded from the analysis")
+    log.info(f"Results saved to {explore_stats_file}")
 
-    # Print insert length info for adaptor types included in the analysis
+
+def check_for_umis(
+    results, adaptors_included, entries, umi_threshold, kmer_length, outdir
+):
+    # Traverse the adaptors to include and check whether they need UMI entropy analysis
     for adaptor in adaptors_included:
+        adaptor_data = results["included_adapters"][adaptor.name]
+
         for adaptor_end_name, adaptor_end in zip(
             ["i5", "i7"], [adaptor.i5, adaptor.i7]
         ):
             df_good_hits = entries[adaptor.name][adaptor_end_name]
             if adaptor_end.has_index:
                 median_insert_length = df_good_hits["insert_len"].median()
+
                 if median_insert_length > umi_threshold:
                     # Calculate entropies here
                     entropies = calculate_relative_entropy(
                         df_good_hits, kmer_length, median_insert_length
                     )
-                    entropy_file = os.path.join(
-                        outdir, f"{adaptor.name}_{adaptor_end_name}.entropy.csv"
-                    )
-                    pd.Series(entropies).to_csv(entropy_file, float_format="%.2f")
-                    log.info(
-                        f"{adaptor.name}:{adaptor_end_name}, relative entropy for k={kmer_length}, over the index saved to {entropy_file}"
-                    )
-                insert_lengths = df_good_hits["insert_len"].value_counts()
+                    adaptor_data[adaptor_end_name]["relative_entropy"] = entropies
                 log.info(
                     f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits with median insert length {median_insert_length}"
                 )
-                histogram_file = os.path.join(
-                    outdir, f"{adaptor.name}_{adaptor_end_name}.hist.csv"
-                )
-                insert_lengths[sorted(insert_lengths.index)].to_csv(histogram_file)
-                log.info(
-                    f"{adaptor.name}:{adaptor_end_name} insert length histogram saved {histogram_file}"
-                )
-            else:
-                median_insert_length = None
-                insert_lengths = None
-                log.info(
-                    f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits (no insert length since no index)"
-                )
+
+    return results

From 3c3aa51a352d931a23f83eda9e3a86979a78e124 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Tue, 3 Sep 2024 12:43:46 +0200
Subject: [PATCH 2/5] Changed from adaptors argument to i7_sequence_token and
 i5_sequence_token

---
 anglerfish/demux/adaptor.py                   | 21 ++++++++++++-------
 anglerfish/demux/samplesheet.py               |  5 +++--
 .../test_demux/test_adaptor.py                |  8 +++++--
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/anglerfish/demux/adaptor.py b/anglerfish/demux/adaptor.py
index ac33de8..5b77cd9 100644
--- a/anglerfish/demux/adaptor.py
+++ b/anglerfish/demux/adaptor.py
@@ -18,16 +18,17 @@ class Adaptor:
     def __init__(
         self,
         name: str,
-        adaptors: dict,
+        i7_sequence_token: str,
+        i5_sequence_token: str,
         i7_index: str | None = None,
         i5_index: str | None = None,
     ):
         self.name: str = name
-        self.i5 = AdaptorPart(
-            sequence_token=adaptors[name]["i5"], name=name, index_seq=i5_index
-        )
         self.i7 = AdaptorPart(
-            sequence_token=adaptors[name]["i7"], name=name, index_seq=i7_index
+            sequence_token=i7_sequence_token, name=name, index_seq=i7_index
+        )
+        self.i5 = AdaptorPart(
+            sequence_token=i5_sequence_token, name=name, index_seq=i5_index
         )
 
     def get_fastastring(self, insert_Ns: bool = True) -> str:
@@ -265,6 +266,12 @@ def load_adaptors(raw: bool = False) -> list[Adaptor] | dict:
     # By default, return list of Adaptor objects
     else:
         adaptors = []
-        for adaptor_name in adaptors_dict:
-            adaptors.append(Adaptor(name=adaptor_name, adaptors=adaptors_dict))
+        for adaptor_name, adaptor_parts_dict in adaptors_dict.items():
+            adaptors.append(
+                Adaptor(
+                    name=adaptor_name,
+                    i7_sequence_token=adaptor_parts_dict["i7"],
+                    i5_sequence_token=adaptor_parts_dict["i5"],
+                )
+            )
         return adaptors
diff --git a/anglerfish/demux/samplesheet.py b/anglerfish/demux/samplesheet.py
index 1998f33..4672b22 100644
--- a/anglerfish/demux/samplesheet.py
+++ b/anglerfish/demux/samplesheet.py
@@ -75,9 +75,10 @@ def __init__(self, input_csv: str, ont_barcodes_enabled: bool):
                     sample_name,
                     Adaptor(
                         name=row["adaptors"],
-                        adaptors=ADAPTORS,
-                        i5_index=i5_index,
+                        i7_sequence_token=ADAPTORS[row["adaptors"]]["i7"],
+                        i5_sequence_token=ADAPTORS[row["adaptors"]]["i5"],
                         i7_index=i7_index,
+                        i5_index=i5_index,
                     ),
                     row["fastq_path"],
                     ont_barcode,
diff --git a/tests/test_anglerfish/test_demux/test_adaptor.py b/tests/test_anglerfish/test_demux/test_adaptor.py
index 5b3fb34..62969c3 100644
--- a/tests/test_anglerfish/test_demux/test_adaptor.py
+++ b/tests/test_anglerfish/test_demux/test_adaptor.py
@@ -199,13 +199,17 @@ class TestAdaptor:
     def test_adaptor(self):
         adaptors = {
             "simple_and_index_umi": {
-                "i5": "AAA",
                 "i7": "AAA<N><U4>CCC",
+                "i5": "AAA",
             }
         }
 
         adaptor = to_test.Adaptor(
-            "simple_and_index_umi", adaptors, i5_index=None, i7_index="TTT"
+            "simple_and_index_umi",
+            i7_sequence_token=adaptors["simple_and_index_umi"]["i7"],
+            i5_sequence_token=adaptors["simple_and_index_umi"]["i5"],
+            i7_index="TTT",
+            i5_index=None,
         )
         assert adaptor.name == "simple_and_index_umi"
         assert isinstance(adaptor.i5, to_test.AdaptorPart)

From 7b62050747458f905cc401cd2d5ac71beadcec4c Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Wed, 4 Sep 2024 10:43:30 +0200
Subject: [PATCH 3/5] Rearranging the tests and added basic explore tests

---
 tests/__init__.py                             |  0
 .../test_demux/test_demux.py => conftest.py}  | 74 +--------------
 tests/test_demux/__init__.py                  |  0
 .../test_demux/test_adaptor.py                |  0
 tests/test_demux/test_demux.py                | 71 ++++++++++++++
 tests/test_explore/test_explore.py            | 94 +++++++++++++++++++
 6 files changed, 166 insertions(+), 73 deletions(-)
 create mode 100644 tests/__init__.py
 rename tests/{test_anglerfish/test_demux/test_demux.py => conftest.py} (63%)
 create mode 100644 tests/test_demux/__init__.py
 rename tests/{test_anglerfish => }/test_demux/test_adaptor.py (100%)
 create mode 100644 tests/test_demux/test_demux.py
 create mode 100644 tests/test_explore/test_explore.py

diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_anglerfish/test_demux/test_demux.py b/tests/conftest.py
similarity index 63%
rename from tests/test_anglerfish/test_demux/test_demux.py
rename to tests/conftest.py
index ed8124b..f21c9c1 100644
--- a/tests/test_anglerfish/test_demux/test_demux.py
+++ b/tests/conftest.py
@@ -4,11 +4,9 @@
 
 import pytest
 
-from anglerfish.demux import demux as to_test
-
 
 @pytest.fixture(scope="module")
-def fixture():
+def demux_fixture():
     f"""Fixture for all tests within {__file__}.
     
     Creates a temporary directory with...
@@ -85,73 +83,3 @@ def fixture():
     yield fixture
 
     tmp.cleanup()
-
-
-def test_run_minimap2(fixture):
-    """Check that the function runs successfully, not the output."""
-    # Test alignment on single read
-    to_test.run_minimap2(
-        fastq_in=fixture["testdata_single"],
-        index_file=fixture["index_file"],
-        output_paf=fixture["paf_single"],
-        threads=1,
-        minimap_b=1,
-    )
-
-    # Create aligntment from multiple reads
-    to_test.run_minimap2(
-        fastq_in=fixture["testdata_multiple"],
-        index_file=fixture["index_file"],
-        output_paf=fixture["paf_multiple"],
-        threads=1,
-        minimap_b=1,
-    )
-
-
-def test_map_reads_to_alns(fixture):
-    reads_alns = to_test.map_reads_to_alns(fixture["paf_single"])
-
-    for read_name, alns in reads_alns.items():
-        assert read_name == "0ad8bdb6-e009-43c5-95b1-d381e699f983"
-        for aln in alns:
-            if aln.adapter_name == "truseq_i7":
-                assert (
-                    aln.cs
-                    == "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17"
-                )
-            else:
-                assert aln.cs == "cs:Z::15-a*cg:5+tcccgat:3+g:33"
-
-
-def test_parse_cs(fixture):
-    test_cs_str = (
-        "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17"
-    )
-    expected_cs_parsed = ("taacttggtc", 0)
-
-    cs_parsed = to_test.parse_cs(
-        cs_string=test_cs_str,
-        index_seq=fixture["index"],
-        umi_before=0,
-        umi_after=0,
-    )
-
-    assert cs_parsed == expected_cs_parsed
-
-
-def test_categorize_matches(fixture):
-    i5_name = "truseq_i5"
-    i7_name = "truseq_i7"
-    reads_alns = to_test.map_reads_to_alns(fixture["paf_multiple"])
-
-    layout = to_test.categorize_matches(
-        i5_name=i5_name, i7_name=i7_name, reads_to_alns=reads_alns
-    )
-    fragments, singletons, concats, unknowns = layout
-
-    # Assert reads were categorized as expected
-    assert len(fragments["fragment_read"]) == 2
-    assert len(singletons["singleton_i5_read"]) == 1
-    assert len(singletons["singleton_i7_read"]) == 1
-    assert len(concats["concat_read"]) == 4
-    assert len(unknowns["unknown_read"]) == 2
diff --git a/tests/test_demux/__init__.py b/tests/test_demux/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/test_anglerfish/test_demux/test_adaptor.py b/tests/test_demux/test_adaptor.py
similarity index 100%
rename from tests/test_anglerfish/test_demux/test_adaptor.py
rename to tests/test_demux/test_adaptor.py
diff --git a/tests/test_demux/test_demux.py b/tests/test_demux/test_demux.py
new file mode 100644
index 0000000..8015950
--- /dev/null
+++ b/tests/test_demux/test_demux.py
@@ -0,0 +1,71 @@
+from anglerfish.demux import demux as to_test
+
+
+def test_run_minimap2(demux_fixture):
+    """Check that the function runs successfully, not the output."""
+    # Test alignment on single read
+    to_test.run_minimap2(
+        fastq_in=demux_fixture["testdata_single"],
+        index_file=demux_fixture["index_file"],
+        output_paf=demux_fixture["paf_single"],
+        threads=1,
+        minimap_b=1,
+    )
+
+    # Create aligntment from multiple reads
+    to_test.run_minimap2(
+        fastq_in=demux_fixture["testdata_multiple"],
+        index_file=demux_fixture["index_file"],
+        output_paf=demux_fixture["paf_multiple"],
+        threads=1,
+        minimap_b=1,
+    )
+
+
+def test_map_reads_to_alns(demux_fixture):
+    reads_alns = to_test.map_reads_to_alns(demux_fixture["paf_single"])
+
+    for read_name, alns in reads_alns.items():
+        assert read_name == "0ad8bdb6-e009-43c5-95b1-d381e699f983"
+        for aln in alns:
+            if aln.adapter_name == "truseq_i7":
+                assert (
+                    aln.cs
+                    == "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17"
+                )
+            else:
+                assert aln.cs == "cs:Z::15-a*cg:5+tcccgat:3+g:33"
+
+
+def test_parse_cs(demux_fixture):
+    test_cs_str = (
+        "cs:Z::11-ca:6*tg:13*nt*na*na*nc*nt*nt*ng*ng*nt*nc:1*ta:1+ctagaaa:2*gt*tg:17"
+    )
+    expected_cs_parsed = ("taacttggtc", 0)
+
+    cs_parsed = to_test.parse_cs(
+        cs_string=test_cs_str,
+        index_seq=demux_fixture["index"],
+        umi_before=0,
+        umi_after=0,
+    )
+
+    assert cs_parsed == expected_cs_parsed
+
+
+def test_categorize_matches(demux_fixture):
+    i5_name = "truseq_i5"
+    i7_name = "truseq_i7"
+    reads_alns = to_test.map_reads_to_alns(demux_fixture["paf_multiple"])
+
+    layout = to_test.categorize_matches(
+        i5_name=i5_name, i7_name=i7_name, reads_to_alns=reads_alns
+    )
+    fragments, singletons, concats, unknowns = layout
+
+    # Assert reads were categorized as expected
+    assert len(fragments["fragment_read"]) == 2
+    assert len(singletons["singleton_i5_read"]) == 1
+    assert len(singletons["singleton_i7_read"]) == 1
+    assert len(concats["concat_read"]) == 4
+    assert len(unknowns["unknown_read"]) == 2
diff --git a/tests/test_explore/test_explore.py b/tests/test_explore/test_explore.py
new file mode 100644
index 0000000..a5d2e94
--- /dev/null
+++ b/tests/test_explore/test_explore.py
@@ -0,0 +1,94 @@
+import os
+
+import pytest
+
+from anglerfish.demux.adaptor import Adaptor
+from anglerfish.explore.explore import run_multiple_alignments
+
+
+@pytest.fixture(scope="module")
+def explore_fixture(request):
+    f"""Fixture for all tests within {__file__}.
+    
+    Creates a temporary directory with...
+    - truseq.fasta: A fasta file with the truseq i7 and i5 adaptors.
+    - illumina_ud.fasta: A fasta file with the illumina unique dual i7 and i5 adaptors.
+    - testdata_single.fastq.gz: A single read subset from the testdata with a perfect index match (TAACTTGGTC) to the truseq i7 adaptor.
+    - testdata_multiple.fastq.gz: Fabricated test data intended to test the categorization of alignments.
+
+    Expected files to be created...
+    """
+
+    adaptors_dict = {
+        "illumina_ud": {
+            "i5": "AATGATACGGCGACCACCGAGATCTACAC<N>TCGTCGGCAGCGTC",
+            "i7": "CAAGCAGAAGACGGCATACGAGAT<N>GTCTCGTGGGCTCGG",
+        },
+        "truseq": {
+            "i5": "AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT",
+            "i7": "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N>ATCTCGTATGCCGTCTTCTGCTTG",
+        },
+    }
+    demux_fixture = request.getfixturevalue("demux_fixture")
+    fixture = demux_fixture
+    fixture["adaptors"] = [
+        Adaptor(
+            name=adaptor_name,
+            i7_sequence_token=adaptor_parts_dict["i7"],
+            i5_sequence_token=adaptor_parts_dict["i5"],
+        )
+        for adaptor_name, adaptor_parts_dict in adaptors_dict.items()
+    ]
+    return fixture
+
+
+def test_run_multiple_alignments(explore_fixture):
+    # Test a basic run
+    return_list = run_multiple_alignments(
+        fastq=explore_fixture["testdata_single"],
+        outdir=explore_fixture["tmp_path"],
+        threads=1,
+        use_existing=False,
+        adaptors=explore_fixture["adaptors"],
+        minimap_b=1,
+    )
+
+    assert return_list
+    assert len(return_list) == 2  # One for illumina_ud and one for truseq
+    for adapter, alignment_file in return_list:
+        # check that tile file ending is .paf
+        assert alignment_file.endswith(".paf")
+        # check that alignment file exists
+        assert os.path.exists(alignment_file)
+
+    # When use_existing is true, the alignment should not be rerun
+    # Save modification time of the alignment file
+    alignment_file = return_list[0][1]
+    alignment_time = os.path.getmtime(alignment_file)
+
+    return_list = run_multiple_alignments(
+        fastq=explore_fixture["testdata_single"],
+        outdir=explore_fixture["tmp_path"],
+        threads=1,
+        use_existing=True,
+        adaptors=explore_fixture["adaptors"],
+        minimap_b=1,
+    )
+    # Check that the alignment file was not rerun
+    assert alignment_time == os.path.getmtime(alignment_file)
+
+    # When use_existing is false, the alignment should be rerun
+    # Save modification time of the alignment file
+    alignment_file = return_list[0][1]
+    alignment_time = os.path.getmtime(alignment_file)
+
+    return_list = run_multiple_alignments(
+        fastq=explore_fixture["testdata_single"],
+        outdir=explore_fixture["tmp_path"],
+        threads=1,
+        use_existing=False,
+        adaptors=explore_fixture["adaptors"],
+        minimap_b=1,
+    )
+    # Check that the alignment file was rerun
+    assert alignment_time != os.path.getmtime(alignment_file)

From 37d51fd53e411eb5b591a5600e80649fe5e09bae Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Mon, 9 Sep 2024 09:54:02 +0200
Subject: [PATCH 4/5] Splitting up explore a bit and basic testing

---
 anglerfish/explore/explore.py      | 436 +++++++++++++++--------------
 tests/test_explore/test_explore.py | 138 ++++++++-
 2 files changed, 367 insertions(+), 207 deletions(-)

diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py
index b486168..ae7b684 100644
--- a/anglerfish/explore/explore.py
+++ b/anglerfish/explore/explore.py
@@ -27,208 +27,6 @@ def run_explore(
     umi_threshold: float,
     kmer_length: int,
 ):
-    def _run_explore(
-        fastq: str,
-        outdir: str,
-        threads: int,
-        use_existing: bool,
-        good_hit_threshold: float,
-        insert_thres_low: int,
-        insert_thres_high: int,
-        minimap_b: int,
-        min_hits_per_adaptor: int,
-        umi_threshold: float,
-        kmer_length: int,
-    ):
-        # Setup a run directory
-        run_uuid = str(uuid.uuid4())
-        try:
-            os.mkdir(outdir)
-        except FileExistsError:
-            log.info(f"Output directory {outdir} already exists")
-            if not use_existing:
-                log.error(
-                    f"Output directory {outdir} already exists, please use --use_existing to continue"
-                )
-                exit(1)
-            else:
-                pass
-
-        log.info("Running anglerfish explore")
-        log.info(f"Run uuid {run_uuid}")
-
-        # Create a results dictionary which is the skeleton for json output
-        results = {
-            "run_uuid": run_uuid,
-            "timestamp": pd.Timestamp.now().isoformat(),
-            "parameters": {
-                "fastq": fastq,
-                "outdir": outdir,
-                "threads": threads,
-                "use_existing": use_existing,
-                "good_hit_threshold": good_hit_threshold,
-                "insert_thres_low": insert_thres_low,
-                "insert_thres_high": insert_thres_high,
-                "minimap_b": minimap_b,
-                "min_hits_per_adaptor": min_hits_per_adaptor,
-                "umi_threshold": umi_threshold,
-                "kmer_length": kmer_length,
-            },
-            "included_adapters": {},
-            "excluded_adapters": {},
-        }
-
-        adaptors = cast(list[Adaptor], load_adaptors())
-        adaptors_and_aln_paths: list[tuple[Adaptor, str]] = []
-
-        # Map all reads against all adaptors
-        for adaptor in adaptors:
-            # Align
-            aln_path = os.path.join(outdir, f"{adaptor.name}.paf")
-            adaptors_and_aln_paths.append((adaptor, aln_path))
-            if os.path.exists(aln_path) and use_existing:
-                log.info(f"Skipping {adaptor.name} as alignment already exists")
-                continue
-            adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta")
-            with open(adaptor_path, "w") as f:
-                f.write(adaptor.get_fastastring(insert_Ns=False))
-
-            log.info(f"Aligning {adaptor.name}")
-            run_minimap2(
-                fastq_in=fastq,
-                index_file=adaptor_path,
-                output_paf=aln_path,
-                threads=threads,
-                minimap_b=minimap_b,
-            )
-
-        # Parse alignments
-        entries: dict = {}
-        adaptors_included = []
-        for adaptor, aln_path in adaptors_and_aln_paths:
-            log.info(f"Parsing {adaptor.name}")
-            adaptor_data = {"name": adaptor.name, "i5": {}, "i7": {}}
-            reads_alns: dict[str, list[Alignment]] = map_reads_to_alns(
-                aln_path, complex_identifier=True
-            )
-
-            # Choose only the highest scoring alignment for each combination of read, adaptor end and strand
-            reads_to_highest_q_aln_dict: dict[str, dict] = {}
-            for aln_name, alns in reads_alns.items():
-                highest_q_aln = max(alns, key=lambda aln: aln.q)
-                reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln)
-            df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index")
-            nr_good_hits = {}
-
-            # Match Insert Match = mim
-            # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert
-            # The cg string allows for mismatches within the matching strings but no insertions or deletions
-            # All cg string matches are also cs string matches (subset) but not vice versa
-            mim_re_cs = r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$"  # Match Insert Match = mim
-            mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$"
-            df_mim = df[df.cs.str.match(mim_re_cs)]
-
-            # Extract the match lengths
-            match_col_df = df_mim.cg.str.extract(mim_re_cg).rename(
-                {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1
-            )
-            match_col_df = match_col_df.astype(
-                {
-                    "match_1_len": "int32",
-                    "insert_len": "int32",
-                    "match_2_len": "int32",
-                }
-            )
-
-            df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df
-
-            for adaptor_end_name, adaptor_end in zip(
-                ["i5", "i7"], [adaptor.i5, adaptor.i7]
-            ):
-                if adaptor_end.has_index:
-                    assert adaptor_end.len_before_index is not None
-                    assert adaptor_end.len_after_index is not None
-
-                    # Alignment thresholds
-                    before_thres = round(
-                        adaptor_end.len_before_index * good_hit_threshold
-                    )
-                    after_thres = round(
-                        adaptor_end.len_after_index * good_hit_threshold
-                    )
-                    insert_thres_low = insert_thres_low
-                    insert_thres_high = insert_thres_high
-
-                    requirements = (
-                        (df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}")
-                        & (df_mim["match_1_len"] >= (before_thres))
-                        & (df_mim["insert_len"] >= insert_thres_low)
-                        & (df_mim["insert_len"] <= insert_thres_high)
-                        & (df_mim["match_2_len"] >= (after_thres))
-                    )
-                    df_good_hits = df_mim[requirements]
-
-                    insert_lengths = df_good_hits["insert_len"].value_counts()
-
-                    if len(insert_lengths) == 0:
-                        median_insert_length = None
-                        insert_lengths_histogram = {}
-                    else:
-                        median_insert_length = df_good_hits["insert_len"].median()
-                        insert_lengths_histogram = insert_lengths[
-                            sorted(insert_lengths.index)
-                        ].to_dict()
-                else:
-                    m_re_cs = r"^cs:Z::([1-9][0-9]*)$"
-                    df_good_hits = df[df.cg.str.match(m_re_cs)]
-                    match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename(
-                        {0: "match_1_len"}, axis=1
-                    )
-                    match_col_df = match_col_df.astype({"match_1_len": "int32"})
-
-                    df_good_hits.loc[match_col_df.index, match_col_df.columns] = (
-                        match_col_df
-                    )
-
-                    thres = round(adaptor_end.len_constant * good_hit_threshold)
-                    df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres]
-
-                    median_insert_length = None
-                    insert_lengths_histogram = {}
-                    insert_lengths = None
-
-                # Filter multiple hits per read and adaptor end
-                df_good_hits = df_good_hits.sort_values(
-                    by=["read_name", "match_1_len"], ascending=False
-                ).drop_duplicates(subset=["read_name", "adapter_name"], keep="first")
-
-                if adaptor.name not in entries.keys():
-                    entries[adaptor.name] = {}
-                entries[adaptor.name][adaptor_end_name] = df_good_hits
-
-                nr_good_hits[adaptor_end_name] = len(df_good_hits)
-                log.info(
-                    f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits."
-                )
-
-                # Collect data for the results dictionary
-                adaptor_data[adaptor_end_name] = {
-                    "nr_good_hits": len(df_good_hits),
-                    "index_length": median_insert_length,
-                    "insert_lengths_histogram": insert_lengths_histogram,
-                    "relative_entropy": {},
-                }
-
-            if min(nr_good_hits.values()) >= min_hits_per_adaptor:
-                log.info(f"Adaptor {adaptor.name} is included in the analysis")
-                results["included_adapters"][adaptor.name] = adaptor_data
-                adaptors_included.append(adaptor)
-            else:
-                results["excluded_adapters"][adaptor.name] = adaptor_data
-                log.info(f"Adaptor {adaptor.name} is excluded from the analysis")
-
-        return results, adaptors_included, entries, umi_threshold, kmer_length, outdir
-
     results, adaptors_included, entries, umi_threshold, kmer_length, outdir = (
         _run_explore(
             fastq,
@@ -257,12 +55,244 @@ def _run_explore(
     log.info(f"Results saved to {explore_stats_file}")
 
 
+def _run_explore(
+    fastq: str,
+    outdir: str,
+    threads: int,
+    use_existing: bool,
+    good_hit_threshold: float,
+    insert_thres_low: int,
+    insert_thres_high: int,
+    minimap_b: int,
+    min_hits_per_adaptor: int,
+    umi_threshold: float,
+    kmer_length: int,
+):
+    run_uuid = str(uuid.uuid4())
+    log.info("Running anglerfish explore")
+    log.info(f"Run uuid {run_uuid}")
+
+    setup_explore_directory(outdir, use_existing)
+
+    # Create a results dictionary which is the skeleton for json output
+    results = {
+        "run_uuid": run_uuid,
+        "timestamp": pd.Timestamp.now().isoformat(),
+        "parameters": {
+            "fastq": fastq,
+            "outdir": outdir,
+            "threads": threads,
+            "use_existing": use_existing,
+            "good_hit_threshold": good_hit_threshold,
+            "insert_thres_low": insert_thres_low,
+            "insert_thres_high": insert_thres_high,
+            "minimap_b": minimap_b,
+            "min_hits_per_adaptor": min_hits_per_adaptor,
+            "umi_threshold": umi_threshold,
+            "kmer_length": kmer_length,
+        },
+        "included_adaptors": {},
+        "excluded_adaptors": {},
+    }
+
+    adaptors = cast(list[Adaptor], load_adaptors())
+
+    adaptors_and_aln_paths = run_multiple_alignments(
+        fastq,
+        outdir,
+        threads,
+        use_existing,
+        adaptors,
+        minimap_b,
+    )
+
+    # Parse alignments
+    entries: dict = {}
+    adaptors_included = []
+    for adaptor, aln_path in adaptors_and_aln_paths:
+        log.info(f"Parsing {adaptor.name}")
+        adaptor_data = {"name": adaptor.name, "i5": {}, "i7": {}}
+        reads_alns: dict[str, list[Alignment]] = map_reads_to_alns(
+            aln_path, complex_identifier=True
+        )
+
+        if len(reads_alns) == 0:
+            log.info(
+                f"No hits for {adaptor.name} in alignment file (perhaps the read file was mis-formatted?)"
+            )
+            continue
+
+        # Choose only the highest scoring alignment for each combination of read, adaptor end and strand
+        reads_to_highest_q_aln_dict: dict[str, dict] = {}
+        for aln_name, alns in reads_alns.items():
+            highest_q_aln = max(alns, key=lambda aln: aln.q)
+            reads_to_highest_q_aln_dict[aln_name] = vars(highest_q_aln)
+        df = pd.DataFrame.from_dict(reads_to_highest_q_aln_dict, orient="index")
+        nr_good_hits = {}
+
+        # Match Insert Match = mim
+        # The cs string filter is quite strict, requiring 10+ perfect match before insert and 10+ perfect match after insert
+        # The cg string allows for mismatches within the matching strings but no insertions or deletions
+        # All cg string matches are also cs string matches (subset) but not vice versa
+        mim_re_cs = (
+            r"^cs:Z::[1-9][0-9]*\+([a,c,t,g]*):[1-9][0-9]*$"  # Match Insert Match = mim
+        )
+        mim_re_cg = r"^cg:Z:([0-9]*)M([0-9]*)I([0-9]*)M$"
+        df_mim = df[df.cs.str.match(mim_re_cs)]
+
+        # Extract the match lengths
+        match_col_df = df_mim.cg.str.extract(mim_re_cg).rename(
+            {0: "match_1_len", 1: "insert_len", 2: "match_2_len"}, axis=1
+        )
+        match_col_df = match_col_df.astype(
+            {
+                "match_1_len": "int32",
+                "insert_len": "int32",
+                "match_2_len": "int32",
+            }
+        )
+
+        df_mim.loc[match_col_df.index, match_col_df.columns] = match_col_df
+
+        for adaptor_end_name, adaptor_end in zip(
+            ["i5", "i7"], [adaptor.i5, adaptor.i7]
+        ):
+            if adaptor_end.has_index:
+                assert adaptor_end.len_before_index is not None
+                assert adaptor_end.len_after_index is not None
+
+                # Alignment thresholds
+                before_thres = round(adaptor_end.len_before_index * good_hit_threshold)
+                after_thres = round(adaptor_end.len_after_index * good_hit_threshold)
+                insert_thres_low = insert_thres_low
+                insert_thres_high = insert_thres_high
+
+                requirements = (
+                    (df_mim["adapter_name"] == f"{adaptor.name}_{adaptor_end_name}")
+                    & (df_mim["match_1_len"] >= (before_thres))
+                    & (df_mim["insert_len"] >= insert_thres_low)
+                    & (df_mim["insert_len"] <= insert_thres_high)
+                    & (df_mim["match_2_len"] >= (after_thres))
+                )
+                df_good_hits = df_mim[requirements]
+
+                insert_lengths = df_good_hits["insert_len"].value_counts()
+
+                if len(insert_lengths) == 0:
+                    median_insert_length = None
+                    insert_lengths_histogram = {}
+                else:
+                    median_insert_length = df_good_hits["insert_len"].median()
+                    insert_lengths_histogram = insert_lengths[
+                        sorted(insert_lengths.index)
+                    ].to_dict()
+            else:
+                m_re_cs = r"^cs:Z::([1-9][0-9]*)$"
+                df_good_hits = df[df.cg.str.match(m_re_cs)]
+                match_col_df = df_good_hits.cg.str.extract(m_re_cs).rename(
+                    {0: "match_1_len"}, axis=1
+                )
+                match_col_df = match_col_df.astype({"match_1_len": "int32"})
+
+                df_good_hits.loc[match_col_df.index, match_col_df.columns] = (
+                    match_col_df
+                )
+
+                thres = round(adaptor_end.len_constant * good_hit_threshold)
+                df_good_hits = df_good_hits[df_good_hits["match_1_len"] >= thres]
+
+                median_insert_length = None
+                insert_lengths_histogram = {}
+                insert_lengths = None
+
+            # Filter multiple hits per read and adaptor end
+            df_good_hits = df_good_hits.sort_values(
+                by=["read_name", "match_1_len"], ascending=False
+            ).drop_duplicates(subset=["read_name", "adapter_name"], keep="first")
+
+            if adaptor.name not in entries.keys():
+                entries[adaptor.name] = {}
+            entries[adaptor.name][adaptor_end_name] = df_good_hits
+
+            nr_good_hits[adaptor_end_name] = len(df_good_hits)
+            log.info(
+                f"{adaptor.name}:{adaptor_end_name} had {len(df_good_hits)} good hits."
+            )
+
+            # Collect data for the results dictionary
+            adaptor_data[adaptor_end_name] = {
+                "nr_good_hits": len(df_good_hits),
+                "index_length": median_insert_length,
+                "insert_lengths_histogram": insert_lengths_histogram,
+                "relative_entropy": {},
+            }
+
+        if min(nr_good_hits.values()) >= min_hits_per_adaptor:
+            log.info(f"Adaptor {adaptor.name} is included in the analysis")
+            results["included_adaptors"][adaptor.name] = adaptor_data
+            adaptors_included.append(adaptor)
+        else:
+            results["excluded_adaptors"][adaptor.name] = adaptor_data
+            log.info(f"Adaptor {adaptor.name} is excluded from the analysis")
+
+    return results, adaptors_included, entries, umi_threshold, kmer_length, outdir
+
+
+def setup_explore_directory(outdir: str, use_existing: bool):
+    # Setup a run directory
+
+    try:
+        os.mkdir(outdir)
+    except FileExistsError:
+        log.info(f"Output directory {outdir} already exists")
+        if not use_existing:
+            log.error(
+                f"Output directory {outdir} already exists, please use --use_existing to continue"
+            )
+            exit(1)
+        else:
+            pass
+
+
+def run_multiple_alignments(
+    fastq: str,
+    outdir: str,
+    threads: int,
+    use_existing: bool,
+    adaptors: list[Adaptor],
+    minimap_b: int,
+) -> list[tuple[Adaptor, str]]:
+    adaptors_and_aln_paths: list[tuple[Adaptor, str]] = []
+
+    # Map all reads against all adaptors
+    for adaptor in adaptors:
+        # Align
+        aln_path = os.path.join(outdir, f"{adaptor.name}.paf")
+        adaptors_and_aln_paths.append((adaptor, aln_path))
+        if os.path.exists(aln_path) and use_existing:
+            log.info(f"Skipping {adaptor.name} as alignment already exists")
+            continue
+        adaptor_path = os.path.join(outdir, f"{adaptor.name}.fasta")
+        with open(adaptor_path, "w") as f:
+            f.write(adaptor.get_fastastring(insert_Ns=False))
+
+        log.info(f"Aligning {adaptor.name}")
+        run_minimap2(
+            fastq_in=fastq,
+            index_file=adaptor_path,
+            output_paf=aln_path,
+            threads=threads,
+            minimap_b=minimap_b,
+        )
+    return adaptors_and_aln_paths
+
+
 def check_for_umis(
     results, adaptors_included, entries, umi_threshold, kmer_length, outdir
 ):
     # Traverse the adaptors to include and check whether they need UMI entropy analysis
     for adaptor in adaptors_included:
-        adaptor_data = results["included_adapters"][adaptor.name]
+        adaptor_data = results["included_adaptors"][adaptor.name]
 
         for adaptor_end_name, adaptor_end in zip(
             ["i5", "i7"], [adaptor.i5, adaptor.i7]
diff --git a/tests/test_explore/test_explore.py b/tests/test_explore/test_explore.py
index a5d2e94..5993a67 100644
--- a/tests/test_explore/test_explore.py
+++ b/tests/test_explore/test_explore.py
@@ -1,9 +1,15 @@
+import gzip
 import os
+import tempfile
 
 import pytest
 
 from anglerfish.demux.adaptor import Adaptor
-from anglerfish.explore.explore import run_multiple_alignments
+from anglerfish.explore.explore import (
+    _run_explore,
+    run_multiple_alignments,
+    setup_explore_directory,
+)
 
 
 @pytest.fixture(scope="module")
@@ -29,7 +35,63 @@ def explore_fixture(request):
             "i7": "GATCGGAAGAGCACACGTCTGAACTCCAGTCAC<N>ATCTCGTATGCCGTCTTCTGCTTG",
         },
     }
+
+    # Read with perfect match to illumina ud i7 and i5 adaptors
+    matching_read_seq = "".join(
+        [
+            "TTATGCTAACTCATTCAGCGTATTGCTAATGATACGGCGACCACCGAGATCTACAC",  # i5 before index
+            "GCGTAAGA",  # i5 index
+            "TCGTCGGCAGCGTC",  # i5 after index
+            "CGGAAAATGTGGAAGTGACTTTGGAACTGGGTGCAGGAAAGACTGAAACAGTTTAGAGGGCGAAGAGAACGCGAAAAATATGGTAAGTTTGGAAAAGATCGGACGTCGTGAGAAAAGAGTGTAAAAGATCAGATGTGTAGATCTCA",  # Real read
+            "CCGAGCCCACGAGAC",  # rev. comp i7 before index
+            "TCCTGAGC",  # rev. comp i7 index
+            "ATCTCGTATGCCGTCTTCTGCTTGAGCAATACGTT",  # rev. comp i7 after index
+        ]
+    )
+
+    matching_read = (
+        "\n".join(
+            [
+                "@matching_read",
+                matching_read_seq,
+                "+",
+                "%" * len(matching_read_seq),
+            ]
+        )
+        + "\n"
+    )
+
+    # Read with perfect match to illumina_ud, i5 only, slight mismatch i7
+    i5_only_read_seq = "".join(
+        [
+            "TTATGCTAACTCATTCAGCGTATTGCTAATGATACGGCGACCACCGAGATCTACAC",  # i5 before index
+            "GCGTAAGA",  # i5 index
+            "TCGTCGGCAGCGTC",  # i5 after index
+            "CGGAAAATGTGGAAGTGACTTTGGAACTGGGTGCAGGAAAGACTGAAACAGTTTAGAGGGCGAAGAGAACGCGAAAAATATGGTAAGTTTGGAAAAGATCGGACGTCGTGAGAAAAGAGTGTAAAAGATCAGATGTGTAGATCTCA",  # Real read
+            "CAGACCCCACGAGAC",  # rev. comp i7 before index with 2 mismatches
+            "TCCTGAGC",  # rev. comp i7 index
+            "ATCTCGTATGCCGTCTTCTGCTTGAGCAATACGTT",  # rev. comp i7 after index
+        ]
+    )
+    i5_only_read = (
+        "\n".join(
+            [
+                "@half_matching_read",
+                i5_only_read_seq,
+                "+",
+                "%" * len(i5_only_read_seq),
+            ]
+        )
+        + "\n"
+    )
     demux_fixture = request.getfixturevalue("demux_fixture")
+    tmp_path = demux_fixture["tmp_path"]
+    demux_fixture["explore_reads"] = tmp_path / "testdata_explore.fastq.gz"
+
+    with gzip.open(tmp_path / "testdata_explore.fastq.gz", "ab") as f:
+        f.write(matching_read.encode())
+        f.write(i5_only_read.encode())
+
     fixture = demux_fixture
     fixture["adaptors"] = [
         Adaptor(
@@ -42,11 +104,39 @@ def explore_fixture(request):
     return fixture
 
 
+def test_setup_explore_directory():
+    tmp_dir = tempfile.TemporaryDirectory()
+    dir_to_be_created = os.path.join(tmp_dir.name, "test_dir")
+    setup_explore_directory(dir_to_be_created, False)
+    assert os.path.exists(dir_to_be_created)
+    tmp_dir.cleanup()
+
+
+def test_setup_explore_directory_use_existing():
+    tmp_dir = tempfile.TemporaryDirectory()
+    dir_to_be_created = os.path.join(tmp_dir.name, "test_dir")
+    os.makedirs(dir_to_be_created)
+    # This method should run exit(1) if the directory already exists
+    with pytest.raises(SystemExit):
+        setup_explore_directory(dir_to_be_created, False)
+
+    # save modification time of the directory
+    dir_time = os.path.getmtime(dir_to_be_created)
+    setup_explore_directory(dir_to_be_created, True)
+    # Check that the directory was not recreated
+    assert dir_time == os.path.getmtime(dir_to_be_created)
+
+    tmp_dir.cleanup()
+
+
 def test_run_multiple_alignments(explore_fixture):
     # Test a basic run
+    tmp_dir = tempfile.TemporaryDirectory()
+    tmp_path = tmp_dir.name
+
     return_list = run_multiple_alignments(
         fastq=explore_fixture["testdata_single"],
-        outdir=explore_fixture["tmp_path"],
+        outdir=tmp_path,
         threads=1,
         use_existing=False,
         adaptors=explore_fixture["adaptors"],
@@ -68,7 +158,7 @@ def test_run_multiple_alignments(explore_fixture):
 
     return_list = run_multiple_alignments(
         fastq=explore_fixture["testdata_single"],
-        outdir=explore_fixture["tmp_path"],
+        outdir=tmp_path,
         threads=1,
         use_existing=True,
         adaptors=explore_fixture["adaptors"],
@@ -84,11 +174,51 @@ def test_run_multiple_alignments(explore_fixture):
 
     return_list = run_multiple_alignments(
         fastq=explore_fixture["testdata_single"],
-        outdir=explore_fixture["tmp_path"],
+        outdir=tmp_path,
         threads=1,
         use_existing=False,
         adaptors=explore_fixture["adaptors"],
         minimap_b=1,
     )
+
     # Check that the alignment file was rerun
     assert alignment_time != os.path.getmtime(alignment_file)
+
+    tmp_dir.cleanup()
+
+
+def test_run_explore_functional_test(explore_fixture):
+    """Test overall function of the explore command."""
+    tmp_dir = tempfile.TemporaryDirectory()
+    tmp_path = os.path.join(tmp_dir.name, "outdir")
+
+    # Running with 2 reads, one perfect match to illumina_ud, one partial match to illumina_ud
+    results, adaptors_included, entries, umi_threshold, kmer_length, outdir = (
+        _run_explore(
+            fastq=explore_fixture["explore_reads"],
+            outdir=tmp_path,
+            threads=1,
+            use_existing=False,
+            good_hit_threshold=0.9,
+            insert_thres_low=4,
+            insert_thres_high=30,
+            minimap_b=4,
+            min_hits_per_adaptor=1,  # Not default
+            umi_threshold=11,
+            kmer_length=2,
+        )
+    )
+
+    assert results
+
+    # Make sure one read matches both and one read matches only i5
+    assert list(results["included_adaptors"].keys()) == ["illumina_ud"]
+    assert results["included_adaptors"]["illumina_ud"]["i5"]["nr_good_hits"] == 2
+    assert results["included_adaptors"]["illumina_ud"]["i7"]["nr_good_hits"] == 1
+
+    # Correct index length found
+    assert results["included_adaptors"]["illumina_ud"]["i5"]["index_length"] == 8
+    assert results["included_adaptors"]["illumina_ud"]["i7"]["index_length"] == 8
+
+    assert len(adaptors_included) == 1
+    assert adaptors_included[0].name == "illumina_ud"

From fe0194351f611a06780a784f9624b61fe1183d86 Mon Sep 17 00:00:00 2001
From: Johannes Alneberg <alneberg@kth.se>
Date: Mon, 16 Sep 2024 14:30:27 +0200
Subject: [PATCH 5/5] renamed _run_explore

---
 anglerfish/explore/explore.py      | 4 ++--
 tests/test_explore/test_explore.py | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/anglerfish/explore/explore.py b/anglerfish/explore/explore.py
index ae7b684..b4f2470 100644
--- a/anglerfish/explore/explore.py
+++ b/anglerfish/explore/explore.py
@@ -28,7 +28,7 @@ def run_explore(
     kmer_length: int,
 ):
     results, adaptors_included, entries, umi_threshold, kmer_length, outdir = (
-        _run_explore(
+        get_explore_results(
             fastq,
             outdir,
             threads,
@@ -55,7 +55,7 @@ def run_explore(
     log.info(f"Results saved to {explore_stats_file}")
 
 
-def _run_explore(
+def get_explore_results(
     fastq: str,
     outdir: str,
     threads: int,
diff --git a/tests/test_explore/test_explore.py b/tests/test_explore/test_explore.py
index 5993a67..45a4933 100644
--- a/tests/test_explore/test_explore.py
+++ b/tests/test_explore/test_explore.py
@@ -6,7 +6,7 @@
 
 from anglerfish.demux.adaptor import Adaptor
 from anglerfish.explore.explore import (
-    _run_explore,
+    get_explore_results,
     run_multiple_alignments,
     setup_explore_directory,
 )
@@ -187,14 +187,14 @@ def test_run_multiple_alignments(explore_fixture):
     tmp_dir.cleanup()
 
 
-def test_run_explore_functional_test(explore_fixture):
+def test_get_explore_results_functional_test(explore_fixture):
     """Test overall function of the explore command."""
     tmp_dir = tempfile.TemporaryDirectory()
     tmp_path = os.path.join(tmp_dir.name, "outdir")
 
     # Running with 2 reads, one perfect match to illumina_ud, one partial match to illumina_ud
     results, adaptors_included, entries, umi_threshold, kmer_length, outdir = (
-        _run_explore(
+        get_explore_results(
             fastq=explore_fixture["explore_reads"],
             outdir=tmp_path,
             threads=1,