From c22c0baef2560cd7d36df65187080c244ba2cff9 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Tue, 16 Jan 2024 09:11:47 +0100
Subject: [PATCH 01/23] added action for normalization

---
 q2_amr/card/normalization.py | 23 +++++++++++++++++++++++
 q2_amr/plugin_setup.py       | 20 ++++++++++++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 q2_amr/card/normalization.py

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
new file mode 100644
index 0000000..ac6435a
--- /dev/null
+++ b/q2_amr/card/normalization.py
@@ -0,0 +1,23 @@
+from typing import Union
+
+import biom
+import pandas as pd
+from q2_types.per_sample_sequences import (
+    SingleLanePerSamplePairedEndFastqDirFmt,
+    SingleLanePerSampleSingleEndFastqDirFmt,
+)
+
+
+def normalize(
+    reads: Union[
+        SingleLanePerSamplePairedEndFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt
+    ],
+    table: biom.Table,
+) -> pd.DataFrame:
+    # manifest = reads.manifest.view(pd.DataFrame)
+    # #paired = isinstance(reads, SingleLanePerSamplePairedEndFastqDirFmt)
+    # for samp in list(manifest.index):
+    #     # fwd = manifest.loc[samp, "forward"]
+    #     # rev = manifest.loc[samp, "reverse"] if paired else None
+    normalized_table = pd.DataFrame(table)
+    return normalized_table
diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index 84efb88..ef62a67 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -7,7 +7,7 @@
 # ----------------------------------------------------------------------------
 import importlib
 
-from q2_types.feature_table import FeatureTable, PresenceAbsence
+from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence
 from q2_types.per_sample_sequences import (
     PairedEndSequencesWithQuality,
     SequencesWithQuality,
@@ -21,6 +21,7 @@
 from q2_amr.card.database import fetch_card_db
 from q2_amr.card.heatmap import heatmap
 from q2_amr.card.mags import annotate_mags_card
+from q2_amr.card.normalization import normalize
 from q2_amr.card.reads import annotate_reads_card, visualize_annotation_stats
 from q2_amr.types import (
     CARDAnnotationJSONFormat,
@@ -104,7 +105,6 @@
     citations=[citations["alcock_card_2023"]],
 )
 
-
 plugin.methods.register_function(
     function=annotate_reads_card,
     inputs={
@@ -177,6 +177,22 @@
     citations=[citations["alcock_card_2023"]],
 )
 
+plugin.methods.register_function(
+    function=normalize,
+    inputs={
+        "table": FeatureTable[Frequency],
+        "reads": SampleData[PairedEndSequencesWithQuality | SequencesWithQuality],
+    },
+    parameters={},
+    outputs=[("normalized_table", FeatureTable[Frequency])],
+    input_descriptions={"table": "The ", "reads": "The "},
+    parameter_descriptions={},
+    output_descriptions={"normalized_table": "hello"},
+    name="Shannon's Entropy",
+    description="Compute Shannon's Entropy for each sample in a ",
+    citations=[],
+)
+
 # Registrations
 plugin.register_semantic_types(
     CARDDatabase, CARDAnnotation, CARDAlleleAnnotation, CARDGeneAnnotation

From 926f8761ad6a1cc534d5679776b33900ac122513 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Thu, 25 Jan 2024 09:59:29 +0100
Subject: [PATCH 02/23] added tpm normalize with cli integration

---
 q2_amr/card/normalization.py            | 90 ++++++++++++++++++++++---
 q2_amr/card/tests/test_normalization.py | 11 +++
 q2_amr/card/utils.py                    |  8 ++-
 q2_amr/plugin_setup.py                  | 25 +++++--
 4 files changed, 117 insertions(+), 17 deletions(-)
 create mode 100644 q2_amr/card/tests/test_normalization.py

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index ac6435a..380171e 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -1,17 +1,16 @@
-from typing import Union
+import os
+import subprocess
+import tempfile
 
 import biom
 import pandas as pd
-from q2_types.per_sample_sequences import (
-    SingleLanePerSamplePairedEndFastqDirFmt,
-    SingleLanePerSampleSingleEndFastqDirFmt,
-)
+from pydeseq2.dds import DeseqDataSet
 
+from q2_amr.card.utils import run_command
+from q2_amr.types import CARDAlleleAnnotationDirectoryFormat
 
-def normalize(
-    reads: Union[
-        SingleLanePerSamplePairedEndFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt
-    ],
+
+def normalize_mor(
     table: biom.Table,
 ) -> pd.DataFrame:
     # manifest = reads.manifest.view(pd.DataFrame)
@@ -19,5 +18,76 @@ def normalize(
     # for samp in list(manifest.index):
     #     # fwd = manifest.loc[samp, "forward"]
     #     # rev = manifest.loc[samp, "reverse"] if paired else None
-    normalized_table = pd.DataFrame(table)
+    data = table.matrix_data.toarray()
+    columns = table.ids(axis="sample")
+    index = table.ids(axis="observation")
+    df = pd.DataFrame(data, index=index, columns=columns)
+    df = df.T
+    metadata = df.iloc[:, 0].to_frame()
+    metadata.iloc[0, 0] = "a"
+    original_columns = metadata.columns
+
+    metadata.columns = ["condition"] + list(original_columns[1:])
+    dds = DeseqDataSet(counts=df, metadata=metadata)
+    dds.fit_size_factors()
+
+    # normalizedcounts = dds.obsm["size_factors"]
+    return df
+
+
+def normalize_tpm(
+    table: biom.Table, amr_annotations: CARDAlleleAnnotationDirectoryFormat
+) -> pd.DataFrame:
+    # Initialize an empty list to store individual DataFrames
+    len_list = []
+
+    # Iterate over samples in the specified path
+    for samp in os.listdir(amr_annotations.path):
+        anno_txt = os.path.join(amr_annotations.path, samp, "allele_mapping_data.txt")
+
+        # Read each DataFrame and append it to the list
+        len_sample = pd.read_csv(
+            anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"]
+        )
+        # index_col="Reference Sequence")
+
+        len_list.append(len_sample)
+
+    # Concatenate all DataFrames into a single DataFrame and remove duplicate rows
+    len_df = pd.concat(len_list)
+    len_df.drop_duplicates(inplace=True)
+    len_df.columns = ["gene_id", "gene_length"]
+
+    counts_df = table.to_dataframe()
+    counts_df = counts_df.T
+
+    with tempfile.TemporaryDirectory() as tmp:
+        len = os.path.join(tmp, "len.csv")
+        counts = os.path.join(tmp, "counts.csv")
+        len_df.to_csv(len, index=False)
+        counts_df.to_csv(counts, index=True)
+
+        run_rnanorm_tpm(cwd=tmp, len=len, counts=counts)
+        normalized_table = pd.read_csv(
+            os.path.join(tmp, "out.csv"),
+            index_col=0,
+        )
+        normalized_table.index.name = "sample_id"
     return normalized_table
+
+
+def run_rnanorm_tpm(
+    cwd: str,
+    len: str,
+    counts: str,
+):
+    cmd = ["rnanorm", "tpm", counts, "--gene-lengths", len, "--out", f"{cwd}/out.csv"]
+
+    try:
+        run_command(cmd, cwd, verbose=True, shell=True)
+    except subprocess.CalledProcessError as e:
+        raise Exception(
+            "An error was encountered while running RNAnorm, "
+            f"(return code {e.returncode}), please inspect "
+            "stdout and stderr to learn more."
+        )
diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py
new file mode 100644
index 0000000..1999eb9
--- /dev/null
+++ b/q2_amr/card/tests/test_normalization.py
@@ -0,0 +1,11 @@
+# from qiime2.plugin.testing import TestPluginBase
+#
+#
+# class TestAnnotateReadsCARD(TestPluginBase):
+#     package = "q2_amr.card.tests"
+#
+#     @classmethod
+#     def setUpClass(cls):
+#
+#     def test_normalize_MOR(self):
+#         biom = self.get_data_path()
diff --git a/q2_amr/card/utils.py b/q2_amr/card/utils.py
index 201f2ab..c91e543 100644
--- a/q2_amr/card/utils.py
+++ b/q2_amr/card/utils.py
@@ -15,7 +15,13 @@
 )
 
 
-def run_command(cmd, cwd, verbose=True):
+def run_command(
+    cmd,
+    cwd,
+    verbose=True,
+    shell=False,
+    stdout=None,
+):
     if verbose:
         print(EXTERNAL_CMD_WARNING)
         print("\nCommand:", end=" ")
diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index 43b30cc..671e526 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -21,7 +21,7 @@
 from q2_amr.card.database import fetch_card_db
 from q2_amr.card.heatmap import heatmap
 from q2_amr.card.mags import annotate_mags_card
-from q2_amr.card.normalization import normalize
+from q2_amr.card.normalization import normalize_mor, normalize_tpm
 from q2_amr.card.reads import annotate_reads_card
 from q2_amr.types import (
     CARDAnnotationJSONFormat,
@@ -216,18 +216,31 @@
 )
 
 plugin.methods.register_function(
-    function=normalize,
+    function=normalize_mor,
+    inputs={"table": FeatureTable[Frequency]},
+    parameters={},
+    outputs=[("normalized_table", FeatureTable[Frequency])],
+    input_descriptions={"table": "FeatureTable"},
+    parameter_descriptions={},
+    output_descriptions={"normalized_table": "hello"},
+    name="",
+    description="",
+    citations=[],
+)
+
+plugin.methods.register_function(
+    function=normalize_tpm,
     inputs={
         "table": FeatureTable[Frequency],
-        "reads": SampleData[PairedEndSequencesWithQuality | SequencesWithQuality],
+        "amr_annotations": SampleData[CARDAlleleAnnotation],
     },
     parameters={},
     outputs=[("normalized_table", FeatureTable[Frequency])],
-    input_descriptions={"table": "The ", "reads": "The "},
+    input_descriptions={"table": "FeatureTable"},
     parameter_descriptions={},
     output_descriptions={"normalized_table": "hello"},
-    name="Shannon's Entropy",
-    description="Compute Shannon's Entropy for each sample in a ",
+    name="",
+    description="",
     citations=[],
 )
 

From e713e5e095975162d4d7b9e741e3d3cb9acedfce Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Fri, 26 Jan 2024 14:16:33 +0100
Subject: [PATCH 03/23] adding tpm python implementation

---
 q2_amr/card/normalization.py | 63 +++++++++---------------------------
 1 file changed, 15 insertions(+), 48 deletions(-)

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index 380171e..c1c7156 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -1,12 +1,10 @@
 import os
-import subprocess
-import tempfile
 
 import biom
 import pandas as pd
 from pydeseq2.dds import DeseqDataSet
+from rnanorm import TPM
 
-from q2_amr.card.utils import run_command
 from q2_amr.types import CARDAlleleAnnotationDirectoryFormat
 
 
@@ -38,8 +36,8 @@ def normalize_mor(
 def normalize_tpm(
     table: biom.Table, amr_annotations: CARDAlleleAnnotationDirectoryFormat
 ) -> pd.DataFrame:
-    # Initialize an empty list to store individual DataFrames
-    len_list = []
+    # Initialize an empty series for gene lengths
+    len_all = pd.Series()
 
     # Iterate over samples in the specified path
     for samp in os.listdir(amr_annotations.path):
@@ -48,46 +46,15 @@ def normalize_tpm(
         # Read each DataFrame and append it to the list
         len_sample = pd.read_csv(
             anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"]
-        )
-        # index_col="Reference Sequence")
-
-        len_list.append(len_sample)
-
-    # Concatenate all DataFrames into a single DataFrame and remove duplicate rows
-    len_df = pd.concat(len_list)
-    len_df.drop_duplicates(inplace=True)
-    len_df.columns = ["gene_id", "gene_length"]
-
-    counts_df = table.to_dataframe()
-    counts_df = counts_df.T
-
-    with tempfile.TemporaryDirectory() as tmp:
-        len = os.path.join(tmp, "len.csv")
-        counts = os.path.join(tmp, "counts.csv")
-        len_df.to_csv(len, index=False)
-        counts_df.to_csv(counts, index=True)
-
-        run_rnanorm_tpm(cwd=tmp, len=len, counts=counts)
-        normalized_table = pd.read_csv(
-            os.path.join(tmp, "out.csv"),
-            index_col=0,
-        )
-        normalized_table.index.name = "sample_id"
-    return normalized_table
-
-
-def run_rnanorm_tpm(
-    cwd: str,
-    len: str,
-    counts: str,
-):
-    cmd = ["rnanorm", "tpm", counts, "--gene-lengths", len, "--out", f"{cwd}/out.csv"]
-
-    try:
-        run_command(cmd, cwd, verbose=True, shell=True)
-    except subprocess.CalledProcessError as e:
-        raise Exception(
-            "An error was encountered while running RNAnorm, "
-            f"(return code {e.returncode}), please inspect "
-            "stdout and stderr to learn more."
-        )
+        ).set_index("Reference Sequence")["Reference Length"]
+
+        len_all = len_all.combine_first(len_sample)
+    len_all = len_all
+    # len_all = len_all.values.reshape(-1, 1)
+    counts_arr = table.matrix_data.toarray().T
+    # counts_arr = counts_arr.T
+    # len_all = len_all.reshape(-1, 1)
+    tpm_normalizer = TPM(gene_lengths=len_all).set_output(transform="pandas")
+    tpm_df = tpm_normalizer.fit_transform(counts_arr)
+    tpm_df.index.name = "sample_id"
+    return tpm_df

From 358ba73fa172f0eeb69649f6112467a3149b0b38 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Thu, 1 Feb 2024 10:48:50 +0100
Subject: [PATCH 04/23] added tpm python integration

---
 q2_amr/card/normalization.py | 65 +++++++++++++++++++++---------------
 1 file changed, 38 insertions(+), 27 deletions(-)

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index c1c7156..3305c2c 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -1,8 +1,5 @@
-import os
-
 import biom
 import pandas as pd
-from pydeseq2.dds import DeseqDataSet
 from rnanorm import TPM
 
 from q2_amr.types import CARDAlleleAnnotationDirectoryFormat
@@ -26,8 +23,8 @@ def normalize_mor(
     original_columns = metadata.columns
 
     metadata.columns = ["condition"] + list(original_columns[1:])
-    dds = DeseqDataSet(counts=df, metadata=metadata)
-    dds.fit_size_factors()
+    # dds = DeseqDataSet(counts=df, metadata=metadata)
+    # dds.fit_size_factors()
 
     # normalizedcounts = dds.obsm["size_factors"]
     return df
@@ -36,25 +33,39 @@ def normalize_mor(
 def normalize_tpm(
     table: biom.Table, amr_annotations: CARDAlleleAnnotationDirectoryFormat
 ) -> pd.DataFrame:
-    # Initialize an empty series for gene lengths
-    len_all = pd.Series()
-
-    # Iterate over samples in the specified path
-    for samp in os.listdir(amr_annotations.path):
-        anno_txt = os.path.join(amr_annotations.path, samp, "allele_mapping_data.txt")
-
-        # Read each DataFrame and append it to the list
-        len_sample = pd.read_csv(
-            anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"]
-        ).set_index("Reference Sequence")["Reference Length"]
-
-        len_all = len_all.combine_first(len_sample)
-    len_all = len_all
-    # len_all = len_all.values.reshape(-1, 1)
-    counts_arr = table.matrix_data.toarray().T
-    # counts_arr = counts_arr.T
-    # len_all = len_all.reshape(-1, 1)
-    tpm_normalizer = TPM(gene_lengths=len_all).set_output(transform="pandas")
-    tpm_df = tpm_normalizer.fit_transform(counts_arr)
-    tpm_df.index.name = "sample_id"
-    return tpm_df
+    # # Initialize an empty series for gene lengths
+    # len_all = pd.Series()
+    #
+    # # Iterate over samples in the specified path
+    # for samp in os.listdir(amr_annotations.path):
+    #     anno_txt = os.path.join(amr_annotations.path, samp, "allele_mapping_data.txt")
+    #
+    #     # Read each DataFrame and append it to the list
+    #     len_sample = pd.read_csv(
+    #         anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"]
+    #     ).set_index("Reference Sequence")["Reference Length"]
+    #
+    #     len_all = len_all.combine_first(len_sample)
+    len_all = pd.read_csv(
+        "/Users/rischv/Desktop/genelengts.txt",
+        sep="\t",
+        header=None,
+        names=["index", "values"],
+        index_col="index",
+        squeeze=True,
+    )
+    len_all = len_all.astype(int)
+    df = pd.DataFrame(
+        data=table.matrix_data.toarray(),
+        index=table.ids(axis="observation"),
+        columns=table.ids(axis="sample"),
+    ).T
+    # Work with pandas.
+    # len_all.to_csv("/Users/rischv/Desktop/genelengts.txt", sep='\t', index=True,
+    # header=False)
+
+    transformer = TPM(gene_lengths=len_all).set_output(transform="pandas")
+    exp_normalized = transformer.fit_transform(df)
+
+    exp_normalized.index.name = "sample_id"
+    return exp_normalized

From 315e5a1a2ce36efe37aba8017da3009a5aab16b4 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Thu, 1 Feb 2024 14:02:28 +0100
Subject: [PATCH 05/23] added GeneLength type

---
 q2_amr/plugin_setup.py                        |  8 ++++++
 q2_amr/types/__init__.py                      |  4 +++
 q2_amr/types/_format.py                       | 22 +++++++++++++++
 q2_amr/types/_type.py                         |  1 +
 q2_amr/types/tests/data/gene_length.txt       |  2 ++
 .../types/tests/data/gene_length_3_cols.txt   |  2 ++
 .../types/tests/data/gene_length_switched.txt |  2 ++
 .../tests/test_types_formats_transformers.py  | 28 +++++++++++++++++++
 8 files changed, 69 insertions(+)
 create mode 100644 q2_amr/types/tests/data/gene_length.txt
 create mode 100644 q2_amr/types/tests/data/gene_length_3_cols.txt
 create mode 100644 q2_amr/types/tests/data/gene_length_switched.txt

diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index cb22126..45d0e3c 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -41,12 +41,15 @@
     CARDKmerTXTFormat,
     CARDWildcardIndexFormat,
     GapDNAFASTAFormat,
+    GeneLengthDirectoryFormat,
+    GeneLengthFormat,
 )
 from q2_amr.types._type import (
     CARDAlleleAnnotation,
     CARDAnnotation,
     CARDGeneAnnotation,
     CARDKmerDatabase,
+    GeneLength,
 )
 
 citations = Citations.load("citations.bib", package="q2_amr")
@@ -239,6 +242,9 @@
 plugin.register_semantic_type_to_format(
     SampleData[CARDGeneAnnotation], artifact_format=CARDGeneAnnotationDirectoryFormat
 )
+plugin.register_semantic_type_to_format(
+    GeneLength, artifact_format=GeneLengthDirectoryFormat
+)
 
 plugin.register_formats(
     CARDKmerDatabaseDirectoryFormat,
@@ -256,6 +262,8 @@
     CARDAnnotationStatsFormat,
     CARDAlleleAnnotationDirectoryFormat,
     CARDGeneAnnotationDirectoryFormat,
+    GeneLengthFormat,
+    GeneLengthDirectoryFormat,
 )
 
 importlib.import_module("q2_amr.types._transformer")
diff --git a/q2_amr/types/__init__.py b/q2_amr/types/__init__.py
index 7f5d25d..94f3f12 100644
--- a/q2_amr/types/__init__.py
+++ b/q2_amr/types/__init__.py
@@ -22,6 +22,8 @@
     CARDKmerTXTFormat,
     CARDWildcardIndexFormat,
     GapDNAFASTAFormat,
+    GeneLengthDirectoryFormat,
+    GeneLengthFormat,
 )
 from ._type import (
     CARDAlleleAnnotation,
@@ -52,4 +54,6 @@
     "GapDNAFASTAFormat",
     "CARDWildcardIndexFormat",
     "CARDKmerDatabase",
+    "GeneLengthDirectoryFormat",
+    "GeneLengthFormat",
 ]
diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py
index de8c44a..7ee2eca 100644
--- a/q2_amr/types/_format.py
+++ b/q2_amr/types/_format.py
@@ -425,3 +425,25 @@ class CARDGeneAnnotationDirectoryFormat(MultiDirValidationMixin, model.Directory
     @gene.set_path_maker
     def gene_path_maker(self, sample_id):
         return "%s/gene_mapping_data.txt" % sample_id
+
+
+class GeneLengthFormat(model.TextFileFormat):
+    def _validate(self, n_records=None):
+        df = pd.read_csv(str(self), sep="\t", header=None)
+        has_two_columns = len(df.columns) == 2
+        lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all()
+        if not (has_two_columns and lengths_num):
+            raise ValidationError(
+                "Format does not match GeneLengthsFormat. Must consist of tab "
+                "separated values in two columns with no header. The first column must "
+                "be the gene names and the second column must be the corresponding "
+                "gene lengths."
+            )
+
+    def _validate_(self, level):
+        self._validate()
+
+
+GeneLengthDirectoryFormat = model.SingleFileDirectoryFormat(
+    "GeneLengthDirectoryFormat", "gene_length.txt", GeneLengthFormat
+)
diff --git a/q2_amr/types/_type.py b/q2_amr/types/_type.py
index aa2f302..bee4195 100644
--- a/q2_amr/types/_type.py
+++ b/q2_amr/types/_type.py
@@ -17,3 +17,4 @@
 CARDGeneAnnotation = SemanticType(
     "CARDGeneAnnotation", variant_of=SampleData.field["type"]
 )
+GeneLength = SemanticType("GeneLength")
diff --git a/q2_amr/types/tests/data/gene_length.txt b/q2_amr/types/tests/data/gene_length.txt
new file mode 100644
index 0000000..2fddb36
--- /dev/null
+++ b/q2_amr/types/tests/data/gene_length.txt
@@ -0,0 +1,2 @@
+ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
+ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1	1173.0
diff --git a/q2_amr/types/tests/data/gene_length_3_cols.txt b/q2_amr/types/tests/data/gene_length_3_cols.txt
new file mode 100644
index 0000000..a1803bb
--- /dev/null
+++ b/q2_amr/types/tests/data/gene_length_3_cols.txt
@@ -0,0 +1,2 @@
+ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0	1
+ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1	1173.0	1
diff --git a/q2_amr/types/tests/data/gene_length_switched.txt b/q2_amr/types/tests/data/gene_length_switched.txt
new file mode 100644
index 0000000..cae5b65
--- /dev/null
+++ b/q2_amr/types/tests/data/gene_length_switched.txt
@@ -0,0 +1,2 @@
+1356.0	ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1
+1173.0	ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1
diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py
index a594eaa..55ceb21 100644
--- a/q2_amr/types/tests/test_types_formats_transformers.py
+++ b/q2_amr/types/tests/test_types_formats_transformers.py
@@ -21,6 +21,7 @@
     ProteinIterator,
 )
 from q2_types_genomics.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat
+from qiime2.plugin import ValidationError
 from qiime2.plugin.testing import TestPluginBase
 from skbio import DNA, Protein
 
@@ -38,6 +39,8 @@
     CARDKmerTXTFormat,
     CARDWildcardIndexFormat,
     GapDNAFASTAFormat,
+    GeneLengthDirectoryFormat,
+    GeneLengthFormat,
 )
 from q2_amr.types._transformer import (
     _read_from_card_file,
@@ -328,3 +331,28 @@ def test_CARDAlleleAnnotationDirectoryFormat_to_qiime2_Metadata_transformer(self
         )
         metadata_obt = transformer(annotation)
         self.assertIsInstance(metadata_obt, qiime2.Metadata)
+
+
+class TestGeneLengthsTypesAndFormats(AMRTypesTestPluginBase):
+    def test_gene_length_format_validate_positive(self):
+        filepath = self.get_data_path("gene_length.txt")
+        format = GeneLengthFormat(filepath, mode="r")
+        format.validate()
+
+    def test_gene_length_format_validate_negative_3_cols(self):
+        filepath = self.get_data_path("gene_length_3_cols.txt")
+        with self.assertRaises(ValidationError):
+            format = GeneLengthFormat(filepath, mode="r")
+            format.validate()
+
+    def test_gene_length_format_validate_negative_switched(self):
+        filepath = self.get_data_path("gene_length_switched.txt")
+        with self.assertRaises(ValidationError):
+            format = GeneLengthFormat(filepath, mode="r")
+            format.validate()
+
+    def test_gene_length_directory_format_validate_positive(self):
+        filepath = self.get_data_path("gene_length.txt")
+        shutil.copy(filepath, os.path.join(self.temp_dir.name, "gene_length.txt"))
+        format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r")
+        format.validate()

From 8a6c46b3266944578593ff660f41aa77d4f65205 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Thu, 1 Feb 2024 15:44:59 +0100
Subject: [PATCH 06/23] added transformer for GeneLength

---
 q2_amr/types/_transformer.py                  | 21 +++++++++++++++++++
 .../tests/test_types_formats_transformers.py  | 12 +++++++++++
 2 files changed, 33 insertions(+)

diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py
index 6d08cf1..f6fa159 100644
--- a/q2_amr/types/_transformer.py
+++ b/q2_amr/types/_transformer.py
@@ -26,6 +26,7 @@
     CARDAnnotationTXTFormat,
     CARDDatabaseFormat,
     CARDGeneAnnotationDirectoryFormat,
+    GeneLengthDirectoryFormat,
 )
 
 
@@ -253,3 +254,23 @@ def tabulate_data(data_path, data_type):
     if data_type == "mags":
         df_combined.rename(columns={"ID": "HSP_Identifier"}, inplace=True)
     return qiime2.Metadata(df_combined)
+
+
+@plugin.register_transformer
+def _15(data: CARDAlleleAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat:
+    len_all = pd.Series()
+    directory = GeneLengthDirectoryFormat()
+    # Iterate over samples in the specified path
+    for samp in os.listdir(data.path):
+        anno_txt = os.path.join(data.path, samp, "allele_mapping_data.txt")
+
+        # Read each DataFrame and append it to the list
+        len_sample = pd.read_csv(
+            anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"]
+        ).set_index("Reference Sequence")["Reference Length"]
+
+        len_all = len_all.combine_first(len_sample)
+    len_all.to_csv(
+        os.path.join(directory.path, "gene_length.txt"), sep="\t", header=False
+    )
+    return directory
diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py
index 55ceb21..e0de12b 100644
--- a/q2_amr/types/tests/test_types_formats_transformers.py
+++ b/q2_amr/types/tests/test_types_formats_transformers.py
@@ -356,3 +356,15 @@ def test_gene_length_directory_format_validate_positive(self):
         shutil.copy(filepath, os.path.join(self.temp_dir.name, "gene_length.txt"))
         format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r")
         format.validate()
+
+    def test_card_annotation_format_to_df_transformertt(self):
+        transformer = self.get_transformer(
+            CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat
+        )
+
+        annotation = CARDAlleleAnnotationDirectoryFormat(
+            self.get_data_path("annotate_reads_output"), "r"
+        )
+
+        obs = transformer(annotation)
+        self.assertIsInstance(obs, GeneLengthDirectoryFormat)

From 942cd8bced550afdaba03a0544b99bb9065d6207 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Thu, 1 Feb 2024 17:08:44 +0100
Subject: [PATCH 07/23] changed input of normalize to genelength

---
 q2_amr/card/normalization.py | 27 +++++++--------------------
 q2_amr/plugin_setup.py       |  4 ++--
 2 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index 3305c2c..a92c1b1 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -1,8 +1,10 @@
+import os
+
 import biom
 import pandas as pd
 from rnanorm import TPM
 
-from q2_amr.types import CARDAlleleAnnotationDirectoryFormat
+from q2_amr.types import GeneLengthDirectoryFormat
 
 
 def normalize_mor(
@@ -31,40 +33,25 @@ def normalize_mor(
 
 
 def normalize_tpm(
-    table: biom.Table, amr_annotations: CARDAlleleAnnotationDirectoryFormat
+    table: biom.Table, gene_length: GeneLengthDirectoryFormat
 ) -> pd.DataFrame:
-    # # Initialize an empty series for gene lengths
-    # len_all = pd.Series()
-    #
-    # # Iterate over samples in the specified path
-    # for samp in os.listdir(amr_annotations.path):
-    #     anno_txt = os.path.join(amr_annotations.path, samp, "allele_mapping_data.txt")
-    #
-    #     # Read each DataFrame and append it to the list
-    #     len_sample = pd.read_csv(
-    #         anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"]
-    #     ).set_index("Reference Sequence")["Reference Length"]
-    #
-    #     len_all = len_all.combine_first(len_sample)
-    len_all = pd.read_csv(
-        "/Users/rischv/Desktop/genelengts.txt",
+    gene_length_series = pd.read_csv(
+        os.path.join(gene_length.path, "gene_length.txt"),
         sep="\t",
         header=None,
         names=["index", "values"],
         index_col="index",
         squeeze=True,
     )
-    len_all = len_all.astype(int)
     df = pd.DataFrame(
         data=table.matrix_data.toarray(),
         index=table.ids(axis="observation"),
         columns=table.ids(axis="sample"),
     ).T
-    # Work with pandas.
     # len_all.to_csv("/Users/rischv/Desktop/genelengts.txt", sep='\t', index=True,
     # header=False)
 
-    transformer = TPM(gene_lengths=len_all).set_output(transform="pandas")
+    transformer = TPM(gene_lengths=gene_length_series).set_output(transform="pandas")
     exp_normalized = transformer.fit_transform(df)
 
     exp_normalized.index.name = "sample_id"
diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index 9a739b2..5d06ec9 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -235,11 +235,11 @@
     function=normalize_tpm,
     inputs={
         "table": FeatureTable[Frequency],
-        "amr_annotations": SampleData[CARDAlleleAnnotation],
+        "gene_length": SampleData[CARDAlleleAnnotation],
     },
     parameters={},
     outputs=[("normalized_table", FeatureTable[Frequency])],
-    input_descriptions={"table": "FeatureTable"},
+    input_descriptions={"table": "FeatureTable", "gene_length": "gene_length"},
     parameter_descriptions={},
     output_descriptions={"normalized_table": "hello"},
     name="",

From fe1d3a8a58899b3dd071f4ac3be9af3c5ca9a6bc Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Thu, 1 Feb 2024 19:53:18 +0100
Subject: [PATCH 08/23] added transformer for GeneLength

---
 q2_amr/types/_format.py                       | 27 ++++++--
 q2_amr/types/_transformer.py                  | 27 +++++---
 .../tests/data/gene_length_duplicate.txt      |  2 +
 .../tests/test_types_formats_transformers.py  | 69 +++++++++++++------
 4 files changed, 91 insertions(+), 34 deletions(-)
 create mode 100644 q2_amr/types/tests/data/gene_length_duplicate.txt

diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py
index 7ee2eca..09b1882 100644
--- a/q2_amr/types/_format.py
+++ b/q2_amr/types/_format.py
@@ -428,16 +428,33 @@ def gene_path_maker(self, sample_id):
 
 
 class GeneLengthFormat(model.TextFileFormat):
+    """
+    Tab-separated format for gene length data.
+
+    The format must consist of tab separated values in two columns with no header.
+    The first column must be the gene names and the second column must be the
+    corresponding gene lengths. The gene names can not be duplicated.
+    """
+
     def _validate(self, n_records=None):
         df = pd.read_csv(str(self), sep="\t", header=None)
         has_two_columns = len(df.columns) == 2
         lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all()
-        if not (has_two_columns and lengths_num):
+        if not has_two_columns:
+            raise ValidationError(
+                "The file must consist of two tab separated columns. The first column "
+                "must be the gene names and the second column must be the "
+                "corresponding gene lengths."
+            )
+        if not lengths_num:
+            raise ValidationError(
+                "The second column (gene lengths) must be of type int or float."
+            )
+        duplicates = df.iloc[:, 0].duplicated().any()
+        if duplicates:
             raise ValidationError(
-                "Format does not match GeneLengthsFormat. Must consist of tab "
-                "separated values in two columns with no header. The first column must "
-                "be the gene names and the second column must be the corresponding "
-                "gene lengths."
+                "There are no duplicate values allowed in the first column "
+                "(gene names)."
             )
 
     def _validate_(self, level):
diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py
index f6fa159..22cb51b 100644
--- a/q2_amr/types/_transformer.py
+++ b/q2_amr/types/_transformer.py
@@ -258,18 +258,29 @@ def tabulate_data(data_path, data_type):
 
 @plugin.register_transformer
 def _15(data: CARDAlleleAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat:
+    directory = get_gene_lengths(map_type="allele", annotations=data.path)
+    return directory
+
+
+@plugin.register_transformer
+def _16(data: CARDGeneAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat:
+    directory = get_gene_lengths(map_type="gene", annotations=data.path)
+    return directory
+
+
+def get_gene_lengths(map_type, annotations):
+    gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term"
     len_all = pd.Series()
     directory = GeneLengthDirectoryFormat()
-    # Iterate over samples in the specified path
-    for samp in os.listdir(data.path):
-        anno_txt = os.path.join(data.path, samp, "allele_mapping_data.txt")
-
-        # Read each DataFrame and append it to the list
-        len_sample = pd.read_csv(
-            anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"]
-        ).set_index("Reference Sequence")["Reference Length"]
 
+    # Iterate over samples, read in each DataFrame and append it to the series
+    for samp in os.listdir(annotations):
+        anno_txt = os.path.join(annotations, samp, f"{map_type}_mapping_data.txt")
+        cols = [gene_name_col, "Reference Length"]
+        len_sample = pd.read_csv(anno_txt, sep="\t", usecols=cols)
+        len_sample = len_sample.set_index(cols[0])[cols[1]]
         len_all = len_all.combine_first(len_sample)
+
     len_all.to_csv(
         os.path.join(directory.path, "gene_length.txt"), sep="\t", header=False
     )
diff --git a/q2_amr/types/tests/data/gene_length_duplicate.txt b/q2_amr/types/tests/data/gene_length_duplicate.txt
new file mode 100644
index 0000000..60621cb
--- /dev/null
+++ b/q2_amr/types/tests/data/gene_length_duplicate.txt
@@ -0,0 +1,2 @@
+ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
+ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py
index e0de12b..b6be2cc 100644
--- a/q2_amr/types/tests/test_types_formats_transformers.py
+++ b/q2_amr/types/tests/test_types_formats_transformers.py
@@ -339,17 +339,37 @@ def test_gene_length_format_validate_positive(self):
         format = GeneLengthFormat(filepath, mode="r")
         format.validate()
 
-    def test_gene_length_format_validate_negative_3_cols(self):
-        filepath = self.get_data_path("gene_length_3_cols.txt")
-        with self.assertRaises(ValidationError):
-            format = GeneLengthFormat(filepath, mode="r")
-            format.validate()
-
-    def test_gene_length_format_validate_negative_switched(self):
-        filepath = self.get_data_path("gene_length_switched.txt")
-        with self.assertRaises(ValidationError):
-            format = GeneLengthFormat(filepath, mode="r")
-            format.validate()
+    def test_gene_length_format_validate_negative(self):
+        # Test ValidationErrors for wrong columns, wrong data type and duplicate gene
+        # names
+        test_cases = [
+            (
+                "gene_length_3_cols.txt",
+                " is not a(n) GeneLengthFormat file:\n\nThe file must consist of two "
+                "tab separated columns. The first column must be the gene names and "
+                "the second column must be the corresponding gene lengths.",
+            ),
+            (
+                "gene_length_switched.txt",
+                " is not a(n) GeneLengthFormat file:\n\nThe second column "
+                "(gene lengths) must be of type int or float.",
+            ),
+            (
+                "gene_length_duplicate.txt",
+                " is not a(n) GeneLengthFormat file:\n\nThere are no duplicate values "
+                "allowed in the first column (gene names).",
+            ),
+        ]
+
+        for file_name, expected_error_message in test_cases:
+            with self.subTest(file_name=file_name):
+                filepath = self.get_data_path(file_name)
+                with self.assertRaises(ValidationError) as context:
+                    format = GeneLengthFormat(filepath, mode="r")
+                    format.validate()
+                self.assertEqual(
+                    str(context.exception), filepath + expected_error_message
+                )
 
     def test_gene_length_directory_format_validate_positive(self):
         filepath = self.get_data_path("gene_length.txt")
@@ -357,14 +377,21 @@ def test_gene_length_directory_format_validate_positive(self):
         format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r")
         format.validate()
 
-    def test_card_annotation_format_to_df_transformertt(self):
-        transformer = self.get_transformer(
-            CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat
-        )
-
-        annotation = CARDAlleleAnnotationDirectoryFormat(
-            self.get_data_path("annotate_reads_output"), "r"
-        )
+    def test_card_allele_gene_annotation_dir_format_to_gene_length_dir_format(self):
+        transformations = [
+            (CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat),
+            (CARDGeneAnnotationDirectoryFormat, GeneLengthDirectoryFormat),
+        ]
 
-        obs = transformer(annotation)
-        self.assertIsInstance(obs, GeneLengthDirectoryFormat)
+        for input_format, output_format in transformations:
+            with self.subTest(input_format=input_format, output_format=output_format):
+                transformer = self.get_transformer(input_format, output_format)
+                annotation = input_format(
+                    self.get_data_path("annotate_reads_output"), "r"
+                )
+                obs = transformer(annotation)
+                self.assertIsInstance(obs, GeneLengthDirectoryFormat)
+                format = GeneLengthFormat(
+                    os.path.join(obs.path, "gene_length.txt"), mode="r"
+                )
+                format.validate()

From 6dce0d8be69231273b7265eabe0ed32e093e706b Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Thu, 1 Feb 2024 20:15:30 +0100
Subject: [PATCH 09/23] added comment in test

---
 q2_amr/types/_transformer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py
index 22cb51b..d85d346 100644
--- a/q2_amr/types/_transformer.py
+++ b/q2_amr/types/_transformer.py
@@ -269,6 +269,7 @@ def _16(data: CARDGeneAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat:
 
 
 def get_gene_lengths(map_type, annotations):
+    # Extracts gene lengths from CARDAlleleAnnotation and CARDGeneAnnotation
     gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term"
     len_all = pd.Series()
     directory = GeneLengthDirectoryFormat()

From e2069ca314d3ec7d0d950b4bd9c73ce34a217b19 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Fri, 2 Feb 2024 09:52:00 +0100
Subject: [PATCH 10/23] added comments

---
 q2_amr/types/_format.py                               | 7 +++++--
 q2_amr/types/tests/test_types_formats_transformers.py | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py
index 09b1882..9c41528 100644
--- a/q2_amr/types/_format.py
+++ b/q2_amr/types/_format.py
@@ -431,25 +431,28 @@ class GeneLengthFormat(model.TextFileFormat):
     """
     Tab-separated format for gene length data.
 
-    The format must consist of tab separated values in two columns with no header.
+    The file must consist of tab separated values in two columns with no header.
     The first column must be the gene names and the second column must be the
     corresponding gene lengths. The gene names can not be duplicated.
     """
 
     def _validate(self, n_records=None):
         df = pd.read_csv(str(self), sep="\t", header=None)
+        # Check if df has exactly two columns
         has_two_columns = len(df.columns) == 2
-        lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all()
         if not has_two_columns:
             raise ValidationError(
                 "The file must consist of two tab separated columns. The first column "
                 "must be the gene names and the second column must be the "
                 "corresponding gene lengths."
             )
+        # Check if gene length column is numerical (float or int)
+        lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all()
         if not lengths_num:
             raise ValidationError(
                 "The second column (gene lengths) must be of type int or float."
             )
+        # Check if gene names column doesn't have duplicates
         duplicates = df.iloc[:, 0].duplicated().any()
         if duplicates:
             raise ValidationError(
diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py
index b6be2cc..9949130 100644
--- a/q2_amr/types/tests/test_types_formats_transformers.py
+++ b/q2_amr/types/tests/test_types_formats_transformers.py
@@ -340,8 +340,8 @@ def test_gene_length_format_validate_positive(self):
         format.validate()
 
     def test_gene_length_format_validate_negative(self):
-        # Test ValidationErrors for wrong columns, wrong data type and duplicate gene
-        # names
+        # Test ValidationErrors for wrong number of columns, wrong data type and
+        # duplicate gene names
         test_cases = [
             (
                 "gene_length_3_cols.txt",

From b6d2b9478bfe9a9947c9ac5d6e9ebd85ddefc150 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Thu, 8 Feb 2024 16:21:07 +0100
Subject: [PATCH 11/23] added all normalization methods with tests except for
 CPM

---
 README.md                                    |  20 ++--
 q2_amr/card/heatmap.py                       |   8 +-
 q2_amr/card/normalization.py                 | 105 +++++++++++--------
 q2_amr/card/tests/data/feature-table.biom    | Bin 0 -> 33824 bytes
 q2_amr/card/tests/data/gene_length.txt       |   2 +
 q2_amr/card/tests/data/gene_length_short.txt |   1 +
 q2_amr/card/tests/test_normalization.py      |  87 +++++++++++++--
 q2_amr/card/utils.py                         |   6 ++
 q2_amr/citations.bib                         |   9 ++
 q2_amr/plugin_setup.py                       |  30 ++----
 10 files changed, 178 insertions(+), 90 deletions(-)
 create mode 100644 q2_amr/card/tests/data/feature-table.biom
 create mode 100644 q2_amr/card/tests/data/gene_length.txt
 create mode 100644 q2_amr/card/tests/data/gene_length_short.txt

diff --git a/README.md b/README.md
index b6c8579..2b7a67e 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ To install _q2-amr_, follow the steps described below.
 mamba create -yn q2-amr \
   -c conda-forge -c bioconda -c qiime2 -c defaults \
   -c https://packages.qiime2.org/qiime2/2023.9/shotgun/released/ \
-  qiime2 q2cli q2templates q2-types q2-types-genomics rgi
+  qiime2 q2cli q2templates q2-types q2-types-genomics rgi rnanorm
 
 conda activate q2-amr
 
@@ -38,7 +38,7 @@ qiime info
 CONDA_SUBDIR=osx-64 mamba create -yn q2-amr \
   -c conda-forge -c bioconda -c qiime2 -c defaults \
   -c https://packages.qiime2.org/qiime2/2023.9/shotgun/released/ \
-  qiime2 q2cli q2templates q2-types q2-types-genomics rgi
+  qiime2 q2cli q2templates q2-types q2-types-genomics rgi rnanorm
 
 conda activate q2-amr
 conda config --env --set subdir osx-64
@@ -58,15 +58,17 @@ qiime info
 ## Functionality
 This QIIME 2 plugin contains actions used to annotate short single/paired-end
 sequencing reads and MAGs with antimicrobial resistance genes. Currently, the [CARD](https://card.mcmaster.ca) database is supported  (for details on
-the implementation and usage, please refer to the [rgi](https://github.com/arpcard/rgi) documentation). Below you will
+the implementation and usage, please refer to the [RGI](https://github.com/arpcard/rgi) documentation). Below you will
 find an overview of actions available in the plugin.
 
-| Action                     | Description                                                                          | Underlying tool                       | Used function                        |
-|----------------------------|--------------------------------------------------------------------------------------|---------------------------------------|--------------------------------------|
-| fetch-card-db              | Download and preprocess CARD and WildCARD data.                                      | [rgi](https://github.com/arpcard/rgi) | card_annotation, wildcard_annotation |
-| annotate-mags-card         | Annotate MAGs with antimicrobial resistance gene information from CARD.              | [rgi](https://github.com/arpcard/rgi) | main, load                           |
-| annotate-reads-card        | Annotate metagenomic reads with antimicrobial resistance gene information from CARD. | [rgi](https://github.com/arpcard/rgi) | bwt, load                            |
-| heatmap                    | Create a heatmap from annotate-mags-card output files.                               | [rgi](https://github.com/arpcard/rgi) | heatmap                              |
+| Action              | Description                                                                          | Underlying tool                           | Used function                        |
+|---------------------|--------------------------------------------------------------------------------------|-------------------------------------------|--------------------------------------|
+| fetch-card-db       | Download and preprocess CARD and WildCARD data.                                      | [RGI](https://github.com/arpcard/rgi)     | card_annotation, wildcard_annotation |
+| annotate-mags-card  | Annotate MAGs with antimicrobial resistance gene information from CARD.              | [RGI](https://github.com/arpcard/rgi)     | main, load                           |
+| annotate-reads-card | Annotate metagenomic reads with antimicrobial resistance gene information from CARD. | [RGI](https://github.com/arpcard/rgi)     | bwt, load                            |
+| heatmap             | Create a heatmap from annotate-mags-card output files.                               | [RGI](https://github.com/arpcard/rgi)     | heatmap                              |
+| normalize           | Normalize feature table with [FPKM](https://www.nature.com/articles/nmeth.1226), [TPM](https://link.springer.com/article/10.1007/s12064-012-0162-3), [TMM](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25), [UQ](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-94), [CUF](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/), or [CTF](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/) method.                 | [rnaNORM](https://github.com/genialis/RNAnorm) | fpkm, tpm, tmm, uq, cuf, ctf         |
+
 
 ## Dev environment
 This repository follows the _black_ code style. To make the development slightly easier
diff --git a/q2_amr/card/heatmap.py b/q2_amr/card/heatmap.py
index b6c93ed..bc9e253 100644
--- a/q2_amr/card/heatmap.py
+++ b/q2_amr/card/heatmap.py
@@ -8,7 +8,7 @@
 import pkg_resources
 import q2templates
 
-from q2_amr.card.utils import run_command
+from q2_amr.card.utils import InvalidParameterCombinationError, run_command
 from q2_amr.types import CARDAnnotationDirectoryFormat
 
 
@@ -50,12 +50,6 @@ def heatmap(
     q2templates.render(templates, output_dir, context=context)
 
 
-class InvalidParameterCombinationError(Exception):
-    def __init__(self, message="Invalid parameter combination"):
-        self.message = message
-        super().__init__(self.message)
-
-
 def run_rgi_heatmap(tmp, json_files_dir, clus, cat, display, frequency):
     cmd = [
         "rgi",
diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index a92c1b1..98a9109 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -2,57 +2,74 @@
 
 import biom
 import pandas as pd
-from rnanorm import TPM
+from rnanorm import CTF, CUF, FPKM, TMM, TPM, UQ
 
+from q2_amr.card.utils import InvalidParameterCombinationError
 from q2_amr.types import GeneLengthDirectoryFormat
 
 
-def normalize_mor(
+def normalize(
     table: biom.Table,
+    method: str,
+    m_trim: float = 0.3,
+    a_trim: float = 0.05,
+    gene_length: GeneLengthDirectoryFormat = None,
 ) -> pd.DataFrame:
-    # manifest = reads.manifest.view(pd.DataFrame)
-    # #paired = isinstance(reads, SingleLanePerSamplePairedEndFastqDirFmt)
-    # for samp in list(manifest.index):
-    #     # fwd = manifest.loc[samp, "forward"]
-    #     # rev = manifest.loc[samp, "reverse"] if paired else None
-    data = table.matrix_data.toarray()
-    columns = table.ids(axis="sample")
-    index = table.ids(axis="observation")
-    df = pd.DataFrame(data, index=index, columns=columns)
-    df = df.T
-    metadata = df.iloc[:, 0].to_frame()
-    metadata.iloc[0, 0] = "a"
-    original_columns = metadata.columns
-
-    metadata.columns = ["condition"] + list(original_columns[1:])
-    # dds = DeseqDataSet(counts=df, metadata=metadata)
-    # dds.fit_size_factors()
-
-    # normalizedcounts = dds.obsm["size_factors"]
-    return df
-
-
-def normalize_tpm(
-    table: biom.Table, gene_length: GeneLengthDirectoryFormat
-) -> pd.DataFrame:
-    gene_length_series = pd.read_csv(
-        os.path.join(gene_length.path, "gene_length.txt"),
-        sep="\t",
-        header=None,
-        names=["index", "values"],
-        index_col="index",
-        squeeze=True,
-    )
-    df = pd.DataFrame(
+    # Create Dataframe with counts from biom.Table
+    counts = pd.DataFrame(
         data=table.matrix_data.toarray(),
         index=table.ids(axis="observation"),
         columns=table.ids(axis="sample"),
     ).T
-    # len_all.to_csv("/Users/rischv/Desktop/genelengts.txt", sep='\t', index=True,
-    # header=False)
-
-    transformer = TPM(gene_lengths=gene_length_series).set_output(transform="pandas")
-    exp_normalized = transformer.fit_transform(df)
-
-    exp_normalized.index.name = "sample_id"
-    return exp_normalized
+    if method in ["tpm", "fpkm", "uq", "cuf"]:
+        # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ or
+        # CUF
+        if m_trim != 0.3 or a_trim != 0.05:
+            raise InvalidParameterCombinationError(
+                "Parameters m-trim and a-trim can only be used with methods TMM and "
+                "CTF."
+            )
+        if method in ["tpm", "fpkm"]:
+            # Raise Error if gene-length is missing when using methods TPM or FPKM
+            if not gene_length:
+                raise ValueError("gene-length input is missing.")
+            # Create pd.Series from gene_length input
+            gene_length_series = pd.read_csv(
+                os.path.join(gene_length.path, "gene_length.txt"),
+                sep="\t",
+                header=None,
+                names=["index", "values"],
+                index_col="index",
+                squeeze=True,
+            )
+            # Raise Error if there are genes in the counts that are not present in the
+            # gene length
+            if not set(counts.columns).issubset(set(gene_length_series.index)):
+                only_in_counts = set(counts.columns) - set(gene_length_series.index)
+                raise ValueError(
+                    f"There are genes present in the FeatureTable that are not present "
+                    f"in the gene-length input. Missing lengths for genes: "
+                    f"{only_in_counts}"
+                )
+            # Define the methods TPM and FPKM with the gene length series as an input
+            methods = {
+                "tpm": TPM(gene_lengths=gene_length_series),
+                "fpkm": FPKM(gene_lengths=gene_length_series),
+            }
+    if method in ["tmm", "uq", "cuf", "ctf"]:
+        # Raise Error if gene-length is given when using methods TMM, UQ, CUF or CTF
+        if gene_length:
+            raise ValueError(
+                "gene-length input can only be used with FPKM and TPM methods."
+            )
+        # Define the methods TMM and CTF with parameters, also UQ and CUF
+        methods = {
+            "tmm": TMM(m_trim=m_trim, a_trim=a_trim),
+            "ctf": CTF(m_trim=m_trim, a_trim=a_trim),
+            "uq": UQ(),
+            "cuf": CUF(),
+        }
+    # Run normalization method on count dataframe
+    normalized = methods[method].set_output(transform="pandas").fit_transform(counts)
+    normalized.index.name = "sample_id"
+    return normalized
diff --git a/q2_amr/card/tests/data/feature-table.biom b/q2_amr/card/tests/data/feature-table.biom
new file mode 100644
index 0000000000000000000000000000000000000000..3a8fcee13f42e92f42a5463c65bee03c7bbf9168
GIT binary patch
literal 33824
zcmeI4PfS}k7{DC@Z3>iysoH4ONlzP_I4txf{9P^$Ep$QJfVFI6n}(9WD?}nB3u#9;
z4s6<?yY<*3#~pjDc4AT|^@y}fj~pv)2aY>T>TG}C*SxrM12nKCd_FwepY6}~@7a#|
z{LIhE@oQ(>FSHBgLLt#A!a7KgXXu=tKtgd!PhlW{{$2EcX@NjMv<sEigYq!ycY+{T
z$9%KbCnki<N&3mhVqzT?kRFn=e=-OpC&p$}Pv@|^4*HLA{9sh%GizS)cDj_!uWHd+
zI#*cn)W`xiN*VX*cjI_9$91PjXbk}o6z4=pWEVl<`9MIIP_qc?3}HS<?@%6ybb47&
zo7PjhQ;@9{qMDvFK0OLSr${P^)Y+l++{_nq=~CoYaYc)(>U%?F!*;2u*(o$`sqZxz
z?h&Z(wpU!!Xu~ouWKXD)A|U&%9K8<HC;sJjp4IJK@>acKy5ucJGIvz@YW;p)Wx{s0
z%3xVvm)5sX^z_S*EJ~&O(09S7udPDgDc$e2<+Oew2kkHQzf-uX2FbBi>I;e0)$ert
z1$|mt<`<A96f;`yXdG^<Mk<F1)z;0;i;x|WL4==70113}b^0ScH`RhA9nDj{PRaOG
z-kD8jRy=2NT&L5x*6WXyOwWGmlvD<tD&%SiPOm32xLhg~;+HOEviV#D){y>uaY@M8
z08Y<m8N4z%{c+@)UbLK8f3!b31nD%~^*oiLm>V06xUq;kG8-L^4@KkdQ2%Ibz#WJ|
z{!`$I2>N5!r{e>zbg_|*$?<q}WN2h#DxLG<UamN{F?IFIWPEJKbw6+iNBg6)%xNt5
z{tM-YRr!GtES~cURQ!wK;poV~pe{~J({nxW`x6tyi}ZtB)d+60i$d%(0Vco%m;e)C
z0!)AjFaajO1egF5s67GGu1fY%hYU|xWzH+5rAaMKEm$h%ZxteCsWdhgv+F|ILDzH^
zvLiTD!0>NOO^*kpwUF#m_wf&XDrefcs5$pO0Y#MWA;zEFFe*49QJbCc>3IMULi-=s
z35E51<%HnkZdwSK>j2HmJ{&((jOHDj4v_TlKCA<+WX=|k-vV7!&mP9Z4#>fpF2qdo
z<#qO9@?9)P`W|3>=WWP9{oKa+Lix=3gX-$mv#kRrKVApw_7&De!))t->r^(6=1HyA
z0o(XB=WDg|uDM=lC+!z1Tg|a%Ct!|W2Rpon>pk_K_OjX4#q5H&CZt~|70c&eqm*%<
zen6apvoo9+0sKX?or?!$y2{$^T*f_cLJW3@!Do1Y)+VJYy(I{0JA|VHc+4)tDm@?;
zy_+lQq?^vuLRfFGrDf|~XuCSHqn|k$Oo8->pYXdLd|q+#otIaL$B$wM&;4w<X<mmf
z+j2CQT&;KNJ=6D4<P$Hmw>xz3i3xOG>l*-O5sotfCcp%k025#W4NhQvejz1-r$fK*
z%Dv+~@q>zQ%09{6q6$jant9KWUrAjQt?ToDq(w_t__yutu1CKnH-@FWeO!F|UpjO_
z^|@sWs`)V^M6YO*u_e@T=Fwltjc}U?mhhWLSuHyzzyz286JP>N;1~!T+zu6%)&(Kd
zeFU%}Il7fKus`h%Lw}VCg3MDgxD3)`dfMzK^nJV8j9rWU#Mc3{2*;TK6JP>NfC(^x
z1}9K$|3vrO%Knk<pPU4+R8{%5AN8?+>bAV+KaKqpaq@M*Qj*0?fC(@GCcp%kK%)|<
zwtrH_AmzxX@1Sh|_oD`OsMKV+O5bNQDUcrV(`J97@0iT$|1tJY#L3rivk1qT025#W
zOn?b6fd(gFy9Z(3bF6j`BK$dk;O~ALd_Wws`Q|+cy3g>paZk2<gBL1#)qB0R@w+|?
z`tN@RK8#<!k9x@0+w{fE@k{I2LmWQ@Kz+NQ+zfcH{50>Q&dq^Js%I19TVI0Q#Ts|v
z-#S|B@57XyuOLAB5*SZ?4RY$|49*wIXU-pMMRk&3TL(;jybjdK3#-#+*w%q<oPX48
znkThd2W;cloUhgPVNd6wg*JN+wV(awBw&tT2fMk6>pk`Q1l&h;Q_xvu(xjg`8AK_h
z9of^okNU*mZ&pn9*3`0@QW>Xtz4c9{yjf#vy;Ex=-9sgxc$xh^*Iy|id$w-?RA%59
z6JP>NfC(@GCeZK%4*osBk$q2D$O28Z1@V23SY?A(<2PX2KiU5Mf3Ic#<m*6<Tgd8|
z025#WOn?b6f&ZI8wfz&_uPFORwtspc|2>{^sw&_1qrUe~F=PKkoO~V7o#F=*U;<2l
z2`~XB&^!dH?Vsp8iu_}^kFxhY;8j)nj?GfX0^MeRdec&S!q`6%Ctt%YC0Wb_m;e)C
M0!)AjG%A6A0p5#ZcK`qY

literal 0
HcmV?d00001

diff --git a/q2_amr/card/tests/data/gene_length.txt b/q2_amr/card/tests/data/gene_length.txt
new file mode 100644
index 0000000..2fddb36
--- /dev/null
+++ b/q2_amr/card/tests/data/gene_length.txt
@@ -0,0 +1,2 @@
+ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
+ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1	1173.0
diff --git a/q2_amr/card/tests/data/gene_length_short.txt b/q2_amr/card/tests/data/gene_length_short.txt
new file mode 100644
index 0000000..81d043e
--- /dev/null
+++ b/q2_amr/card/tests/data/gene_length_short.txt
@@ -0,0 +1 @@
+ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py
index 1999eb9..5ab2a17 100644
--- a/q2_amr/card/tests/test_normalization.py
+++ b/q2_amr/card/tests/test_normalization.py
@@ -1,11 +1,76 @@
-# from qiime2.plugin.testing import TestPluginBase
-#
-#
-# class TestAnnotateReadsCARD(TestPluginBase):
-#     package = "q2_amr.card.tests"
-#
-#     @classmethod
-#     def setUpClass(cls):
-#
-#     def test_normalize_MOR(self):
-#         biom = self.get_data_path()
+import os
+import shutil
+from unittest.mock import MagicMock, patch
+
+import biom
+from qiime2.plugin.testing import TestPluginBase
+
+from q2_amr.card.normalization import normalize
+from q2_amr.card.utils import InvalidParameterCombinationError
+from q2_amr.types import GeneLengthDirectoryFormat
+
+
+class TestNormalize(TestPluginBase):
+    package = "q2_amr.card.tests"
+
+    @classmethod
+    def setUpClass(cls):
+        # Mocking the biom.Table and gene_length class
+        cls.table = MagicMock()
+        cls.gene_length = MagicMock()
+
+    def test_tpm_fpkm_uq_cuf_with_invalid_m_a_trim(self):
+        # Test Error raised if gene-length is given with TMM method
+        expected_message = (
+            "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF."
+        )
+        with self.assertRaises(InvalidParameterCombinationError) as cm:
+            normalize(self.table, "tpm", m_trim=0.2, a_trim=0.05)
+        self.assertEqual(str(cm.exception), expected_message)
+
+    def test_tpm_fpkm_with_missing_gene_length(self):
+        # Test Error raised if gene-length is missing with TPM method
+        expected_message = "gene-length input is missing."
+        with self.assertRaises(ValueError) as cm:
+            normalize(self.table, "tpm")
+        self.assertEqual(str(cm.exception), expected_message)
+
+    def test_tmm_uq_cuf_ctf_with_gene_length(self):
+        # Test Error raised if gene-length is given with TMM method
+        expected_message = (
+            "gene-length input can only be used with FPKM and TPM methods."
+        )
+        with self.assertRaises(ValueError) as cm:
+            normalize(self.table, "tmm", gene_length=self.gene_length)
+        self.assertEqual(str(cm.exception), expected_message)
+
+    def test_tpm_fpkm_with_short_gene_length(self):
+        # Test Error raised if gene-length is missing genes
+        gene_length = GeneLengthDirectoryFormat()
+        shutil.copy(
+            self.get_data_path("gene_length_short.txt"),
+            os.path.join(gene_length.path, "gene_length.txt"),
+        )
+        table = biom.load_table(self.get_data_path("feature-table.biom"))
+        expected_message = (
+            "There are genes present in the FeatureTable that are not present "
+            "in the gene-length input. Missing lengths for genes: "
+            "{'ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1'}"
+        )
+        with self.assertRaises(ValueError) as cm:
+            normalize(table, "tpm", gene_length=gene_length)
+        self.assertEqual(str(cm.exception), expected_message)
+
+    @patch("q2_amr.card.normalization.TPM")
+    def test_tpm_fpkm_with_valid_inputs(self, mock_tpm):
+        # Test valid inputs for TPM method
+        gene_length = GeneLengthDirectoryFormat()
+        shutil.copy(self.get_data_path("gene_length.txt"), gene_length.path)
+        table = biom.load_table(self.get_data_path("feature-table.biom"))
+        normalize(table=table, gene_length=gene_length, method="tpm")
+
+    @patch("q2_amr.card.normalization.TMM")
+    def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm):
+        # Test valid inputs for TMM method
+        table = biom.load_table(self.get_data_path("feature-table.biom"))
+        normalize(table=table, method="tmm", a_trim=0.06, m_trim=0.4)
diff --git a/q2_amr/card/utils.py b/q2_amr/card/utils.py
index 1599f8d..69c600c 100644
--- a/q2_amr/card/utils.py
+++ b/q2_amr/card/utils.py
@@ -140,3 +140,9 @@ def create_count_table(df_list: list) -> pd.DataFrame:
     df.columns.name = None
     df.index.name = "sample_id"
     return df
+
+
+class InvalidParameterCombinationError(Exception):
+    def __init__(self, message="Invalid parameter combination"):
+        self.message = message
+        super().__init__(self.message)
diff --git a/q2_amr/citations.bib b/q2_amr/citations.bib
index a748e7a..9858108 100644
--- a/q2_amr/citations.bib
+++ b/q2_amr/citations.bib
@@ -26,3 +26,12 @@ @article{alcock_card_2023
 	keywords = {*Data Curation, *Databases, Factual, *Drug Resistance, Microbial, *Machine Learning, Anti-Bacterial Agents/pharmacology, Genes, Bacterial, Likelihood Functions, Molecular Sequence Annotation, Software},
 	pages = {D690--D699},
 }
+
+@misc{Zmrzlikar_RNAnorm_RNA-seq_data_2023,
+author = {Zmrzlikar, Jure and Žganec, Matjaž and Ausec, Luka and Štajdohar, Miha},
+month = jul,
+title = {{RNAnorm: RNA-seq data normalization in Python}},
+url = {https://github.com/genialis/RNAnorm},
+version = {2.1.0},
+year = {2023}
+}
diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index 5d06ec9..0f67ff0 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -14,14 +14,14 @@
 )
 from q2_types.sample_data import SampleData
 from q2_types_genomics.per_sample_data import MAGs
-from qiime2.core.type import Bool, Choices, Int, Properties, Range, Str, TypeMap
+from qiime2.core.type import Bool, Choices, Float, Int, Properties, Range, Str, TypeMap
 from qiime2.plugin import Citations, Plugin
 
 from q2_amr import __version__
 from q2_amr.card.database import fetch_card_db
 from q2_amr.card.heatmap import heatmap
 from q2_amr.card.mags import annotate_mags_card
-from q2_amr.card.normalization import normalize_mor, normalize_tpm
+from q2_amr.card.normalization import normalize
 from q2_amr.card.reads import annotate_reads_card
 from q2_amr.types import (
     CARDAnnotationJSONFormat,
@@ -219,32 +219,24 @@
 )
 
 plugin.methods.register_function(
-    function=normalize_mor,
-    inputs={"table": FeatureTable[Frequency]},
-    parameters={},
-    outputs=[("normalized_table", FeatureTable[Frequency])],
-    input_descriptions={"table": "FeatureTable"},
-    parameter_descriptions={},
-    output_descriptions={"normalized_table": "hello"},
-    name="",
-    description="",
-    citations=[],
-)
-
-plugin.methods.register_function(
-    function=normalize_tpm,
+    function=normalize,
     inputs={
         "table": FeatureTable[Frequency],
-        "gene_length": SampleData[CARDAlleleAnnotation],
+        "gene_length": GeneLength
+        | SampleData[CARDAlleleAnnotation | CARDGeneAnnotation],
+    },
+    parameters={
+        "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf"]),
+        "m_trim": Float,
+        "a_trim": Float,
     },
-    parameters={},
     outputs=[("normalized_table", FeatureTable[Frequency])],
     input_descriptions={"table": "FeatureTable", "gene_length": "gene_length"},
     parameter_descriptions={},
     output_descriptions={"normalized_table": "hello"},
     name="",
     description="",
-    citations=[],
+    citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]],
 )
 
 # Registrations

From dd1f9ff7b280aecebc2f0348c69f2c1468bc023b Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Thu, 8 Feb 2024 16:40:58 +0100
Subject: [PATCH 12/23] added cpm method

---
 q2_amr/card/normalization.py | 16 +++++++++-------
 q2_amr/plugin_setup.py       |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index 98a9109..ec42e97 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -2,7 +2,7 @@
 
 import biom
 import pandas as pd
-from rnanorm import CTF, CUF, FPKM, TMM, TPM, UQ
+from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
 
 from q2_amr.card.utils import InvalidParameterCombinationError
 from q2_amr.types import GeneLengthDirectoryFormat
@@ -21,9 +21,9 @@ def normalize(
         index=table.ids(axis="observation"),
         columns=table.ids(axis="sample"),
     ).T
-    if method in ["tpm", "fpkm", "uq", "cuf"]:
-        # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ or
-        # CUF
+    if method in ["tpm", "fpkm", "uq", "cuf", "cpm"]:
+        # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ,
+        # CPM or CUF
         if m_trim != 0.3 or a_trim != 0.05:
             raise InvalidParameterCombinationError(
                 "Parameters m-trim and a-trim can only be used with methods TMM and "
@@ -56,18 +56,20 @@ def normalize(
                 "tpm": TPM(gene_lengths=gene_length_series),
                 "fpkm": FPKM(gene_lengths=gene_length_series),
             }
-    if method in ["tmm", "uq", "cuf", "ctf"]:
-        # Raise Error if gene-length is given when using methods TMM, UQ, CUF or CTF
+    if method in ["tmm", "uq", "cuf", "ctf", "cpm"]:
+        # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or
+        # CTF
         if gene_length:
             raise ValueError(
                 "gene-length input can only be used with FPKM and TPM methods."
             )
-        # Define the methods TMM and CTF with parameters, also UQ and CUF
+        # Define the methods TMM and CTF with parameters, also UQ, CPM and CUF
         methods = {
             "tmm": TMM(m_trim=m_trim, a_trim=a_trim),
             "ctf": CTF(m_trim=m_trim, a_trim=a_trim),
             "uq": UQ(),
             "cuf": CUF(),
+            "cpm": CPM(),
         }
     # Run normalization method on count dataframe
     normalized = methods[method].set_output(transform="pandas").fit_transform(counts)
diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index 0f67ff0..9b08ee3 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -226,7 +226,7 @@
         | SampleData[CARDAlleleAnnotation | CARDGeneAnnotation],
     },
     parameters={
-        "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf"]),
+        "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf", "cpm"]),
         "m_trim": Float,
         "a_trim": Float,
     },

From 3daa215610b077320792645a5f98b3e1371be452 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Fri, 9 Feb 2024 10:03:35 +0100
Subject: [PATCH 13/23] updated plugin setup

---
 q2_amr/plugin_setup.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index 9b08ee3..56a79b0 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -231,11 +231,27 @@
         "a_trim": Float,
     },
     outputs=[("normalized_table", FeatureTable[Frequency])],
-    input_descriptions={"table": "FeatureTable", "gene_length": "gene_length"},
-    parameter_descriptions={},
-    output_descriptions={"normalized_table": "hello"},
-    name="",
-    description="",
+    input_descriptions={
+        "table": "Feature table with gene counts.",
+        "gene_length": "Gene lengths of all genes in the feature table.",
+    },
+    parameter_descriptions={
+        "method": "Specify the normalization method to be used. Use FPKM or TPM for "
+        "within comparisons and TMM, UQ, CUF or CTF for between sample "
+        "camparisons. Check https://www.genialis.com/wp-content/uploads/2023"
+        "/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf for "
+        "more information on the methods.",
+        "m_trim": "Two sided cutoff for M-values. Can only be used for methods TMM and "
+        "CTF.",
+        "a_trim": "Two sided cutoff for A-values. Can only be used for methods TMM and "
+        "CTF.",
+    },
+    output_descriptions={
+        "normalized_table": "Feature table normalized with specified " "method."
+    },
+    name="Normalize FeatureTable",
+    description="Normalize FeatureTable by gene length, library size and composition "
+    "with common methods for RNA-seq.",
     citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]],
 )
 

From 011abf4d48fef4a500330c9103b194d122e360c4 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Fri, 9 Feb 2024 12:12:56 +0100
Subject: [PATCH 14/23] added rnanorm to meta.yaml

---
 ci/recipe/meta.yaml    | 1 +
 q2_amr/plugin_setup.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml
index 00abfca..799e8d5 100644
--- a/ci/recipe/meta.yaml
+++ b/ci/recipe/meta.yaml
@@ -24,6 +24,7 @@ requirements:
     - q2templates {{ qiime2_epoch }}.*
     - q2cli {{ qiime2_epoch }}.*
     - rgi
+    - rnanorm
 
 test:
   requires:
diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index 56a79b0..1bdaafe 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -227,8 +227,8 @@
     },
     parameters={
         "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf", "cpm"]),
-        "m_trim": Float,
-        "a_trim": Float,
+        "m_trim": Float % Range(0, 1, inclusive_start=True),
+        "a_trim": Float % Range(0, 1, inclusive_start=True),
     },
     outputs=[("normalized_table", FeatureTable[Frequency])],
     input_descriptions={

From 907afa5c4392992620c19d1f6770be70322ba366 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Wed, 29 May 2024 12:43:30 +0200
Subject: [PATCH 15/23] Revert "Merge branch '42_Genelengts_type' into
 23_norm_tpm"

This reverts commit ff80222daf58709be628527dc0e7a85db0806ffd, reversing
changes made to 942cd8bced550afdaba03a0544b99bb9065d6207.
---
 q2_amr/types/_format.py                       | 30 ++------
 q2_amr/types/_transformer.py                  | 28 +++-----
 .../tests/data/gene_length_duplicate.txt      |  2 -
 .../tests/test_types_formats_transformers.py  | 69 ++++++-------------
 4 files changed, 34 insertions(+), 95 deletions(-)
 delete mode 100644 q2_amr/types/tests/data/gene_length_duplicate.txt

diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py
index 9c41528..7ee2eca 100644
--- a/q2_amr/types/_format.py
+++ b/q2_amr/types/_format.py
@@ -428,36 +428,16 @@ def gene_path_maker(self, sample_id):
 
 
 class GeneLengthFormat(model.TextFileFormat):
-    """
-    Tab-separated format for gene length data.
-
-    The file must consist of tab separated values in two columns with no header.
-    The first column must be the gene names and the second column must be the
-    corresponding gene lengths. The gene names can not be duplicated.
-    """
-
     def _validate(self, n_records=None):
         df = pd.read_csv(str(self), sep="\t", header=None)
-        # Check if df has exactly two columns
         has_two_columns = len(df.columns) == 2
-        if not has_two_columns:
-            raise ValidationError(
-                "The file must consist of two tab separated columns. The first column "
-                "must be the gene names and the second column must be the "
-                "corresponding gene lengths."
-            )
-        # Check if gene length column is numerical (float or int)
         lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all()
-        if not lengths_num:
-            raise ValidationError(
-                "The second column (gene lengths) must be of type int or float."
-            )
-        # Check if gene names column doesn't have duplicates
-        duplicates = df.iloc[:, 0].duplicated().any()
-        if duplicates:
+        if not (has_two_columns and lengths_num):
             raise ValidationError(
-                "There are no duplicate values allowed in the first column "
-                "(gene names)."
+                "Format does not match GeneLengthsFormat. Must consist of tab "
+                "separated values in two columns with no header. The first column must "
+                "be the gene names and the second column must be the corresponding "
+                "gene lengths."
             )
 
     def _validate_(self, level):
diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py
index d85d346..f6fa159 100644
--- a/q2_amr/types/_transformer.py
+++ b/q2_amr/types/_transformer.py
@@ -258,30 +258,18 @@ def tabulate_data(data_path, data_type):
 
 @plugin.register_transformer
 def _15(data: CARDAlleleAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat:
-    directory = get_gene_lengths(map_type="allele", annotations=data.path)
-    return directory
-
-
-@plugin.register_transformer
-def _16(data: CARDGeneAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat:
-    directory = get_gene_lengths(map_type="gene", annotations=data.path)
-    return directory
-
-
-def get_gene_lengths(map_type, annotations):
-    # Extracts gene lengths from CARDAlleleAnnotation and CARDGeneAnnotation
-    gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term"
     len_all = pd.Series()
     directory = GeneLengthDirectoryFormat()
+    # Iterate over samples in the specified path
+    for samp in os.listdir(data.path):
+        anno_txt = os.path.join(data.path, samp, "allele_mapping_data.txt")
 
-    # Iterate over samples, read in each DataFrame and append it to the series
-    for samp in os.listdir(annotations):
-        anno_txt = os.path.join(annotations, samp, f"{map_type}_mapping_data.txt")
-        cols = [gene_name_col, "Reference Length"]
-        len_sample = pd.read_csv(anno_txt, sep="\t", usecols=cols)
-        len_sample = len_sample.set_index(cols[0])[cols[1]]
-        len_all = len_all.combine_first(len_sample)
+        # Read each DataFrame and append it to the list
+        len_sample = pd.read_csv(
+            anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"]
+        ).set_index("Reference Sequence")["Reference Length"]
 
+        len_all = len_all.combine_first(len_sample)
     len_all.to_csv(
         os.path.join(directory.path, "gene_length.txt"), sep="\t", header=False
     )
diff --git a/q2_amr/types/tests/data/gene_length_duplicate.txt b/q2_amr/types/tests/data/gene_length_duplicate.txt
deleted file mode 100644
index 60621cb..0000000
--- a/q2_amr/types/tests/data/gene_length_duplicate.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
-ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py
index 9949130..e0de12b 100644
--- a/q2_amr/types/tests/test_types_formats_transformers.py
+++ b/q2_amr/types/tests/test_types_formats_transformers.py
@@ -339,37 +339,17 @@ def test_gene_length_format_validate_positive(self):
         format = GeneLengthFormat(filepath, mode="r")
         format.validate()
 
-    def test_gene_length_format_validate_negative(self):
-        # Test ValidationErrors for wrong number of columns, wrong data type and
-        # duplicate gene names
-        test_cases = [
-            (
-                "gene_length_3_cols.txt",
-                " is not a(n) GeneLengthFormat file:\n\nThe file must consist of two "
-                "tab separated columns. The first column must be the gene names and "
-                "the second column must be the corresponding gene lengths.",
-            ),
-            (
-                "gene_length_switched.txt",
-                " is not a(n) GeneLengthFormat file:\n\nThe second column "
-                "(gene lengths) must be of type int or float.",
-            ),
-            (
-                "gene_length_duplicate.txt",
-                " is not a(n) GeneLengthFormat file:\n\nThere are no duplicate values "
-                "allowed in the first column (gene names).",
-            ),
-        ]
-
-        for file_name, expected_error_message in test_cases:
-            with self.subTest(file_name=file_name):
-                filepath = self.get_data_path(file_name)
-                with self.assertRaises(ValidationError) as context:
-                    format = GeneLengthFormat(filepath, mode="r")
-                    format.validate()
-                self.assertEqual(
-                    str(context.exception), filepath + expected_error_message
-                )
+    def test_gene_length_format_validate_negative_3_cols(self):
+        filepath = self.get_data_path("gene_length_3_cols.txt")
+        with self.assertRaises(ValidationError):
+            format = GeneLengthFormat(filepath, mode="r")
+            format.validate()
+
+    def test_gene_length_format_validate_negative_switched(self):
+        filepath = self.get_data_path("gene_length_switched.txt")
+        with self.assertRaises(ValidationError):
+            format = GeneLengthFormat(filepath, mode="r")
+            format.validate()
 
     def test_gene_length_directory_format_validate_positive(self):
         filepath = self.get_data_path("gene_length.txt")
@@ -377,21 +357,14 @@ def test_gene_length_directory_format_validate_positive(self):
         format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r")
         format.validate()
 
-    def test_card_allele_gene_annotation_dir_format_to_gene_length_dir_format(self):
-        transformations = [
-            (CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat),
-            (CARDGeneAnnotationDirectoryFormat, GeneLengthDirectoryFormat),
-        ]
+    def test_card_annotation_format_to_df_transformertt(self):
+        transformer = self.get_transformer(
+            CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat
+        )
+
+        annotation = CARDAlleleAnnotationDirectoryFormat(
+            self.get_data_path("annotate_reads_output"), "r"
+        )
 
-        for input_format, output_format in transformations:
-            with self.subTest(input_format=input_format, output_format=output_format):
-                transformer = self.get_transformer(input_format, output_format)
-                annotation = input_format(
-                    self.get_data_path("annotate_reads_output"), "r"
-                )
-                obs = transformer(annotation)
-                self.assertIsInstance(obs, GeneLengthDirectoryFormat)
-                format = GeneLengthFormat(
-                    os.path.join(obs.path, "gene_length.txt"), mode="r"
-                )
-                format.validate()
+        obs = transformer(annotation)
+        self.assertIsInstance(obs, GeneLengthDirectoryFormat)

From 6865edae21adb30751d19453a78d125ae7e8f826 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Wed, 29 May 2024 12:44:04 +0200
Subject: [PATCH 16/23] Revert "Merge branch '42_Genelengts_type' into
 23_norm_tpm"

This reverts commit 07fac990b281ffe348b6df4ad9f3b59da279e3e8, reversing
changes made to 358ba73fa172f0eeb69649f6112467a3149b0b38.
---
 q2_amr/card/tests/data/output.mags.txt        |  4 +-
 q2_amr/card/tests/test_utils.py               | 46 +++++++++----------
 q2_amr/card/utils.py                          |  6 +--
 q2_amr/plugin_setup.py                        | 14 ++----
 q2_amr/types/__init__.py                      |  4 --
 q2_amr/types/_format.py                       | 22 ---------
 q2_amr/types/_transformer.py                  | 21 ---------
 q2_amr/types/_type.py                         |  1 -
 q2_amr/types/tests/data/gene_length.txt       |  2 -
 .../types/tests/data/gene_length_3_cols.txt   |  2 -
 .../types/tests/data/gene_length_switched.txt |  2 -
 .../tests/test_types_formats_transformers.py  | 40 ----------------
 12 files changed, 28 insertions(+), 136 deletions(-)
 delete mode 100644 q2_amr/types/tests/data/gene_length.txt
 delete mode 100644 q2_amr/types/tests/data/gene_length_3_cols.txt
 delete mode 100644 q2_amr/types/tests/data/gene_length_switched.txt

diff --git a/q2_amr/card/tests/data/output.mags.txt b/q2_amr/card/tests/data/output.mags.txt
index 55fc417..4102af9 100644
--- a/q2_amr/card/tests/data/output.mags.txt
+++ b/q2_amr/card/tests/data/output.mags.txt
@@ -1,5 +1,5 @@
 ORF_ID	Contig	Start	Stop	Orientation	Cut_Off	Pass_Bitscore	Best_Hit_Bitscore	Best_Hit_ARO	Best_Identities	ARO	Model_type	SNPs_in_Best_Hit_ARO	Other_SNPs	Drug Class	Resistance Mechanism	AMR Gene Family	Predicted_DNA	Predicted_Protein	CARD_Protein_Sequence	Percentage Length of Reference Sequence	ID	Model_ID	Nudged	Note
 "NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643"	NC_000962.3_689 	759789	763325	+	Strict	2300	2394.77	mdtF	99.91	3000796	protein variant model	D516G, H526T, L511R	n/a	rifamycin antibiotic	"antibiotic target alteration; antibiotic target replacement"	rifamycin-resistant beta-subunit of RNA polymerase (rpoB)	GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA	MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	100.51	gnl|BL_ORD_ID|2005|hsp_num:0	1237
-"NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643"	NC_000962.3_689 	759789	763325	+	Strict	2300	2394.77	mdtF	99.91	3000815	protein variant model	D516G, H526T, L511R	n/a	rifamycin antibiotic	"antibiotic target alteration; antibiotic target replacement"	rifamycin-resistant beta-subunit of RNA polymerase (rpoB)	GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA	MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	100.51	gnl|BL_ORD_ID|2005|hsp_num:0	1237
+"NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643"	NC_000962.3_689 	759789	763325	+	Strict	2300	2394.77	mgrA	99.91	3000815	protein variant model	D516G, H526T, L511R	n/a	rifamycin antibiotic	"antibiotic target alteration; antibiotic target replacement"	rifamycin-resistant beta-subunit of RNA polymerase (rpoB)	GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA	MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	100.51	gnl|BL_ORD_ID|2005|hsp_num:0	1237
 "NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643"	NC_000962.3_689 	759789	763325	+	Strict	2300	2394.77	OprN	99.91	3000805	protein variant model	D516G, H526T, L511R	n/a	rifamycin antibiotic	"antibiotic target alteration; antibiotic target replacement"	rifamycin-resistant beta-subunit of RNA polymerase (rpoB)	GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA	MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	100.51	gnl|BL_ORD_ID|2005|hsp_num:0	1237
-"NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643"	NC_000962.3_689 	759789	763325	+	Strict	2300	2394.77	mepA	99.91	3000026	protein variant model	D516G, H526T, L511R	n/a	rifamycin antibiotic	"antibiotic target alteration; antibiotic target replacement"	rifamycin-resistant beta-subunit of RNA polymerase (rpoB)	GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA	MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	100.51	gnl|BL_ORD_ID|2005|hsp_num:0	1237
+"NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643"	NC_000962.3_689 	759789	763325	+	Strict	2300	2394.77	mepA	99.91	3000026	protein variant model	D516G, H526T, L511R	n/a	rifamycin antibiotic	"antibiotic target alteration; antibiotic target replacement"	rifamycin-resistant beta-subunit of RNA polymerase (rpoB)	GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA	MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA	100.51	gnl|BL_ORD_ID|2005|hsp_num:0	1237
diff --git a/q2_amr/card/tests/test_utils.py b/q2_amr/card/tests/test_utils.py
index 3e833cb..e6999ce 100644
--- a/q2_amr/card/tests/test_utils.py
+++ b/q2_amr/card/tests/test_utils.py
@@ -15,31 +15,27 @@ class TestAnnotateReadsCARD(TestPluginBase):
 
     @classmethod
     def setUpClass(cls):
-        cls.allele_count_df = pd.DataFrame(
-            {
-                "Reference Sequence": [
+        cls.count_df_list = []
+        for colname, values in zip(
+            ["Reference Sequence", "ARO Term", "Best_Hit_ARO"],
+            [
+                [
                     "ARO:3000796|ID:121|Name:mdtF|NCBI:U00096.1",
                     "ARO:3000815|ID:154|Name:mgrA|NCBI:BA000018.3",
                     "ARO:3000805|ID:172|Name:OprN|NCBI:AE004091.2",
                     "ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1",
                 ],
-                "sample1": ["1", "1", "1", "1"],
-            }
-        )
-
-        cls.gene_count_df = pd.DataFrame(
-            {
-                "ARO Term": ["mdtF", "mgrA", "OprN", "mepA"],
-                "sample1": ["1", "1", "1", "1"],
-            }
-        )
-
-        cls.mag_count_df = pd.DataFrame(
-            {
-                "Best_Hit_ARO": ["mdtF", "OprN", "mepA"],
-                "sample1/bin1": ["2", "1", "1"],
-            }
-        )
+                ["mdtF", "mgrA", "OprN", "mepA"],
+                ["mdtF", "mgrA", "OprN", "mepA"],
+            ],
+        ):
+            df = pd.DataFrame(
+                {
+                    colname: values,
+                    "sample1": ["1", "1", "1", "1"],
+                }
+            )
+            cls.count_df_list.append(df)
 
         cls.frequency_table = pd.DataFrame(
             {
@@ -138,8 +134,8 @@ def test_read_in_txt_mags(self):
         # Test read_in_txt with output data from annotate_mags_card
         self.read_in_txt_test_body(
             filename="output.mags.txt",
-            samp_bin_name="sample1/bin1",
-            exp=self.mag_count_df,
+            samp_bin_name="sample1",
+            exp=self.count_df_list[2],
             data_type="mags",
         )
 
@@ -148,7 +144,7 @@ def test_read_in_txt_reads_allele(self):
         self.read_in_txt_test_body(
             filename="output.allele_mapping_data.txt",
             samp_bin_name="sample1",
-            exp=self.allele_count_df,
+            exp=self.count_df_list[0],
             data_type="reads",
             map_type="allele",
         )
@@ -158,7 +154,7 @@ def test_read_in_txt_reads_gene(self):
         self.read_in_txt_test_body(
             filename="output.gene_mapping_data.txt",
             samp_bin_name="sample1",
-            exp=self.gene_count_df,
+            exp=self.count_df_list[1],
             data_type="reads",
             map_type="gene",
         )
@@ -174,7 +170,7 @@ def read_in_txt_test_body(
 
     def test_create_count_table(self):
         # Create list of dataframes to be used by create_count_table
-        df_list = [self.gene_count_df.copy(), self.gene_count_df.copy()]
+        df_list = [self.count_df_list[1].copy(), self.count_df_list[1].copy()]
         df_list[1].iloc[0, 0] = "mdtE"
         df_list[1].rename(columns={"sample1": "sample2"}, inplace=True)
 
diff --git a/q2_amr/card/utils.py b/q2_amr/card/utils.py
index 69c600c..5b9e526 100644
--- a/q2_amr/card/utils.py
+++ b/q2_amr/card/utils.py
@@ -107,10 +107,8 @@ def read_in_txt(path: str, samp_bin_name: str, data_type: str, map_type=None):
         df = df[[colname, "All Mapped Reads"]]
         df.rename(columns={"All Mapped Reads": samp_bin_name}, inplace=True)
     else:
-        df = df["Best_Hit_ARO"].value_counts().reset_index()
-
-        # Rename the columns
-        df.columns = ["Best_Hit_ARO", samp_bin_name]
+        df = df[["Best_Hit_ARO"]]
+        df[samp_bin_name] = 1
 
     df = df.astype(str)
     return df
diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index 1bdaafe..feb15f9 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -7,7 +7,7 @@
 # ----------------------------------------------------------------------------
 import importlib
 
-from q2_types.feature_table import FeatureTable, Frequency
+from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence
 from q2_types.per_sample_sequences import (
     PairedEndSequencesWithQuality,
     SequencesWithQuality,
@@ -42,15 +42,12 @@
     CARDKmerTXTFormat,
     CARDWildcardIndexFormat,
     GapDNAFASTAFormat,
-    GeneLengthDirectoryFormat,
-    GeneLengthFormat,
 )
 from q2_amr.types._type import (
     CARDAlleleAnnotation,
     CARDAnnotation,
     CARDGeneAnnotation,
     CARDKmerDatabase,
-    GeneLength,
 )
 
 citations = Citations.load("citations.bib", package="q2_amr")
@@ -98,7 +95,7 @@
     },
     outputs=[
         ("amr_annotations", SampleData[CARDAnnotation]),
-        ("feature_table", FeatureTable[Frequency]),
+        ("feature_table", FeatureTable[PresenceAbsence]),
     ],
     input_descriptions={
         "mag": "MAGs to be annotated with CARD.",
@@ -115,7 +112,7 @@
     },
     output_descriptions={
         "amr_annotations": "AMR annotation as .txt and .json file.",
-        "feature_table": "Frequency table of ARGs in all samples.",
+        "feature_table": "Presence and absence table of ARGs in all samples.",
     },
     name="Annotate MAGs with antimicrobial resistance genes from CARD.",
     description="Annotate MAGs with antimicrobial resistance genes from CARD.",
@@ -280,9 +277,6 @@
 plugin.register_semantic_type_to_format(
     SampleData[CARDGeneAnnotation], artifact_format=CARDGeneAnnotationDirectoryFormat
 )
-plugin.register_semantic_type_to_format(
-    GeneLength, artifact_format=GeneLengthDirectoryFormat
-)
 
 plugin.register_formats(
     CARDKmerDatabaseDirectoryFormat,
@@ -300,8 +294,6 @@
     CARDAnnotationStatsFormat,
     CARDAlleleAnnotationDirectoryFormat,
     CARDGeneAnnotationDirectoryFormat,
-    GeneLengthFormat,
-    GeneLengthDirectoryFormat,
 )
 
 importlib.import_module("q2_amr.types._transformer")
diff --git a/q2_amr/types/__init__.py b/q2_amr/types/__init__.py
index 94f3f12..7f5d25d 100644
--- a/q2_amr/types/__init__.py
+++ b/q2_amr/types/__init__.py
@@ -22,8 +22,6 @@
     CARDKmerTXTFormat,
     CARDWildcardIndexFormat,
     GapDNAFASTAFormat,
-    GeneLengthDirectoryFormat,
-    GeneLengthFormat,
 )
 from ._type import (
     CARDAlleleAnnotation,
@@ -54,6 +52,4 @@
     "GapDNAFASTAFormat",
     "CARDWildcardIndexFormat",
     "CARDKmerDatabase",
-    "GeneLengthDirectoryFormat",
-    "GeneLengthFormat",
 ]
diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py
index 7ee2eca..de8c44a 100644
--- a/q2_amr/types/_format.py
+++ b/q2_amr/types/_format.py
@@ -425,25 +425,3 @@ class CARDGeneAnnotationDirectoryFormat(MultiDirValidationMixin, model.Directory
     @gene.set_path_maker
     def gene_path_maker(self, sample_id):
         return "%s/gene_mapping_data.txt" % sample_id
-
-
-class GeneLengthFormat(model.TextFileFormat):
-    def _validate(self, n_records=None):
-        df = pd.read_csv(str(self), sep="\t", header=None)
-        has_two_columns = len(df.columns) == 2
-        lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all()
-        if not (has_two_columns and lengths_num):
-            raise ValidationError(
-                "Format does not match GeneLengthsFormat. Must consist of tab "
-                "separated values in two columns with no header. The first column must "
-                "be the gene names and the second column must be the corresponding "
-                "gene lengths."
-            )
-
-    def _validate_(self, level):
-        self._validate()
-
-
-GeneLengthDirectoryFormat = model.SingleFileDirectoryFormat(
-    "GeneLengthDirectoryFormat", "gene_length.txt", GeneLengthFormat
-)
diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py
index f6fa159..6d08cf1 100644
--- a/q2_amr/types/_transformer.py
+++ b/q2_amr/types/_transformer.py
@@ -26,7 +26,6 @@
     CARDAnnotationTXTFormat,
     CARDDatabaseFormat,
     CARDGeneAnnotationDirectoryFormat,
-    GeneLengthDirectoryFormat,
 )
 
 
@@ -254,23 +253,3 @@ def tabulate_data(data_path, data_type):
     if data_type == "mags":
         df_combined.rename(columns={"ID": "HSP_Identifier"}, inplace=True)
     return qiime2.Metadata(df_combined)
-
-
-@plugin.register_transformer
-def _15(data: CARDAlleleAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat:
-    len_all = pd.Series()
-    directory = GeneLengthDirectoryFormat()
-    # Iterate over samples in the specified path
-    for samp in os.listdir(data.path):
-        anno_txt = os.path.join(data.path, samp, "allele_mapping_data.txt")
-
-        # Read each DataFrame and append it to the list
-        len_sample = pd.read_csv(
-            anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"]
-        ).set_index("Reference Sequence")["Reference Length"]
-
-        len_all = len_all.combine_first(len_sample)
-    len_all.to_csv(
-        os.path.join(directory.path, "gene_length.txt"), sep="\t", header=False
-    )
-    return directory
diff --git a/q2_amr/types/_type.py b/q2_amr/types/_type.py
index bee4195..aa2f302 100644
--- a/q2_amr/types/_type.py
+++ b/q2_amr/types/_type.py
@@ -17,4 +17,3 @@
 CARDGeneAnnotation = SemanticType(
     "CARDGeneAnnotation", variant_of=SampleData.field["type"]
 )
-GeneLength = SemanticType("GeneLength")
diff --git a/q2_amr/types/tests/data/gene_length.txt b/q2_amr/types/tests/data/gene_length.txt
deleted file mode 100644
index 2fddb36..0000000
--- a/q2_amr/types/tests/data/gene_length.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
-ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1	1173.0
diff --git a/q2_amr/types/tests/data/gene_length_3_cols.txt b/q2_amr/types/tests/data/gene_length_3_cols.txt
deleted file mode 100644
index a1803bb..0000000
--- a/q2_amr/types/tests/data/gene_length_3_cols.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0	1
-ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1	1173.0	1
diff --git a/q2_amr/types/tests/data/gene_length_switched.txt b/q2_amr/types/tests/data/gene_length_switched.txt
deleted file mode 100644
index cae5b65..0000000
--- a/q2_amr/types/tests/data/gene_length_switched.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-1356.0	ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1
-1173.0	ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1
diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py
index e0de12b..a594eaa 100644
--- a/q2_amr/types/tests/test_types_formats_transformers.py
+++ b/q2_amr/types/tests/test_types_formats_transformers.py
@@ -21,7 +21,6 @@
     ProteinIterator,
 )
 from q2_types_genomics.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat
-from qiime2.plugin import ValidationError
 from qiime2.plugin.testing import TestPluginBase
 from skbio import DNA, Protein
 
@@ -39,8 +38,6 @@
     CARDKmerTXTFormat,
     CARDWildcardIndexFormat,
     GapDNAFASTAFormat,
-    GeneLengthDirectoryFormat,
-    GeneLengthFormat,
 )
 from q2_amr.types._transformer import (
     _read_from_card_file,
@@ -331,40 +328,3 @@ def test_CARDAlleleAnnotationDirectoryFormat_to_qiime2_Metadata_transformer(self
         )
         metadata_obt = transformer(annotation)
         self.assertIsInstance(metadata_obt, qiime2.Metadata)
-
-
-class TestGeneLengthsTypesAndFormats(AMRTypesTestPluginBase):
-    def test_gene_length_format_validate_positive(self):
-        filepath = self.get_data_path("gene_length.txt")
-        format = GeneLengthFormat(filepath, mode="r")
-        format.validate()
-
-    def test_gene_length_format_validate_negative_3_cols(self):
-        filepath = self.get_data_path("gene_length_3_cols.txt")
-        with self.assertRaises(ValidationError):
-            format = GeneLengthFormat(filepath, mode="r")
-            format.validate()
-
-    def test_gene_length_format_validate_negative_switched(self):
-        filepath = self.get_data_path("gene_length_switched.txt")
-        with self.assertRaises(ValidationError):
-            format = GeneLengthFormat(filepath, mode="r")
-            format.validate()
-
-    def test_gene_length_directory_format_validate_positive(self):
-        filepath = self.get_data_path("gene_length.txt")
-        shutil.copy(filepath, os.path.join(self.temp_dir.name, "gene_length.txt"))
-        format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r")
-        format.validate()
-
-    def test_card_annotation_format_to_df_transformertt(self):
-        transformer = self.get_transformer(
-            CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat
-        )
-
-        annotation = CARDAlleleAnnotationDirectoryFormat(
-            self.get_data_path("annotate_reads_output"), "r"
-        )
-
-        obs = transformer(annotation)
-        self.assertIsInstance(obs, GeneLengthDirectoryFormat)

From 7b58de2b21744fb2b2228e5a2fe1d459d89d4025 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Wed, 29 May 2024 16:11:45 +0200
Subject: [PATCH 17/23] added transformer for allele annotation to
 sequencecharacteristics

---
 q2_amr/card/normalization.py                  |  3 +-
 q2_amr/types/_transformer.py                  | 46 ++++++++++++++++++-
 .../tests/test_types_formats_transformers.py  | 30 ++++++++++++
 3 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index ec42e97..3c70670 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -5,7 +5,6 @@
 from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
 
 from q2_amr.card.utils import InvalidParameterCombinationError
-from q2_amr.types import GeneLengthDirectoryFormat
 
 
 def normalize(
@@ -13,7 +12,7 @@ def normalize(
     method: str,
     m_trim: float = 0.3,
     a_trim: float = 0.05,
-    gene_length: GeneLengthDirectoryFormat = None,
+    gene_length: pd.DataFrame = None,
 ) -> pd.DataFrame:
     # Create Dataframe with counts from biom.Table
     counts = pd.DataFrame(
diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py
index 1139f7d..c8ae7cc 100644
--- a/q2_amr/types/_transformer.py
+++ b/q2_amr/types/_transformer.py
@@ -13,7 +13,12 @@
 import pandas as pd
 import qiime2
 import skbio
-from q2_types.feature_data import DNAFASTAFormat, DNAIterator, ProteinFASTAFormat
+from q2_types.feature_data import (
+    DNAFASTAFormat,
+    DNAIterator,
+    ProteinFASTAFormat,
+    SequenceCharacteristicsDirectoryFormat,
+)
 from q2_types.feature_data._transformer import ProteinIterator
 from q2_types.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat
 from skbio import DNA, Protein
@@ -287,3 +292,42 @@ def tabulate_data(data_path, data_type):
         df_combined.rename(columns={"ID": "HSP_Identifier"}, inplace=True)
 
     return qiime2.Metadata(df_combined)
+
+
+@plugin.register_transformer
+def _18(
+    data: CARDAlleleAnnotationDirectoryFormat,
+) -> SequenceCharacteristicsDirectoryFormat:
+    directory = get_gene_lengths(map_type="allele", annotations=data.path)
+    return directory
+
+
+@plugin.register_transformer
+def _19(
+    data: CARDGeneAnnotationDirectoryFormat,
+) -> SequenceCharacteristicsDirectoryFormat:
+    directory = get_gene_lengths(map_type="gene", annotations=data.path)
+    return directory
+
+
+def get_gene_lengths(map_type, annotations):
+    # Extracts gene lengths from CARDAlleleAnnotation and CARDGeneAnnotation
+    gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term"
+    len_all = pd.Series()
+    directory_fmt = SequenceCharacteristicsDirectoryFormat()
+
+    # Iterate over samples, read in each DataFrame and append it to the series
+    for samp in os.listdir(annotations):
+        anno_txt = os.path.join(annotations, samp, f"{map_type}_mapping_data.txt")
+        cols = [gene_name_col, "Reference Length"]
+        len_sample = pd.read_csv(anno_txt, sep="\t", usecols=cols)
+        len_sample = len_sample.set_index(cols[0])[cols[1]]
+        len_all = len_all.combine_first(len_sample)
+
+    df = len_all.to_frame()
+    df.index.name = "id"
+    df.columns = ["length"]
+    df.to_csv(
+        os.path.join(directory_fmt.path, "gene_length.tsv"), sep="\t", header=True
+    )
+    return directory_fmt
diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py
index a871e99..a44c996 100644
--- a/q2_amr/types/tests/test_types_formats_transformers.py
+++ b/q2_amr/types/tests/test_types_formats_transformers.py
@@ -19,6 +19,8 @@
     DNAIterator,
     ProteinFASTAFormat,
     ProteinIterator,
+    SequenceCharacteristicsDirectoryFormat,
+    SequenceCharacteristicsFormat,
 )
 from q2_types.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat
 from qiime2.plugin.testing import TestPluginBase
@@ -561,3 +563,31 @@ def test_tabulate_data_allele(self):
         exp.index = exp.index.astype(str)
         obs = tabulate_data(self.get_data_path("card_allele_annotation"), "allele")
         self.assertEqual(qiime2.Metadata(exp), obs)
+
+    def test_card_allele_annotation_dir_format_to_gene_length_dir_format(self):
+        transformer = self.get_transformer(
+            CARDAlleleAnnotationDirectoryFormat, SequenceCharacteristicsDirectoryFormat
+        )
+        annotation = CARDAlleleAnnotationDirectoryFormat(
+            self.get_data_path("card_allele_annotation"), "r"
+        )
+        obs = transformer(annotation)
+        self.assertIsInstance(obs, SequenceCharacteristicsDirectoryFormat)
+        format = SequenceCharacteristicsFormat(
+            os.path.join(obs.path, "gene_length.tsv"), mode="r"
+        )
+        format.validate()
+
+    def test_card_gene_annotation_dir_format_to_gene_length_dir_format(self):
+        transformer = self.get_transformer(
+            CARDGeneAnnotationDirectoryFormat, SequenceCharacteristicsDirectoryFormat
+        )
+        annotation = CARDGeneAnnotationDirectoryFormat(
+            self.get_data_path("card_gene_annotation"), "r"
+        )
+        obs = transformer(annotation)
+        self.assertIsInstance(obs, SequenceCharacteristicsDirectoryFormat)
+        format = SequenceCharacteristicsFormat(
+            os.path.join(obs.path, "gene_length.tsv"), mode="r"
+        )
+        format.validate()

From f4a20ed4f6fa84b53cf18005776fb42311021872 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Wed, 29 May 2024 17:48:19 +0200
Subject: [PATCH 18/23] changed normalize to work with characteristics

---
 q2_amr/card/normalization.py                  | 25 +++++++++++++------
 q2_amr/card/tests/test_normalization.py       |  2 +-
 q2_amr/types/_transformer.py                  |  8 +++---
 .../tests/test_types_formats_transformers.py  |  4 +--
 4 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index 3c70670..e784593 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -2,6 +2,7 @@
 
 import biom
 import pandas as pd
+from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
 from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
 
 from q2_amr.card.utils import InvalidParameterCombinationError
@@ -12,7 +13,7 @@ def normalize(
     method: str,
     m_trim: float = 0.3,
     a_trim: float = 0.05,
-    gene_length: pd.DataFrame = None,
+    gene_length: SequenceCharacteristicsDirectoryFormat = None,
 ) -> pd.DataFrame:
     # Create Dataframe with counts from biom.Table
     counts = pd.DataFrame(
@@ -20,6 +21,7 @@ def normalize(
         index=table.ids(axis="observation"),
         columns=table.ids(axis="sample"),
     ).T
+
     if method in ["tpm", "fpkm", "uq", "cuf", "cpm"]:
         # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ,
         # CPM or CUF
@@ -28,33 +30,39 @@ def normalize(
                 "Parameters m-trim and a-trim can only be used with methods TMM and "
                 "CTF."
             )
+
         if method in ["tpm", "fpkm"]:
             # Raise Error if gene-length is missing when using methods TPM or FPKM
             if not gene_length:
                 raise ValueError("gene-length input is missing.")
+
             # Create pd.Series from gene_length input
-            gene_length_series = pd.read_csv(
-                os.path.join(gene_length.path, "gene_length.txt"),
+            lengths = pd.read_csv(
+                os.path.join(gene_length.path, "sequence_characteristics.tsv"),
                 sep="\t",
                 header=None,
                 names=["index", "values"],
                 index_col="index",
                 squeeze=True,
+                skiprows=1,
             )
+
             # Raise Error if there are genes in the counts that are not present in the
             # gene length
-            if not set(counts.columns).issubset(set(gene_length_series.index)):
-                only_in_counts = set(counts.columns) - set(gene_length_series.index)
+            if not set(counts.columns).issubset(set(lengths.index)):
+                only_in_counts = set(counts.columns) - set(lengths.index)
                 raise ValueError(
                     f"There are genes present in the FeatureTable that are not present "
                     f"in the gene-length input. Missing lengths for genes: "
                     f"{only_in_counts}"
                 )
+
             # Define the methods TPM and FPKM with the gene length series as an input
             methods = {
-                "tpm": TPM(gene_lengths=gene_length_series),
-                "fpkm": FPKM(gene_lengths=gene_length_series),
+                "tpm": TPM(gene_lengths=lengths),
+                "fpkm": FPKM(gene_lengths=lengths),
             }
+
     if method in ["tmm", "uq", "cuf", "ctf", "cpm"]:
         # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or
         # CTF
@@ -62,6 +70,7 @@ def normalize(
             raise ValueError(
                 "gene-length input can only be used with FPKM and TPM methods."
             )
+
         # Define the methods TMM and CTF with parameters, also UQ, CPM and CUF
         methods = {
             "tmm": TMM(m_trim=m_trim, a_trim=a_trim),
@@ -70,7 +79,9 @@ def normalize(
             "cuf": CUF(),
             "cpm": CPM(),
         }
+
     # Run normalization method on count dataframe
     normalized = methods[method].set_output(transform="pandas").fit_transform(counts)
     normalized.index.name = "sample_id"
+
     return normalized
diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py
index 3198449..a35d67d 100644
--- a/q2_amr/card/tests/test_normalization.py
+++ b/q2_amr/card/tests/test_normalization.py
@@ -22,7 +22,7 @@ def setUpClass(cls):
     def test_tpm_fpkm_uq_cuf_with_invalid_m_a_trim(self):
         # Test Error raised if gene-length is given with TMM method
         expected_message = (
-            "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF."
+            "Parameters m-trim and a-trim can only be used with methods TMM and CTF."
         )
         with self.assertRaises(InvalidParameterCombinationError) as cm:
             normalize(self.table, "tpm", m_trim=0.2, a_trim=0.05)
diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py
index c8ae7cc..b1565f9 100644
--- a/q2_amr/types/_transformer.py
+++ b/q2_amr/types/_transformer.py
@@ -314,7 +314,7 @@ def get_gene_lengths(map_type, annotations):
     # Extracts gene lengths from CARDAlleleAnnotation and CARDGeneAnnotation
     gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term"
     len_all = pd.Series()
-    directory_fmt = SequenceCharacteristicsDirectoryFormat()
+    dir_fmt = SequenceCharacteristicsDirectoryFormat()
 
     # Iterate over samples, read in each DataFrame and append it to the series
     for samp in os.listdir(annotations):
@@ -328,6 +328,8 @@ def get_gene_lengths(map_type, annotations):
     df.index.name = "id"
     df.columns = ["length"]
     df.to_csv(
-        os.path.join(directory_fmt.path, "gene_length.tsv"), sep="\t", header=True
+        os.path.join(dir_fmt.path, "sequence_characteristics.tsv"),
+        sep="\t",
+        header=True,
     )
-    return directory_fmt
+    return dir_fmt
diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py
index a44c996..2a99879 100644
--- a/q2_amr/types/tests/test_types_formats_transformers.py
+++ b/q2_amr/types/tests/test_types_formats_transformers.py
@@ -574,7 +574,7 @@ def test_card_allele_annotation_dir_format_to_gene_length_dir_format(self):
         obs = transformer(annotation)
         self.assertIsInstance(obs, SequenceCharacteristicsDirectoryFormat)
         format = SequenceCharacteristicsFormat(
-            os.path.join(obs.path, "gene_length.tsv"), mode="r"
+            os.path.join(obs.path, "sequence_characteristics.tsv"), mode="r"
         )
         format.validate()
 
@@ -588,6 +588,6 @@ def test_card_gene_annotation_dir_format_to_gene_length_dir_format(self):
         obs = transformer(annotation)
         self.assertIsInstance(obs, SequenceCharacteristicsDirectoryFormat)
         format = SequenceCharacteristicsFormat(
-            os.path.join(obs.path, "gene_length.tsv"), mode="r"
+            os.path.join(obs.path, "sequence_characteristics.tsv"), mode="r"
         )
         format.validate()

From 6d416a3854eec747ffa9790cded5d0bac4faa81a Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Wed, 29 May 2024 17:53:36 +0200
Subject: [PATCH 19/23] changed tests

---
 .../{gene_length.txt => sequence_characteristics.tsv}     | 1 +
 ...ength_short.txt => sequence_characteristics_short.tsv} | 1 +
 q2_amr/card/tests/test_normalization.py                   | 8 +++++---
 3 files changed, 7 insertions(+), 3 deletions(-)
 rename q2_amr/card/tests/data/{gene_length.txt => sequence_characteristics.tsv} (90%)
 rename q2_amr/card/tests/data/{gene_length_short.txt => sequence_characteristics_short.tsv} (82%)

diff --git a/q2_amr/card/tests/data/gene_length.txt b/q2_amr/card/tests/data/sequence_characteristics.tsv
similarity index 90%
rename from q2_amr/card/tests/data/gene_length.txt
rename to q2_amr/card/tests/data/sequence_characteristics.tsv
index 2fddb36..545c51e 100644
--- a/q2_amr/card/tests/data/gene_length.txt
+++ b/q2_amr/card/tests/data/sequence_characteristics.tsv
@@ -1,2 +1,3 @@
+id  length
 ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
 ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1	1173.0
diff --git a/q2_amr/card/tests/data/gene_length_short.txt b/q2_amr/card/tests/data/sequence_characteristics_short.tsv
similarity index 82%
rename from q2_amr/card/tests/data/gene_length_short.txt
rename to q2_amr/card/tests/data/sequence_characteristics_short.tsv
index 81d043e..734296b 100644
--- a/q2_amr/card/tests/data/gene_length_short.txt
+++ b/q2_amr/card/tests/data/sequence_characteristics_short.tsv
@@ -1 +1,2 @@
+id  length
 ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	1356.0
diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py
index a35d67d..e61f9e2 100644
--- a/q2_amr/card/tests/test_normalization.py
+++ b/q2_amr/card/tests/test_normalization.py
@@ -48,8 +48,8 @@ def test_tpm_fpkm_with_short_gene_length(self):
         # Test Error raised if gene-length is missing genes
         gene_length = SequenceCharacteristicsDirectoryFormat()
         shutil.copy(
-            self.get_data_path("gene_length_short.txt"),
-            os.path.join(gene_length.path, "gene_length.txt"),
+            self.get_data_path("sequence_characteristics_short.tsv"),
+            os.path.join(gene_length.path, "sequence_characteristics.tsv"),
         )
         table = biom.load_table(self.get_data_path("feature-table.biom"))
         expected_message = (
@@ -65,7 +65,9 @@ def test_tpm_fpkm_with_short_gene_length(self):
     def test_tpm_fpkm_with_valid_inputs(self, mock_tpm):
         # Test valid inputs for TPM method
         gene_length = SequenceCharacteristicsDirectoryFormat()
-        shutil.copy(self.get_data_path("gene_length.txt"), gene_length.path)
+        shutil.copy(
+            self.get_data_path("sequence_characteristics.tsv"), gene_length.path
+        )
         table = biom.load_table(self.get_data_path("feature-table.biom"))
         normalize(table=table, gene_length=gene_length, method="tpm")
 

From 085769b9afa430c53c024c077630d5772e6ffe48 Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Mon, 3 Jun 2024 12:03:07 +0200
Subject: [PATCH 20/23] changed parameter value error to value error

---
 q2_amr/card/normalization.py            | 4 +---
 q2_amr/card/tests/test_normalization.py | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index e784593..9f955d0 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -5,8 +5,6 @@
 from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
 from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
 
-from q2_amr.card.utils import InvalidParameterCombinationError
-
 
 def normalize(
     table: biom.Table,
@@ -26,7 +24,7 @@ def normalize(
         # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ,
         # CPM or CUF
         if m_trim != 0.3 or a_trim != 0.05:
-            raise InvalidParameterCombinationError(
+            raise ValueError(
                 "Parameters m-trim and a-trim can only be used with methods TMM and "
                 "CTF."
             )
diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py
index e61f9e2..a5e1689 100644
--- a/q2_amr/card/tests/test_normalization.py
+++ b/q2_amr/card/tests/test_normalization.py
@@ -7,7 +7,6 @@
 from qiime2.plugin.testing import TestPluginBase
 
 from q2_amr.card.normalization import normalize
-from q2_amr.card.utils import InvalidParameterCombinationError
 
 
 class TestNormalize(TestPluginBase):
@@ -24,7 +23,7 @@ def test_tpm_fpkm_uq_cuf_with_invalid_m_a_trim(self):
         expected_message = (
             "Parameters m-trim and a-trim can only be used with methods TMM and CTF."
         )
-        with self.assertRaises(InvalidParameterCombinationError) as cm:
+        with self.assertRaises(ValueError) as cm:
             normalize(self.table, "tpm", m_trim=0.2, a_trim=0.05)
         self.assertEqual(str(cm.exception), expected_message)
 

From 7cf82d4f004b6c4005596d4dce10681c8f0347ed Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Wed, 5 Jun 2024 13:26:57 +0200
Subject: [PATCH 21/23] chnages after review

---
 q2_amr/card/normalization.py              | 131 +++++++++++-----------
 q2_amr/card/tests/data/feature-table.biom | Bin 33824 -> 0 bytes
 q2_amr/card/tests/data/feature-table.tsv  |   3 +
 q2_amr/card/tests/test_normalization.py   |  97 ++++++++++------
 q2_amr/plugin_setup.py                    |   8 +-
 5 files changed, 140 insertions(+), 99 deletions(-)
 delete mode 100644 q2_amr/card/tests/data/feature-table.biom
 create mode 100644 q2_amr/card/tests/data/feature-table.tsv

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index 9f955d0..d6dab5a 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -1,75 +1,31 @@
 import os
 
-import biom
 import pandas as pd
 from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
 from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
 
 
 def normalize(
-    table: biom.Table,
+    table: pd.DataFrame,
     method: str,
-    m_trim: float = 0.3,
-    a_trim: float = 0.05,
+    m_trim: float = None,
+    a_trim: float = None,
     gene_length: SequenceCharacteristicsDirectoryFormat = None,
 ) -> pd.DataFrame:
-    # Create Dataframe with counts from biom.Table
-    counts = pd.DataFrame(
-        data=table.matrix_data.toarray(),
-        index=table.ids(axis="observation"),
-        columns=table.ids(axis="sample"),
-    ).T
-
-    if method in ["tpm", "fpkm", "uq", "cuf", "cpm"]:
-        # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ,
-        # CPM or CUF
-        if m_trim != 0.3 or a_trim != 0.05:
-            raise ValueError(
-                "Parameters m-trim and a-trim can only be used with methods TMM and "
-                "CTF."
-            )
-
-        if method in ["tpm", "fpkm"]:
-            # Raise Error if gene-length is missing when using methods TPM or FPKM
-            if not gene_length:
-                raise ValueError("gene-length input is missing.")
-
-            # Create pd.Series from gene_length input
-            lengths = pd.read_csv(
-                os.path.join(gene_length.path, "sequence_characteristics.tsv"),
-                sep="\t",
-                header=None,
-                names=["index", "values"],
-                index_col="index",
-                squeeze=True,
-                skiprows=1,
-            )
-
-            # Raise Error if there are genes in the counts that are not present in the
-            # gene length
-            if not set(counts.columns).issubset(set(lengths.index)):
-                only_in_counts = set(counts.columns) - set(lengths.index)
-                raise ValueError(
-                    f"There are genes present in the FeatureTable that are not present "
-                    f"in the gene-length input. Missing lengths for genes: "
-                    f"{only_in_counts}"
-                )
-
-            # Define the methods TPM and FPKM with the gene length series as an input
-            methods = {
-                "tpm": TPM(gene_lengths=lengths),
-                "fpkm": FPKM(gene_lengths=lengths),
-            }
-
-    if method in ["tmm", "uq", "cuf", "ctf", "cpm"]:
-        # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or
-        # CTF
-        if gene_length:
-            raise ValueError(
-                "gene-length input can only be used with FPKM and TPM methods."
-            )
-
-        # Define the methods TMM and CTF with parameters, also UQ, CPM and CUF
+    # Validate parameter combinations and set trim parameters
+    m_trim, a_trim = _validate_parameters(method, m_trim, a_trim, gene_length)
+
+    # Process gene_lengths input and define methods that need gene_lengths input
+    if method in ["tpm", "fpkm"]:
+        lengths = _convert_lengths(table, gene_length)
+
+        methods = {
+            "tpm": TPM(gene_lengths=lengths),
+            "fpkm": FPKM(gene_lengths=lengths),
+        }
+
+    # Define remaining methods that don't need gene_lengths input
+    else:
         methods = {
             "tmm": TMM(m_trim=m_trim, a_trim=a_trim),
             "ctf": CTF(m_trim=m_trim, a_trim=a_trim),
@@ -78,8 +34,57 @@ def normalize(
             "cpm": CPM(),
         }
 
-    # Run normalization method on count dataframe
-    normalized = methods[method].set_output(transform="pandas").fit_transform(counts)
+    # Run normalization method on frequency table
+    normalized = methods[method].set_output(transform="pandas").fit_transform(table)
     normalized.index.name = "sample_id"
 
     return normalized
+
+
+def _validate_parameters(method, m_trim, a_trim, gene_length):
+    # Raise Error if gene-length is missing when using methods TPM or FPKM
+    if method in ["tpm", "fpkm"] and not gene_length:
+        raise ValueError("gene-length input is missing.")
+
+    # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or CTF
+    if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length:
+        raise ValueError(
+            "gene-length input can only be used with FPKM and TPM " "methods."
+        )
+
+    # Raise Error if m_trim or a_trim are given when not using methods TMM or CTF
+    if (method not in ["tmm", "ctf"]) and (m_trim is not None or a_trim is not None):
+        raise ValueError(
+            "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF."
+        )
+
+    # Set m_trim and a_trim to their default values for methods TMM and CTF
+    if method in ["tmm", "ctf"]:
+        m_trim = 0.3 if m_trim is None else m_trim
+        a_trim = 0.05 if a_trim is None else a_trim
+
+    return m_trim, a_trim
+
+
+def _convert_lengths(table, gene_length):
+    # Read in table from sequence_characteristics.tsv as a pd.Series
+    lengths = pd.read_csv(
+        os.path.join(gene_length.path, "sequence_characteristics.tsv"),
+        sep="\t",
+        header=None,
+        names=["index", "values"],
+        index_col="index",
+        squeeze=True,
+        skiprows=1,
+    )
+
+    # Check if all gene IDs that are present in the table are also present in
+    # the lengths
+    if not set(table.columns).issubset(set(lengths.index)):
+        only_in_counts = set(table.columns) - set(lengths.index)
+        raise ValueError(
+            f"There are genes present in the FeatureTable that are not present "
+            f"in the gene-length input. Missing lengths for genes: "
+            f"{only_in_counts}"
+        )
+    return lengths
diff --git a/q2_amr/card/tests/data/feature-table.biom b/q2_amr/card/tests/data/feature-table.biom
deleted file mode 100644
index 3a8fcee13f42e92f42a5463c65bee03c7bbf9168..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 33824
zcmeI4PfS}k7{DC@Z3>iysoH4ONlzP_I4txf{9P^$Ep$QJfVFI6n}(9WD?}nB3u#9;
z4s6<?yY<*3#~pjDc4AT|^@y}fj~pv)2aY>T>TG}C*SxrM12nKCd_FwepY6}~@7a#|
z{LIhE@oQ(>FSHBgLLt#A!a7KgXXu=tKtgd!PhlW{{$2EcX@NjMv<sEigYq!ycY+{T
z$9%KbCnki<N&3mhVqzT?kRFn=e=-OpC&p$}Pv@|^4*HLA{9sh%GizS)cDj_!uWHd+
zI#*cn)W`xiN*VX*cjI_9$91PjXbk}o6z4=pWEVl<`9MIIP_qc?3}HS<?@%6ybb47&
zo7PjhQ;@9{qMDvFK0OLSr${P^)Y+l++{_nq=~CoYaYc)(>U%?F!*;2u*(o$`sqZxz
z?h&Z(wpU!!Xu~ouWKXD)A|U&%9K8<HC;sJjp4IJK@>acKy5ucJGIvz@YW;p)Wx{s0
z%3xVvm)5sX^z_S*EJ~&O(09S7udPDgDc$e2<+Oew2kkHQzf-uX2FbBi>I;e0)$ert
z1$|mt<`<A96f;`yXdG^<Mk<F1)z;0;i;x|WL4==70113}b^0ScH`RhA9nDj{PRaOG
z-kD8jRy=2NT&L5x*6WXyOwWGmlvD<tD&%SiPOm32xLhg~;+HOEviV#D){y>uaY@M8
z08Y<m8N4z%{c+@)UbLK8f3!b31nD%~^*oiLm>V06xUq;kG8-L^4@KkdQ2%Ibz#WJ|
z{!`$I2>N5!r{e>zbg_|*$?<q}WN2h#DxLG<UamN{F?IFIWPEJKbw6+iNBg6)%xNt5
z{tM-YRr!GtES~cURQ!wK;poV~pe{~J({nxW`x6tyi}ZtB)d+60i$d%(0Vco%m;e)C
z0!)AjFaajO1egF5s67GGu1fY%hYU|xWzH+5rAaMKEm$h%ZxteCsWdhgv+F|ILDzH^
zvLiTD!0>NOO^*kpwUF#m_wf&XDrefcs5$pO0Y#MWA;zEFFe*49QJbCc>3IMULi-=s
z35E51<%HnkZdwSK>j2HmJ{&((jOHDj4v_TlKCA<+WX=|k-vV7!&mP9Z4#>fpF2qdo
z<#qO9@?9)P`W|3>=WWP9{oKa+Lix=3gX-$mv#kRrKVApw_7&De!))t->r^(6=1HyA
z0o(XB=WDg|uDM=lC+!z1Tg|a%Ct!|W2Rpon>pk_K_OjX4#q5H&CZt~|70c&eqm*%<
zen6apvoo9+0sKX?or?!$y2{$^T*f_cLJW3@!Do1Y)+VJYy(I{0JA|VHc+4)tDm@?;
zy_+lQq?^vuLRfFGrDf|~XuCSHqn|k$Oo8->pYXdLd|q+#otIaL$B$wM&;4w<X<mmf
z+j2CQT&;KNJ=6D4<P$Hmw>xz3i3xOG>l*-O5sotfCcp%k025#W4NhQvejz1-r$fK*
z%Dv+~@q>zQ%09{6q6$jant9KWUrAjQt?ToDq(w_t__yutu1CKnH-@FWeO!F|UpjO_
z^|@sWs`)V^M6YO*u_e@T=Fwltjc}U?mhhWLSuHyzzyz286JP>N;1~!T+zu6%)&(Kd
zeFU%}Il7fKus`h%Lw}VCg3MDgxD3)`dfMzK^nJV8j9rWU#Mc3{2*;TK6JP>NfC(^x
z1}9K$|3vrO%Knk<pPU4+R8{%5AN8?+>bAV+KaKqpaq@M*Qj*0?fC(@GCcp%kK%)|<
zwtrH_AmzxX@1Sh|_oD`OsMKV+O5bNQDUcrV(`J97@0iT$|1tJY#L3rivk1qT025#W
zOn?b6fd(gFy9Z(3bF6j`BK$dk;O~ALd_Wws`Q|+cy3g>paZk2<gBL1#)qB0R@w+|?
z`tN@RK8#<!k9x@0+w{fE@k{I2LmWQ@Kz+NQ+zfcH{50>Q&dq^Js%I19TVI0Q#Ts|v
z-#S|B@57XyuOLAB5*SZ?4RY$|49*wIXU-pMMRk&3TL(;jybjdK3#-#+*w%q<oPX48
znkThd2W;cloUhgPVNd6wg*JN+wV(awBw&tT2fMk6>pk`Q1l&h;Q_xvu(xjg`8AK_h
z9of^okNU*mZ&pn9*3`0@QW>Xtz4c9{yjf#vy;Ex=-9sgxc$xh^*Iy|id$w-?RA%59
z6JP>NfC(@GCeZK%4*osBk$q2D$O28Z1@V23SY?A(<2PX2KiU5Mf3Ic#<m*6<Tgd8|
z025#WOn?b6f&ZI8wfz&_uPFORwtspc|2>{^sw&_1qrUe~F=PKkoO~V7o#F=*U;<2l
z2`~XB&^!dH?Vsp8iu_}^kFxhY;8j)nj?GfX0^MeRdec&S!q`6%Ctt%YC0Wb_m;e)C
M0!)AjG%A6A0p5#ZcK`qY

diff --git a/q2_amr/card/tests/data/feature-table.tsv b/q2_amr/card/tests/data/feature-table.tsv
new file mode 100644
index 0000000..712090b
--- /dev/null
+++ b/q2_amr/card/tests/data/feature-table.tsv
@@ -0,0 +1,3 @@
+ID	ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1	ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1
+sample1	2.0	0.0
+sample2	2.0	0.0
diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py
index a5e1689..18eb790 100644
--- a/q2_amr/card/tests/test_normalization.py
+++ b/q2_amr/card/tests/test_normalization.py
@@ -2,11 +2,12 @@
 import shutil
 from unittest.mock import MagicMock, patch
 
-import biom
+import pandas as pd
+from pandas._testing import assert_series_equal
 from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat
 from qiime2.plugin.testing import TestPluginBase
 
-from q2_amr.card.normalization import normalize
+from q2_amr.card.normalization import _convert_lengths, _validate_parameters, normalize
 
 
 class TestNormalize(TestPluginBase):
@@ -14,51 +15,79 @@ class TestNormalize(TestPluginBase):
 
     @classmethod
     def setUpClass(cls):
-        # Mocking the biom.Table and gene_length class
-        cls.table = MagicMock()
         cls.gene_length = MagicMock()
+        cls.lengths = pd.Series(
+            {
+                "ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1": 1356.0,
+                "ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1": 1173.0,
+            },
+            name="values",
+        )
+        cls.lengths.index.name = "index"
 
-    def test_tpm_fpkm_uq_cuf_with_invalid_m_a_trim(self):
+    def test_validate_parameters_uq_with_m_a_trim(self):
         # Test Error raised if gene-length is given with TMM method
-        expected_message = (
-            "Parameters m-trim and a-trim can only be used with methods TMM and CTF."
-        )
-        with self.assertRaises(ValueError) as cm:
-            normalize(self.table, "tpm", m_trim=0.2, a_trim=0.05)
-        self.assertEqual(str(cm.exception), expected_message)
+        with self.assertRaisesRegex(
+            ValueError,
+            "Parameters m-trim and a-trim can only "
+            "be used with methods TMM and CTF.",
+        ):
+            _validate_parameters("uq", 0.2, 0.05, None)
 
-    def test_tpm_fpkm_with_missing_gene_length(self):
+    def test_validate_parameters_tpm_missing_gene_length(self):
         # Test Error raised if gene-length is missing with TPM method
-        expected_message = "gene-length input is missing."
-        with self.assertRaises(ValueError) as cm:
-            normalize(self.table, "tpm")
-        self.assertEqual(str(cm.exception), expected_message)
+        with self.assertRaisesRegex(ValueError, "gene-length input is missing."):
+            _validate_parameters("tpm", None, None, None)
 
-    def test_tmm_uq_cuf_ctf_with_gene_length(self):
+    def test_validate_parameters_tmm_gene_length(self):
         # Test Error raised if gene-length is given with TMM method
-        expected_message = (
-            "gene-length input can only be used with FPKM and TPM methods."
+        with self.assertRaisesRegex(
+            ValueError, "gene-length input can only be used with FPKM and TPM methods."
+        ):
+            _validate_parameters("tmm", None, None, gene_length=self.gene_length)
+
+    def test_validate_parameters_default_m_a_trim(self):
+        # Test if m_trim and a_trim get set to default values if None
+        m_trim, a_trim = _validate_parameters("tmm", None, None, None)
+        self.assertEqual(m_trim, 0.3)
+        self.assertEqual(a_trim, 0.05)
+
+    def test_validate_parameters_m_a_trim(self):
+        # Test if m_trim and a_trim are not modified if not None
+        m_trim, a_trim = _validate_parameters("tmm", 0.1, 0.06, None)
+        self.assertEqual(m_trim, 0.1)
+        self.assertEqual(a_trim, 0.06)
+
+    def test_convert_lengths_gene_length(self):
+        # Test Error raised if gene-length is missing genes
+        gene_length = SequenceCharacteristicsDirectoryFormat()
+        shutil.copy(
+            self.get_data_path("sequence_characteristics.tsv"), gene_length.path
+        )
+        table = pd.read_csv(
+            self.get_data_path("feature-table.tsv"), sep="\t", index_col="ID"
         )
-        with self.assertRaises(ValueError) as cm:
-            normalize(self.table, "tmm", gene_length=self.gene_length)
-        self.assertEqual(str(cm.exception), expected_message)
 
-    def test_tpm_fpkm_with_short_gene_length(self):
+        obs = _convert_lengths(table, gene_length=gene_length)
+        assert_series_equal(obs, self.lengths)
+
+    def test_convert_lengths_short_gene_length(self):
         # Test Error raised if gene-length is missing genes
         gene_length = SequenceCharacteristicsDirectoryFormat()
         shutil.copy(
             self.get_data_path("sequence_characteristics_short.tsv"),
             os.path.join(gene_length.path, "sequence_characteristics.tsv"),
         )
-        table = biom.load_table(self.get_data_path("feature-table.biom"))
-        expected_message = (
+        table = pd.read_csv(
+            self.get_data_path("feature-table.tsv"), sep="\t", index_col="ID"
+        )
+        with self.assertRaisesRegex(
+            ValueError,
             "There are genes present in the FeatureTable that are not present "
             "in the gene-length input. Missing lengths for genes: "
-            "{'ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1'}"
-        )
-        with self.assertRaises(ValueError) as cm:
-            normalize(table, "tpm", gene_length=gene_length)
-        self.assertEqual(str(cm.exception), expected_message)
+            "{'ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1'}",
+        ):
+            _convert_lengths(table, gene_length=gene_length)
 
     @patch("q2_amr.card.normalization.TPM")
     def test_tpm_fpkm_with_valid_inputs(self, mock_tpm):
@@ -67,11 +96,15 @@ def test_tpm_fpkm_with_valid_inputs(self, mock_tpm):
         shutil.copy(
             self.get_data_path("sequence_characteristics.tsv"), gene_length.path
         )
-        table = biom.load_table(self.get_data_path("feature-table.biom"))
+        table = pd.read_csv(
+            self.get_data_path("feature-table.tsv"), sep="\t", index_col="ID"
+        )
         normalize(table=table, gene_length=gene_length, method="tpm")
 
     @patch("q2_amr.card.normalization.TMM")
     def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm):
         # Test valid inputs for TMM method
-        table = biom.load_table(self.get_data_path("feature-table.biom"))
+        table = pd.read_csv(
+            self.get_data_path("feature-table.tsv"), sep="\t", index_col="ID"
+        )
         normalize(table=table, method="tmm", a_trim=0.06, m_trim=0.4)
diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py
index 456776d..93dcd23 100644
--- a/q2_amr/plugin_setup.py
+++ b/q2_amr/plugin_setup.py
@@ -470,8 +470,8 @@
     },
     parameters={
         "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf", "cpm"]),
-        "m_trim": Float % Range(0, 1, inclusive_start=True),
-        "a_trim": Float % Range(0, 1, inclusive_start=True),
+        "m_trim": Float % Range(0, 1, inclusive_start=True, inclusive_end=True),
+        "a_trim": Float % Range(0, 1, inclusive_start=True, inclusive_end=True),
     },
     outputs=[("normalized_table", FeatureTable[Frequency])],
     input_descriptions={
@@ -485,9 +485,9 @@
         "/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf for "
         "more information on the methods.",
         "m_trim": "Two sided cutoff for M-values. Can only be used for methods TMM and "
-        "CTF.",
+        "CTF. (default = 0.3)",
         "a_trim": "Two sided cutoff for A-values. Can only be used for methods TMM and "
-        "CTF.",
+        "CTF. (default = 0.05)",
     },
     output_descriptions={
         "normalized_table": "Feature table normalized with specified " "method."

From f25449df62c90f93c62f9b6e5957444adf30b8ba Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Wed, 5 Jun 2024 13:51:14 +0200
Subject: [PATCH 22/23] typo

---
 q2_amr/card/tests/test_normalization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py
index 18eb790..77daf22 100644
--- a/q2_amr/card/tests/test_normalization.py
+++ b/q2_amr/card/tests/test_normalization.py
@@ -26,7 +26,7 @@ def setUpClass(cls):
         cls.lengths.index.name = "index"
 
     def test_validate_parameters_uq_with_m_a_trim(self):
-        # Test Error raised if gene-length is given with TMM method
+        # Test Error raised if gene-length is given with UQ method
         with self.assertRaisesRegex(
             ValueError,
             "Parameters m-trim and a-trim can only "

From 8bafcb2870118b1746741af6c4bd65cb2a884eaa Mon Sep 17 00:00:00 2001
From: VinzentRisch <risch.vinzent@gmail.com>
Date: Wed, 5 Jun 2024 17:04:40 +0200
Subject: [PATCH 23/23] typos

---
 q2_amr/card/normalization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py
index d6dab5a..afb1b1e 100644
--- a/q2_amr/card/normalization.py
+++ b/q2_amr/card/normalization.py
@@ -49,13 +49,13 @@ def _validate_parameters(method, m_trim, a_trim, gene_length):
     # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or CTF
     if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length:
         raise ValueError(
-            "gene-length input can only be used with FPKM and TPM " "methods."
+            "gene-length input can only be used with FPKM and TPM methods."
         )
 
     # Raise Error if m_trim or a_trim are given when not using methods TMM or CTF
     if (method not in ["tmm", "ctf"]) and (m_trim is not None or a_trim is not None):
         raise ValueError(
-            "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF."
+            "Parameters m-trim and a-trim can only be used with methods TMM and CTF."
         )
 
     # Set m_trim and a_trim to their default values for methods TMM and CTF