From c22c0baef2560cd7d36df65187080c244ba2cff9 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Tue, 16 Jan 2024 09:11:47 +0100 Subject: [PATCH 01/23] added action for normalization --- q2_amr/card/normalization.py | 23 +++++++++++++++++++++++ q2_amr/plugin_setup.py | 20 ++++++++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 q2_amr/card/normalization.py diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py new file mode 100644 index 0000000..ac6435a --- /dev/null +++ b/q2_amr/card/normalization.py @@ -0,0 +1,23 @@ +from typing import Union + +import biom +import pandas as pd +from q2_types.per_sample_sequences import ( + SingleLanePerSamplePairedEndFastqDirFmt, + SingleLanePerSampleSingleEndFastqDirFmt, +) + + +def normalize( + reads: Union[ + SingleLanePerSamplePairedEndFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt + ], + table: biom.Table, +) -> pd.DataFrame: + # manifest = reads.manifest.view(pd.DataFrame) + # #paired = isinstance(reads, SingleLanePerSamplePairedEndFastqDirFmt) + # for samp in list(manifest.index): + # # fwd = manifest.loc[samp, "forward"] + # # rev = manifest.loc[samp, "reverse"] if paired else None + normalized_table = pd.DataFrame(table) + return normalized_table diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 84efb88..ef62a67 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -7,7 +7,7 @@ # ---------------------------------------------------------------------------- import importlib -from q2_types.feature_table import FeatureTable, PresenceAbsence +from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence from q2_types.per_sample_sequences import ( PairedEndSequencesWithQuality, SequencesWithQuality, @@ -21,6 +21,7 @@ from q2_amr.card.database import fetch_card_db from q2_amr.card.heatmap import heatmap from q2_amr.card.mags import annotate_mags_card +from q2_amr.card.normalization import normalize from q2_amr.card.reads import annotate_reads_card, visualize_annotation_stats from q2_amr.types import ( CARDAnnotationJSONFormat, @@ -104,7 +105,6 @@ citations=[citations["alcock_card_2023"]], ) - plugin.methods.register_function( function=annotate_reads_card, inputs={ @@ -177,6 +177,22 @@ citations=[citations["alcock_card_2023"]], ) +plugin.methods.register_function( + function=normalize, + inputs={ + "table": FeatureTable[Frequency], + "reads": SampleData[PairedEndSequencesWithQuality | SequencesWithQuality], + }, + parameters={}, + outputs=[("normalized_table", FeatureTable[Frequency])], + input_descriptions={"table": "The ", "reads": "The "}, + parameter_descriptions={}, + output_descriptions={"normalized_table": "hello"}, + name="Shannon's Entropy", + description="Compute Shannon's Entropy for each sample in a ", + citations=[], +) + # Registrations plugin.register_semantic_types( CARDDatabase, CARDAnnotation, CARDAlleleAnnotation, CARDGeneAnnotation From 926f8761ad6a1cc534d5679776b33900ac122513 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 25 Jan 2024 09:59:29 +0100 Subject: [PATCH 02/23] added tpm normalize with cli integration --- q2_amr/card/normalization.py | 90 ++++++++++++++++++++++--- q2_amr/card/tests/test_normalization.py | 11 +++ q2_amr/card/utils.py | 8 ++- q2_amr/plugin_setup.py | 25 +++++-- 4 files changed, 117 insertions(+), 17 deletions(-) create mode 100644 q2_amr/card/tests/test_normalization.py diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index ac6435a..380171e 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -1,17 +1,16 @@ -from typing import Union +import os +import subprocess +import tempfile import biom import pandas as pd -from q2_types.per_sample_sequences import ( - SingleLanePerSamplePairedEndFastqDirFmt, - SingleLanePerSampleSingleEndFastqDirFmt, -) +from pydeseq2.dds import DeseqDataSet +from q2_amr.card.utils import run_command +from q2_amr.types import CARDAlleleAnnotationDirectoryFormat -def normalize( - reads: Union[ - SingleLanePerSamplePairedEndFastqDirFmt, SingleLanePerSampleSingleEndFastqDirFmt - ], + +def normalize_mor( table: biom.Table, ) -> pd.DataFrame: # manifest = reads.manifest.view(pd.DataFrame) @@ -19,5 +18,76 @@ def normalize( # for samp in list(manifest.index): # # fwd = manifest.loc[samp, "forward"] # # rev = manifest.loc[samp, "reverse"] if paired else None - normalized_table = pd.DataFrame(table) + data = table.matrix_data.toarray() + columns = table.ids(axis="sample") + index = table.ids(axis="observation") + df = pd.DataFrame(data, index=index, columns=columns) + df = df.T + metadata = df.iloc[:, 0].to_frame() + metadata.iloc[0, 0] = "a" + original_columns = metadata.columns + + metadata.columns = ["condition"] + list(original_columns[1:]) + dds = DeseqDataSet(counts=df, metadata=metadata) + dds.fit_size_factors() + + # normalizedcounts = dds.obsm["size_factors"] + return df + + +def normalize_tpm( + table: biom.Table, amr_annotations: CARDAlleleAnnotationDirectoryFormat +) -> pd.DataFrame: + # Initialize an empty list to store individual DataFrames + len_list = [] + + # Iterate over samples in the specified path + for samp in os.listdir(amr_annotations.path): + anno_txt = os.path.join(amr_annotations.path, samp, "allele_mapping_data.txt") + + # Read each DataFrame and append it to the list + len_sample = pd.read_csv( + anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"] + ) + # index_col="Reference Sequence") + + len_list.append(len_sample) + + # Concatenate all DataFrames into a single DataFrame and remove duplicate rows + len_df = pd.concat(len_list) + len_df.drop_duplicates(inplace=True) + len_df.columns = ["gene_id", "gene_length"] + + counts_df = table.to_dataframe() + counts_df = counts_df.T + + with tempfile.TemporaryDirectory() as tmp: + len = os.path.join(tmp, "len.csv") + counts = os.path.join(tmp, "counts.csv") + len_df.to_csv(len, index=False) + counts_df.to_csv(counts, index=True) + + run_rnanorm_tpm(cwd=tmp, len=len, counts=counts) + normalized_table = pd.read_csv( + os.path.join(tmp, "out.csv"), + index_col=0, + ) + normalized_table.index.name = "sample_id" return normalized_table + + +def run_rnanorm_tpm( + cwd: str, + len: str, + counts: str, +): + cmd = ["rnanorm", "tpm", counts, "--gene-lengths", len, "--out", f"{cwd}/out.csv"] + + try: + run_command(cmd, cwd, verbose=True, shell=True) + except subprocess.CalledProcessError as e: + raise Exception( + "An error was encountered while running RNAnorm, " + f"(return code {e.returncode}), please inspect " + "stdout and stderr to learn more." + ) diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py new file mode 100644 index 0000000..1999eb9 --- /dev/null +++ b/q2_amr/card/tests/test_normalization.py @@ -0,0 +1,11 @@ +# from qiime2.plugin.testing import TestPluginBase +# +# +# class TestAnnotateReadsCARD(TestPluginBase): +# package = "q2_amr.card.tests" +# +# @classmethod +# def setUpClass(cls): +# +# def test_normalize_MOR(self): +# biom = self.get_data_path() diff --git a/q2_amr/card/utils.py b/q2_amr/card/utils.py index 201f2ab..c91e543 100644 --- a/q2_amr/card/utils.py +++ b/q2_amr/card/utils.py @@ -15,7 +15,13 @@ ) -def run_command(cmd, cwd, verbose=True): +def run_command( + cmd, + cwd, + verbose=True, + shell=False, + stdout=None, +): if verbose: print(EXTERNAL_CMD_WARNING) print("\nCommand:", end=" ") diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 43b30cc..671e526 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -21,7 +21,7 @@ from q2_amr.card.database import fetch_card_db from q2_amr.card.heatmap import heatmap from q2_amr.card.mags import annotate_mags_card -from q2_amr.card.normalization import normalize +from q2_amr.card.normalization import normalize_mor, normalize_tpm from q2_amr.card.reads import annotate_reads_card from q2_amr.types import ( CARDAnnotationJSONFormat, @@ -216,18 +216,31 @@ ) plugin.methods.register_function( - function=normalize, + function=normalize_mor, + inputs={"table": FeatureTable[Frequency]}, + parameters={}, + outputs=[("normalized_table", FeatureTable[Frequency])], + input_descriptions={"table": "FeatureTable"}, + parameter_descriptions={}, + output_descriptions={"normalized_table": "hello"}, + name="", + description="", + citations=[], +) + +plugin.methods.register_function( + function=normalize_tpm, inputs={ "table": FeatureTable[Frequency], - "reads": SampleData[PairedEndSequencesWithQuality | SequencesWithQuality], + "amr_annotations": SampleData[CARDAlleleAnnotation], }, parameters={}, outputs=[("normalized_table", FeatureTable[Frequency])], - input_descriptions={"table": "The ", "reads": "The "}, + input_descriptions={"table": "FeatureTable"}, parameter_descriptions={}, output_descriptions={"normalized_table": "hello"}, - name="Shannon's Entropy", - description="Compute Shannon's Entropy for each sample in a ", + name="", + description="", citations=[], ) From e713e5e095975162d4d7b9e741e3d3cb9acedfce Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 26 Jan 2024 14:16:33 +0100 Subject: [PATCH 03/23] adding tpm python implementation --- q2_amr/card/normalization.py | 63 +++++++++--------------------------- 1 file changed, 15 insertions(+), 48 deletions(-) diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index 380171e..c1c7156 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -1,12 +1,10 @@ import os -import subprocess -import tempfile import biom import pandas as pd from pydeseq2.dds import DeseqDataSet +from rnanorm import TPM -from q2_amr.card.utils import run_command from q2_amr.types import CARDAlleleAnnotationDirectoryFormat @@ -38,8 +36,8 @@ def normalize_mor( def normalize_tpm( table: biom.Table, amr_annotations: CARDAlleleAnnotationDirectoryFormat ) -> pd.DataFrame: - # Initialize an empty list to store individual DataFrames - len_list = [] + # Initialize an empty series for gene lengths + len_all = pd.Series() # Iterate over samples in the specified path for samp in os.listdir(amr_annotations.path): @@ -48,46 +46,15 @@ def normalize_tpm( # Read each DataFrame and append it to the list len_sample = pd.read_csv( anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"] - ) - # index_col="Reference Sequence") - - len_list.append(len_sample) - - # Concatenate all DataFrames into a single DataFrame and remove duplicate rows - len_df = pd.concat(len_list) - len_df.drop_duplicates(inplace=True) - len_df.columns = ["gene_id", "gene_length"] - - counts_df = table.to_dataframe() - counts_df = counts_df.T - - with tempfile.TemporaryDirectory() as tmp: - len = os.path.join(tmp, "len.csv") - counts = os.path.join(tmp, "counts.csv") - len_df.to_csv(len, index=False) - counts_df.to_csv(counts, index=True) - - run_rnanorm_tpm(cwd=tmp, len=len, counts=counts) - normalized_table = pd.read_csv( - os.path.join(tmp, "out.csv"), - index_col=0, - ) - normalized_table.index.name = "sample_id" - return normalized_table - - -def run_rnanorm_tpm( - cwd: str, - len: str, - counts: str, -): - cmd = ["rnanorm", "tpm", counts, "--gene-lengths", len, "--out", f"{cwd}/out.csv"] - - try: - run_command(cmd, cwd, verbose=True, shell=True) - except subprocess.CalledProcessError as e: - raise Exception( - "An error was encountered while running RNAnorm, " - f"(return code {e.returncode}), please inspect " - "stdout and stderr to learn more." - ) + ).set_index("Reference Sequence")["Reference Length"] + + len_all = len_all.combine_first(len_sample) + len_all = len_all + # len_all = len_all.values.reshape(-1, 1) + counts_arr = table.matrix_data.toarray().T + # counts_arr = counts_arr.T + # len_all = len_all.reshape(-1, 1) + tpm_normalizer = TPM(gene_lengths=len_all).set_output(transform="pandas") + tpm_df = tpm_normalizer.fit_transform(counts_arr) + tpm_df.index.name = "sample_id" + return tpm_df From 358ba73fa172f0eeb69649f6112467a3149b0b38 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 1 Feb 2024 10:48:50 +0100 Subject: [PATCH 04/23] added tpm python integration --- q2_amr/card/normalization.py | 65 +++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index c1c7156..3305c2c 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -1,8 +1,5 @@ -import os - import biom import pandas as pd -from pydeseq2.dds import DeseqDataSet from rnanorm import TPM from q2_amr.types import CARDAlleleAnnotationDirectoryFormat @@ -26,8 +23,8 @@ def normalize_mor( original_columns = metadata.columns metadata.columns = ["condition"] + list(original_columns[1:]) - dds = DeseqDataSet(counts=df, metadata=metadata) - dds.fit_size_factors() + # dds = DeseqDataSet(counts=df, metadata=metadata) + # dds.fit_size_factors() # normalizedcounts = dds.obsm["size_factors"] return df @@ -36,25 +33,39 @@ def normalize_mor( def normalize_tpm( table: biom.Table, amr_annotations: CARDAlleleAnnotationDirectoryFormat ) -> pd.DataFrame: - # Initialize an empty series for gene lengths - len_all = pd.Series() - - # Iterate over samples in the specified path - for samp in os.listdir(amr_annotations.path): - anno_txt = os.path.join(amr_annotations.path, samp, "allele_mapping_data.txt") - - # Read each DataFrame and append it to the list - len_sample = pd.read_csv( - anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"] - ).set_index("Reference Sequence")["Reference Length"] - - len_all = len_all.combine_first(len_sample) - len_all = len_all - # len_all = len_all.values.reshape(-1, 1) - counts_arr = table.matrix_data.toarray().T - # counts_arr = counts_arr.T - # len_all = len_all.reshape(-1, 1) - tpm_normalizer = TPM(gene_lengths=len_all).set_output(transform="pandas") - tpm_df = tpm_normalizer.fit_transform(counts_arr) - tpm_df.index.name = "sample_id" - return tpm_df + # # Initialize an empty series for gene lengths + # len_all = pd.Series() + # + # # Iterate over samples in the specified path + # for samp in os.listdir(amr_annotations.path): + # anno_txt = os.path.join(amr_annotations.path, samp, "allele_mapping_data.txt") + # + # # Read each DataFrame and append it to the list + # len_sample = pd.read_csv( + # anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"] + # ).set_index("Reference Sequence")["Reference Length"] + # + # len_all = len_all.combine_first(len_sample) + len_all = pd.read_csv( + "/Users/rischv/Desktop/genelengts.txt", + sep="\t", + header=None, + names=["index", "values"], + index_col="index", + squeeze=True, + ) + len_all = len_all.astype(int) + df = pd.DataFrame( + data=table.matrix_data.toarray(), + index=table.ids(axis="observation"), + columns=table.ids(axis="sample"), + ).T + # Work with pandas. + # len_all.to_csv("/Users/rischv/Desktop/genelengts.txt", sep='\t', index=True, + # header=False) + + transformer = TPM(gene_lengths=len_all).set_output(transform="pandas") + exp_normalized = transformer.fit_transform(df) + + exp_normalized.index.name = "sample_id" + return exp_normalized From 315e5a1a2ce36efe37aba8017da3009a5aab16b4 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 1 Feb 2024 14:02:28 +0100 Subject: [PATCH 05/23] added GeneLength type --- q2_amr/plugin_setup.py | 8 ++++++ q2_amr/types/__init__.py | 4 +++ q2_amr/types/_format.py | 22 +++++++++++++++ q2_amr/types/_type.py | 1 + q2_amr/types/tests/data/gene_length.txt | 2 ++ .../types/tests/data/gene_length_3_cols.txt | 2 ++ .../types/tests/data/gene_length_switched.txt | 2 ++ .../tests/test_types_formats_transformers.py | 28 +++++++++++++++++++ 8 files changed, 69 insertions(+) create mode 100644 q2_amr/types/tests/data/gene_length.txt create mode 100644 q2_amr/types/tests/data/gene_length_3_cols.txt create mode 100644 q2_amr/types/tests/data/gene_length_switched.txt diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index cb22126..45d0e3c 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -41,12 +41,15 @@ CARDKmerTXTFormat, CARDWildcardIndexFormat, GapDNAFASTAFormat, + GeneLengthDirectoryFormat, + GeneLengthFormat, ) from q2_amr.types._type import ( CARDAlleleAnnotation, CARDAnnotation, CARDGeneAnnotation, CARDKmerDatabase, + GeneLength, ) citations = Citations.load("citations.bib", package="q2_amr") @@ -239,6 +242,9 @@ plugin.register_semantic_type_to_format( SampleData[CARDGeneAnnotation], artifact_format=CARDGeneAnnotationDirectoryFormat ) +plugin.register_semantic_type_to_format( + GeneLength, artifact_format=GeneLengthDirectoryFormat +) plugin.register_formats( CARDKmerDatabaseDirectoryFormat, @@ -256,6 +262,8 @@ CARDAnnotationStatsFormat, CARDAlleleAnnotationDirectoryFormat, CARDGeneAnnotationDirectoryFormat, + GeneLengthFormat, + GeneLengthDirectoryFormat, ) importlib.import_module("q2_amr.types._transformer") diff --git a/q2_amr/types/__init__.py b/q2_amr/types/__init__.py index 7f5d25d..94f3f12 100644 --- a/q2_amr/types/__init__.py +++ b/q2_amr/types/__init__.py @@ -22,6 +22,8 @@ CARDKmerTXTFormat, CARDWildcardIndexFormat, GapDNAFASTAFormat, + GeneLengthDirectoryFormat, + GeneLengthFormat, ) from ._type import ( CARDAlleleAnnotation, @@ -52,4 +54,6 @@ "GapDNAFASTAFormat", "CARDWildcardIndexFormat", "CARDKmerDatabase", + "GeneLengthDirectoryFormat", + "GeneLengthFormat", ] diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py index de8c44a..7ee2eca 100644 --- a/q2_amr/types/_format.py +++ b/q2_amr/types/_format.py @@ -425,3 +425,25 @@ class CARDGeneAnnotationDirectoryFormat(MultiDirValidationMixin, model.Directory @gene.set_path_maker def gene_path_maker(self, sample_id): return "%s/gene_mapping_data.txt" % sample_id + + +class GeneLengthFormat(model.TextFileFormat): + def _validate(self, n_records=None): + df = pd.read_csv(str(self), sep="\t", header=None) + has_two_columns = len(df.columns) == 2 + lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all() + if not (has_two_columns and lengths_num): + raise ValidationError( + "Format does not match GeneLengthsFormat. Must consist of tab " + "separated values in two columns with no header. The first column must " + "be the gene names and the second column must be the corresponding " + "gene lengths." + ) + + def _validate_(self, level): + self._validate() + + +GeneLengthDirectoryFormat = model.SingleFileDirectoryFormat( + "GeneLengthDirectoryFormat", "gene_length.txt", GeneLengthFormat +) diff --git a/q2_amr/types/_type.py b/q2_amr/types/_type.py index aa2f302..bee4195 100644 --- a/q2_amr/types/_type.py +++ b/q2_amr/types/_type.py @@ -17,3 +17,4 @@ CARDGeneAnnotation = SemanticType( "CARDGeneAnnotation", variant_of=SampleData.field["type"] ) +GeneLength = SemanticType("GeneLength") diff --git a/q2_amr/types/tests/data/gene_length.txt b/q2_amr/types/tests/data/gene_length.txt new file mode 100644 index 0000000..2fddb36 --- /dev/null +++ b/q2_amr/types/tests/data/gene_length.txt @@ -0,0 +1,2 @@ +ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 +ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1 1173.0 diff --git a/q2_amr/types/tests/data/gene_length_3_cols.txt b/q2_amr/types/tests/data/gene_length_3_cols.txt new file mode 100644 index 0000000..a1803bb --- /dev/null +++ b/q2_amr/types/tests/data/gene_length_3_cols.txt @@ -0,0 +1,2 @@ +ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 1 +ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1 1173.0 1 diff --git a/q2_amr/types/tests/data/gene_length_switched.txt b/q2_amr/types/tests/data/gene_length_switched.txt new file mode 100644 index 0000000..cae5b65 --- /dev/null +++ b/q2_amr/types/tests/data/gene_length_switched.txt @@ -0,0 +1,2 @@ +1356.0 ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 +1173.0 ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1 diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py index a594eaa..55ceb21 100644 --- a/q2_amr/types/tests/test_types_formats_transformers.py +++ b/q2_amr/types/tests/test_types_formats_transformers.py @@ -21,6 +21,7 @@ ProteinIterator, ) from q2_types_genomics.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat +from qiime2.plugin import ValidationError from qiime2.plugin.testing import TestPluginBase from skbio import DNA, Protein @@ -38,6 +39,8 @@ CARDKmerTXTFormat, CARDWildcardIndexFormat, GapDNAFASTAFormat, + GeneLengthDirectoryFormat, + GeneLengthFormat, ) from q2_amr.types._transformer import ( _read_from_card_file, @@ -328,3 +331,28 @@ def test_CARDAlleleAnnotationDirectoryFormat_to_qiime2_Metadata_transformer(self ) metadata_obt = transformer(annotation) self.assertIsInstance(metadata_obt, qiime2.Metadata) + + +class TestGeneLengthsTypesAndFormats(AMRTypesTestPluginBase): + def test_gene_length_format_validate_positive(self): + filepath = self.get_data_path("gene_length.txt") + format = GeneLengthFormat(filepath, mode="r") + format.validate() + + def test_gene_length_format_validate_negative_3_cols(self): + filepath = self.get_data_path("gene_length_3_cols.txt") + with self.assertRaises(ValidationError): + format = GeneLengthFormat(filepath, mode="r") + format.validate() + + def test_gene_length_format_validate_negative_switched(self): + filepath = self.get_data_path("gene_length_switched.txt") + with self.assertRaises(ValidationError): + format = GeneLengthFormat(filepath, mode="r") + format.validate() + + def test_gene_length_directory_format_validate_positive(self): + filepath = self.get_data_path("gene_length.txt") + shutil.copy(filepath, os.path.join(self.temp_dir.name, "gene_length.txt")) + format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r") + format.validate() From 8a6c46b3266944578593ff660f41aa77d4f65205 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 1 Feb 2024 15:44:59 +0100 Subject: [PATCH 06/23] added transformer for GeneLength --- q2_amr/types/_transformer.py | 21 +++++++++++++++++++ .../tests/test_types_formats_transformers.py | 12 +++++++++++ 2 files changed, 33 insertions(+) diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py index 6d08cf1..f6fa159 100644 --- a/q2_amr/types/_transformer.py +++ b/q2_amr/types/_transformer.py @@ -26,6 +26,7 @@ CARDAnnotationTXTFormat, CARDDatabaseFormat, CARDGeneAnnotationDirectoryFormat, + GeneLengthDirectoryFormat, ) @@ -253,3 +254,23 @@ def tabulate_data(data_path, data_type): if data_type == "mags": df_combined.rename(columns={"ID": "HSP_Identifier"}, inplace=True) return qiime2.Metadata(df_combined) + + +@plugin.register_transformer +def _15(data: CARDAlleleAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat: + len_all = pd.Series() + directory = GeneLengthDirectoryFormat() + # Iterate over samples in the specified path + for samp in os.listdir(data.path): + anno_txt = os.path.join(data.path, samp, "allele_mapping_data.txt") + + # Read each DataFrame and append it to the list + len_sample = pd.read_csv( + anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"] + ).set_index("Reference Sequence")["Reference Length"] + + len_all = len_all.combine_first(len_sample) + len_all.to_csv( + os.path.join(directory.path, "gene_length.txt"), sep="\t", header=False + ) + return directory diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py index 55ceb21..e0de12b 100644 --- a/q2_amr/types/tests/test_types_formats_transformers.py +++ b/q2_amr/types/tests/test_types_formats_transformers.py @@ -356,3 +356,15 @@ def test_gene_length_directory_format_validate_positive(self): shutil.copy(filepath, os.path.join(self.temp_dir.name, "gene_length.txt")) format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r") format.validate() + + def test_card_annotation_format_to_df_transformertt(self): + transformer = self.get_transformer( + CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat + ) + + annotation = CARDAlleleAnnotationDirectoryFormat( + self.get_data_path("annotate_reads_output"), "r" + ) + + obs = transformer(annotation) + self.assertIsInstance(obs, GeneLengthDirectoryFormat) From 942cd8bced550afdaba03a0544b99bb9065d6207 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 1 Feb 2024 17:08:44 +0100 Subject: [PATCH 07/23] changed input of normalize to genelength --- q2_amr/card/normalization.py | 27 +++++++-------------------- q2_amr/plugin_setup.py | 4 ++-- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index 3305c2c..a92c1b1 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -1,8 +1,10 @@ +import os + import biom import pandas as pd from rnanorm import TPM -from q2_amr.types import CARDAlleleAnnotationDirectoryFormat +from q2_amr.types import GeneLengthDirectoryFormat def normalize_mor( @@ -31,40 +33,25 @@ def normalize_mor( def normalize_tpm( - table: biom.Table, amr_annotations: CARDAlleleAnnotationDirectoryFormat + table: biom.Table, gene_length: GeneLengthDirectoryFormat ) -> pd.DataFrame: - # # Initialize an empty series for gene lengths - # len_all = pd.Series() - # - # # Iterate over samples in the specified path - # for samp in os.listdir(amr_annotations.path): - # anno_txt = os.path.join(amr_annotations.path, samp, "allele_mapping_data.txt") - # - # # Read each DataFrame and append it to the list - # len_sample = pd.read_csv( - # anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"] - # ).set_index("Reference Sequence")["Reference Length"] - # - # len_all = len_all.combine_first(len_sample) - len_all = pd.read_csv( - "/Users/rischv/Desktop/genelengts.txt", + gene_length_series = pd.read_csv( + os.path.join(gene_length.path, "gene_length.txt"), sep="\t", header=None, names=["index", "values"], index_col="index", squeeze=True, ) - len_all = len_all.astype(int) df = pd.DataFrame( data=table.matrix_data.toarray(), index=table.ids(axis="observation"), columns=table.ids(axis="sample"), ).T - # Work with pandas. # len_all.to_csv("/Users/rischv/Desktop/genelengts.txt", sep='\t', index=True, # header=False) - transformer = TPM(gene_lengths=len_all).set_output(transform="pandas") + transformer = TPM(gene_lengths=gene_length_series).set_output(transform="pandas") exp_normalized = transformer.fit_transform(df) exp_normalized.index.name = "sample_id" diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 9a739b2..5d06ec9 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -235,11 +235,11 @@ function=normalize_tpm, inputs={ "table": FeatureTable[Frequency], - "amr_annotations": SampleData[CARDAlleleAnnotation], + "gene_length": SampleData[CARDAlleleAnnotation], }, parameters={}, outputs=[("normalized_table", FeatureTable[Frequency])], - input_descriptions={"table": "FeatureTable"}, + input_descriptions={"table": "FeatureTable", "gene_length": "gene_length"}, parameter_descriptions={}, output_descriptions={"normalized_table": "hello"}, name="", From fe1d3a8a58899b3dd071f4ac3be9af3c5ca9a6bc Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 1 Feb 2024 19:53:18 +0100 Subject: [PATCH 08/23] added transformer for GeneLength --- q2_amr/types/_format.py | 27 ++++++-- q2_amr/types/_transformer.py | 27 +++++--- .../tests/data/gene_length_duplicate.txt | 2 + .../tests/test_types_formats_transformers.py | 69 +++++++++++++------ 4 files changed, 91 insertions(+), 34 deletions(-) create mode 100644 q2_amr/types/tests/data/gene_length_duplicate.txt diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py index 7ee2eca..09b1882 100644 --- a/q2_amr/types/_format.py +++ b/q2_amr/types/_format.py @@ -428,16 +428,33 @@ def gene_path_maker(self, sample_id): class GeneLengthFormat(model.TextFileFormat): + """ + Tab-separated format for gene length data. + + The format must consist of tab separated values in two columns with no header. + The first column must be the gene names and the second column must be the + corresponding gene lengths. The gene names can not be duplicated. + """ + def _validate(self, n_records=None): df = pd.read_csv(str(self), sep="\t", header=None) has_two_columns = len(df.columns) == 2 lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all() - if not (has_two_columns and lengths_num): + if not has_two_columns: + raise ValidationError( + "The file must consist of two tab separated columns. The first column " + "must be the gene names and the second column must be the " + "corresponding gene lengths." + ) + if not lengths_num: + raise ValidationError( + "The second column (gene lengths) must be of type int or float." + ) + duplicates = df.iloc[:, 0].duplicated().any() + if duplicates: raise ValidationError( - "Format does not match GeneLengthsFormat. Must consist of tab " - "separated values in two columns with no header. The first column must " - "be the gene names and the second column must be the corresponding " - "gene lengths." + "There are no duplicate values allowed in the first column " + "(gene names)." ) def _validate_(self, level): diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py index f6fa159..22cb51b 100644 --- a/q2_amr/types/_transformer.py +++ b/q2_amr/types/_transformer.py @@ -258,18 +258,29 @@ def tabulate_data(data_path, data_type): @plugin.register_transformer def _15(data: CARDAlleleAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat: + directory = get_gene_lengths(map_type="allele", annotations=data.path) + return directory + + +@plugin.register_transformer +def _16(data: CARDGeneAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat: + directory = get_gene_lengths(map_type="gene", annotations=data.path) + return directory + + +def get_gene_lengths(map_type, annotations): + gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term" len_all = pd.Series() directory = GeneLengthDirectoryFormat() - # Iterate over samples in the specified path - for samp in os.listdir(data.path): - anno_txt = os.path.join(data.path, samp, "allele_mapping_data.txt") - - # Read each DataFrame and append it to the list - len_sample = pd.read_csv( - anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"] - ).set_index("Reference Sequence")["Reference Length"] + # Iterate over samples, read in each DataFrame and append it to the series + for samp in os.listdir(annotations): + anno_txt = os.path.join(annotations, samp, f"{map_type}_mapping_data.txt") + cols = [gene_name_col, "Reference Length"] + len_sample = pd.read_csv(anno_txt, sep="\t", usecols=cols) + len_sample = len_sample.set_index(cols[0])[cols[1]] len_all = len_all.combine_first(len_sample) + len_all.to_csv( os.path.join(directory.path, "gene_length.txt"), sep="\t", header=False ) diff --git a/q2_amr/types/tests/data/gene_length_duplicate.txt b/q2_amr/types/tests/data/gene_length_duplicate.txt new file mode 100644 index 0000000..60621cb --- /dev/null +++ b/q2_amr/types/tests/data/gene_length_duplicate.txt @@ -0,0 +1,2 @@ +ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 +ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py index e0de12b..b6be2cc 100644 --- a/q2_amr/types/tests/test_types_formats_transformers.py +++ b/q2_amr/types/tests/test_types_formats_transformers.py @@ -339,17 +339,37 @@ def test_gene_length_format_validate_positive(self): format = GeneLengthFormat(filepath, mode="r") format.validate() - def test_gene_length_format_validate_negative_3_cols(self): - filepath = self.get_data_path("gene_length_3_cols.txt") - with self.assertRaises(ValidationError): - format = GeneLengthFormat(filepath, mode="r") - format.validate() - - def test_gene_length_format_validate_negative_switched(self): - filepath = self.get_data_path("gene_length_switched.txt") - with self.assertRaises(ValidationError): - format = GeneLengthFormat(filepath, mode="r") - format.validate() + def test_gene_length_format_validate_negative(self): + # Test ValidationErrors for wrong columns, wrong data type and duplicate gene + # names + test_cases = [ + ( + "gene_length_3_cols.txt", + " is not a(n) GeneLengthFormat file:\n\nThe file must consist of two " + "tab separated columns. The first column must be the gene names and " + "the second column must be the corresponding gene lengths.", + ), + ( + "gene_length_switched.txt", + " is not a(n) GeneLengthFormat file:\n\nThe second column " + "(gene lengths) must be of type int or float.", + ), + ( + "gene_length_duplicate.txt", + " is not a(n) GeneLengthFormat file:\n\nThere are no duplicate values " + "allowed in the first column (gene names).", + ), + ] + + for file_name, expected_error_message in test_cases: + with self.subTest(file_name=file_name): + filepath = self.get_data_path(file_name) + with self.assertRaises(ValidationError) as context: + format = GeneLengthFormat(filepath, mode="r") + format.validate() + self.assertEqual( + str(context.exception), filepath + expected_error_message + ) def test_gene_length_directory_format_validate_positive(self): filepath = self.get_data_path("gene_length.txt") @@ -357,14 +377,21 @@ def test_gene_length_directory_format_validate_positive(self): format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r") format.validate() - def test_card_annotation_format_to_df_transformertt(self): - transformer = self.get_transformer( - CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat - ) - - annotation = CARDAlleleAnnotationDirectoryFormat( - self.get_data_path("annotate_reads_output"), "r" - ) + def test_card_allele_gene_annotation_dir_format_to_gene_length_dir_format(self): + transformations = [ + (CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat), + (CARDGeneAnnotationDirectoryFormat, GeneLengthDirectoryFormat), + ] - obs = transformer(annotation) - self.assertIsInstance(obs, GeneLengthDirectoryFormat) + for input_format, output_format in transformations: + with self.subTest(input_format=input_format, output_format=output_format): + transformer = self.get_transformer(input_format, output_format) + annotation = input_format( + self.get_data_path("annotate_reads_output"), "r" + ) + obs = transformer(annotation) + self.assertIsInstance(obs, GeneLengthDirectoryFormat) + format = GeneLengthFormat( + os.path.join(obs.path, "gene_length.txt"), mode="r" + ) + format.validate() From 6dce0d8be69231273b7265eabe0ed32e093e706b Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 1 Feb 2024 20:15:30 +0100 Subject: [PATCH 09/23] added comment in test --- q2_amr/types/_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py index 22cb51b..d85d346 100644 --- a/q2_amr/types/_transformer.py +++ b/q2_amr/types/_transformer.py @@ -269,6 +269,7 @@ def _16(data: CARDGeneAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat: def get_gene_lengths(map_type, annotations): + # Extracts gene lengths from CARDAlleleAnnotation and CARDGeneAnnotation gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term" len_all = pd.Series() directory = GeneLengthDirectoryFormat() From e2069ca314d3ec7d0d950b4bd9c73ce34a217b19 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 2 Feb 2024 09:52:00 +0100 Subject: [PATCH 10/23] added comments --- q2_amr/types/_format.py | 7 +++++-- q2_amr/types/tests/test_types_formats_transformers.py | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py index 09b1882..9c41528 100644 --- a/q2_amr/types/_format.py +++ b/q2_amr/types/_format.py @@ -431,25 +431,28 @@ class GeneLengthFormat(model.TextFileFormat): """ Tab-separated format for gene length data. - The format must consist of tab separated values in two columns with no header. + The file must consist of tab separated values in two columns with no header. The first column must be the gene names and the second column must be the corresponding gene lengths. The gene names can not be duplicated. """ def _validate(self, n_records=None): df = pd.read_csv(str(self), sep="\t", header=None) + # Check if df has exactly two columns has_two_columns = len(df.columns) == 2 - lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all() if not has_two_columns: raise ValidationError( "The file must consist of two tab separated columns. The first column " "must be the gene names and the second column must be the " "corresponding gene lengths." ) + # Check if gene length column is numerical (float or int) + lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all() if not lengths_num: raise ValidationError( "The second column (gene lengths) must be of type int or float." ) + # Check if gene names column doesn't have duplicates duplicates = df.iloc[:, 0].duplicated().any() if duplicates: raise ValidationError( diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py index b6be2cc..9949130 100644 --- a/q2_amr/types/tests/test_types_formats_transformers.py +++ b/q2_amr/types/tests/test_types_formats_transformers.py @@ -340,8 +340,8 @@ def test_gene_length_format_validate_positive(self): format.validate() def test_gene_length_format_validate_negative(self): - # Test ValidationErrors for wrong columns, wrong data type and duplicate gene - # names + # Test ValidationErrors for wrong number of columns, wrong data type and + # duplicate gene names test_cases = [ ( "gene_length_3_cols.txt", From b6d2b9478bfe9a9947c9ac5d6e9ebd85ddefc150 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 8 Feb 2024 16:21:07 +0100 Subject: [PATCH 11/23] added all normalization methods with tests except for CPM --- README.md | 20 ++-- q2_amr/card/heatmap.py | 8 +- q2_amr/card/normalization.py | 105 +++++++++++-------- q2_amr/card/tests/data/feature-table.biom | Bin 0 -> 33824 bytes q2_amr/card/tests/data/gene_length.txt | 2 + q2_amr/card/tests/data/gene_length_short.txt | 1 + q2_amr/card/tests/test_normalization.py | 87 +++++++++++++-- q2_amr/card/utils.py | 6 ++ q2_amr/citations.bib | 9 ++ q2_amr/plugin_setup.py | 30 ++---- 10 files changed, 178 insertions(+), 90 deletions(-) create mode 100644 q2_amr/card/tests/data/feature-table.biom create mode 100644 q2_amr/card/tests/data/gene_length.txt create mode 100644 q2_amr/card/tests/data/gene_length_short.txt diff --git a/README.md b/README.md index b6c8579..2b7a67e 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ To install _q2-amr_, follow the steps described below. mamba create -yn q2-amr \ -c conda-forge -c bioconda -c qiime2 -c defaults \ -c https://packages.qiime2.org/qiime2/2023.9/shotgun/released/ \ - qiime2 q2cli q2templates q2-types q2-types-genomics rgi + qiime2 q2cli q2templates q2-types q2-types-genomics rgi rnanorm conda activate q2-amr @@ -38,7 +38,7 @@ qiime info CONDA_SUBDIR=osx-64 mamba create -yn q2-amr \ -c conda-forge -c bioconda -c qiime2 -c defaults \ -c https://packages.qiime2.org/qiime2/2023.9/shotgun/released/ \ - qiime2 q2cli q2templates q2-types q2-types-genomics rgi + qiime2 q2cli q2templates q2-types q2-types-genomics rgi rnanorm conda activate q2-amr conda config --env --set subdir osx-64 @@ -58,15 +58,17 @@ qiime info ## Functionality This QIIME 2 plugin contains actions used to annotate short single/paired-end sequencing reads and MAGs with antimicrobial resistance genes. Currently, the [CARD](https://card.mcmaster.ca) database is supported (for details on -the implementation and usage, please refer to the [rgi](https://github.com/arpcard/rgi) documentation). Below you will +the implementation and usage, please refer to the [RGI](https://github.com/arpcard/rgi) documentation). Below you will find an overview of actions available in the plugin. -| Action | Description | Underlying tool | Used function | -|----------------------------|--------------------------------------------------------------------------------------|---------------------------------------|--------------------------------------| -| fetch-card-db | Download and preprocess CARD and WildCARD data. | [rgi](https://github.com/arpcard/rgi) | card_annotation, wildcard_annotation | -| annotate-mags-card | Annotate MAGs with antimicrobial resistance gene information from CARD. | [rgi](https://github.com/arpcard/rgi) | main, load | -| annotate-reads-card | Annotate metagenomic reads with antimicrobial resistance gene information from CARD. | [rgi](https://github.com/arpcard/rgi) | bwt, load | -| heatmap | Create a heatmap from annotate-mags-card output files. | [rgi](https://github.com/arpcard/rgi) | heatmap | +| Action | Description | Underlying tool | Used function | +|---------------------|--------------------------------------------------------------------------------------|-------------------------------------------|--------------------------------------| +| fetch-card-db | Download and preprocess CARD and WildCARD data. | [RGI](https://github.com/arpcard/rgi) | card_annotation, wildcard_annotation | +| annotate-mags-card | Annotate MAGs with antimicrobial resistance gene information from CARD. | [RGI](https://github.com/arpcard/rgi) | main, load | +| annotate-reads-card | Annotate metagenomic reads with antimicrobial resistance gene information from CARD. | [RGI](https://github.com/arpcard/rgi) | bwt, load | +| heatmap | Create a heatmap from annotate-mags-card output files. | [RGI](https://github.com/arpcard/rgi) | heatmap | +| normalize | Normalize feature table with [FPKM](https://www.nature.com/articles/nmeth.1226), [TPM](https://link.springer.com/article/10.1007/s12064-012-0162-3), [TMM](https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25), [UQ](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-11-94), [CUF](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/), or [CTF](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/) method. | [rnaNORM](https://github.com/genialis/RNAnorm) | fpkm, tpm, tmm, uq, cuf, ctf | + ## Dev environment This repository follows the _black_ code style. To make the development slightly easier diff --git a/q2_amr/card/heatmap.py b/q2_amr/card/heatmap.py index b6c93ed..bc9e253 100644 --- a/q2_amr/card/heatmap.py +++ b/q2_amr/card/heatmap.py @@ -8,7 +8,7 @@ import pkg_resources import q2templates -from q2_amr.card.utils import run_command +from q2_amr.card.utils import InvalidParameterCombinationError, run_command from q2_amr.types import CARDAnnotationDirectoryFormat @@ -50,12 +50,6 @@ def heatmap( q2templates.render(templates, output_dir, context=context) -class InvalidParameterCombinationError(Exception): - def __init__(self, message="Invalid parameter combination"): - self.message = message - super().__init__(self.message) - - def run_rgi_heatmap(tmp, json_files_dir, clus, cat, display, frequency): cmd = [ "rgi", diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index a92c1b1..98a9109 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -2,57 +2,74 @@ import biom import pandas as pd -from rnanorm import TPM +from rnanorm import CTF, CUF, FPKM, TMM, TPM, UQ +from q2_amr.card.utils import InvalidParameterCombinationError from q2_amr.types import GeneLengthDirectoryFormat -def normalize_mor( +def normalize( table: biom.Table, + method: str, + m_trim: float = 0.3, + a_trim: float = 0.05, + gene_length: GeneLengthDirectoryFormat = None, ) -> pd.DataFrame: - # manifest = reads.manifest.view(pd.DataFrame) - # #paired = isinstance(reads, SingleLanePerSamplePairedEndFastqDirFmt) - # for samp in list(manifest.index): - # # fwd = manifest.loc[samp, "forward"] - # # rev = manifest.loc[samp, "reverse"] if paired else None - data = table.matrix_data.toarray() - columns = table.ids(axis="sample") - index = table.ids(axis="observation") - df = pd.DataFrame(data, index=index, columns=columns) - df = df.T - metadata = df.iloc[:, 0].to_frame() - metadata.iloc[0, 0] = "a" - original_columns = metadata.columns - - metadata.columns = ["condition"] + list(original_columns[1:]) - # dds = DeseqDataSet(counts=df, metadata=metadata) - # dds.fit_size_factors() - - # normalizedcounts = dds.obsm["size_factors"] - return df - - -def normalize_tpm( - table: biom.Table, gene_length: GeneLengthDirectoryFormat -) -> pd.DataFrame: - gene_length_series = pd.read_csv( - os.path.join(gene_length.path, "gene_length.txt"), - sep="\t", - header=None, - names=["index", "values"], - index_col="index", - squeeze=True, - ) - df = pd.DataFrame( + # Create Dataframe with counts from biom.Table + counts = pd.DataFrame( data=table.matrix_data.toarray(), index=table.ids(axis="observation"), columns=table.ids(axis="sample"), ).T - # len_all.to_csv("/Users/rischv/Desktop/genelengts.txt", sep='\t', index=True, - # header=False) - - transformer = TPM(gene_lengths=gene_length_series).set_output(transform="pandas") - exp_normalized = transformer.fit_transform(df) - - exp_normalized.index.name = "sample_id" - return exp_normalized + if method in ["tpm", "fpkm", "uq", "cuf"]: + # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ or + # CUF + if m_trim != 0.3 or a_trim != 0.05: + raise InvalidParameterCombinationError( + "Parameters m-trim and a-trim can only be used with methods TMM and " + "CTF." + ) + if method in ["tpm", "fpkm"]: + # Raise Error if gene-length is missing when using methods TPM or FPKM + if not gene_length: + raise ValueError("gene-length input is missing.") + # Create pd.Series from gene_length input + gene_length_series = pd.read_csv( + os.path.join(gene_length.path, "gene_length.txt"), + sep="\t", + header=None, + names=["index", "values"], + index_col="index", + squeeze=True, + ) + # Raise Error if there are genes in the counts that are not present in the + # gene length + if not set(counts.columns).issubset(set(gene_length_series.index)): + only_in_counts = set(counts.columns) - set(gene_length_series.index) + raise ValueError( + f"There are genes present in the FeatureTable that are not present " + f"in the gene-length input. Missing lengths for genes: " + f"{only_in_counts}" + ) + # Define the methods TPM and FPKM with the gene length series as an input + methods = { + "tpm": TPM(gene_lengths=gene_length_series), + "fpkm": FPKM(gene_lengths=gene_length_series), + } + if method in ["tmm", "uq", "cuf", "ctf"]: + # Raise Error if gene-length is given when using methods TMM, UQ, CUF or CTF + if gene_length: + raise ValueError( + "gene-length input can only be used with FPKM and TPM methods." + ) + # Define the methods TMM and CTF with parameters, also UQ and CUF + methods = { + "tmm": TMM(m_trim=m_trim, a_trim=a_trim), + "ctf": CTF(m_trim=m_trim, a_trim=a_trim), + "uq": UQ(), + "cuf": CUF(), + } + # Run normalization method on count dataframe + normalized = methods[method].set_output(transform="pandas").fit_transform(counts) + normalized.index.name = "sample_id" + return normalized diff --git a/q2_amr/card/tests/data/feature-table.biom b/q2_amr/card/tests/data/feature-table.biom new file mode 100644 index 0000000000000000000000000000000000000000..3a8fcee13f42e92f42a5463c65bee03c7bbf9168 GIT binary patch literal 33824 zcmeI4PfS}k7{DC@Z3>iysoH4ONlzP_I4txf{9P^$Ep$QJfVFI6n}(9WD?}nB3u#9; z4s6T>TG}C*SxrM12nKCd_FwepY6}~@7a#| z{LIhE@oQ(>FSHBgLLt#A!a7KgXXu=tKtgd!PhlW{{$2EcX@NjMvU%?F!*;2u*(o$`sqZxz z?h&Z(wpU!!Xu~ouWKXD)A|U&%9K8acKy5ucJGIvz@YW;p)Wx{s0 z%3xVvm)5sX^z_S*EJ~&O(09S7udPDgDc$e2<+Oew2kkHQzf-uX2FbBi>I;e0)$ert z1$|mt<`uaY@M8 z08YV06xUq;kG8-L^4@KkdQ2%Ibz#WJ| z{!`$I2>N5!r{e>zbg_|*$?NOO^*kpwUF#m_wf&XDrefcs5$pO0Y#MWA;zEFFe*49QJbCc>3IMULi-=s z35E51<%HnkZdwSK>j2HmJ{&((jOHDj4v_TlKCA<+WX=|k-vV7!&mP9Z4#>fpF2qdo z<#qO9@?9)P`W|3>=WWP9{oKa+Lix=3gX-$mv#kRrKVApw_7&De!))t->r^(6=1HyA z0o(XB=WDg|uDM=lC+!z1Tg|a%Ct!|W2Rpon>pk_K_OjX4#q5H&CZt~|70c&eqm*%< zen6apvoo9+0sKX?or?!$y2{$^T*f_cLJW3@!Do1Y)+VJYy(I{0JA|VHc+4)tDm@?; zy_+lQq?^vuLRfFGrDf|~XuCSHqn|k$Oo8->pYXdLd|q+#otIaL$B$wM&;4wxz3i3xOG>l*-O5sotfCcp%k025#W4NhQvejz1-r$fK* z%Dv+~@q>zQ%09{6q6$jant9KWUrAjQt?ToDq(w_t__yutu1CKnH-@FWeO!F|UpjO_ z^|@sWs`)V^M6YO*u_e@T=Fwltjc}U?mhhWLSuHyzzyz286JP>N;1~!T+zu6%)&(Kd zeFU%}Il7fKus`h%Lw}VCg3MDgxD3)`dfMzK^nJV8j9rWU#Mc3{2*;TK6JP>NfC(^x z1}9K$|3vrO%KnkbAV+KaKqpaq@M*Qj*0?fC(@GCcp%kK%)|< zwtrH_AmzxX@1Sh|_oD`OsMKV+O5bNQDUcrV(`J97@0iT$|1tJY#L3rivk1qT025#W zOn?b6fd(gFy9Z(3bF6j`BK$dk;O~ALd_Wws`Q|+cy3g>paZk2Q&dq^Js%I19TVI0Q#Ts|v z-#S|B@57XyuOLAB5*SZ?4RY$|49*wIXU-pMMRk&3TL(;jybjdK3#-#+*w%qpk`Q1l&h;Q_xvu(xjg`8AK_h z9of^okNU*mZ&pn9*3`0@QW>Xtz4c9{yjf#vy;Ex=-9sgxc$xh^*Iy|id$w-?RA%59 z6JP>NfC(@GCeZK%4*osBk$q2D$O28Z1@V23SY?A(<2PX2KiU5Mf3Ic#{^sw&_1qrUe~F=PKkoO~V7o#F=*U;<2l z2`~XB&^!dH?Vsp8iu_}^kFxhY;8j)nj?GfX0^MeRdec&S!q`6%Ctt%YC0Wb_m;e)C M0!)AjG%A6A0p5#ZcK`qY literal 0 HcmV?d00001 diff --git a/q2_amr/card/tests/data/gene_length.txt b/q2_amr/card/tests/data/gene_length.txt new file mode 100644 index 0000000..2fddb36 --- /dev/null +++ b/q2_amr/card/tests/data/gene_length.txt @@ -0,0 +1,2 @@ +ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 +ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1 1173.0 diff --git a/q2_amr/card/tests/data/gene_length_short.txt b/q2_amr/card/tests/data/gene_length_short.txt new file mode 100644 index 0000000..81d043e --- /dev/null +++ b/q2_amr/card/tests/data/gene_length_short.txt @@ -0,0 +1 @@ +ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py index 1999eb9..5ab2a17 100644 --- a/q2_amr/card/tests/test_normalization.py +++ b/q2_amr/card/tests/test_normalization.py @@ -1,11 +1,76 @@ -# from qiime2.plugin.testing import TestPluginBase -# -# -# class TestAnnotateReadsCARD(TestPluginBase): -# package = "q2_amr.card.tests" -# -# @classmethod -# def setUpClass(cls): -# -# def test_normalize_MOR(self): -# biom = self.get_data_path() +import os +import shutil +from unittest.mock import MagicMock, patch + +import biom +from qiime2.plugin.testing import TestPluginBase + +from q2_amr.card.normalization import normalize +from q2_amr.card.utils import InvalidParameterCombinationError +from q2_amr.types import GeneLengthDirectoryFormat + + +class TestNormalize(TestPluginBase): + package = "q2_amr.card.tests" + + @classmethod + def setUpClass(cls): + # Mocking the biom.Table and gene_length class + cls.table = MagicMock() + cls.gene_length = MagicMock() + + def test_tpm_fpkm_uq_cuf_with_invalid_m_a_trim(self): + # Test Error raised if gene-length is given with TMM method + expected_message = ( + "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF." + ) + with self.assertRaises(InvalidParameterCombinationError) as cm: + normalize(self.table, "tpm", m_trim=0.2, a_trim=0.05) + self.assertEqual(str(cm.exception), expected_message) + + def test_tpm_fpkm_with_missing_gene_length(self): + # Test Error raised if gene-length is missing with TPM method + expected_message = "gene-length input is missing." + with self.assertRaises(ValueError) as cm: + normalize(self.table, "tpm") + self.assertEqual(str(cm.exception), expected_message) + + def test_tmm_uq_cuf_ctf_with_gene_length(self): + # Test Error raised if gene-length is given with TMM method + expected_message = ( + "gene-length input can only be used with FPKM and TPM methods." + ) + with self.assertRaises(ValueError) as cm: + normalize(self.table, "tmm", gene_length=self.gene_length) + self.assertEqual(str(cm.exception), expected_message) + + def test_tpm_fpkm_with_short_gene_length(self): + # Test Error raised if gene-length is missing genes + gene_length = GeneLengthDirectoryFormat() + shutil.copy( + self.get_data_path("gene_length_short.txt"), + os.path.join(gene_length.path, "gene_length.txt"), + ) + table = biom.load_table(self.get_data_path("feature-table.biom")) + expected_message = ( + "There are genes present in the FeatureTable that are not present " + "in the gene-length input. Missing lengths for genes: " + "{'ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1'}" + ) + with self.assertRaises(ValueError) as cm: + normalize(table, "tpm", gene_length=gene_length) + self.assertEqual(str(cm.exception), expected_message) + + @patch("q2_amr.card.normalization.TPM") + def test_tpm_fpkm_with_valid_inputs(self, mock_tpm): + # Test valid inputs for TPM method + gene_length = GeneLengthDirectoryFormat() + shutil.copy(self.get_data_path("gene_length.txt"), gene_length.path) + table = biom.load_table(self.get_data_path("feature-table.biom")) + normalize(table=table, gene_length=gene_length, method="tpm") + + @patch("q2_amr.card.normalization.TMM") + def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm): + # Test valid inputs for TMM method + table = biom.load_table(self.get_data_path("feature-table.biom")) + normalize(table=table, method="tmm", a_trim=0.06, m_trim=0.4) diff --git a/q2_amr/card/utils.py b/q2_amr/card/utils.py index 1599f8d..69c600c 100644 --- a/q2_amr/card/utils.py +++ b/q2_amr/card/utils.py @@ -140,3 +140,9 @@ def create_count_table(df_list: list) -> pd.DataFrame: df.columns.name = None df.index.name = "sample_id" return df + + +class InvalidParameterCombinationError(Exception): + def __init__(self, message="Invalid parameter combination"): + self.message = message + super().__init__(self.message) diff --git a/q2_amr/citations.bib b/q2_amr/citations.bib index a748e7a..9858108 100644 --- a/q2_amr/citations.bib +++ b/q2_amr/citations.bib @@ -26,3 +26,12 @@ @article{alcock_card_2023 keywords = {*Data Curation, *Databases, Factual, *Drug Resistance, Microbial, *Machine Learning, Anti-Bacterial Agents/pharmacology, Genes, Bacterial, Likelihood Functions, Molecular Sequence Annotation, Software}, pages = {D690--D699}, } + +@misc{Zmrzlikar_RNAnorm_RNA-seq_data_2023, +author = {Zmrzlikar, Jure and Žganec, Matjaž and Ausec, Luka and Štajdohar, Miha}, +month = jul, +title = {{RNAnorm: RNA-seq data normalization in Python}}, +url = {https://github.com/genialis/RNAnorm}, +version = {2.1.0}, +year = {2023} +} diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 5d06ec9..0f67ff0 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -14,14 +14,14 @@ ) from q2_types.sample_data import SampleData from q2_types_genomics.per_sample_data import MAGs -from qiime2.core.type import Bool, Choices, Int, Properties, Range, Str, TypeMap +from qiime2.core.type import Bool, Choices, Float, Int, Properties, Range, Str, TypeMap from qiime2.plugin import Citations, Plugin from q2_amr import __version__ from q2_amr.card.database import fetch_card_db from q2_amr.card.heatmap import heatmap from q2_amr.card.mags import annotate_mags_card -from q2_amr.card.normalization import normalize_mor, normalize_tpm +from q2_amr.card.normalization import normalize from q2_amr.card.reads import annotate_reads_card from q2_amr.types import ( CARDAnnotationJSONFormat, @@ -219,32 +219,24 @@ ) plugin.methods.register_function( - function=normalize_mor, - inputs={"table": FeatureTable[Frequency]}, - parameters={}, - outputs=[("normalized_table", FeatureTable[Frequency])], - input_descriptions={"table": "FeatureTable"}, - parameter_descriptions={}, - output_descriptions={"normalized_table": "hello"}, - name="", - description="", - citations=[], -) - -plugin.methods.register_function( - function=normalize_tpm, + function=normalize, inputs={ "table": FeatureTable[Frequency], - "gene_length": SampleData[CARDAlleleAnnotation], + "gene_length": GeneLength + | SampleData[CARDAlleleAnnotation | CARDGeneAnnotation], + }, + parameters={ + "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf"]), + "m_trim": Float, + "a_trim": Float, }, - parameters={}, outputs=[("normalized_table", FeatureTable[Frequency])], input_descriptions={"table": "FeatureTable", "gene_length": "gene_length"}, parameter_descriptions={}, output_descriptions={"normalized_table": "hello"}, name="", description="", - citations=[], + citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]], ) # Registrations From dd1f9ff7b280aecebc2f0348c69f2c1468bc023b Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Thu, 8 Feb 2024 16:40:58 +0100 Subject: [PATCH 12/23] added cpm method --- q2_amr/card/normalization.py | 16 +++++++++------- q2_amr/plugin_setup.py | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index 98a9109..ec42e97 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -2,7 +2,7 @@ import biom import pandas as pd -from rnanorm import CTF, CUF, FPKM, TMM, TPM, UQ +from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ from q2_amr.card.utils import InvalidParameterCombinationError from q2_amr.types import GeneLengthDirectoryFormat @@ -21,9 +21,9 @@ def normalize( index=table.ids(axis="observation"), columns=table.ids(axis="sample"), ).T - if method in ["tpm", "fpkm", "uq", "cuf"]: - # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ or - # CUF + if method in ["tpm", "fpkm", "uq", "cuf", "cpm"]: + # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ, + # CPM or CUF if m_trim != 0.3 or a_trim != 0.05: raise InvalidParameterCombinationError( "Parameters m-trim and a-trim can only be used with methods TMM and " @@ -56,18 +56,20 @@ def normalize( "tpm": TPM(gene_lengths=gene_length_series), "fpkm": FPKM(gene_lengths=gene_length_series), } - if method in ["tmm", "uq", "cuf", "ctf"]: - # Raise Error if gene-length is given when using methods TMM, UQ, CUF or CTF + if method in ["tmm", "uq", "cuf", "ctf", "cpm"]: + # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or + # CTF if gene_length: raise ValueError( "gene-length input can only be used with FPKM and TPM methods." ) - # Define the methods TMM and CTF with parameters, also UQ and CUF + # Define the methods TMM and CTF with parameters, also UQ, CPM and CUF methods = { "tmm": TMM(m_trim=m_trim, a_trim=a_trim), "ctf": CTF(m_trim=m_trim, a_trim=a_trim), "uq": UQ(), "cuf": CUF(), + "cpm": CPM(), } # Run normalization method on count dataframe normalized = methods[method].set_output(transform="pandas").fit_transform(counts) diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 0f67ff0..9b08ee3 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -226,7 +226,7 @@ | SampleData[CARDAlleleAnnotation | CARDGeneAnnotation], }, parameters={ - "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf"]), + "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf", "cpm"]), "m_trim": Float, "a_trim": Float, }, From 3daa215610b077320792645a5f98b3e1371be452 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 9 Feb 2024 10:03:35 +0100 Subject: [PATCH 13/23] updated plugin setup --- q2_amr/plugin_setup.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 9b08ee3..56a79b0 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -231,11 +231,27 @@ "a_trim": Float, }, outputs=[("normalized_table", FeatureTable[Frequency])], - input_descriptions={"table": "FeatureTable", "gene_length": "gene_length"}, - parameter_descriptions={}, - output_descriptions={"normalized_table": "hello"}, - name="", - description="", + input_descriptions={ + "table": "Feature table with gene counts.", + "gene_length": "Gene lengths of all genes in the feature table.", + }, + parameter_descriptions={ + "method": "Specify the normalization method to be used. Use FPKM or TPM for " + "within comparisons and TMM, UQ, CUF or CTF for between sample " + "camparisons. Check https://www.genialis.com/wp-content/uploads/2023" + "/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf for " + "more information on the methods.", + "m_trim": "Two sided cutoff for M-values. Can only be used for methods TMM and " + "CTF.", + "a_trim": "Two sided cutoff for A-values. Can only be used for methods TMM and " + "CTF.", + }, + output_descriptions={ + "normalized_table": "Feature table normalized with specified " "method." + }, + name="Normalize FeatureTable", + description="Normalize FeatureTable by gene length, library size and composition " + "with common methods for RNA-seq.", citations=[citations["Zmrzlikar_RNAnorm_RNA-seq_data_2023"]], ) From 011abf4d48fef4a500330c9103b194d122e360c4 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Fri, 9 Feb 2024 12:12:56 +0100 Subject: [PATCH 14/23] added rnanorm to meta.yaml --- ci/recipe/meta.yaml | 1 + q2_amr/plugin_setup.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/ci/recipe/meta.yaml b/ci/recipe/meta.yaml index 00abfca..799e8d5 100644 --- a/ci/recipe/meta.yaml +++ b/ci/recipe/meta.yaml @@ -24,6 +24,7 @@ requirements: - q2templates {{ qiime2_epoch }}.* - q2cli {{ qiime2_epoch }}.* - rgi + - rnanorm test: requires: diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 56a79b0..1bdaafe 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -227,8 +227,8 @@ }, parameters={ "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf", "cpm"]), - "m_trim": Float, - "a_trim": Float, + "m_trim": Float % Range(0, 1, inclusive_start=True), + "a_trim": Float % Range(0, 1, inclusive_start=True), }, outputs=[("normalized_table", FeatureTable[Frequency])], input_descriptions={ From 907afa5c4392992620c19d1f6770be70322ba366 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 29 May 2024 12:43:30 +0200 Subject: [PATCH 15/23] Revert "Merge branch '42_Genelengts_type' into 23_norm_tpm" This reverts commit ff80222daf58709be628527dc0e7a85db0806ffd, reversing changes made to 942cd8bced550afdaba03a0544b99bb9065d6207. --- q2_amr/types/_format.py | 30 ++------ q2_amr/types/_transformer.py | 28 +++----- .../tests/data/gene_length_duplicate.txt | 2 - .../tests/test_types_formats_transformers.py | 69 ++++++------------- 4 files changed, 34 insertions(+), 95 deletions(-) delete mode 100644 q2_amr/types/tests/data/gene_length_duplicate.txt diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py index 9c41528..7ee2eca 100644 --- a/q2_amr/types/_format.py +++ b/q2_amr/types/_format.py @@ -428,36 +428,16 @@ def gene_path_maker(self, sample_id): class GeneLengthFormat(model.TextFileFormat): - """ - Tab-separated format for gene length data. - - The file must consist of tab separated values in two columns with no header. - The first column must be the gene names and the second column must be the - corresponding gene lengths. The gene names can not be duplicated. - """ - def _validate(self, n_records=None): df = pd.read_csv(str(self), sep="\t", header=None) - # Check if df has exactly two columns has_two_columns = len(df.columns) == 2 - if not has_two_columns: - raise ValidationError( - "The file must consist of two tab separated columns. The first column " - "must be the gene names and the second column must be the " - "corresponding gene lengths." - ) - # Check if gene length column is numerical (float or int) lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all() - if not lengths_num: - raise ValidationError( - "The second column (gene lengths) must be of type int or float." - ) - # Check if gene names column doesn't have duplicates - duplicates = df.iloc[:, 0].duplicated().any() - if duplicates: + if not (has_two_columns and lengths_num): raise ValidationError( - "There are no duplicate values allowed in the first column " - "(gene names)." + "Format does not match GeneLengthsFormat. Must consist of tab " + "separated values in two columns with no header. The first column must " + "be the gene names and the second column must be the corresponding " + "gene lengths." ) def _validate_(self, level): diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py index d85d346..f6fa159 100644 --- a/q2_amr/types/_transformer.py +++ b/q2_amr/types/_transformer.py @@ -258,30 +258,18 @@ def tabulate_data(data_path, data_type): @plugin.register_transformer def _15(data: CARDAlleleAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat: - directory = get_gene_lengths(map_type="allele", annotations=data.path) - return directory - - -@plugin.register_transformer -def _16(data: CARDGeneAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat: - directory = get_gene_lengths(map_type="gene", annotations=data.path) - return directory - - -def get_gene_lengths(map_type, annotations): - # Extracts gene lengths from CARDAlleleAnnotation and CARDGeneAnnotation - gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term" len_all = pd.Series() directory = GeneLengthDirectoryFormat() + # Iterate over samples in the specified path + for samp in os.listdir(data.path): + anno_txt = os.path.join(data.path, samp, "allele_mapping_data.txt") - # Iterate over samples, read in each DataFrame and append it to the series - for samp in os.listdir(annotations): - anno_txt = os.path.join(annotations, samp, f"{map_type}_mapping_data.txt") - cols = [gene_name_col, "Reference Length"] - len_sample = pd.read_csv(anno_txt, sep="\t", usecols=cols) - len_sample = len_sample.set_index(cols[0])[cols[1]] - len_all = len_all.combine_first(len_sample) + # Read each DataFrame and append it to the list + len_sample = pd.read_csv( + anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"] + ).set_index("Reference Sequence")["Reference Length"] + len_all = len_all.combine_first(len_sample) len_all.to_csv( os.path.join(directory.path, "gene_length.txt"), sep="\t", header=False ) diff --git a/q2_amr/types/tests/data/gene_length_duplicate.txt b/q2_amr/types/tests/data/gene_length_duplicate.txt deleted file mode 100644 index 60621cb..0000000 --- a/q2_amr/types/tests/data/gene_length_duplicate.txt +++ /dev/null @@ -1,2 +0,0 @@ -ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 -ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py index 9949130..e0de12b 100644 --- a/q2_amr/types/tests/test_types_formats_transformers.py +++ b/q2_amr/types/tests/test_types_formats_transformers.py @@ -339,37 +339,17 @@ def test_gene_length_format_validate_positive(self): format = GeneLengthFormat(filepath, mode="r") format.validate() - def test_gene_length_format_validate_negative(self): - # Test ValidationErrors for wrong number of columns, wrong data type and - # duplicate gene names - test_cases = [ - ( - "gene_length_3_cols.txt", - " is not a(n) GeneLengthFormat file:\n\nThe file must consist of two " - "tab separated columns. The first column must be the gene names and " - "the second column must be the corresponding gene lengths.", - ), - ( - "gene_length_switched.txt", - " is not a(n) GeneLengthFormat file:\n\nThe second column " - "(gene lengths) must be of type int or float.", - ), - ( - "gene_length_duplicate.txt", - " is not a(n) GeneLengthFormat file:\n\nThere are no duplicate values " - "allowed in the first column (gene names).", - ), - ] - - for file_name, expected_error_message in test_cases: - with self.subTest(file_name=file_name): - filepath = self.get_data_path(file_name) - with self.assertRaises(ValidationError) as context: - format = GeneLengthFormat(filepath, mode="r") - format.validate() - self.assertEqual( - str(context.exception), filepath + expected_error_message - ) + def test_gene_length_format_validate_negative_3_cols(self): + filepath = self.get_data_path("gene_length_3_cols.txt") + with self.assertRaises(ValidationError): + format = GeneLengthFormat(filepath, mode="r") + format.validate() + + def test_gene_length_format_validate_negative_switched(self): + filepath = self.get_data_path("gene_length_switched.txt") + with self.assertRaises(ValidationError): + format = GeneLengthFormat(filepath, mode="r") + format.validate() def test_gene_length_directory_format_validate_positive(self): filepath = self.get_data_path("gene_length.txt") @@ -377,21 +357,14 @@ def test_gene_length_directory_format_validate_positive(self): format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r") format.validate() - def test_card_allele_gene_annotation_dir_format_to_gene_length_dir_format(self): - transformations = [ - (CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat), - (CARDGeneAnnotationDirectoryFormat, GeneLengthDirectoryFormat), - ] + def test_card_annotation_format_to_df_transformertt(self): + transformer = self.get_transformer( + CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat + ) + + annotation = CARDAlleleAnnotationDirectoryFormat( + self.get_data_path("annotate_reads_output"), "r" + ) - for input_format, output_format in transformations: - with self.subTest(input_format=input_format, output_format=output_format): - transformer = self.get_transformer(input_format, output_format) - annotation = input_format( - self.get_data_path("annotate_reads_output"), "r" - ) - obs = transformer(annotation) - self.assertIsInstance(obs, GeneLengthDirectoryFormat) - format = GeneLengthFormat( - os.path.join(obs.path, "gene_length.txt"), mode="r" - ) - format.validate() + obs = transformer(annotation) + self.assertIsInstance(obs, GeneLengthDirectoryFormat) From 6865edae21adb30751d19453a78d125ae7e8f826 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 29 May 2024 12:44:04 +0200 Subject: [PATCH 16/23] Revert "Merge branch '42_Genelengts_type' into 23_norm_tpm" This reverts commit 07fac990b281ffe348b6df4ad9f3b59da279e3e8, reversing changes made to 358ba73fa172f0eeb69649f6112467a3149b0b38. --- q2_amr/card/tests/data/output.mags.txt | 4 +- q2_amr/card/tests/test_utils.py | 46 +++++++++---------- q2_amr/card/utils.py | 6 +-- q2_amr/plugin_setup.py | 14 ++---- q2_amr/types/__init__.py | 4 -- q2_amr/types/_format.py | 22 --------- q2_amr/types/_transformer.py | 21 --------- q2_amr/types/_type.py | 1 - q2_amr/types/tests/data/gene_length.txt | 2 - .../types/tests/data/gene_length_3_cols.txt | 2 - .../types/tests/data/gene_length_switched.txt | 2 - .../tests/test_types_formats_transformers.py | 40 ---------------- 12 files changed, 28 insertions(+), 136 deletions(-) delete mode 100644 q2_amr/types/tests/data/gene_length.txt delete mode 100644 q2_amr/types/tests/data/gene_length_3_cols.txt delete mode 100644 q2_amr/types/tests/data/gene_length_switched.txt diff --git a/q2_amr/card/tests/data/output.mags.txt b/q2_amr/card/tests/data/output.mags.txt index 55fc417..4102af9 100644 --- a/q2_amr/card/tests/data/output.mags.txt +++ b/q2_amr/card/tests/data/output.mags.txt @@ -1,5 +1,5 @@ ORF_ID Contig Start Stop Orientation Cut_Off Pass_Bitscore Best_Hit_Bitscore Best_Hit_ARO Best_Identities ARO Model_type SNPs_in_Best_Hit_ARO Other_SNPs Drug Class Resistance Mechanism AMR Gene Family Predicted_DNA Predicted_Protein CARD_Protein_Sequence Percentage Length of Reference Sequence ID Model_ID Nudged Note "NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643" NC_000962.3_689 759789 763325 + Strict 2300 2394.77 mdtF 99.91 3000796 protein variant model D516G, H526T, L511R n/a rifamycin antibiotic "antibiotic target alteration; antibiotic target replacement" rifamycin-resistant beta-subunit of RNA polymerase (rpoB) GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA 100.51 gnl|BL_ORD_ID|2005|hsp_num:0 1237 -"NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643" NC_000962.3_689 759789 763325 + Strict 2300 2394.77 mdtF 99.91 3000815 protein variant model D516G, H526T, L511R n/a rifamycin antibiotic "antibiotic target alteration; antibiotic target replacement" rifamycin-resistant beta-subunit of RNA polymerase (rpoB) GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA 100.51 gnl|BL_ORD_ID|2005|hsp_num:0 1237 +"NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643" NC_000962.3_689 759789 763325 + Strict 2300 2394.77 mgrA 99.91 3000815 protein variant model D516G, H526T, L511R n/a rifamycin antibiotic "antibiotic target alteration; antibiotic target replacement" rifamycin-resistant beta-subunit of RNA polymerase (rpoB) GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA 100.51 gnl|BL_ORD_ID|2005|hsp_num:0 1237 "NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643" NC_000962.3_689 759789 763325 + Strict 2300 2394.77 OprN 99.91 3000805 protein variant model D516G, H526T, L511R n/a rifamycin antibiotic "antibiotic target alteration; antibiotic target replacement" rifamycin-resistant beta-subunit of RNA polymerase (rpoB) GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA 100.51 gnl|BL_ORD_ID|2005|hsp_num:0 1237 -"NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643" NC_000962.3_689 759789 763325 + Strict 2300 2394.77 mepA 99.91 3000026 protein variant model D516G, H526T, L511R n/a rifamycin antibiotic "antibiotic target alteration; antibiotic target replacement" rifamycin-resistant beta-subunit of RNA polymerase (rpoB) GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA 100.51 gnl|BL_ORD_ID|2005|hsp_num:0 1237 +"NC_000962.3_689 # 759789 # 763325 # 1 # ID=1_689;partial=00;start_type=GTG;rbs_motif=None;rbs_spacer=None;gc_cont=0.643" NC_000962.3_689 759789 763325 + Strict 2300 2394.77 mepA 99.91 3000026 protein variant model D516G, H526T, L511R n/a rifamycin antibiotic "antibiotic target alteration; antibiotic target replacement" rifamycin-resistant beta-subunit of RNA polymerase (rpoB) GTGCTGGAAGGATGCATCTTGGCAGATTCCCGCCAGAGCAAAACAGCCGCTAGTCCTAGTCCGAGTCGCCCGCAAAGTTCCTCGAATAACTCCGTACCCGGAGCGCCAAACCGGGTCTCCTTCGCTAAGCTGCGCGAACCACTTGAGGTTCCGGGACTCCTTGACGTCCAGACCGATTCGTTCGAGTGGCTGATCGGTTCGCCGCGCTGGCGCGAATCCGCCGCCGAGCGGGGTGATGTCAACCCAGTGGGTGGCCTGGAAGAGGTGCTCTACGAGCTGTCTCCGATCGAGGACTTCTCCGGGTCGATGTCGTTGTCGTTCTCTGACCCTCGTTTCGACGATGTCAAGGCACCCGTCGACGAGTGCAAAGACAAGGACATGACGTACGCGGCTCCACTGTTCGTCACCGCCGAGTTCATCAACAACAACACCGGTGAGATCAAGAGTCAGACGGTGTTCATGGGTGACTTCCCGATGATGACCGAGAAGGGCACGTTCATCATCAACGGGACCGAGCGTGTGGTGGTCAGCCAGCTGGTGCGGTCGCCCGGGGTGTACTTCGACGAGACCATTGACAAGTCCACCGACAAGACGCTGCACAGCGTCAAGGTGATCCCGAGCCGCGGCGCGTGGCTCGAGTTTGACGTCGACAAGCGCGACACCGTCGGCGTGCGCATCGACCGCAAACGCCGGCAACCGGTCACCGTGCTGCTCAAGGCGCTGGGCTGGACCAGCGAGCAGATTGTCGAGCGGTTCGGGTTCTCCGAGATCATGCGATCGACGCTGGAGAAGGACAACACCGTCGGCACCGACGAGGCGCTGTTGGACATCTACCGCAAGCTGCGTCCGGGCGAGCCCCCGACCAAAGAGTCAGCGCAGACGCTGTTGGAAAACTTGTTCTTCAAGGAGAAGCGCTACGACCTGGCCCGCGTCGGTCGCTATAAGGTCAACAAGAAGCTCGGGCTGCATGTCGGCGAGCCCATCACGTCGTCGACGCTGACCGAAGAAGACGTCGTGGCCACCATCGAATATCTGGTCCGCTTGCACGAGGGTCAGACCACGATGACCGTTCCGGGCGGCGTCGAGGTGCCGGTGGAAACCGACGACATCGACCACTTCGGCAACCGCCGCCTGCGTACGGTCGGCGAGCTGATCCAAAACCAGATCCGGGTCGGCATGTCGCGGATGGAGCGGGTGGTCCGGGAGCGGATGACCACCCAGGACGTGGAGGCGATCACACCGCAGACGTTGATCAACATCCGGCCGGTGGTCGCCGCGATCAAGGAGTTCTTCGGCACCAGCCAGCTGAGCCAATTCATGGACCAGAACAACCCGCTGTCGGGGTTGACCCACAAGCGCCGACTGTCGGCGCTGGGGCCCGGCGGTCTGTCACGTGAGCGTGCCGGGCTGGAGGTCCGCGACGTGCACCCGTCGCACTACGGCCGGATGTGCCCGATCGAAACCCCTGAGGGGCCCAACATCGGTCTGATCGGCTCGCTGTCGGTGTACGCGCGGGTCAACCCGTTCGGGTTCATCGAAACGCCGTACCGCAAGGTGGTCGACGGCGTGGTTAGCGACGAGATCGTGTACCTGACCGCCGACGAGGAGGACCGCCACGTGGTGGCACAGGCCAATTCGCCGATCGATGCGGACGGTCGCTTCGTCGAGCCGCGCGTGCTGGTCCGCCGCAAGGCGGGCGAGGTGGAGTACGTGCCCTCGTCTGAGGTGGACTACATGGACGTCTCGCCCCGCCAGATGGTGTCGGTGGCCACCGCGATGATTCCCTTCCTGGAGCACGACGACGCCAACCGTGCCCTCATGGGGGCAAACATGCAGCGCCAGGCGGTGCCGCTGGTCCGTAGCGAGGCCCCGCTGGTGGGCACCGGGATGGAGCTGCGCGCGGCGATCGACGCCGGCGACGTCGTCGTCGCCGAAGAAAGCGGCGTCATCGAGGAGGTGTCGGCCGACTACATCACTGTGATGCACGACAACGGCACCCGGCGTACCTACCGGATGCGCAAGTTTGCCCGGTCCAACCACGGCACTTGCGCCAACCAGTGCCCCATCGTGGACGCGGGCGACCGAGTCGAGGCCGGTCAGGTGATCGCCGACGGTCCCTGTACTGACGACGGCGAGATGGCGCTGGGCAAGAACCTGCTGGTGGCCATCATGCCGTGGGAGGGCCACAACTACGAGGACGCGATCATCCTGTCCAACCGCCTGGTCGAAGAGGACGTGCTCACCTCGATCCACATCGAGGAGCATGAGATCGATGCTCGCGACACCAAGCTGGGTGCGGAGGAGATCACCCGCGACATCCCGAACATCTCCGACGAGGTGCTCGCCGACCTGGATGAGCGGGGCATCGTGCGCATCGGTGCCGAGGTTCGCGACGGGGACATCCTGGTCGGCAAGGTCACCCCGAAGGGTGAGACCGAGCTGACGCCGGAGGAGCGGCTGCTGCGTGCCATCTTCGGTGAGAAGGCCCGCGAGGTGCGCGACACTTCGCTGAAGGTGCCGCACGGCGAATCCGGCAAGGTGATCGGCATTCGGGTGTTTTCCCGCGAGGACGAGGACGAGTTGCCGGCCGGTGTCAACGAGCTGGTGCGTGTGTATGTGGCTCAGAAACGCAAGATCTCCGACGGTGACAAGCTGGCCGGCCGGCACGGCAACAAGGGCGTGATCGGCAAGATCCTGCCGGTTGAGGACATGCCGTTCCTTGCCGACGGCACCCCGGTGGACATTATTTTGAACACCCACGGCGTGCCGCGACGGATGAACATCGGCCAGATTTTGGAGACCCACCTGGGTTGGTGTGCCCACAGCGGCTGGAAGGTCGACGCCGCCAAGGGGGTTCCGGACTGGGCCGCCAGGCTGCCCGACGAACTGCTCGAGGCGCAGCCGAACGCCATTGTGTCGACGCCGGTGTTCGACGGCGCCCAGGAGGCCGAGCTGCAGGGCCTGTTGTCGTGCACGCTGCCCAACCGCGACGGTGACGTGCTGGTCGACGCCGACGGCAAGGCCATGCTCTTCGACGGGCGCAGCGGCGAGCCGTTCCCGTACCCGGTCACGGTTGGCTACATGTACATCATGAAGCTGCACCACCTGGTGGACGACAAGATCCACGCCCGCTCCACCGGGCCGTACTCGATGATCACCCAGCAGCCGCTGGGCGGTAAGGCGCAGTTCGGTGGCCAGCGGTTCGGGGAGATGGAGTGCTGGGCCATGCAGGCCTACGGTGCTGCCTACACCCTGCAGGAGCTGTTGACCATCAAGTCCGATGACACCGTCGGCCGCGTCAAGGTGTACGAGGCGATCGTCAAGGGTGAGAACATCCCGGAGCCGGGCATCCCCGAGTCGTTCAAGGTGCTGCTCAAAGAACTGCAGTCGCTGTGCCTCAACGTCGAGGTGCTATCGAGTGACGGTGCGGCGATCGAACTGCGCGAAGGTGAGGACGAGGACCTGGAGCGGGCCGCGGCCAACCTGGGAATCAATCTGTCCCGCAACGAATCCGCAAGTGTCGAGGATCTTGCGTAA MLEGCILADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA MADSRQSKTAASPSPSRPQSSSNNSVPGAPNRVSFAKLREPLEVPGLLDVQTDSFEWLIGSPRWRESAAERGDVNPVGGLEEVLYELSPIEDFSGSMSLSFSDPRFDDVKAPVDECKDKDMTYAAPLFVTAEFINNNTGEIKSQTVFMGDFPMMTEKGTFIINGTERVVVSQLVRSPGVYFDETIDKSTDKTLHSVKVIPSRGAWLEFDVDKRDTVGVRIDRKRRQPVTVLLKALGWTSEQIVERFGFSEIMRSTLEKDNTVGTDEALLDIYRKLRPGEPPTKESAQTLLENLFFKEKRYDLARVGRYKVNKKLGLHVGEPITSSTLTEEDVVATIEYLVRLHEGQTTMTVPGGVEVPVETDDIDHFGNRRLRTVGELIQNQIRVGMSRMERVVRERMTTQDVEAITPQTLINIRPVVAAIKEFFGTSQLSQFMDQNNPLSGLTHKRRLSALGPGGLSRERAGLEVRDVHPSHYGRMCPIETPEGPNIGLIGSLSVYARVNPFGFIETPYRKVVDGVVSDEIVYLTADEEDRHVVAQANSPIDADGRFVEPRVLVRRKAGEVEYVPSSEVDYMDVSPRQMVSVATAMIPFLEHDDANRALMGANMQRQAVPLVRSEAPLVGTGMELRAAIDAGDVVVAEESGVIEEVSADYITVMHDNGTRRTYRMRKFARSNHGTCANQCPIVDAGDRVEAGQVIADGPCTDDGEMALGKNLLVAIMPWEGHNYEDAIILSNRLVEEDVLTSIHIEEHEIDARDTKLGAEEITRDIPNISDEVLADLDERGIVRIGAEVRDGDILVGKVTPKGETELTPEERLLRAIFGEKAREVRDTSLKVPHGESGKVIGIRVFSREDEDELPAGVNELVRVYVAQKRKISDGDKLAGRHGNKGVIGKILPVEDMPFLADGTPVDIILNTHGVPRRMNIGQILETHLGWCAHSGWKVDAAKGVPDWAARLPDELLEAQPNAIVSTPVFDGAQEAELQGLLSCTLPNRDGDVLVDADGKAMLFDGRSGEPFPYPVTVGYMYIMKLHHLVDDKIHARSTGPYSMITQQPLGGKAQFGGQRFGEMECWAMQAYGAAYTLQELLTIKSDDTVGRVKVYEAIVKGENIPEPGIPESFKVLLKELQSLCLNVEVLSSDGAAIELREGEDEDLERAAANLGINLSRNESASVEDLA 100.51 gnl|BL_ORD_ID|2005|hsp_num:0 1237 diff --git a/q2_amr/card/tests/test_utils.py b/q2_amr/card/tests/test_utils.py index 3e833cb..e6999ce 100644 --- a/q2_amr/card/tests/test_utils.py +++ b/q2_amr/card/tests/test_utils.py @@ -15,31 +15,27 @@ class TestAnnotateReadsCARD(TestPluginBase): @classmethod def setUpClass(cls): - cls.allele_count_df = pd.DataFrame( - { - "Reference Sequence": [ + cls.count_df_list = [] + for colname, values in zip( + ["Reference Sequence", "ARO Term", "Best_Hit_ARO"], + [ + [ "ARO:3000796|ID:121|Name:mdtF|NCBI:U00096.1", "ARO:3000815|ID:154|Name:mgrA|NCBI:BA000018.3", "ARO:3000805|ID:172|Name:OprN|NCBI:AE004091.2", "ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1", ], - "sample1": ["1", "1", "1", "1"], - } - ) - - cls.gene_count_df = pd.DataFrame( - { - "ARO Term": ["mdtF", "mgrA", "OprN", "mepA"], - "sample1": ["1", "1", "1", "1"], - } - ) - - cls.mag_count_df = pd.DataFrame( - { - "Best_Hit_ARO": ["mdtF", "OprN", "mepA"], - "sample1/bin1": ["2", "1", "1"], - } - ) + ["mdtF", "mgrA", "OprN", "mepA"], + ["mdtF", "mgrA", "OprN", "mepA"], + ], + ): + df = pd.DataFrame( + { + colname: values, + "sample1": ["1", "1", "1", "1"], + } + ) + cls.count_df_list.append(df) cls.frequency_table = pd.DataFrame( { @@ -138,8 +134,8 @@ def test_read_in_txt_mags(self): # Test read_in_txt with output data from annotate_mags_card self.read_in_txt_test_body( filename="output.mags.txt", - samp_bin_name="sample1/bin1", - exp=self.mag_count_df, + samp_bin_name="sample1", + exp=self.count_df_list[2], data_type="mags", ) @@ -148,7 +144,7 @@ def test_read_in_txt_reads_allele(self): self.read_in_txt_test_body( filename="output.allele_mapping_data.txt", samp_bin_name="sample1", - exp=self.allele_count_df, + exp=self.count_df_list[0], data_type="reads", map_type="allele", ) @@ -158,7 +154,7 @@ def test_read_in_txt_reads_gene(self): self.read_in_txt_test_body( filename="output.gene_mapping_data.txt", samp_bin_name="sample1", - exp=self.gene_count_df, + exp=self.count_df_list[1], data_type="reads", map_type="gene", ) @@ -174,7 +170,7 @@ def read_in_txt_test_body( def test_create_count_table(self): # Create list of dataframes to be used by create_count_table - df_list = [self.gene_count_df.copy(), self.gene_count_df.copy()] + df_list = [self.count_df_list[1].copy(), self.count_df_list[1].copy()] df_list[1].iloc[0, 0] = "mdtE" df_list[1].rename(columns={"sample1": "sample2"}, inplace=True) diff --git a/q2_amr/card/utils.py b/q2_amr/card/utils.py index 69c600c..5b9e526 100644 --- a/q2_amr/card/utils.py +++ b/q2_amr/card/utils.py @@ -107,10 +107,8 @@ def read_in_txt(path: str, samp_bin_name: str, data_type: str, map_type=None): df = df[[colname, "All Mapped Reads"]] df.rename(columns={"All Mapped Reads": samp_bin_name}, inplace=True) else: - df = df["Best_Hit_ARO"].value_counts().reset_index() - - # Rename the columns - df.columns = ["Best_Hit_ARO", samp_bin_name] + df = df[["Best_Hit_ARO"]] + df[samp_bin_name] = 1 df = df.astype(str) return df diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 1bdaafe..feb15f9 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -7,7 +7,7 @@ # ---------------------------------------------------------------------------- import importlib -from q2_types.feature_table import FeatureTable, Frequency +from q2_types.feature_table import FeatureTable, Frequency, PresenceAbsence from q2_types.per_sample_sequences import ( PairedEndSequencesWithQuality, SequencesWithQuality, @@ -42,15 +42,12 @@ CARDKmerTXTFormat, CARDWildcardIndexFormat, GapDNAFASTAFormat, - GeneLengthDirectoryFormat, - GeneLengthFormat, ) from q2_amr.types._type import ( CARDAlleleAnnotation, CARDAnnotation, CARDGeneAnnotation, CARDKmerDatabase, - GeneLength, ) citations = Citations.load("citations.bib", package="q2_amr") @@ -98,7 +95,7 @@ }, outputs=[ ("amr_annotations", SampleData[CARDAnnotation]), - ("feature_table", FeatureTable[Frequency]), + ("feature_table", FeatureTable[PresenceAbsence]), ], input_descriptions={ "mag": "MAGs to be annotated with CARD.", @@ -115,7 +112,7 @@ }, output_descriptions={ "amr_annotations": "AMR annotation as .txt and .json file.", - "feature_table": "Frequency table of ARGs in all samples.", + "feature_table": "Presence and absence table of ARGs in all samples.", }, name="Annotate MAGs with antimicrobial resistance genes from CARD.", description="Annotate MAGs with antimicrobial resistance genes from CARD.", @@ -280,9 +277,6 @@ plugin.register_semantic_type_to_format( SampleData[CARDGeneAnnotation], artifact_format=CARDGeneAnnotationDirectoryFormat ) -plugin.register_semantic_type_to_format( - GeneLength, artifact_format=GeneLengthDirectoryFormat -) plugin.register_formats( CARDKmerDatabaseDirectoryFormat, @@ -300,8 +294,6 @@ CARDAnnotationStatsFormat, CARDAlleleAnnotationDirectoryFormat, CARDGeneAnnotationDirectoryFormat, - GeneLengthFormat, - GeneLengthDirectoryFormat, ) importlib.import_module("q2_amr.types._transformer") diff --git a/q2_amr/types/__init__.py b/q2_amr/types/__init__.py index 94f3f12..7f5d25d 100644 --- a/q2_amr/types/__init__.py +++ b/q2_amr/types/__init__.py @@ -22,8 +22,6 @@ CARDKmerTXTFormat, CARDWildcardIndexFormat, GapDNAFASTAFormat, - GeneLengthDirectoryFormat, - GeneLengthFormat, ) from ._type import ( CARDAlleleAnnotation, @@ -54,6 +52,4 @@ "GapDNAFASTAFormat", "CARDWildcardIndexFormat", "CARDKmerDatabase", - "GeneLengthDirectoryFormat", - "GeneLengthFormat", ] diff --git a/q2_amr/types/_format.py b/q2_amr/types/_format.py index 7ee2eca..de8c44a 100644 --- a/q2_amr/types/_format.py +++ b/q2_amr/types/_format.py @@ -425,25 +425,3 @@ class CARDGeneAnnotationDirectoryFormat(MultiDirValidationMixin, model.Directory @gene.set_path_maker def gene_path_maker(self, sample_id): return "%s/gene_mapping_data.txt" % sample_id - - -class GeneLengthFormat(model.TextFileFormat): - def _validate(self, n_records=None): - df = pd.read_csv(str(self), sep="\t", header=None) - has_two_columns = len(df.columns) == 2 - lengths_num = df.iloc[:, 1].apply(lambda x: isinstance(x, (float, int))).all() - if not (has_two_columns and lengths_num): - raise ValidationError( - "Format does not match GeneLengthsFormat. Must consist of tab " - "separated values in two columns with no header. The first column must " - "be the gene names and the second column must be the corresponding " - "gene lengths." - ) - - def _validate_(self, level): - self._validate() - - -GeneLengthDirectoryFormat = model.SingleFileDirectoryFormat( - "GeneLengthDirectoryFormat", "gene_length.txt", GeneLengthFormat -) diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py index f6fa159..6d08cf1 100644 --- a/q2_amr/types/_transformer.py +++ b/q2_amr/types/_transformer.py @@ -26,7 +26,6 @@ CARDAnnotationTXTFormat, CARDDatabaseFormat, CARDGeneAnnotationDirectoryFormat, - GeneLengthDirectoryFormat, ) @@ -254,23 +253,3 @@ def tabulate_data(data_path, data_type): if data_type == "mags": df_combined.rename(columns={"ID": "HSP_Identifier"}, inplace=True) return qiime2.Metadata(df_combined) - - -@plugin.register_transformer -def _15(data: CARDAlleleAnnotationDirectoryFormat) -> GeneLengthDirectoryFormat: - len_all = pd.Series() - directory = GeneLengthDirectoryFormat() - # Iterate over samples in the specified path - for samp in os.listdir(data.path): - anno_txt = os.path.join(data.path, samp, "allele_mapping_data.txt") - - # Read each DataFrame and append it to the list - len_sample = pd.read_csv( - anno_txt, sep="\t", usecols=["Reference Sequence", "Reference Length"] - ).set_index("Reference Sequence")["Reference Length"] - - len_all = len_all.combine_first(len_sample) - len_all.to_csv( - os.path.join(directory.path, "gene_length.txt"), sep="\t", header=False - ) - return directory diff --git a/q2_amr/types/_type.py b/q2_amr/types/_type.py index bee4195..aa2f302 100644 --- a/q2_amr/types/_type.py +++ b/q2_amr/types/_type.py @@ -17,4 +17,3 @@ CARDGeneAnnotation = SemanticType( "CARDGeneAnnotation", variant_of=SampleData.field["type"] ) -GeneLength = SemanticType("GeneLength") diff --git a/q2_amr/types/tests/data/gene_length.txt b/q2_amr/types/tests/data/gene_length.txt deleted file mode 100644 index 2fddb36..0000000 --- a/q2_amr/types/tests/data/gene_length.txt +++ /dev/null @@ -1,2 +0,0 @@ -ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 -ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1 1173.0 diff --git a/q2_amr/types/tests/data/gene_length_3_cols.txt b/q2_amr/types/tests/data/gene_length_3_cols.txt deleted file mode 100644 index a1803bb..0000000 --- a/q2_amr/types/tests/data/gene_length_3_cols.txt +++ /dev/null @@ -1,2 +0,0 @@ -ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 1 -ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1 1173.0 1 diff --git a/q2_amr/types/tests/data/gene_length_switched.txt b/q2_amr/types/tests/data/gene_length_switched.txt deleted file mode 100644 index cae5b65..0000000 --- a/q2_amr/types/tests/data/gene_length_switched.txt +++ /dev/null @@ -1,2 +0,0 @@ -1356.0 ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 -1173.0 ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1 diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py index e0de12b..a594eaa 100644 --- a/q2_amr/types/tests/test_types_formats_transformers.py +++ b/q2_amr/types/tests/test_types_formats_transformers.py @@ -21,7 +21,6 @@ ProteinIterator, ) from q2_types_genomics.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat -from qiime2.plugin import ValidationError from qiime2.plugin.testing import TestPluginBase from skbio import DNA, Protein @@ -39,8 +38,6 @@ CARDKmerTXTFormat, CARDWildcardIndexFormat, GapDNAFASTAFormat, - GeneLengthDirectoryFormat, - GeneLengthFormat, ) from q2_amr.types._transformer import ( _read_from_card_file, @@ -331,40 +328,3 @@ def test_CARDAlleleAnnotationDirectoryFormat_to_qiime2_Metadata_transformer(self ) metadata_obt = transformer(annotation) self.assertIsInstance(metadata_obt, qiime2.Metadata) - - -class TestGeneLengthsTypesAndFormats(AMRTypesTestPluginBase): - def test_gene_length_format_validate_positive(self): - filepath = self.get_data_path("gene_length.txt") - format = GeneLengthFormat(filepath, mode="r") - format.validate() - - def test_gene_length_format_validate_negative_3_cols(self): - filepath = self.get_data_path("gene_length_3_cols.txt") - with self.assertRaises(ValidationError): - format = GeneLengthFormat(filepath, mode="r") - format.validate() - - def test_gene_length_format_validate_negative_switched(self): - filepath = self.get_data_path("gene_length_switched.txt") - with self.assertRaises(ValidationError): - format = GeneLengthFormat(filepath, mode="r") - format.validate() - - def test_gene_length_directory_format_validate_positive(self): - filepath = self.get_data_path("gene_length.txt") - shutil.copy(filepath, os.path.join(self.temp_dir.name, "gene_length.txt")) - format = GeneLengthDirectoryFormat(self.temp_dir.name, mode="r") - format.validate() - - def test_card_annotation_format_to_df_transformertt(self): - transformer = self.get_transformer( - CARDAlleleAnnotationDirectoryFormat, GeneLengthDirectoryFormat - ) - - annotation = CARDAlleleAnnotationDirectoryFormat( - self.get_data_path("annotate_reads_output"), "r" - ) - - obs = transformer(annotation) - self.assertIsInstance(obs, GeneLengthDirectoryFormat) From 7b58de2b21744fb2b2228e5a2fe1d459d89d4025 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 29 May 2024 16:11:45 +0200 Subject: [PATCH 17/23] added transformer for allele annotation to sequencecharacteristics --- q2_amr/card/normalization.py | 3 +- q2_amr/types/_transformer.py | 46 ++++++++++++++++++- .../tests/test_types_formats_transformers.py | 30 ++++++++++++ 3 files changed, 76 insertions(+), 3 deletions(-) diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index ec42e97..3c70670 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -5,7 +5,6 @@ from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ from q2_amr.card.utils import InvalidParameterCombinationError -from q2_amr.types import GeneLengthDirectoryFormat def normalize( @@ -13,7 +12,7 @@ def normalize( method: str, m_trim: float = 0.3, a_trim: float = 0.05, - gene_length: GeneLengthDirectoryFormat = None, + gene_length: pd.DataFrame = None, ) -> pd.DataFrame: # Create Dataframe with counts from biom.Table counts = pd.DataFrame( diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py index 1139f7d..c8ae7cc 100644 --- a/q2_amr/types/_transformer.py +++ b/q2_amr/types/_transformer.py @@ -13,7 +13,12 @@ import pandas as pd import qiime2 import skbio -from q2_types.feature_data import DNAFASTAFormat, DNAIterator, ProteinFASTAFormat +from q2_types.feature_data import ( + DNAFASTAFormat, + DNAIterator, + ProteinFASTAFormat, + SequenceCharacteristicsDirectoryFormat, +) from q2_types.feature_data._transformer import ProteinIterator from q2_types.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat from skbio import DNA, Protein @@ -287,3 +292,42 @@ def tabulate_data(data_path, data_type): df_combined.rename(columns={"ID": "HSP_Identifier"}, inplace=True) return qiime2.Metadata(df_combined) + + +@plugin.register_transformer +def _18( + data: CARDAlleleAnnotationDirectoryFormat, +) -> SequenceCharacteristicsDirectoryFormat: + directory = get_gene_lengths(map_type="allele", annotations=data.path) + return directory + + +@plugin.register_transformer +def _19( + data: CARDGeneAnnotationDirectoryFormat, +) -> SequenceCharacteristicsDirectoryFormat: + directory = get_gene_lengths(map_type="gene", annotations=data.path) + return directory + + +def get_gene_lengths(map_type, annotations): + # Extracts gene lengths from CARDAlleleAnnotation and CARDGeneAnnotation + gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term" + len_all = pd.Series() + directory_fmt = SequenceCharacteristicsDirectoryFormat() + + # Iterate over samples, read in each DataFrame and append it to the series + for samp in os.listdir(annotations): + anno_txt = os.path.join(annotations, samp, f"{map_type}_mapping_data.txt") + cols = [gene_name_col, "Reference Length"] + len_sample = pd.read_csv(anno_txt, sep="\t", usecols=cols) + len_sample = len_sample.set_index(cols[0])[cols[1]] + len_all = len_all.combine_first(len_sample) + + df = len_all.to_frame() + df.index.name = "id" + df.columns = ["length"] + df.to_csv( + os.path.join(directory_fmt.path, "gene_length.tsv"), sep="\t", header=True + ) + return directory_fmt diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py index a871e99..a44c996 100644 --- a/q2_amr/types/tests/test_types_formats_transformers.py +++ b/q2_amr/types/tests/test_types_formats_transformers.py @@ -19,6 +19,8 @@ DNAIterator, ProteinFASTAFormat, ProteinIterator, + SequenceCharacteristicsDirectoryFormat, + SequenceCharacteristicsFormat, ) from q2_types.genome_data import GenesDirectoryFormat, ProteinsDirectoryFormat from qiime2.plugin.testing import TestPluginBase @@ -561,3 +563,31 @@ def test_tabulate_data_allele(self): exp.index = exp.index.astype(str) obs = tabulate_data(self.get_data_path("card_allele_annotation"), "allele") self.assertEqual(qiime2.Metadata(exp), obs) + + def test_card_allele_annotation_dir_format_to_gene_length_dir_format(self): + transformer = self.get_transformer( + CARDAlleleAnnotationDirectoryFormat, SequenceCharacteristicsDirectoryFormat + ) + annotation = CARDAlleleAnnotationDirectoryFormat( + self.get_data_path("card_allele_annotation"), "r" + ) + obs = transformer(annotation) + self.assertIsInstance(obs, SequenceCharacteristicsDirectoryFormat) + format = SequenceCharacteristicsFormat( + os.path.join(obs.path, "gene_length.tsv"), mode="r" + ) + format.validate() + + def test_card_gene_annotation_dir_format_to_gene_length_dir_format(self): + transformer = self.get_transformer( + CARDGeneAnnotationDirectoryFormat, SequenceCharacteristicsDirectoryFormat + ) + annotation = CARDGeneAnnotationDirectoryFormat( + self.get_data_path("card_gene_annotation"), "r" + ) + obs = transformer(annotation) + self.assertIsInstance(obs, SequenceCharacteristicsDirectoryFormat) + format = SequenceCharacteristicsFormat( + os.path.join(obs.path, "gene_length.tsv"), mode="r" + ) + format.validate() From f4a20ed4f6fa84b53cf18005776fb42311021872 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 29 May 2024 17:48:19 +0200 Subject: [PATCH 18/23] changed normalize to work with characteristics --- q2_amr/card/normalization.py | 25 +++++++++++++------ q2_amr/card/tests/test_normalization.py | 2 +- q2_amr/types/_transformer.py | 8 +++--- .../tests/test_types_formats_transformers.py | 4 +-- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index 3c70670..e784593 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -2,6 +2,7 @@ import biom import pandas as pd +from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ from q2_amr.card.utils import InvalidParameterCombinationError @@ -12,7 +13,7 @@ def normalize( method: str, m_trim: float = 0.3, a_trim: float = 0.05, - gene_length: pd.DataFrame = None, + gene_length: SequenceCharacteristicsDirectoryFormat = None, ) -> pd.DataFrame: # Create Dataframe with counts from biom.Table counts = pd.DataFrame( @@ -20,6 +21,7 @@ def normalize( index=table.ids(axis="observation"), columns=table.ids(axis="sample"), ).T + if method in ["tpm", "fpkm", "uq", "cuf", "cpm"]: # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ, # CPM or CUF @@ -28,33 +30,39 @@ def normalize( "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF." ) + if method in ["tpm", "fpkm"]: # Raise Error if gene-length is missing when using methods TPM or FPKM if not gene_length: raise ValueError("gene-length input is missing.") + # Create pd.Series from gene_length input - gene_length_series = pd.read_csv( - os.path.join(gene_length.path, "gene_length.txt"), + lengths = pd.read_csv( + os.path.join(gene_length.path, "sequence_characteristics.tsv"), sep="\t", header=None, names=["index", "values"], index_col="index", squeeze=True, + skiprows=1, ) + # Raise Error if there are genes in the counts that are not present in the # gene length - if not set(counts.columns).issubset(set(gene_length_series.index)): - only_in_counts = set(counts.columns) - set(gene_length_series.index) + if not set(counts.columns).issubset(set(lengths.index)): + only_in_counts = set(counts.columns) - set(lengths.index) raise ValueError( f"There are genes present in the FeatureTable that are not present " f"in the gene-length input. Missing lengths for genes: " f"{only_in_counts}" ) + # Define the methods TPM and FPKM with the gene length series as an input methods = { - "tpm": TPM(gene_lengths=gene_length_series), - "fpkm": FPKM(gene_lengths=gene_length_series), + "tpm": TPM(gene_lengths=lengths), + "fpkm": FPKM(gene_lengths=lengths), } + if method in ["tmm", "uq", "cuf", "ctf", "cpm"]: # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or # CTF @@ -62,6 +70,7 @@ def normalize( raise ValueError( "gene-length input can only be used with FPKM and TPM methods." ) + # Define the methods TMM and CTF with parameters, also UQ, CPM and CUF methods = { "tmm": TMM(m_trim=m_trim, a_trim=a_trim), @@ -70,7 +79,9 @@ def normalize( "cuf": CUF(), "cpm": CPM(), } + # Run normalization method on count dataframe normalized = methods[method].set_output(transform="pandas").fit_transform(counts) normalized.index.name = "sample_id" + return normalized diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py index 3198449..a35d67d 100644 --- a/q2_amr/card/tests/test_normalization.py +++ b/q2_amr/card/tests/test_normalization.py @@ -22,7 +22,7 @@ def setUpClass(cls): def test_tpm_fpkm_uq_cuf_with_invalid_m_a_trim(self): # Test Error raised if gene-length is given with TMM method expected_message = ( - "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF." + "Parameters m-trim and a-trim can only be used with methods TMM and CTF." ) with self.assertRaises(InvalidParameterCombinationError) as cm: normalize(self.table, "tpm", m_trim=0.2, a_trim=0.05) diff --git a/q2_amr/types/_transformer.py b/q2_amr/types/_transformer.py index c8ae7cc..b1565f9 100644 --- a/q2_amr/types/_transformer.py +++ b/q2_amr/types/_transformer.py @@ -314,7 +314,7 @@ def get_gene_lengths(map_type, annotations): # Extracts gene lengths from CARDAlleleAnnotation and CARDGeneAnnotation gene_name_col = "Reference Sequence" if map_type == "allele" else "ARO Term" len_all = pd.Series() - directory_fmt = SequenceCharacteristicsDirectoryFormat() + dir_fmt = SequenceCharacteristicsDirectoryFormat() # Iterate over samples, read in each DataFrame and append it to the series for samp in os.listdir(annotations): @@ -328,6 +328,8 @@ def get_gene_lengths(map_type, annotations): df.index.name = "id" df.columns = ["length"] df.to_csv( - os.path.join(directory_fmt.path, "gene_length.tsv"), sep="\t", header=True + os.path.join(dir_fmt.path, "sequence_characteristics.tsv"), + sep="\t", + header=True, ) - return directory_fmt + return dir_fmt diff --git a/q2_amr/types/tests/test_types_formats_transformers.py b/q2_amr/types/tests/test_types_formats_transformers.py index a44c996..2a99879 100644 --- a/q2_amr/types/tests/test_types_formats_transformers.py +++ b/q2_amr/types/tests/test_types_formats_transformers.py @@ -574,7 +574,7 @@ def test_card_allele_annotation_dir_format_to_gene_length_dir_format(self): obs = transformer(annotation) self.assertIsInstance(obs, SequenceCharacteristicsDirectoryFormat) format = SequenceCharacteristicsFormat( - os.path.join(obs.path, "gene_length.tsv"), mode="r" + os.path.join(obs.path, "sequence_characteristics.tsv"), mode="r" ) format.validate() @@ -588,6 +588,6 @@ def test_card_gene_annotation_dir_format_to_gene_length_dir_format(self): obs = transformer(annotation) self.assertIsInstance(obs, SequenceCharacteristicsDirectoryFormat) format = SequenceCharacteristicsFormat( - os.path.join(obs.path, "gene_length.tsv"), mode="r" + os.path.join(obs.path, "sequence_characteristics.tsv"), mode="r" ) format.validate() From 6d416a3854eec747ffa9790cded5d0bac4faa81a Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 29 May 2024 17:53:36 +0200 Subject: [PATCH 19/23] changed tests --- .../{gene_length.txt => sequence_characteristics.tsv} | 1 + ...ength_short.txt => sequence_characteristics_short.tsv} | 1 + q2_amr/card/tests/test_normalization.py | 8 +++++--- 3 files changed, 7 insertions(+), 3 deletions(-) rename q2_amr/card/tests/data/{gene_length.txt => sequence_characteristics.tsv} (90%) rename q2_amr/card/tests/data/{gene_length_short.txt => sequence_characteristics_short.tsv} (82%) diff --git a/q2_amr/card/tests/data/gene_length.txt b/q2_amr/card/tests/data/sequence_characteristics.tsv similarity index 90% rename from q2_amr/card/tests/data/gene_length.txt rename to q2_amr/card/tests/data/sequence_characteristics.tsv index 2fddb36..545c51e 100644 --- a/q2_amr/card/tests/data/gene_length.txt +++ b/q2_amr/card/tests/data/sequence_characteristics.tsv @@ -1,2 +1,3 @@ +id length ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1 1173.0 diff --git a/q2_amr/card/tests/data/gene_length_short.txt b/q2_amr/card/tests/data/sequence_characteristics_short.tsv similarity index 82% rename from q2_amr/card/tests/data/gene_length_short.txt rename to q2_amr/card/tests/data/sequence_characteristics_short.tsv index 81d043e..734296b 100644 --- a/q2_amr/card/tests/data/gene_length_short.txt +++ b/q2_amr/card/tests/data/sequence_characteristics_short.tsv @@ -1 +1,2 @@ +id length ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 1356.0 diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py index a35d67d..e61f9e2 100644 --- a/q2_amr/card/tests/test_normalization.py +++ b/q2_amr/card/tests/test_normalization.py @@ -48,8 +48,8 @@ def test_tpm_fpkm_with_short_gene_length(self): # Test Error raised if gene-length is missing genes gene_length = SequenceCharacteristicsDirectoryFormat() shutil.copy( - self.get_data_path("gene_length_short.txt"), - os.path.join(gene_length.path, "gene_length.txt"), + self.get_data_path("sequence_characteristics_short.tsv"), + os.path.join(gene_length.path, "sequence_characteristics.tsv"), ) table = biom.load_table(self.get_data_path("feature-table.biom")) expected_message = ( @@ -65,7 +65,9 @@ def test_tpm_fpkm_with_short_gene_length(self): def test_tpm_fpkm_with_valid_inputs(self, mock_tpm): # Test valid inputs for TPM method gene_length = SequenceCharacteristicsDirectoryFormat() - shutil.copy(self.get_data_path("gene_length.txt"), gene_length.path) + shutil.copy( + self.get_data_path("sequence_characteristics.tsv"), gene_length.path + ) table = biom.load_table(self.get_data_path("feature-table.biom")) normalize(table=table, gene_length=gene_length, method="tpm") From 085769b9afa430c53c024c077630d5772e6ffe48 Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Mon, 3 Jun 2024 12:03:07 +0200 Subject: [PATCH 20/23] changed parameter value error to value error --- q2_amr/card/normalization.py | 4 +--- q2_amr/card/tests/test_normalization.py | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index e784593..9f955d0 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -5,8 +5,6 @@ from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ -from q2_amr.card.utils import InvalidParameterCombinationError - def normalize( table: biom.Table, @@ -26,7 +24,7 @@ def normalize( # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ, # CPM or CUF if m_trim != 0.3 or a_trim != 0.05: - raise InvalidParameterCombinationError( + raise ValueError( "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF." ) diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py index e61f9e2..a5e1689 100644 --- a/q2_amr/card/tests/test_normalization.py +++ b/q2_amr/card/tests/test_normalization.py @@ -7,7 +7,6 @@ from qiime2.plugin.testing import TestPluginBase from q2_amr.card.normalization import normalize -from q2_amr.card.utils import InvalidParameterCombinationError class TestNormalize(TestPluginBase): @@ -24,7 +23,7 @@ def test_tpm_fpkm_uq_cuf_with_invalid_m_a_trim(self): expected_message = ( "Parameters m-trim and a-trim can only be used with methods TMM and CTF." ) - with self.assertRaises(InvalidParameterCombinationError) as cm: + with self.assertRaises(ValueError) as cm: normalize(self.table, "tpm", m_trim=0.2, a_trim=0.05) self.assertEqual(str(cm.exception), expected_message) From 7cf82d4f004b6c4005596d4dce10681c8f0347ed Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 5 Jun 2024 13:26:57 +0200 Subject: [PATCH 21/23] chnages after review --- q2_amr/card/normalization.py | 131 +++++++++++----------- q2_amr/card/tests/data/feature-table.biom | Bin 33824 -> 0 bytes q2_amr/card/tests/data/feature-table.tsv | 3 + q2_amr/card/tests/test_normalization.py | 97 ++++++++++------ q2_amr/plugin_setup.py | 8 +- 5 files changed, 140 insertions(+), 99 deletions(-) delete mode 100644 q2_amr/card/tests/data/feature-table.biom create mode 100644 q2_amr/card/tests/data/feature-table.tsv diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index 9f955d0..d6dab5a 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -1,75 +1,31 @@ import os -import biom import pandas as pd from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ def normalize( - table: biom.Table, + table: pd.DataFrame, method: str, - m_trim: float = 0.3, - a_trim: float = 0.05, + m_trim: float = None, + a_trim: float = None, gene_length: SequenceCharacteristicsDirectoryFormat = None, ) -> pd.DataFrame: - # Create Dataframe with counts from biom.Table - counts = pd.DataFrame( - data=table.matrix_data.toarray(), - index=table.ids(axis="observation"), - columns=table.ids(axis="sample"), - ).T - - if method in ["tpm", "fpkm", "uq", "cuf", "cpm"]: - # Raise Error if m or a-trim parameters are given with methods TPM, FPKM, UQ, - # CPM or CUF - if m_trim != 0.3 or a_trim != 0.05: - raise ValueError( - "Parameters m-trim and a-trim can only be used with methods TMM and " - "CTF." - ) - - if method in ["tpm", "fpkm"]: - # Raise Error if gene-length is missing when using methods TPM or FPKM - if not gene_length: - raise ValueError("gene-length input is missing.") - - # Create pd.Series from gene_length input - lengths = pd.read_csv( - os.path.join(gene_length.path, "sequence_characteristics.tsv"), - sep="\t", - header=None, - names=["index", "values"], - index_col="index", - squeeze=True, - skiprows=1, - ) - - # Raise Error if there are genes in the counts that are not present in the - # gene length - if not set(counts.columns).issubset(set(lengths.index)): - only_in_counts = set(counts.columns) - set(lengths.index) - raise ValueError( - f"There are genes present in the FeatureTable that are not present " - f"in the gene-length input. Missing lengths for genes: " - f"{only_in_counts}" - ) - - # Define the methods TPM and FPKM with the gene length series as an input - methods = { - "tpm": TPM(gene_lengths=lengths), - "fpkm": FPKM(gene_lengths=lengths), - } - - if method in ["tmm", "uq", "cuf", "ctf", "cpm"]: - # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or - # CTF - if gene_length: - raise ValueError( - "gene-length input can only be used with FPKM and TPM methods." - ) - - # Define the methods TMM and CTF with parameters, also UQ, CPM and CUF + # Validate parameter combinations and set trim parameters + m_trim, a_trim = _validate_parameters(method, m_trim, a_trim, gene_length) + + # Process gene_lengths input and define methods that need gene_lengths input + if method in ["tpm", "fpkm"]: + lengths = _convert_lengths(table, gene_length) + + methods = { + "tpm": TPM(gene_lengths=lengths), + "fpkm": FPKM(gene_lengths=lengths), + } + + # Define remaining methods that don't need gene_lengths input + else: methods = { "tmm": TMM(m_trim=m_trim, a_trim=a_trim), "ctf": CTF(m_trim=m_trim, a_trim=a_trim), @@ -78,8 +34,57 @@ def normalize( "cpm": CPM(), } - # Run normalization method on count dataframe - normalized = methods[method].set_output(transform="pandas").fit_transform(counts) + # Run normalization method on frequency table + normalized = methods[method].set_output(transform="pandas").fit_transform(table) normalized.index.name = "sample_id" return normalized + + +def _validate_parameters(method, m_trim, a_trim, gene_length): + # Raise Error if gene-length is missing when using methods TPM or FPKM + if method in ["tpm", "fpkm"] and not gene_length: + raise ValueError("gene-length input is missing.") + + # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or CTF + if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length: + raise ValueError( + "gene-length input can only be used with FPKM and TPM " "methods." + ) + + # Raise Error if m_trim or a_trim are given when not using methods TMM or CTF + if (method not in ["tmm", "ctf"]) and (m_trim is not None or a_trim is not None): + raise ValueError( + "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF." + ) + + # Set m_trim and a_trim to their default values for methods TMM and CTF + if method in ["tmm", "ctf"]: + m_trim = 0.3 if m_trim is None else m_trim + a_trim = 0.05 if a_trim is None else a_trim + + return m_trim, a_trim + + +def _convert_lengths(table, gene_length): + # Read in table from sequence_characteristics.tsv as a pd.Series + lengths = pd.read_csv( + os.path.join(gene_length.path, "sequence_characteristics.tsv"), + sep="\t", + header=None, + names=["index", "values"], + index_col="index", + squeeze=True, + skiprows=1, + ) + + # Check if all gene IDs that are present in the table are also present in + # the lengths + if not set(table.columns).issubset(set(lengths.index)): + only_in_counts = set(table.columns) - set(lengths.index) + raise ValueError( + f"There are genes present in the FeatureTable that are not present " + f"in the gene-length input. Missing lengths for genes: " + f"{only_in_counts}" + ) + return lengths diff --git a/q2_amr/card/tests/data/feature-table.biom b/q2_amr/card/tests/data/feature-table.biom deleted file mode 100644 index 3a8fcee13f42e92f42a5463c65bee03c7bbf9168..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 33824 zcmeI4PfS}k7{DC@Z3>iysoH4ONlzP_I4txf{9P^$Ep$QJfVFI6n}(9WD?}nB3u#9; z4s6T>TG}C*SxrM12nKCd_FwepY6}~@7a#| z{LIhE@oQ(>FSHBgLLt#A!a7KgXXu=tKtgd!PhlW{{$2EcX@NjMvU%?F!*;2u*(o$`sqZxz z?h&Z(wpU!!Xu~ouWKXD)A|U&%9K8acKy5ucJGIvz@YW;p)Wx{s0 z%3xVvm)5sX^z_S*EJ~&O(09S7udPDgDc$e2<+Oew2kkHQzf-uX2FbBi>I;e0)$ert z1$|mt<`uaY@M8 z08YV06xUq;kG8-L^4@KkdQ2%Ibz#WJ| z{!`$I2>N5!r{e>zbg_|*$?NOO^*kpwUF#m_wf&XDrefcs5$pO0Y#MWA;zEFFe*49QJbCc>3IMULi-=s z35E51<%HnkZdwSK>j2HmJ{&((jOHDj4v_TlKCA<+WX=|k-vV7!&mP9Z4#>fpF2qdo z<#qO9@?9)P`W|3>=WWP9{oKa+Lix=3gX-$mv#kRrKVApw_7&De!))t->r^(6=1HyA z0o(XB=WDg|uDM=lC+!z1Tg|a%Ct!|W2Rpon>pk_K_OjX4#q5H&CZt~|70c&eqm*%< zen6apvoo9+0sKX?or?!$y2{$^T*f_cLJW3@!Do1Y)+VJYy(I{0JA|VHc+4)tDm@?; zy_+lQq?^vuLRfFGrDf|~XuCSHqn|k$Oo8->pYXdLd|q+#otIaL$B$wM&;4wxz3i3xOG>l*-O5sotfCcp%k025#W4NhQvejz1-r$fK* z%Dv+~@q>zQ%09{6q6$jant9KWUrAjQt?ToDq(w_t__yutu1CKnH-@FWeO!F|UpjO_ z^|@sWs`)V^M6YO*u_e@T=Fwltjc}U?mhhWLSuHyzzyz286JP>N;1~!T+zu6%)&(Kd zeFU%}Il7fKus`h%Lw}VCg3MDgxD3)`dfMzK^nJV8j9rWU#Mc3{2*;TK6JP>NfC(^x z1}9K$|3vrO%KnkbAV+KaKqpaq@M*Qj*0?fC(@GCcp%kK%)|< zwtrH_AmzxX@1Sh|_oD`OsMKV+O5bNQDUcrV(`J97@0iT$|1tJY#L3rivk1qT025#W zOn?b6fd(gFy9Z(3bF6j`BK$dk;O~ALd_Wws`Q|+cy3g>paZk2Q&dq^Js%I19TVI0Q#Ts|v z-#S|B@57XyuOLAB5*SZ?4RY$|49*wIXU-pMMRk&3TL(;jybjdK3#-#+*w%qpk`Q1l&h;Q_xvu(xjg`8AK_h z9of^okNU*mZ&pn9*3`0@QW>Xtz4c9{yjf#vy;Ex=-9sgxc$xh^*Iy|id$w-?RA%59 z6JP>NfC(@GCeZK%4*osBk$q2D$O28Z1@V23SY?A(<2PX2KiU5Mf3Ic#{^sw&_1qrUe~F=PKkoO~V7o#F=*U;<2l z2`~XB&^!dH?Vsp8iu_}^kFxhY;8j)nj?GfX0^MeRdec&S!q`6%Ctt%YC0Wb_m;e)C M0!)AjG%A6A0p5#ZcK`qY diff --git a/q2_amr/card/tests/data/feature-table.tsv b/q2_amr/card/tests/data/feature-table.tsv new file mode 100644 index 0000000..712090b --- /dev/null +++ b/q2_amr/card/tests/data/feature-table.tsv @@ -0,0 +1,3 @@ +ID ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1 ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1 +sample1 2.0 0.0 +sample2 2.0 0.0 diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py index a5e1689..18eb790 100644 --- a/q2_amr/card/tests/test_normalization.py +++ b/q2_amr/card/tests/test_normalization.py @@ -2,11 +2,12 @@ import shutil from unittest.mock import MagicMock, patch -import biom +import pandas as pd +from pandas._testing import assert_series_equal from q2_types.feature_data import SequenceCharacteristicsDirectoryFormat from qiime2.plugin.testing import TestPluginBase -from q2_amr.card.normalization import normalize +from q2_amr.card.normalization import _convert_lengths, _validate_parameters, normalize class TestNormalize(TestPluginBase): @@ -14,51 +15,79 @@ class TestNormalize(TestPluginBase): @classmethod def setUpClass(cls): - # Mocking the biom.Table and gene_length class - cls.table = MagicMock() cls.gene_length = MagicMock() + cls.lengths = pd.Series( + { + "ARO:3000026|ID:377|Name:mepA|NCBI:AY661734.1": 1356.0, + "ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1": 1173.0, + }, + name="values", + ) + cls.lengths.index.name = "index" - def test_tpm_fpkm_uq_cuf_with_invalid_m_a_trim(self): + def test_validate_parameters_uq_with_m_a_trim(self): # Test Error raised if gene-length is given with TMM method - expected_message = ( - "Parameters m-trim and a-trim can only be used with methods TMM and CTF." - ) - with self.assertRaises(ValueError) as cm: - normalize(self.table, "tpm", m_trim=0.2, a_trim=0.05) - self.assertEqual(str(cm.exception), expected_message) + with self.assertRaisesRegex( + ValueError, + "Parameters m-trim and a-trim can only " + "be used with methods TMM and CTF.", + ): + _validate_parameters("uq", 0.2, 0.05, None) - def test_tpm_fpkm_with_missing_gene_length(self): + def test_validate_parameters_tpm_missing_gene_length(self): # Test Error raised if gene-length is missing with TPM method - expected_message = "gene-length input is missing." - with self.assertRaises(ValueError) as cm: - normalize(self.table, "tpm") - self.assertEqual(str(cm.exception), expected_message) + with self.assertRaisesRegex(ValueError, "gene-length input is missing."): + _validate_parameters("tpm", None, None, None) - def test_tmm_uq_cuf_ctf_with_gene_length(self): + def test_validate_parameters_tmm_gene_length(self): # Test Error raised if gene-length is given with TMM method - expected_message = ( - "gene-length input can only be used with FPKM and TPM methods." + with self.assertRaisesRegex( + ValueError, "gene-length input can only be used with FPKM and TPM methods." + ): + _validate_parameters("tmm", None, None, gene_length=self.gene_length) + + def test_validate_parameters_default_m_a_trim(self): + # Test if m_trim and a_trim get set to default values if None + m_trim, a_trim = _validate_parameters("tmm", None, None, None) + self.assertEqual(m_trim, 0.3) + self.assertEqual(a_trim, 0.05) + + def test_validate_parameters_m_a_trim(self): + # Test if m_trim and a_trim are not modified if not None + m_trim, a_trim = _validate_parameters("tmm", 0.1, 0.06, None) + self.assertEqual(m_trim, 0.1) + self.assertEqual(a_trim, 0.06) + + def test_convert_lengths_gene_length(self): + # Test Error raised if gene-length is missing genes + gene_length = SequenceCharacteristicsDirectoryFormat() + shutil.copy( + self.get_data_path("sequence_characteristics.tsv"), gene_length.path + ) + table = pd.read_csv( + self.get_data_path("feature-table.tsv"), sep="\t", index_col="ID" ) - with self.assertRaises(ValueError) as cm: - normalize(self.table, "tmm", gene_length=self.gene_length) - self.assertEqual(str(cm.exception), expected_message) - def test_tpm_fpkm_with_short_gene_length(self): + obs = _convert_lengths(table, gene_length=gene_length) + assert_series_equal(obs, self.lengths) + + def test_convert_lengths_short_gene_length(self): # Test Error raised if gene-length is missing genes gene_length = SequenceCharacteristicsDirectoryFormat() shutil.copy( self.get_data_path("sequence_characteristics_short.tsv"), os.path.join(gene_length.path, "sequence_characteristics.tsv"), ) - table = biom.load_table(self.get_data_path("feature-table.biom")) - expected_message = ( + table = pd.read_csv( + self.get_data_path("feature-table.tsv"), sep="\t", index_col="ID" + ) + with self.assertRaisesRegex( + ValueError, "There are genes present in the FeatureTable that are not present " "in the gene-length input. Missing lengths for genes: " - "{'ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1'}" - ) - with self.assertRaises(ValueError) as cm: - normalize(table, "tpm", gene_length=gene_length) - self.assertEqual(str(cm.exception), expected_message) + "{'ARO:3000027|ID:1757|Name:emrA|NCBI:AP009048.1'}", + ): + _convert_lengths(table, gene_length=gene_length) @patch("q2_amr.card.normalization.TPM") def test_tpm_fpkm_with_valid_inputs(self, mock_tpm): @@ -67,11 +96,15 @@ def test_tpm_fpkm_with_valid_inputs(self, mock_tpm): shutil.copy( self.get_data_path("sequence_characteristics.tsv"), gene_length.path ) - table = biom.load_table(self.get_data_path("feature-table.biom")) + table = pd.read_csv( + self.get_data_path("feature-table.tsv"), sep="\t", index_col="ID" + ) normalize(table=table, gene_length=gene_length, method="tpm") @patch("q2_amr.card.normalization.TMM") def test_tmm_uq_cuf_ctf_with_valid_inputs(self, mock_tmm): # Test valid inputs for TMM method - table = biom.load_table(self.get_data_path("feature-table.biom")) + table = pd.read_csv( + self.get_data_path("feature-table.tsv"), sep="\t", index_col="ID" + ) normalize(table=table, method="tmm", a_trim=0.06, m_trim=0.4) diff --git a/q2_amr/plugin_setup.py b/q2_amr/plugin_setup.py index 456776d..93dcd23 100644 --- a/q2_amr/plugin_setup.py +++ b/q2_amr/plugin_setup.py @@ -470,8 +470,8 @@ }, parameters={ "method": Str % Choices(["tpm", "fpkm", "tmm", "uq", "cuf", "ctf", "cpm"]), - "m_trim": Float % Range(0, 1, inclusive_start=True), - "a_trim": Float % Range(0, 1, inclusive_start=True), + "m_trim": Float % Range(0, 1, inclusive_start=True, inclusive_end=True), + "a_trim": Float % Range(0, 1, inclusive_start=True, inclusive_end=True), }, outputs=[("normalized_table", FeatureTable[Frequency])], input_descriptions={ @@ -485,9 +485,9 @@ "/12/2023-Normalizing-RNA-seq-data-in-Python-with-RNAnorm.pdf for " "more information on the methods.", "m_trim": "Two sided cutoff for M-values. Can only be used for methods TMM and " - "CTF.", + "CTF. (default = 0.3)", "a_trim": "Two sided cutoff for A-values. Can only be used for methods TMM and " - "CTF.", + "CTF. (default = 0.05)", }, output_descriptions={ "normalized_table": "Feature table normalized with specified " "method." From f25449df62c90f93c62f9b6e5957444adf30b8ba Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 5 Jun 2024 13:51:14 +0200 Subject: [PATCH 22/23] typo --- q2_amr/card/tests/test_normalization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/q2_amr/card/tests/test_normalization.py b/q2_amr/card/tests/test_normalization.py index 18eb790..77daf22 100644 --- a/q2_amr/card/tests/test_normalization.py +++ b/q2_amr/card/tests/test_normalization.py @@ -26,7 +26,7 @@ def setUpClass(cls): cls.lengths.index.name = "index" def test_validate_parameters_uq_with_m_a_trim(self): - # Test Error raised if gene-length is given with TMM method + # Test Error raised if gene-length is given with UQ method with self.assertRaisesRegex( ValueError, "Parameters m-trim and a-trim can only " From 8bafcb2870118b1746741af6c4bd65cb2a884eaa Mon Sep 17 00:00:00 2001 From: VinzentRisch Date: Wed, 5 Jun 2024 17:04:40 +0200 Subject: [PATCH 23/23] typos --- q2_amr/card/normalization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/q2_amr/card/normalization.py b/q2_amr/card/normalization.py index d6dab5a..afb1b1e 100644 --- a/q2_amr/card/normalization.py +++ b/q2_amr/card/normalization.py @@ -49,13 +49,13 @@ def _validate_parameters(method, m_trim, a_trim, gene_length): # Raise Error if gene-length is given when using methods TMM, UQ, CUF, CPM or CTF if method in ["tmm", "uq", "cuf", "ctf", "cpm"] and gene_length: raise ValueError( - "gene-length input can only be used with FPKM and TPM " "methods." + "gene-length input can only be used with FPKM and TPM methods." ) # Raise Error if m_trim or a_trim are given when not using methods TMM or CTF if (method not in ["tmm", "ctf"]) and (m_trim is not None or a_trim is not None): raise ValueError( - "Parameters m-trim and a-trim can only be used with methods TMM and " "CTF." + "Parameters m-trim and a-trim can only be used with methods TMM and CTF." ) # Set m_trim and a_trim to their default values for methods TMM and CTF