amrfinder files

LEBC · LEBC · commit f0e1f476bfa7 · 2025-01-16T14:16:16.000+01:00
diff --git a/bifrost_bridge/amrfinderplus.py b/bifrost_bridge/amrfinderplus.py
@@ -0,0 +1,81 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/06_amrfinderplus.ipynb.
+
+# %% auto 0
+__all__ = ['process_amrfinderplus_data', 'process_amrfinderplus_data_from_cli']
+
+# %% ../nbs/06_amrfinderplus.ipynb 2
+# That export there, it makes sure this code goes into the module.
+
+# standard libs
+import os
+import re
+
+# Common to template
+# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
+import dotenv  # for loading config from .env files, https://pypi.org/project/python-dotenv/
+import envyaml  # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
+import fastcore  # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
+from fastcore import (
+    test,
+)
+from fastcore.script import (
+    call_parse,
+)  # for @call_parse, https://fastcore.fast.ai/script
+import json  # for nicely printing json and yaml
+from fastcore import test
+from . import core
+
+# %% ../nbs/06_amrfinderplus.ipynb 6
+def process_amrfinderplus_data(
+    input_path: str,
+    output_path: str = "./output.tsv",
+    replace_header: str = None,
+    filter_columns: str = None,
+    add_header: str = None,
+):
+    """
+    Command-line interface for processing amrfinderplus data.
+
+    This function sets up an argument parser to handle command-line arguments for processing amrfinderplus data files.
+    It supports specifying input and output file paths, replacing headers, filtering columns, and handling the presence or absence of headers in the input file.
+
+    Arguments:
+        input_path (str): Path to the input file.
+        output_path (str): Path to the output file (default: './output.tsv').
+        replace_header (str): Header to replace the existing header (default: None).
+        filter_columns (str): Columns to filter from the header (default: None).
+        header_exists (int): Indicates if the header exists in the input file (default: 1).
+        add_header (str): Header to add if the header does not exist in the input file (default: None).
+    """
+
+    df = core.DataFrame()
+    df.import_data(input_path, file_type="tsv", add_header=add_header)
+
+    # print(df.df.columns)
+    def concatenate_vector(x, sep=","):
+        return ",".join([str(i) for i in x])
+
+    df_agg = df.df.apply(concatenate_vector, axis=0)
+    df.df = df_agg.to_frame().T
+    if replace_header:
+        df.rename_header(replace_header)
+
+    if filter_columns:
+        df.filter_columns(filter_columns)
+
+    # df.show()
+
+    df.export_data(output_path, file_type="tsv")
+
+
+@call_parse
+def process_amrfinderplus_data_from_cli(
+    input_path: str,
+    output_path: str = "./output.tsv",
+    replace_header: str = None,
+    filter_columns: str = None,
+    add_header: str = None,
+):
+    process_amrfinderplus_data(
+        input_path, output_path, replace_header, filter_columns, add_header
+    )
diff --git a/nbs/06_amrfinderplus.ipynb b/nbs/06_amrfinderplus.ipynb
@@ -0,0 +1,234 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# |default_exp amrfinderplus"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# |hide\n",
+    "# See above? this hides these blocks, meaning these blocks aren't in the module and aren't in the documentation\n",
+    "import nbdev\n",
+    "from nbdev.showdoc import *  # ignore this Pylance warning in favor of following nbdev docs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# |export\n",
+    "# That export there, it makes sure this code goes into the module.\n",
+    "\n",
+    "# standard libs\n",
+    "import os\n",
+    "import re\n",
+    "\n",
+    "# Common to template\n",
+    "# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`\n",
+    "import dotenv  # for loading config from .env files, https://pypi.org/project/python-dotenv/\n",
+    "import envyaml  # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml\n",
+    "import fastcore  # To add functionality related to nbdev development, https://github.com/fastai/fastcore/\n",
+    "from fastcore import (\n",
+    "    test,\n",
+    ")\n",
+    "from fastcore.script import (\n",
+    "    call_parse,\n",
+    ")  # for @call_parse, https://fastcore.fast.ai/script\n",
+    "import json  # for nicely printing json and yaml\n",
+    "from fastcore import test\n",
+    "from bifrost_bridge import core\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Because the notebooks now are located in the `nbs` folder, we need to change the python `wd` for the notebook to the project folder. Keep this included in all notebooks but don't export it to the package. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This block should never be exported. It is to have python running in the project (and not the nbs) dir, and to initiate the package using pip.\n",
+    "os.chdir(core.PROJECT_DIR)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##################################################CODE_SEGMENT###########################################"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# |export\n",
+    "\n",
+    "def process_amrfinderplus_data(\n",
+    "    input_path:str,\n",
+    "    output_path:str = './output.tsv',\n",
+    "    replace_header:str = None,\n",
+    "    filter_columns:str = None,\n",
+    "    add_header:str = None):\n",
+    "\n",
+    "    \"\"\"\n",
+    "    Command-line interface for processing amrfinderplus data.\n",
+    "\n",
+    "    This function sets up an argument parser to handle command-line arguments for processing amrfinderplus data files.\n",
+    "    It supports specifying input and output file paths, replacing headers, filtering columns, and handling the presence or absence of headers in the input file.\n",
+    "\n",
+    "    Arguments:\n",
+    "        input_path (str): Path to the input file.\n",
+    "        output_path (str): Path to the output file (default: './output.tsv').\n",
+    "        replace_header (str): Header to replace the existing header (default: None).\n",
+    "        filter_columns (str): Columns to filter from the header (default: None).\n",
+    "        header_exists (int): Indicates if the header exists in the input file (default: 1).\n",
+    "        add_header (str): Header to add if the header does not exist in the input file (default: None).\n",
+    "    \"\"\"\n",
+    "\n",
+    "\n",
+    "    df = core.DataFrame()\n",
+    "    df.import_data(input_path, file_type='tsv', add_header=add_header)\n",
+    "    #print(df.df.columns)\n",
+    "    def concatenate_vector(x, sep=','):\n",
+    "        return ','.join([str(i) for i in x])\n",
+    "    \n",
+    "    df_agg = df.df.apply(concatenate_vector, axis=0)\n",
+    "    df.df = df_agg.to_frame().T\n",
+    "    if replace_header:\n",
+    "        df.rename_header(replace_header)\n",
+    "\n",
+    "    if filter_columns:\n",
+    "        df.filter_columns(filter_columns)\n",
+    "    \n",
+    "    #df.show()\n",
+    "\n",
+    "    df.export_data(output_path, file_type='tsv')\n",
+    "\n",
+    "@call_parse\n",
+    "def process_amrfinderplus_data_from_cli(\n",
+    "    input_path:str,\n",
+    "    output_path:str = './output.tsv',\n",
+    "    replace_header:str = None,\n",
+    "    filter_columns:str = None,\n",
+    "    add_header:str = None):\n",
+    "    process_amrfinderplus_data(input_path, output_path, replace_header, filter_columns, add_header)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|hide\n",
+    "#Example usage of the function\n",
+    "process_amrfinderplus_data(\n",
+    "   input_path='test_data/amrfinderplus.tsv', \n",
+    "   output_path='test_data/amrfinderplus_testout.tsv',\n",
+    "   #filter_columns=\"Query / Template length\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#|hide\n",
+    "#Example usage of the function\n",
+    "process_amrfinderplus_data(\n",
+    "   input_path='test_data/amrfinderplus_long_example.tsv', \n",
+    "   output_path='test_data/amrfinderplus_long_example_testout.tsv',\n",
+    "   #filter_columns=\"Query / Template length\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Database Plasmid Identity Query / Template length Contig Position in contig  \\\n",
+      "0                                                                               \n",
+      "\n",
+      "  Note Accession number  \n",
+      "0                        \n"
+     ]
+    }
+   ],
+   "source": [
+    "#|hide\n",
+    "#Example usage of the function\n",
+    "process_amrfinderplus_data(\n",
+    "   input_path='test_data/amrfinderplus_empty.tsv', \n",
+    "   output_path='test_data/amrfinderplus_empty_testout.tsv',\n",
+    "   #filter_columns=\"Query / Template length\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##################################################CODE_SEGMENT###########################################"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "# This is included at the end to ensure when you run through your notebook the code is also transferred to the associated python package\n",
+    "\n",
+    "nbdev.nbdev_export()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/test_data/amrfinderplus.tsv b/test_data/amrfinderplus.tsv
@@ -0,0 +1,2 @@
+Protein identifier	Contig id	Start	Stop	Strand	Gene symbol	Sequence name	Scope	Element type	Element subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference sequence	% Identity to reference sequence	Alignment length	Accession of closest sequence	Name of closest sequence	HMM id	HMM description
+NA	contig00004	33786	34556	+	blaOXA-193	OXA-61 family class D beta-lactamase OXA-193	core	AMR	AMR	BETA-LACTAM	BETA-LACTAM	ALLELEX	257	257	100.00	100.00	257	WP_002783228.1	OXA-61 family class D beta-lactamase OXA-193	NA	NA
diff --git a/test_data/amrfinderplus_empty.tsv b/test_data/amrfinderplus_empty.tsv
@@ -0,0 +1 @@
+Protein identifier	Contig id	Start	Stop	Strand	Gene symbol	Sequence name	Scope	Element type	Element subtype	Class	Subclass	Method	Target length	Reference sequence length	% Coverage of reference sequence	% Identity to reference sequence	Alignment length	Accession of closest sequence	Name of closest sequence	HMM id	HMM description
diff --git a/test_data/amrfinderplus_long_example.tsv b/test_data/amrfinderplus_long_example.tsv

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description`
	`2`	`+NA contig00004 33786 34556 + blaOXA-193 OXA-61 family class D beta-lactamase OXA-193 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEX 257 257 100.00 100.00 257 WP_002783228.1 OXA-61 family class D beta-lactamase OXA-193 NA NA`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description`