Skip to content

Commit f0e1f47

Browse files
LEBCLEBC
LEBC
authored and
LEBC
committed
amrfinder files
1 parent 8e07af9 commit f0e1f47

File tree

5 files changed

+365
-0
lines changed

5 files changed

+365
-0
lines changed

bifrost_bridge/amrfinderplus.py

+81
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/06_amrfinderplus.ipynb.
2+
3+
# %% auto 0
4+
__all__ = ['process_amrfinderplus_data', 'process_amrfinderplus_data_from_cli']
5+
6+
# %% ../nbs/06_amrfinderplus.ipynb 2
7+
# That export there, it makes sure this code goes into the module.
8+
9+
# standard libs
10+
import os
11+
import re
12+
13+
# Common to template
14+
# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`
15+
import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/
16+
import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml
17+
import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/
18+
from fastcore import (
19+
test,
20+
)
21+
from fastcore.script import (
22+
call_parse,
23+
) # for @call_parse, https://fastcore.fast.ai/script
24+
import json # for nicely printing json and yaml
25+
from fastcore import test
26+
from . import core
27+
28+
# %% ../nbs/06_amrfinderplus.ipynb 6
29+
def process_amrfinderplus_data(
30+
input_path: str,
31+
output_path: str = "./output.tsv",
32+
replace_header: str = None,
33+
filter_columns: str = None,
34+
add_header: str = None,
35+
):
36+
"""
37+
Command-line interface for processing amrfinderplus data.
38+
39+
This function sets up an argument parser to handle command-line arguments for processing amrfinderplus data files.
40+
It supports specifying input and output file paths, replacing headers, filtering columns, and handling the presence or absence of headers in the input file.
41+
42+
Arguments:
43+
input_path (str): Path to the input file.
44+
output_path (str): Path to the output file (default: './output.tsv').
45+
replace_header (str): Header to replace the existing header (default: None).
46+
filter_columns (str): Columns to filter from the header (default: None).
47+
header_exists (int): Indicates if the header exists in the input file (default: 1).
48+
add_header (str): Header to add if the header does not exist in the input file (default: None).
49+
"""
50+
51+
df = core.DataFrame()
52+
df.import_data(input_path, file_type="tsv", add_header=add_header)
53+
54+
# print(df.df.columns)
55+
def concatenate_vector(x, sep=","):
56+
return ",".join([str(i) for i in x])
57+
58+
df_agg = df.df.apply(concatenate_vector, axis=0)
59+
df.df = df_agg.to_frame().T
60+
if replace_header:
61+
df.rename_header(replace_header)
62+
63+
if filter_columns:
64+
df.filter_columns(filter_columns)
65+
66+
# df.show()
67+
68+
df.export_data(output_path, file_type="tsv")
69+
70+
71+
@call_parse
72+
def process_amrfinderplus_data_from_cli(
73+
input_path: str,
74+
output_path: str = "./output.tsv",
75+
replace_header: str = None,
76+
filter_columns: str = None,
77+
add_header: str = None,
78+
):
79+
process_amrfinderplus_data(
80+
input_path, output_path, replace_header, filter_columns, add_header
81+
)

nbs/06_amrfinderplus.ipynb

+234
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"# |default_exp amrfinderplus"
10+
]
11+
},
12+
{
13+
"cell_type": "code",
14+
"execution_count": 1,
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"# |hide\n",
19+
"# See above? this hides these blocks, meaning these blocks aren't in the module and aren't in the documentation\n",
20+
"import nbdev\n",
21+
"from nbdev.showdoc import * # ignore this Pylance warning in favor of following nbdev docs"
22+
]
23+
},
24+
{
25+
"cell_type": "code",
26+
"execution_count": 2,
27+
"metadata": {},
28+
"outputs": [],
29+
"source": [
30+
"# |export\n",
31+
"# That export there, it makes sure this code goes into the module.\n",
32+
"\n",
33+
"# standard libs\n",
34+
"import os\n",
35+
"import re\n",
36+
"\n",
37+
"# Common to template\n",
38+
"# add into settings.ini, requirements, package name is python-dotenv, for conda build ensure `conda config --add channels conda-forge`\n",
39+
"import dotenv # for loading config from .env files, https://pypi.org/project/python-dotenv/\n",
40+
"import envyaml # Allows to loads env vars into a yaml file, https://github.com/thesimj/envyaml\n",
41+
"import fastcore # To add functionality related to nbdev development, https://github.com/fastai/fastcore/\n",
42+
"from fastcore import (\n",
43+
" test,\n",
44+
")\n",
45+
"from fastcore.script import (\n",
46+
" call_parse,\n",
47+
") # for @call_parse, https://fastcore.fast.ai/script\n",
48+
"import json # for nicely printing json and yaml\n",
49+
"from fastcore import test\n",
50+
"from bifrost_bridge import core\n"
51+
]
52+
},
53+
{
54+
"cell_type": "markdown",
55+
"metadata": {},
56+
"source": [
57+
"Because the notebooks now are located in the `nbs` folder, we need to change the python `wd` for the notebook to the project folder. Keep this included in all notebooks but don't export it to the package. "
58+
]
59+
},
60+
{
61+
"cell_type": "code",
62+
"execution_count": 3,
63+
"metadata": {},
64+
"outputs": [],
65+
"source": [
66+
"# This block should never be exported. It is to have python running in the project (and not the nbs) dir, and to initiate the package using pip.\n",
67+
"os.chdir(core.PROJECT_DIR)"
68+
]
69+
},
70+
{
71+
"cell_type": "markdown",
72+
"metadata": {},
73+
"source": [
74+
"##################################################CODE_SEGMENT###########################################"
75+
]
76+
},
77+
{
78+
"cell_type": "code",
79+
"execution_count": 4,
80+
"metadata": {},
81+
"outputs": [],
82+
"source": [
83+
"# |export\n",
84+
"\n",
85+
"def process_amrfinderplus_data(\n",
86+
" input_path:str,\n",
87+
" output_path:str = './output.tsv',\n",
88+
" replace_header:str = None,\n",
89+
" filter_columns:str = None,\n",
90+
" add_header:str = None):\n",
91+
"\n",
92+
" \"\"\"\n",
93+
" Command-line interface for processing amrfinderplus data.\n",
94+
"\n",
95+
" This function sets up an argument parser to handle command-line arguments for processing amrfinderplus data files.\n",
96+
" It supports specifying input and output file paths, replacing headers, filtering columns, and handling the presence or absence of headers in the input file.\n",
97+
"\n",
98+
" Arguments:\n",
99+
" input_path (str): Path to the input file.\n",
100+
" output_path (str): Path to the output file (default: './output.tsv').\n",
101+
" replace_header (str): Header to replace the existing header (default: None).\n",
102+
" filter_columns (str): Columns to filter from the header (default: None).\n",
103+
" header_exists (int): Indicates if the header exists in the input file (default: 1).\n",
104+
" add_header (str): Header to add if the header does not exist in the input file (default: None).\n",
105+
" \"\"\"\n",
106+
"\n",
107+
"\n",
108+
" df = core.DataFrame()\n",
109+
" df.import_data(input_path, file_type='tsv', add_header=add_header)\n",
110+
" #print(df.df.columns)\n",
111+
" def concatenate_vector(x, sep=','):\n",
112+
" return ','.join([str(i) for i in x])\n",
113+
" \n",
114+
" df_agg = df.df.apply(concatenate_vector, axis=0)\n",
115+
" df.df = df_agg.to_frame().T\n",
116+
" if replace_header:\n",
117+
" df.rename_header(replace_header)\n",
118+
"\n",
119+
" if filter_columns:\n",
120+
" df.filter_columns(filter_columns)\n",
121+
" \n",
122+
" #df.show()\n",
123+
"\n",
124+
" df.export_data(output_path, file_type='tsv')\n",
125+
"\n",
126+
"@call_parse\n",
127+
"def process_amrfinderplus_data_from_cli(\n",
128+
" input_path:str,\n",
129+
" output_path:str = './output.tsv',\n",
130+
" replace_header:str = None,\n",
131+
" filter_columns:str = None,\n",
132+
" add_header:str = None):\n",
133+
" process_amrfinderplus_data(input_path, output_path, replace_header, filter_columns, add_header)"
134+
]
135+
},
136+
{
137+
"cell_type": "code",
138+
"execution_count": null,
139+
"metadata": {},
140+
"outputs": [],
141+
"source": [
142+
"#|hide\n",
143+
"#Example usage of the function\n",
144+
"process_amrfinderplus_data(\n",
145+
" input_path='test_data/amrfinderplus.tsv', \n",
146+
" output_path='test_data/amrfinderplus_testout.tsv',\n",
147+
" #filter_columns=\"Query / Template length\"\n",
148+
")"
149+
]
150+
},
151+
{
152+
"cell_type": "code",
153+
"execution_count": null,
154+
"metadata": {},
155+
"outputs": [],
156+
"source": [
157+
"#|hide\n",
158+
"#Example usage of the function\n",
159+
"process_amrfinderplus_data(\n",
160+
" input_path='test_data/amrfinderplus_long_example.tsv', \n",
161+
" output_path='test_data/amrfinderplus_long_example_testout.tsv',\n",
162+
" #filter_columns=\"Query / Template length\"\n",
163+
")"
164+
]
165+
},
166+
{
167+
"cell_type": "code",
168+
"execution_count": 62,
169+
"metadata": {},
170+
"outputs": [
171+
{
172+
"name": "stdout",
173+
"output_type": "stream",
174+
"text": [
175+
" Database Plasmid Identity Query / Template length Contig Position in contig \\\n",
176+
"0 \n",
177+
"\n",
178+
" Note Accession number \n",
179+
"0 \n"
180+
]
181+
}
182+
],
183+
"source": [
184+
"#|hide\n",
185+
"#Example usage of the function\n",
186+
"process_amrfinderplus_data(\n",
187+
" input_path='test_data/amrfinderplus_empty.tsv', \n",
188+
" output_path='test_data/amrfinderplus_empty_testout.tsv',\n",
189+
" #filter_columns=\"Query / Template length\"\n",
190+
")"
191+
]
192+
},
193+
{
194+
"cell_type": "markdown",
195+
"metadata": {},
196+
"source": [
197+
"##################################################CODE_SEGMENT###########################################"
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": null,
203+
"metadata": {},
204+
"outputs": [],
205+
"source": [
206+
"#| hide\n",
207+
"# This is included at the end to ensure when you run through your notebook the code is also transferred to the associated python package\n",
208+
"\n",
209+
"nbdev.nbdev_export()"
210+
]
211+
}
212+
],
213+
"metadata": {
214+
"kernelspec": {
215+
"display_name": "base",
216+
"language": "python",
217+
"name": "python3"
218+
},
219+
"language_info": {
220+
"codemirror_mode": {
221+
"name": "ipython",
222+
"version": 3
223+
},
224+
"file_extension": ".py",
225+
"mimetype": "text/x-python",
226+
"name": "python",
227+
"nbconvert_exporter": "python",
228+
"pygments_lexer": "ipython3",
229+
"version": "3.11.10"
230+
}
231+
},
232+
"nbformat": 4,
233+
"nbformat_minor": 2
234+
}

test_data/amrfinderplus.tsv

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description
2+
NA contig00004 33786 34556 + blaOXA-193 OXA-61 family class D beta-lactamase OXA-193 core AMR AMR BETA-LACTAM BETA-LACTAM ALLELEX 257 257 100.00 100.00 257 WP_002783228.1 OXA-61 family class D beta-lactamase OXA-193 NA NA

test_data/amrfinderplus_empty.tsv

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Protein identifier Contig id Start Stop Strand Gene symbol Sequence name Scope Element type Element subtype Class Subclass Method Target length Reference sequence length % Coverage of reference sequence % Identity to reference sequence Alignment length Accession of closest sequence Name of closest sequence HMM id HMM description

0 commit comments

Comments
 (0)