Skip to content

Commit 23d09b5

Browse files
committedJan 15, 2025·
bifrost update
1 parent 5468b9f commit 23d09b5

9 files changed

+187
-26
lines changed
 

‎bifrost_bridge/bifrost.py

+82-9
Original file line numberDiff line numberDiff line change
@@ -31,35 +31,50 @@
3131
from .mlst import process_mlst_data
3232
from .fastp import process_fastp_data
3333
from .quast import process_quast_data
34+
from .plasmidfinder import process_plasmidfinder_data
35+
from .bracken import process_bracken_data
36+
import pandas as pd
37+
38+
# from bifrost_bridge.amrfinder import process_amrfinder_data
39+
# from bifrost_bridge.pmlst import process_pmlst_data
3440

3541

3642
@call_parse
3743
def process_qc_data(
3844
mlst_path: str = None,
3945
fastp_path: str = None,
4046
quast_path: str = None,
47+
plasmidfinder_path: str = None,
48+
bracken_path: str = None,
49+
amrfinder_path: str = None,
50+
pmlst_path: str = None,
51+
combine_output: bool = True,
4152
output_path: str = "./output.tsv",
4253
):
4354
"""
4455
Command-line interface for processing QC data.
4556
46-
This function processes MLST and FASTP data files based on the provided command-line arguments.
47-
It supports specifying input file paths for MLST and FASTP data, and outputs the processed data to specified paths.
57+
This function processes MLST, FASTP, QUAST, PlasmidFinder, and Bracken data files based on the provided command-line arguments.
58+
It supports specifying input file paths for MLST, FASTP, QUAST, PlasmidFinder, and Bracken data, and outputs the processed data to specified paths.
4859
4960
Arguments:
50-
mlst (str): Path to the MLST input file.
51-
fastp (str): Path to the FASTP input file.
52-
output (str): Path to the output file (default: './output.tsv').
61+
mlst_path (str): Path to the MLST input file.
62+
fastp_path (str): Path to the FASTP input file.
63+
quast_path (str): Path to the QUAST input file.
64+
plasmidfinder_path (str): Path to the PlasmidFinder input file.
65+
bracken_path (str): Path to the Bracken input file.
66+
amrfinder_path (str): Path to the AMRFinder input file.
67+
pmlst_path (str): Path to the PMLST input file.
68+
output_path (str): Path to the output file (default: './output.tsv').
5369
"""
5470
if mlst_path is not None:
5571
if not os.path.exists(mlst_path):
5672
raise FileNotFoundError(f"File not found: {mlst_path}")
5773
process_mlst_data(
5874
input_path=str(mlst_path),
59-
output_path="./parsed_mlst.tsv",
75+
output_path="test_data/bifrost/parsed_mlst.tsv",
6076
replace_header=None,
6177
filter_columns="SampleID, Species, ST",
62-
header_exists=0,
6378
add_header="SampleID, Species, ST, 1, 2, 3, 4, 5, 6, 7",
6479
)
6580

@@ -68,7 +83,7 @@ def process_qc_data(
6883
raise FileNotFoundError(f"File not found: {fastp_path}")
6984
process_fastp_data(
7085
input_path=fastp_path,
71-
output_path="./parsed_fastp.tsv",
86+
output_path="test_data/bifrost/parsed_fastp.tsv",
7287
filter_columns="summary£fastp_version, summary£sequencing, summary£before_filtering£total_reads",
7388
replace_header="fastp_version, sequencing, total_reads",
7489
)
@@ -78,7 +93,65 @@ def process_qc_data(
7893
raise FileNotFoundError(f"File not found: {quast_path}")
7994
process_quast_data(
8095
input_path=quast_path,
81-
output_path="./parsed_quast.tsv",
96+
output_path="test_data/bifrost/parsed_quast.tsv",
8297
filter_columns="Assembly,# contigs (>= 0 bp), N50",
8398
transpose=True,
8499
)
100+
101+
if plasmidfinder_path is not None:
102+
if not os.path.exists(plasmidfinder_path):
103+
raise FileNotFoundError(f"File not found: {plasmidfinder_path}")
104+
process_plasmidfinder_data(
105+
input_path=plasmidfinder_path,
106+
output_path="test_data/bifrost/parsed_plasmidfinder.tsv",
107+
)
108+
109+
if bracken_path is not None:
110+
if not os.path.exists(bracken_path):
111+
raise FileNotFoundError(f"File not found: {bracken_path}")
112+
process_bracken_data(
113+
input_path=bracken_path,
114+
output_path="test_data/bifrost/parsed_bracken.tsv",
115+
)
116+
117+
if amrfinder_path is not None:
118+
if not os.path.exists(amrfinder_path):
119+
raise FileNotFoundError(f"File not found: {amrfinder_path}")
120+
# process_amrfinder_data(
121+
# input_path=amrfinder_path,
122+
# output_path='test_data/bifrost/parsed_amrfinder.tsv',
123+
# filter_columns='Assembly,# contigs (>= 0 bp), N50',
124+
# transpose=True
125+
# )
126+
127+
if pmlst_path is not None:
128+
if not os.path.exists(pmlst_path):
129+
raise FileNotFoundError(f"File not found: {pmlst_path}")
130+
# process_pmlst_data(
131+
# input_path=pmlst_path,
132+
# output_path='test_data/bifrost/parsed_pmlst.tsv',
133+
# filter_columns='Assembly,# contigs (>= 0 bp), N50',
134+
# transpose=True
135+
# )
136+
137+
if combine_output:
138+
# List of output files that were actually created
139+
output_files = []
140+
if mlst_path is not None:
141+
output_files.append("test_data/bifrost/parsed_mlst.tsv")
142+
if fastp_path is not None:
143+
output_files.append("test_data/bifrost/parsed_fastp.tsv")
144+
if quast_path is not None:
145+
output_files.append("test_data/bifrost/parsed_quast.tsv")
146+
if plasmidfinder_path is not None:
147+
output_files.append("test_data/bifrost/parsed_plasmidfinder.tsv")
148+
if bracken_path is not None:
149+
output_files.append("test_data/bifrost/parsed_bracken.tsv")
150+
151+
# Read and concatenate all output files
152+
combined_df = pd.concat(
153+
[pd.read_csv(file, sep="\t") for file in output_files], axis=1
154+
)
155+
156+
# Save the combined dataframe to the specified output path
157+
combined_df.to_csv(output_path, sep="\t", index=False)

‎nbs/06_bracken.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,8 @@
154154
"# |hide\n",
155155
"# Example usage of the function\n",
156156
"process_bracken_data(\n",
157-
" input_path='/Users/B246654/vscode_storage/ssi-dk/bifrost_bridge/test_data/bracken_krakenreport.txt', \n",
158-
" output_path='/Users/B246654/vscode_storage/ssi-dk/bifrost_bridge/test_data/bracken_test_out.tabular'\n",
157+
" input_path='test_data/bracken_krakenreport.txt', \n",
158+
" output_path='test_data/bracken_test_out.tabular'\n",
159159
")"
160160
]
161161
},

‎nbs/99_bifrost.ipynb

+95-15
Original file line numberDiff line numberDiff line change
@@ -80,34 +80,48 @@
8080
"from bifrost_bridge.mlst import process_mlst_data\n",
8181
"from bifrost_bridge.fastp import process_fastp_data\n",
8282
"from bifrost_bridge.quast import process_quast_data\n",
83+
"from bifrost_bridge.plasmidfinder import process_plasmidfinder_data\n",
84+
"from bifrost_bridge.bracken import process_bracken_data\n",
85+
"import pandas as pd\n",
86+
"#from bifrost_bridge.amrfinder import process_amrfinder_data\n",
87+
"#from bifrost_bridge.pmlst import process_pmlst_data\n",
8388
"\n",
8489
"@call_parse\n",
8590
"def process_qc_data(\n",
8691
" mlst_path:str = None,\n",
8792
" fastp_path:str = None,\n",
8893
" quast_path:str = None,\n",
94+
" plasmidfinder_path:str = None,\n",
95+
" bracken_path:str = None,\n",
96+
" amrfinder_path:str = None,\n",
97+
" pmlst_path:str = None,\n",
98+
" combine_output:bool = True,\n",
8999
" output_path:str = './output.tsv'):\n",
90100
"\n",
91101
" \"\"\"\n",
92102
" Command-line interface for processing QC data.\n",
93103
"\n",
94-
" This function processes MLST and FASTP data files based on the provided command-line arguments.\n",
95-
" It supports specifying input file paths for MLST and FASTP data, and outputs the processed data to specified paths.\n",
104+
" This function processes MLST, FASTP, QUAST, PlasmidFinder, and Bracken data files based on the provided command-line arguments.\n",
105+
" It supports specifying input file paths for MLST, FASTP, QUAST, PlasmidFinder, and Bracken data, and outputs the processed data to specified paths.\n",
96106
"\n",
97107
" Arguments:\n",
98-
" mlst (str): Path to the MLST input file.\n",
99-
" fastp (str): Path to the FASTP input file.\n",
100-
" output (str): Path to the output file (default: './output.tsv').\n",
108+
" mlst_path (str): Path to the MLST input file.\n",
109+
" fastp_path (str): Path to the FASTP input file.\n",
110+
" quast_path (str): Path to the QUAST input file.\n",
111+
" plasmidfinder_path (str): Path to the PlasmidFinder input file.\n",
112+
" bracken_path (str): Path to the Bracken input file.\n",
113+
" amrfinder_path (str): Path to the AMRFinder input file.\n",
114+
" pmlst_path (str): Path to the PMLST input file.\n",
115+
" output_path (str): Path to the output file (default: './output.tsv').\n",
101116
" \"\"\"\n",
102117
" if mlst_path is not None:\n",
103118
" if not os.path.exists(mlst_path):\n",
104119
" raise FileNotFoundError(f\"File not found: {mlst_path}\")\n",
105120
" process_mlst_data(\n",
106121
" input_path=str(mlst_path), \n",
107-
" output_path='./parsed_mlst.tsv',\n",
122+
" output_path='test_data/bifrost/parsed_mlst.tsv',\n",
108123
" replace_header=None, \n",
109124
" filter_columns=\"SampleID, Species, ST\",\n",
110-
" header_exists=0,\n",
111125
" add_header=\"SampleID, Species, ST, 1, 2, 3, 4, 5, 6, 7\"\n",
112126
" )\n",
113127
"\n",
@@ -116,7 +130,7 @@
116130
" raise FileNotFoundError(f\"File not found: {fastp_path}\")\n",
117131
" process_fastp_data(\n",
118132
" input_path=fastp_path, \n",
119-
" output_path='./parsed_fastp.tsv',\n",
133+
" output_path='test_data/bifrost/parsed_fastp.tsv',\n",
120134
" filter_columns=\"summary£fastp_version, summary£sequencing, summary£before_filtering£total_reads\",\n",
121135
" replace_header=\"fastp_version, sequencing, total_reads\"\n",
122136
" )\n",
@@ -126,10 +140,66 @@
126140
" raise FileNotFoundError(f\"File not found: {quast_path}\")\n",
127141
" process_quast_data(\n",
128142
" input_path=quast_path, \n",
129-
" output_path='./parsed_quast.tsv',\n",
143+
" output_path='test_data/bifrost/parsed_quast.tsv',\n",
130144
" filter_columns='Assembly,# contigs (>= 0 bp), N50',\n",
131145
" transpose=True\n",
132-
" )"
146+
" )\n",
147+
" \n",
148+
" if plasmidfinder_path is not None:\n",
149+
" if not os.path.exists(plasmidfinder_path):\n",
150+
" raise FileNotFoundError(f\"File not found: {plasmidfinder_path}\")\n",
151+
" process_plasmidfinder_data(\n",
152+
" input_path=plasmidfinder_path, \n",
153+
" output_path='test_data/bifrost/parsed_plasmidfinder.tsv',\n",
154+
" )\n",
155+
"\n",
156+
" if bracken_path is not None:\n",
157+
" if not os.path.exists(bracken_path):\n",
158+
" raise FileNotFoundError(f\"File not found: {bracken_path}\")\n",
159+
" process_bracken_data(\n",
160+
" input_path=bracken_path, \n",
161+
" output_path='test_data/bifrost/parsed_bracken.tsv',\n",
162+
" )\n",
163+
"\n",
164+
" if amrfinder_path is not None:\n",
165+
" if not os.path.exists(amrfinder_path):\n",
166+
" raise FileNotFoundError(f\"File not found: {amrfinder_path}\")\n",
167+
" #process_amrfinder_data(\n",
168+
" # input_path=amrfinder_path, \n",
169+
" # output_path='test_data/bifrost/parsed_amrfinder.tsv',\n",
170+
" # filter_columns='Assembly,# contigs (>= 0 bp), N50',\n",
171+
" # transpose=True\n",
172+
" #)\n",
173+
"\n",
174+
" if pmlst_path is not None:\n",
175+
" if not os.path.exists(pmlst_path):\n",
176+
" raise FileNotFoundError(f\"File not found: {pmlst_path}\")\n",
177+
" #process_pmlst_data(\n",
178+
" # input_path=pmlst_path, \n",
179+
" # output_path='test_data/bifrost/parsed_pmlst.tsv',\n",
180+
" # filter_columns='Assembly,# contigs (>= 0 bp), N50',\n",
181+
" # transpose=True\n",
182+
" #)\n",
183+
" \n",
184+
" if combine_output:\n",
185+
" # List of output files that were actually created\n",
186+
" output_files = []\n",
187+
" if mlst_path is not None:\n",
188+
" output_files.append('test_data/bifrost/parsed_mlst.tsv')\n",
189+
" if fastp_path is not None:\n",
190+
" output_files.append('test_data/bifrost/parsed_fastp.tsv')\n",
191+
" if quast_path is not None:\n",
192+
" output_files.append('test_data/bifrost/parsed_quast.tsv')\n",
193+
" if plasmidfinder_path is not None:\n",
194+
" output_files.append('test_data/bifrost/parsed_plasmidfinder.tsv')\n",
195+
" if bracken_path is not None:\n",
196+
" output_files.append('test_data/bifrost/parsed_bracken.tsv')\n",
197+
"\n",
198+
" # Read and concatenate all output files\n",
199+
" combined_df = pd.concat([pd.read_csv(file, sep='\\t') for file in output_files], axis=1)\n",
200+
"\n",
201+
" # Save the combined dataframe to the specified output path\n",
202+
" combined_df.to_csv(output_path, sep='\\t', index=False)"
133203
]
134204
},
135205
{
@@ -144,17 +214,27 @@
144214
" SampleID Species ST\n",
145215
"0 TestSample2 campylobacter 22\n",
146216
" fastp_version sequencing total_reads\n",
147-
"0 0.23.4 paired end (151 cycles + 151 cycles) 4369610\n"
217+
"0 0.23.4 paired end (151 cycles + 151 cycles) 4369610\n",
218+
"\n",
219+
" Assembly # contigs (>= 0 bp) N50\n",
220+
"values Shovill_on_data_39_and_data_38__Contigs 942 56837\n"
148221
]
149222
}
150223
],
151224
"source": [
152225
"# |hide\n",
153226
"# Example usage of the function\n",
154-
"#process_qc_data(\n",
155-
"# mlst_path='/Users/B246654/vscode_storage/ssi-dk/bifrost_bridge/test_data/mlst_report.tabular', \n",
156-
"# fastp_path='/Users/B246654/vscode_storage/ssi-dk/bifrost_bridge/test_data/TestSample2.json'\n",
157-
"#)"
227+
"process_qc_data(\n",
228+
" mlst_path='test_data/mlst_report.tabular', \n",
229+
" fastp_path='test_data/TestSample2.json',\n",
230+
" quast_path='test_data/quast.tsv',\n",
231+
" plasmidfinder_path='test_data/plasmidfinder.tsv',\n",
232+
" bracken_path='test_data/bracken_krakenreport.txt',\n",
233+
" amrfinder_path='test_data/TestSample2.json',\n",
234+
" pmlst_path='test_data/TestSample2.json',\n",
235+
" combine_output = True,\n",
236+
" output_path = 'test_data/bifrost/output.tsv'\n",
237+
")"
158238
]
159239
},
160240
{

‎test_data/bifrost/output.tsv

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SampleID Species ST fastp_version sequencing total_reads Assembly # contigs (>= 0 bp) N50 Database Plasmid Identity Query / Template length Contig Position in contig Note Accession number species1_unclassified_name species1_unclassified_pct species1_name species1_pct species2_name species2_pct unclassified_name unclassified_pct
2+
TestSample2 campylobacter 22 0.23.4 paired end (151 cycles + 151 cycles) 4369610 Shovill_on_data_39_and_data_38__Contigs 942 56837 enterobacteriales,enterobacteriales IncB/O/K/Z,IncFIB(AP001918) 100.0,96.92 152 / 152,682 / 682 contig00135 len=6350 cov=57.2 corr=0 origname=NODE_135_length_6350_cov_57.224477_pilon sw=shovill-spades/1.1.0 date=20240829,contig00195 len=2394 cov=63.9 corr=0 origname=NODE_195_length_2394_cov_63.941381_pilon sw=shovill-spades/1.1.0 date=20240829 3607..3758,558..1239 nan,nan GQ259888,AP001918 Actinobacillus pleuropneumoniae + unclassified 98.2 Actinobacillus pleuropneumoniae 92.05 Actinobacillus equuli 0.83 unclassified 6.15

‎test_data/bifrost/parsed_bracken.tsv

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
species1_unclassified_name species1_unclassified_pct species1_name species1_pct species2_name species2_pct unclassified_name unclassified_pct
2+
Actinobacillus pleuropneumoniae + unclassified 98.2 Actinobacillus pleuropneumoniae 92.05 Actinobacillus equuli 0.83 unclassified 6.15
File renamed without changes.
File renamed without changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Database Plasmid Identity Query / Template length Contig Position in contig Note Accession number
2+
enterobacteriales,enterobacteriales IncB/O/K/Z,IncFIB(AP001918) 100.0,96.92 152 / 152,682 / 682 contig00135 len=6350 cov=57.2 corr=0 origname=NODE_135_length_6350_cov_57.224477_pilon sw=shovill-spades/1.1.0 date=20240829,contig00195 len=2394 cov=63.9 corr=0 origname=NODE_195_length_2394_cov_63.941381_pilon sw=shovill-spades/1.1.0 date=20240829 3607..3758,558..1239 nan,nan GQ259888,AP001918

‎test_data/bifrost/parsed_quast.tsv

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Assembly # contigs (>= 0 bp) N50
2+
Shovill_on_data_39_and_data_38__Contigs 942 56837

0 commit comments

Comments
 (0)
Please sign in to comment.