Skip to content

Commit eff6db9

Browse files
committed
Add --fasta; print low quality bases as lowercase
1 parent aea7c8c commit eff6db9

File tree

1 file changed

+84
-18
lines changed

1 file changed

+84
-18
lines changed

src/abi2fq.nim

Lines changed: 84 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import std/[os, strformat, strutils, parseopt]
22
import ./abif
33

4-
## This module provides a command-line tool for converting ABIF files to FASTQ format
4+
## This module provides a command-line tool for converting ABIF files to FASTQ or FASTA format
55
## with optional quality trimming.
66
##
77
## The abi2fq tool extracts sequence and quality data from ABIF files,
88
## applies quality trimming to remove low-quality regions, and outputs
9-
## in the standard FASTQ format.
9+
## in the standard FASTQ format or FASTA format (if --fasta is specified).
1010
##
1111
## Command-line usage:
1212
##
@@ -20,6 +20,7 @@ import ./abif
2020
## -n, --no-trim Disable quality trimming
2121
## -v, --verbose Print additional information
2222
## --version Show version information
23+
## --fasta Output in FASTA format instead of FASTQ
2324
##
2425
## Examples:
2526
##
@@ -32,6 +33,9 @@ import ./abif
3233
##
3334
## # Convert with custom quality parameters
3435
## abi2fq -w 20 -q 30 input.ab1 output.fastq
36+
##
37+
## # Convert to FASTA format
38+
## abi2fq --fasta input.ab1 output.fasta
3539

3640
type
3741
Config* = object
@@ -44,6 +48,7 @@ type
4448
noTrim*: bool ## Whether to disable quality trimming
4549
verbose*: bool ## Whether to show verbose output
4650
showVersion*: bool ## Whether to show version information
51+
fasta*: bool ## Whether to output in FASTA format instead of FASTQ
4752

4853
proc printHelp*() =
4954
## Displays the help message for the abi2fq tool.
@@ -61,6 +66,7 @@ Options:
6166
-n, --no-trim Disable quality trimming
6267
-v, --verbose Print additional information
6368
--version Show version information
69+
--fasta Output in FASTA format instead of FASTQ
6470
6571
If output file is not specified, FASTQ will be written to STDOUT.
6672
"""
@@ -83,7 +89,8 @@ proc parseCommandLine*(): Config =
8389
qualityThreshold: 20,
8490
noTrim: false,
8591
verbose: false,
86-
showVersion: false
92+
showVersion: false,
93+
fasta: false
8794
)
8895

8996
var fileArgs: seq[string] = @[]
@@ -116,6 +123,8 @@ proc parseCommandLine*(): Config =
116123
result.verbose = true
117124
of "version":
118125
result.showVersion = true
126+
of "fasta":
127+
result.fasta = true
119128
else:
120129
echo "Unknown option: ", key
121130
printHelp()
@@ -183,43 +192,54 @@ proc trimSequence*(sequence: string, qualities: seq[int],
183192
result.seq = sequence[startPos ..< endPos]
184193
result.qual = qualities[startPos ..< endPos]
185194

186-
proc writeFastq*(sequence: string, qualities: seq[int], name: string, outFile: string = "") =
187-
## Writes sequence and quality data to a FASTQ file.
195+
proc writeFastq*(sequence: string, qualities: seq[int], name: string, outFile: string = "", fasta: bool = false) =
196+
## Writes sequence and quality data to a FASTQ or FASTA file.
188197
##
189-
## If outFile is empty, the FASTQ data is written to stdout.
198+
## If outFile is empty, the data is written to stdout.
199+
## If fasta is true, the output will be in FASTA format instead of FASTQ.
190200
##
191201
## Parameters:
192202
## sequence: The DNA sequence to write
193203
## qualities: Quality scores for each base in the sequence
194-
## name: The sample name for the FASTQ header
204+
## name: The sample name for the header
195205
## outFile: Path to the output file (empty string for stdout)
196-
# Convert quality values to Phred+33 format
197-
var qualityString = ""
198-
for qv in qualities:
199-
qualityString.add(chr(qv + 33))
206+
## fasta: Whether to output in FASTA format instead of FASTQ
200207

201-
let fastqContent = &"@{name}\n{sequence}\n+\n{qualityString}"
208+
var content: string
209+
if fasta:
210+
# Create FASTA format
211+
content = &">{name}\n{sequence}"
212+
else:
213+
# Create FASTQ format
214+
var qualityString = ""
215+
for qv in qualities:
216+
qualityString.add(chr(qv + 33))
217+
content = &"@{name}\n{sequence}\n+\n{qualityString}"
202218

203219
if outFile == "":
204220
# Write to stdout
205-
stdout.write(fastqContent & "\n")
221+
stdout.write(content & "\n")
206222
else:
207223
# Write to file
208-
writeFile(outFile, fastqContent & "\n")
224+
writeFile(outFile, content & "\n")
209225

210226
proc main*() =
211227
## Main entry point for the abi2fq program.
212228
##
213229
## Handles command-line parsing, reads the input ABIF file,
214230
## performs quality trimming if enabled, and outputs the result
215-
## in FASTQ format.
231+
## in FASTQ or FASTA format (depending on the --fasta option).
216232
let config = parseCommandLine()
217233

218234
if config.verbose:
219235
echo &"Processing file: {config.inFile}"
220236
echo &"Window size: {config.windowSize}"
221237
echo &"Quality threshold: {config.qualityThreshold}"
222238
echo &"Trimming: {not config.noTrim}"
239+
if config.fasta:
240+
echo "Output format: FASTA"
241+
else:
242+
echo "Output format: FASTQ"
223243

224244
try:
225245
let trace = newABIFTrace(config.inFile)
@@ -236,8 +256,54 @@ proc main*() =
236256
quit(1)
237257

238258
if config.noTrim:
239-
# No trimming, use original sequence
240-
writeFastq(sequence, qualities, sampleName, config.outFile)
259+
# No trimming, but identify sections that would be trimmed and make them lowercase
260+
let trimmed = trimSequence(sequence, qualities, config.windowSize, config.qualityThreshold)
261+
262+
if config.verbose:
263+
if trimmed.seq.len == 0:
264+
echo "Warning: Entire sequence was below quality threshold"
265+
elif trimmed.seq.len < sequence.len:
266+
echo &"Sections that would be trimmed: {sequence.len - trimmed.seq.len} bases"
267+
268+
# Get indices for low quality regions
269+
var modifiedSeq = ""
270+
if trimmed.seq.len == 0: # All sequence is below threshold
271+
modifiedSeq = sequence.toLowerAscii()
272+
else:
273+
# Find start position (same logic as in trimSequence)
274+
var startPos = 0
275+
for i in 0 .. (sequence.len - config.windowSize):
276+
var windowSum = 0
277+
for j in 0 ..< config.windowSize:
278+
windowSum += qualities[i + j]
279+
280+
let windowAvg = windowSum / config.windowSize
281+
if windowAvg >= config.qualityThreshold.float:
282+
startPos = i
283+
break
284+
285+
# Find end position (same logic as in trimSequence)
286+
var endPos = sequence.len
287+
for i in countdown(sequence.len - config.windowSize, 0):
288+
var windowSum = 0
289+
for j in 0 ..< config.windowSize:
290+
windowSum += qualities[i + j]
291+
292+
let windowAvg = windowSum / config.windowSize
293+
if windowAvg >= config.qualityThreshold.float:
294+
endPos = i + config.windowSize
295+
break
296+
297+
# Make trimmed regions lowercase
298+
if startPos > 0:
299+
modifiedSeq.add(sequence[0 ..< startPos].toLowerAscii())
300+
301+
modifiedSeq.add(sequence[startPos ..< endPos])
302+
303+
if endPos < sequence.len:
304+
modifiedSeq.add(sequence[endPos ..< sequence.len].toLowerAscii())
305+
306+
writeFastq(modifiedSeq, qualities, sampleName, config.outFile, config.fasta)
241307
else:
242308
# Trim low quality ends
243309
let trimmed = trimSequence(sequence, qualities, config.windowSize, config.qualityThreshold)
@@ -247,7 +313,7 @@ proc main*() =
247313
if trimmed.seq.len == 0:
248314
echo "Warning: Entire sequence was below quality threshold"
249315

250-
writeFastq(trimmed.seq, trimmed.qual, sampleName, config.outFile)
316+
writeFastq(trimmed.seq, trimmed.qual, sampleName, config.outFile, config.fasta)
251317

252318
trace.close()
253319
except:

0 commit comments

Comments
 (0)