-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoverage.sh
51 lines (39 loc) · 1.79 KB
/
coverage.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash
# Check if the correct number of arguments is provided
if [ "$#" -ne 3 ]; then
echo "Usage: $0 <input_directory> <output_directory> <region_of_interest_bases>"
exit 1
fi
# Define the input directory, output directory, and region of interest bases
INPUT_DIR="$1"
OUTPUT_DIR="$2"
ROI_BASES="$3"
# Create the output directory if it does not exist
mkdir -p "$OUTPUT_DIR"
# Output file
OUTPUT_FILE="$OUTPUT_DIR/read_counts_with_paths_bases_coverage.txt"
# Initialize the output file
echo -e "File_Path\tRead_Count\tBase_Count\tCoverage" > $OUTPUT_FILE
# Check if the ROI_BASES is a valid number
if ! [[ "$ROI_BASES" =~ ^[0-9]+$ ]]; then
echo "Error: Region of interest bases must be a number."
exit 1
fi
# Loop through all fastq files in the directory and its subdirectories
find "$INPUT_DIR" -type f \( -name "*.fastq.gz" -o -name "*.fastq" \) | while read -r file; do
# Check if the file is a .fastq file and compress it to .fastq.gz
if [[ "$file" == *.fastq ]]; then
pigz -p 128 "$file"
file="$file.gz" # Update the file path to the new .fastq.gz file
fi
# Count the number of reads (each read is represented by 4 lines in fastq format)
count=$(zcat "$file" | wc -l)
reads=$((count / 4))
# Calculate the number of bases by summing the lengths of sequence lines
bases=$(zcat "$file" | awk 'NR % 4 == 2 {total += length($0)} END {print total}')
# Calculate coverage (bases divided by user-defined region of interest bases)
coverage=$(awk "BEGIN {printf \"%.6f\", $bases / $ROI_BASES}")
# Write the full file path, read count, base count, and coverage to the output file
echo -e "$file\t$reads\t$bases\t$coverage" >> $OUTPUT_FILE
done
echo "Read counts, base counts, and coverage with file paths have been saved to $OUTPUT_FILE."