-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
7d23cae
commit 1e7450c
Showing
7 changed files
with
600 additions
and
0 deletions.
There are no files selected for viewing
96 changes: 96 additions & 0 deletions
96
pipelines/nextflow/modules/star_alignment/download_paired_fastqs.nf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#!/usr/bin/env nextflow | ||
/* | ||
See the NOTICE file distributed with this work for additional information | ||
regarding copyright ownership. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
import groovy.json.JsonSlurper | ||
import org.apache.commons.codec.digest.DigestUtils | ||
import java.nio.file.Files | ||
import java.nio.file.Path | ||
|
||
|
||
process DOWNLOAD_PAIRED_FASTQS { | ||
label "default" | ||
tag "${taxon_id}:${run_accession}" | ||
maxForks 25 | ||
//storeDir "${params.outDir}/$taxon_id/$run_accession" | ||
afterScript "sleep $params.files_latency" // Needed because of file system latency | ||
|
||
input: | ||
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val(genomeDir) | ||
|
||
output: | ||
tuple val(taxon_id), val(gca), val(run_accession), val("${params.outDir}/$taxon_id/$run_accession/${run_accession}_1.fastq.gz"), val("${params.outDir}/$taxon_id/$run_accession/${run_accession}_2.fastq.gz" ), val(genomeDir) | ||
|
||
|
||
script: | ||
if(!url1 || !url2 || !md5_1 || !md5_2){ | ||
log.error("Metadata corrupted") | ||
}else{ | ||
def pair1Path = "${params.outDir}/$taxon_id/$run_accession/${run_accession}_1.fastq.gz" | ||
def pair2Path = "${params.outDir}/$taxon_id/$run_accession/${run_accession}_2.fastq.gz" | ||
def retryCount = 0 | ||
def maxRetries = 3 | ||
def md5Match = false | ||
def filesValid = false | ||
def dirPath = file("${params.outDir}/${taxon_id}/${run_accession}") | ||
// List and filter .gz files | ||
def gzFiles = dirPath.listFiles().findAll { it.name.endsWith('.gz') } | ||
|
||
log.info("gzFiles ${gzFiles}") | ||
if (gzFiles.size() !=2 ){ | ||
filesValid = true | ||
|
||
while (!md5Match && retryCount < maxRetries) { | ||
""" | ||
wget -qq -c -O ${pair1Path} ftp://${url1} | ||
""".execute().waitFor() | ||
|
||
""" | ||
wget -qq -c -O ${pair2Path} ftp://${url2} | ||
""".execute().waitFor() | ||
|
||
// Calculate MD5 checksums of downloaded files | ||
md5Pair1 = DigestUtils.md5Hex(Files.newInputStream(file(pair1Path))) | ||
md5Pair2 = DigestUtils.md5Hex(Files.newInputStream(file(pair2Path))) | ||
|
||
// Check if both MD5 checksums are present in stored MD5 checksums | ||
if ([md5_1, md5_2].containsAll([md5Pair1, md5Pair2])) { | ||
md5Match = true | ||
log.info("MD5 checksums match!") | ||
} else { | ||
log.info("MD5 checksums do not match! Retrying...") | ||
""" | ||
rm ${pair1Path} | ||
""".execute().waitFor() | ||
""" | ||
rm ${pair2Path} | ||
""".execute().waitFor() | ||
retryCount++ | ||
Thread.sleep(1000) // Wait for 1 second before retrying | ||
} | ||
} | ||
|
||
if (!md5Match) { | ||
log.info("MD5 checksums do not match after $maxRetries retries!") | ||
} | ||
} | ||
} | ||
|
||
""" | ||
echo "Download completed" | ||
""" | ||
} |
44 changes: 44 additions & 0 deletions
44
pipelines/nextflow/modules/star_alignment/extract_uniquely_mapped_reads_percentage.nf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#!/usr/bin/env nextflow | ||
/* | ||
See the NOTICE file distributed with this work for additional information | ||
regarding copyright ownership. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
include { getDataFromTable } from '../utils.nf' | ||
|
||
process EXTRACT_UNIQUELY_MAPPED_READS_PERCENTAGE { | ||
label 'python' | ||
tag "$run_accession" | ||
publishDir "${params.outDir}/$taxon_id/$run_accession/star/", mode: 'copy' | ||
afterScript "sleep $params.files_latency" // Needed because of file system latency | ||
|
||
input: | ||
tuple val(taxon_id), val(gca), val(run_accession), val(starOutputFile) | ||
|
||
|
||
output: | ||
tuple val(taxon_id), val(gca), val(run_accession), path("insert_into_align.json") | ||
|
||
script: | ||
def run_id = getDataFromTable("run_id", "run", "run_accession", run_accession)[0].run_id | ||
def star_dir = new File(starOutputFile).parent | ||
""" | ||
chmod +x $projectDir/bin/parse_star_output.py | ||
parse_star_output.py --file_path ${starOutputFile} \ | ||
--extra_parameters "{'run_id': '${run_id}', 'assembly_accession': '${gca}'}" \ | ||
--uniquely_mapped_reads_percentage --percentage_reads_mapped_to_multiple_loci --percentage_reads_unmapped_too_short | ||
cp insert_into_align.json ${params.outDir}/${taxon_id}/${run_accession}/star/ | ||
""" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/usr/bin/env nextflow | ||
/* | ||
See the NOTICE file distributed with this work for additional information | ||
regarding copyright ownership. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
process FETCH_GENOME { | ||
tag "$taxon_id:$gca" | ||
label 'fetch_file' | ||
|
||
//storeDir "${params.outDir}/$taxon_id/$gca/ncbi_dataset/" | ||
afterScript "sleep $params.files_latency" // Needed because of file system latency | ||
maxForks 10 | ||
|
||
input: | ||
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2) | ||
|
||
output: | ||
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val("${params.outDir}/$taxon_id/$gca/ncbi_dataset/") | ||
|
||
script: | ||
""" | ||
if [ ! -d "${params.outDir}/${taxon_id}/${gca}/ncbi_dataset/" ]; then | ||
echo "Directory ncbi_dataset does not exist. Proceeding with download..." | ||
curl --retry 3 -X GET "${params.ncbiBaseUrl}/${gca}/download?include_annotation_type=GENOME_FASTA&hydrated=FULLY_HYDRATED" -H "Accept: application/zip" --output genome_file.zip | ||
unzip -j genome_file.zip | ||
mkdir -p ${params.outDir}/${taxon_id}/${gca}/ncbi_dataset && cp *.fna ${params.outDir}/${taxon_id}/${gca}/ncbi_dataset | ||
else | ||
echo "Directory ncbi_dataset already exists. Skipping download." | ||
fi | ||
""" | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/usr/bin/env nextflow | ||
/* | ||
See the NOTICE file distributed with this work for additional information | ||
regarding copyright ownership. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
process RUN_STAR { | ||
tag "$run_accession" | ||
label 'samtools' | ||
publishDir "${params.outDir}/$taxon_id/$run_accession/star/", mode: 'copy' | ||
afterScript "sleep $params.files_latency" // Needed because of file system latency | ||
|
||
input: | ||
tuple val(taxon_id), val(gca), val(run_accession), val(bamFile) | ||
|
||
output: | ||
tuple val(taxon_id), val(gca), val(run_accession), val(bamFile) | ||
|
||
script: | ||
|
||
""" | ||
rm -rf ${starTmpDir} | ||
STAR --limitSjdbInsertNsj 2000000 \ | ||
--outFilterIntronMotifs RemoveNoncanonicalUnannotated \ | ||
--outSAMstrandField intronMotif --runThreadN ${task.cpus} \ | ||
--twopassMode Basic --runMode alignReads --genomeDir ${file(genomeDir)} \ | ||
--readFilesIn ${pair1} ${pair2} --outFileNamePrefix ${outFileNamePrefix} \ | ||
--outSAMattrRGline "ID:${run_accession}" --outTmpDir ${starTmpDir} --outSAMtype BAM \ | ||
SortedByCoordinate | ||
""" | ||
|
||
} | ||
/* | ||
linuxbrew/bin/STAR --limitBAMsortRAM 2006305390 --outBAMsortingThreadN 30 | ||
--limitSjdbInsertNsj 2000000 --outFilterIntronMotifs RemoveNoncanonicalUnannotated | ||
--outSAMstrandField intronMotif --runThreadN 30 --twopassMode Basic | ||
--runMode alignReads --genomeDir /hps/nobackup/flicek/ensembl/genebuild/ | ||
ftricomi/fish/mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2 | ||
/genome_dumps --readFilesIn /hps/nobackup/flicek/ensembl/genebuild/ftricomi | ||
/fish/mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2//rnaseq/input/ | ||
SRR12475462_1.fastq.gz /hps/nobackup/flicek/ensembl/genebuild/ftricomi/fish/ | ||
mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2//rnaseq/input/S | ||
RR12475462_2.fastq.gz --outFileNamePrefix /hps/nobackup/flicek/ensembl/genebuild/ | ||
ftricomi/fish/mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2/rnaseq/ | ||
output/SRR12475462_ --outSAMattrRGline "ID:SRR12475462" | ||
--outTmpDir /hps/nobackup/flicek/ensembl/genebuild/ftricomi/fish/ | ||
mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2/rnaseq/output | ||
/SRR12475462_tmp --outSAMtype BAM SortedByCoordinate --outBAMsortingBinsN 200 | ||
*/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#!/usr/bin/env nextflow | ||
/* | ||
See the NOTICE file distributed with this work for additional information | ||
regarding copyright ownership. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
import java.nio.file.* | ||
|
||
process INDEX_GENOME { | ||
label 'star' | ||
tag "$taxon_id:$gca" | ||
publishDir "${params.outDir}/$taxon_id/$gca/ncbi_dataset/", mode: 'copy' | ||
afterScript "sleep $params.files_latency" // Needed because of file system latency | ||
maxForks 10 | ||
|
||
input: | ||
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val(genomeDir) | ||
|
||
output: | ||
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val(genomeDir) | ||
|
||
script: | ||
def genomeDirPath= new File(genomeDir) | ||
def genomeIndexFile = genomeDirPath.listFiles().find { it.name.endsWith('Genome') } | ||
log.info("${genomeIndexFile}") | ||
if (!genomeIndexFile || genomeIndexFile.length() == 0) { | ||
def genomefilePath=d.listFiles().find { it.name.endsWith('.fna') } | ||
def genomeFile=genomefilePath.absolutePath | ||
// Function to calculate the min value | ||
def min = { a, b -> a < b ? a : b } | ||
|
||
// Initialize variables | ||
def numberOfReferences = 0 | ||
def genomeLength = 0 | ||
// Read the FASTA file line by line | ||
genomefilePath.eachLine { line -> | ||
if (line.startsWith('>')) { | ||
numberOfReferences++ | ||
} else { | ||
genomeLength += line.trim().length() | ||
} | ||
} | ||
log.info("numberOfReferences: ${numberOfReferences}") | ||
log.info("genomeLength: ${genomeLength.abs()}") | ||
// Read the FASTA file | ||
//def fastaContent = genomefilePath.text | ||
|
||
// Calculate the number of references | ||
//def numberOfReferences = fastaContent.count('>') | ||
|
||
// Calculate the genome length (excluding header lines) | ||
//def genomeLength = fastaContent.split('\n').findAll { !it.startsWith('>') }.join('').length() | ||
|
||
// Define read length (you may need to adjust this based on your data) | ||
def readLength = 100 | ||
|
||
// Calculate genomeSAindexNbases | ||
def genomeSAindexNbases = min(14, (Math.log(genomeLength.abs() as Double) / Math.log(2) / 2 - 1) as int) | ||
|
||
// Calculate genomeChrBinNbits | ||
//def genomeChrBinNbits = min(18, (Math.log(Math.max(genomeLength as Double/ numberOfReferences as Double, readLength as Double)) / Math.log(2)) as int) | ||
def genomeChrBinNbits = min(18, (Math.log(Math.max((genomeLength.abs() / numberOfReferences) as Double, readLength as Double)) / Math.log(2)) as int) | ||
|
||
// Print the calculated values for debugging | ||
log.info("genomeSAindexNbases: ${genomeSAindexNbases}") | ||
log.info("genomeChrBinNbits: ${genomeChrBinNbits}") | ||
|
||
// Execute STAR command with calculated parameters #if [ ! -s "${genomeDir}/Genome" ]; then \ | ||
""" | ||
if [ ! -s "${genomeDir}/Genome" ]; \ | ||
then | ||
rm -rf ${genomeDir}/_STARtmp ; | ||
STAR --runThreadN ${task.cpus} --runMode genomeGenerate \ | ||
--outFileNamePrefix ${genomeDir} --genomeDir ${genomeDir} \ | ||
--genomeSAindexNbases ${genomeSAindexNbases} \ | ||
--genomeChrBinNbits ${genomeChrBinNbits} \ | ||
--genomeFastaFiles ${genomeFile} --outTmpDir _STARtmp;fi | ||
""" | ||
} else { | ||
""" | ||
echo "Genome index already exists, skipping STAR genomeGenerate step." | ||
""" | ||
} | ||
|
||
} | ||
|
||
|
Oops, something went wrong.