Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
ens-ftricomi committed Aug 14, 2024
1 parent 7d23cae commit 1e7450c
Show file tree
Hide file tree
Showing 7 changed files with 600 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env nextflow
/*
See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

import groovy.json.JsonSlurper
import org.apache.commons.codec.digest.DigestUtils
import java.nio.file.Files
import java.nio.file.Path


process DOWNLOAD_PAIRED_FASTQS {
label "default"
tag "${taxon_id}:${run_accession}"
maxForks 25
//storeDir "${params.outDir}/$taxon_id/$run_accession"
afterScript "sleep $params.files_latency" // Needed because of file system latency

input:
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val(genomeDir)

output:
tuple val(taxon_id), val(gca), val(run_accession), val("${params.outDir}/$taxon_id/$run_accession/${run_accession}_1.fastq.gz"), val("${params.outDir}/$taxon_id/$run_accession/${run_accession}_2.fastq.gz" ), val(genomeDir)


script:
if(!url1 || !url2 || !md5_1 || !md5_2){
log.error("Metadata corrupted")
}else{
def pair1Path = "${params.outDir}/$taxon_id/$run_accession/${run_accession}_1.fastq.gz"
def pair2Path = "${params.outDir}/$taxon_id/$run_accession/${run_accession}_2.fastq.gz"
def retryCount = 0
def maxRetries = 3
def md5Match = false
def filesValid = false
def dirPath = file("${params.outDir}/${taxon_id}/${run_accession}")
// List and filter .gz files
def gzFiles = dirPath.listFiles().findAll { it.name.endsWith('.gz') }

log.info("gzFiles ${gzFiles}")
if (gzFiles.size() !=2 ){
filesValid = true

while (!md5Match && retryCount < maxRetries) {
"""
wget -qq -c -O ${pair1Path} ftp://${url1}
""".execute().waitFor()

"""
wget -qq -c -O ${pair2Path} ftp://${url2}
""".execute().waitFor()

// Calculate MD5 checksums of downloaded files
md5Pair1 = DigestUtils.md5Hex(Files.newInputStream(file(pair1Path)))
md5Pair2 = DigestUtils.md5Hex(Files.newInputStream(file(pair2Path)))

// Check if both MD5 checksums are present in stored MD5 checksums
if ([md5_1, md5_2].containsAll([md5Pair1, md5Pair2])) {
md5Match = true
log.info("MD5 checksums match!")
} else {
log.info("MD5 checksums do not match! Retrying...")
"""
rm ${pair1Path}
""".execute().waitFor()
"""
rm ${pair2Path}
""".execute().waitFor()
retryCount++
Thread.sleep(1000) // Wait for 1 second before retrying
}
}

if (!md5Match) {
log.info("MD5 checksums do not match after $maxRetries retries!")
}
}
}

"""
echo "Download completed"
"""
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env nextflow
/*
See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

include { getDataFromTable } from '../utils.nf'

process EXTRACT_UNIQUELY_MAPPED_READS_PERCENTAGE {
label 'python'
tag "$run_accession"
publishDir "${params.outDir}/$taxon_id/$run_accession/star/", mode: 'copy'
afterScript "sleep $params.files_latency" // Needed because of file system latency

input:
tuple val(taxon_id), val(gca), val(run_accession), val(starOutputFile)


output:
tuple val(taxon_id), val(gca), val(run_accession), path("insert_into_align.json")

script:
def run_id = getDataFromTable("run_id", "run", "run_accession", run_accession)[0].run_id
def star_dir = new File(starOutputFile).parent
"""
chmod +x $projectDir/bin/parse_star_output.py
parse_star_output.py --file_path ${starOutputFile} \
--extra_parameters "{'run_id': '${run_id}', 'assembly_accession': '${gca}'}" \
--uniquely_mapped_reads_percentage --percentage_reads_mapped_to_multiple_loci --percentage_reads_unmapped_too_short
cp insert_into_align.json ${params.outDir}/${taxon_id}/${run_accession}/star/
"""
}
45 changes: 45 additions & 0 deletions pipelines/nextflow/modules/star_alignment/fetch_genome.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env nextflow
/*
See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

process FETCH_GENOME {
tag "$taxon_id:$gca"
label 'fetch_file'

//storeDir "${params.outDir}/$taxon_id/$gca/ncbi_dataset/"
afterScript "sleep $params.files_latency" // Needed because of file system latency
maxForks 10

input:
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2)

output:
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val("${params.outDir}/$taxon_id/$gca/ncbi_dataset/")

script:
"""
if [ ! -d "${params.outDir}/${taxon_id}/${gca}/ncbi_dataset/" ]; then
echo "Directory ncbi_dataset does not exist. Proceeding with download..."
curl --retry 3 -X GET "${params.ncbiBaseUrl}/${gca}/download?include_annotation_type=GENOME_FASTA&hydrated=FULLY_HYDRATED" -H "Accept: application/zip" --output genome_file.zip
unzip -j genome_file.zip
mkdir -p ${params.outDir}/${taxon_id}/${gca}/ncbi_dataset && cp *.fna ${params.outDir}/${taxon_id}/${gca}/ncbi_dataset
else
echo "Directory ncbi_dataset already exists. Skipping download."
fi
"""

}
62 changes: 62 additions & 0 deletions pipelines/nextflow/modules/star_alignment/index_bam.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env nextflow
/*
See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

process RUN_STAR {
tag "$run_accession"
label 'samtools'
publishDir "${params.outDir}/$taxon_id/$run_accession/star/", mode: 'copy'
afterScript "sleep $params.files_latency" // Needed because of file system latency

input:
tuple val(taxon_id), val(gca), val(run_accession), val(bamFile)

output:
tuple val(taxon_id), val(gca), val(run_accession), val(bamFile)

script:

"""
rm -rf ${starTmpDir}
STAR --limitSjdbInsertNsj 2000000 \
--outFilterIntronMotifs RemoveNoncanonicalUnannotated \
--outSAMstrandField intronMotif --runThreadN ${task.cpus} \
--twopassMode Basic --runMode alignReads --genomeDir ${file(genomeDir)} \
--readFilesIn ${pair1} ${pair2} --outFileNamePrefix ${outFileNamePrefix} \
--outSAMattrRGline "ID:${run_accession}" --outTmpDir ${starTmpDir} --outSAMtype BAM \
SortedByCoordinate
"""

}
/*
linuxbrew/bin/STAR --limitBAMsortRAM 2006305390 --outBAMsortingThreadN 30
--limitSjdbInsertNsj 2000000 --outFilterIntronMotifs RemoveNoncanonicalUnannotated
--outSAMstrandField intronMotif --runThreadN 30 --twopassMode Basic
--runMode alignReads --genomeDir /hps/nobackup/flicek/ensembl/genebuild/
ftricomi/fish/mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2
/genome_dumps --readFilesIn /hps/nobackup/flicek/ensembl/genebuild/ftricomi
/fish/mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2//rnaseq/input/
SRR12475462_1.fastq.gz /hps/nobackup/flicek/ensembl/genebuild/ftricomi/fish/
mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2//rnaseq/input/S
RR12475462_2.fastq.gz --outFileNamePrefix /hps/nobackup/flicek/ensembl/genebuild/
ftricomi/fish/mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2/rnaseq/
output/SRR12475462_ --outSAMattrRGline "ID:SRR12475462"
--outTmpDir /hps/nobackup/flicek/ensembl/genebuild/ftricomi/fish/
mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2/rnaseq/output
/SRR12475462_tmp --outSAMtype BAM SortedByCoordinate --outBAMsortingBinsN 200
*/
98 changes: 98 additions & 0 deletions pipelines/nextflow/modules/star_alignment/index_genome.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env nextflow
/*
See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
import java.nio.file.*

process INDEX_GENOME {
label 'star'
tag "$taxon_id:$gca"
publishDir "${params.outDir}/$taxon_id/$gca/ncbi_dataset/", mode: 'copy'
afterScript "sleep $params.files_latency" // Needed because of file system latency
maxForks 10

input:
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val(genomeDir)

output:
tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val(genomeDir)

script:
def genomeDirPath= new File(genomeDir)
def genomeIndexFile = genomeDirPath.listFiles().find { it.name.endsWith('Genome') }
log.info("${genomeIndexFile}")
if (!genomeIndexFile || genomeIndexFile.length() == 0) {
def genomefilePath=d.listFiles().find { it.name.endsWith('.fna') }
def genomeFile=genomefilePath.absolutePath
// Function to calculate the min value
def min = { a, b -> a < b ? a : b }

// Initialize variables
def numberOfReferences = 0
def genomeLength = 0
// Read the FASTA file line by line
genomefilePath.eachLine { line ->
if (line.startsWith('>')) {
numberOfReferences++
} else {
genomeLength += line.trim().length()
}
}
log.info("numberOfReferences: ${numberOfReferences}")
log.info("genomeLength: ${genomeLength.abs()}")
// Read the FASTA file
//def fastaContent = genomefilePath.text

// Calculate the number of references
//def numberOfReferences = fastaContent.count('>')

// Calculate the genome length (excluding header lines)
//def genomeLength = fastaContent.split('\n').findAll { !it.startsWith('>') }.join('').length()

// Define read length (you may need to adjust this based on your data)
def readLength = 100

// Calculate genomeSAindexNbases
def genomeSAindexNbases = min(14, (Math.log(genomeLength.abs() as Double) / Math.log(2) / 2 - 1) as int)

// Calculate genomeChrBinNbits
//def genomeChrBinNbits = min(18, (Math.log(Math.max(genomeLength as Double/ numberOfReferences as Double, readLength as Double)) / Math.log(2)) as int)
def genomeChrBinNbits = min(18, (Math.log(Math.max((genomeLength.abs() / numberOfReferences) as Double, readLength as Double)) / Math.log(2)) as int)

// Print the calculated values for debugging
log.info("genomeSAindexNbases: ${genomeSAindexNbases}")
log.info("genomeChrBinNbits: ${genomeChrBinNbits}")

// Execute STAR command with calculated parameters #if [ ! -s "${genomeDir}/Genome" ]; then \
"""
if [ ! -s "${genomeDir}/Genome" ]; \
then
rm -rf ${genomeDir}/_STARtmp ;
STAR --runThreadN ${task.cpus} --runMode genomeGenerate \
--outFileNamePrefix ${genomeDir} --genomeDir ${genomeDir} \
--genomeSAindexNbases ${genomeSAindexNbases} \
--genomeChrBinNbits ${genomeChrBinNbits} \
--genomeFastaFiles ${genomeFile} --outTmpDir _STARtmp;fi
"""
} else {
"""
echo "Genome index already exists, skipping STAR genomeGenerate step."
"""
}

}


Loading

0 comments on commit 1e7450c

Please sign in to comment.