first commit

Ensembl · Aug 14, 2024 · 1e7450c · 1e7450c
1 parent 7d23cae
commit 1e7450c
Show file tree

Hide file tree

Showing 7 changed files with 600 additions and 0 deletions.
diff --git a/pipelines/nextflow/modules/star_alignment/download_paired_fastqs.nf b/pipelines/nextflow/modules/star_alignment/download_paired_fastqs.nf
@@ -0,0 +1,96 @@
+#!/usr/bin/env nextflow
+/*
+See the NOTICE file distributed with this work for additional information
+regarding copyright ownership.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import groovy.json.JsonSlurper
+import org.apache.commons.codec.digest.DigestUtils
+import java.nio.file.Files
+import java.nio.file.Path
+
+
+process DOWNLOAD_PAIRED_FASTQS {
+    label "default"
+    tag "${taxon_id}:${run_accession}"
+    maxForks 25
+    //storeDir "${params.outDir}/$taxon_id/$run_accession"
+    afterScript "sleep $params.files_latency"  // Needed because of file system latency
+
+    input:
+    tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val(genomeDir)
+
+    output:
+    tuple val(taxon_id), val(gca), val(run_accession), val("${params.outDir}/$taxon_id/$run_accession/${run_accession}_1.fastq.gz"), val("${params.outDir}/$taxon_id/$run_accession/${run_accession}_2.fastq.gz" ), val(genomeDir)
+
+
+    script: 
+    if(!url1 || !url2 || !md5_1 || !md5_2){
+            log.error("Metadata corrupted")
+    }else{
+        def pair1Path = "${params.outDir}/$taxon_id/$run_accession/${run_accession}_1.fastq.gz"
+        def pair2Path = "${params.outDir}/$taxon_id/$run_accession/${run_accession}_2.fastq.gz"
+        def retryCount = 0
+        def maxRetries = 3
+        def md5Match = false
+        def filesValid = false
+        def dirPath = file("${params.outDir}/${taxon_id}/${run_accession}")
+        // List and filter .gz files
+        def gzFiles = dirPath.listFiles().findAll { it.name.endsWith('.gz') }
+
+        log.info("gzFiles ${gzFiles}")
+        if (gzFiles.size() !=2 ){
+            filesValid = true
+
+            while (!md5Match && retryCount < maxRetries) {
+                """
+                wget -qq -c -O ${pair1Path} ftp://${url1}
+                """.execute().waitFor()
+
+                """
+                wget -qq -c -O ${pair2Path} ftp://${url2}
+                """.execute().waitFor()
+
+                // Calculate MD5 checksums of downloaded files
+                md5Pair1 = DigestUtils.md5Hex(Files.newInputStream(file(pair1Path)))
+                md5Pair2 = DigestUtils.md5Hex(Files.newInputStream(file(pair2Path)))
+
+                // Check if both MD5 checksums are present in stored MD5 checksums
+                if ([md5_1, md5_2].containsAll([md5Pair1, md5Pair2])) {
+                    md5Match = true
+                    log.info("MD5 checksums match!")
+                } else {
+                    log.info("MD5 checksums do not match! Retrying...")
+                    """
+                    rm ${pair1Path}
+                    """.execute().waitFor()
+                    """
+                    rm ${pair2Path}
+                    """.execute().waitFor()
+                    retryCount++
+                    Thread.sleep(1000) // Wait for 1 second before retrying
+                }
+            }
+
+        if (!md5Match) {
+            log.info("MD5 checksums do not match after $maxRetries retries!")
+        } 
+        }
+    }
+
+    """
+    echo "Download completed"
+    """
+}
diff --git a/pipelines/nextflow/modules/star_alignment/extract_uniquely_mapped_reads_percentage.nf b/pipelines/nextflow/modules/star_alignment/extract_uniquely_mapped_reads_percentage.nf
@@ -0,0 +1,44 @@
+#!/usr/bin/env nextflow
+/*
+See the NOTICE file distributed with this work for additional information
+regarding copyright ownership.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+include { getDataFromTable } from '../utils.nf'
+
+process EXTRACT_UNIQUELY_MAPPED_READS_PERCENTAGE {
+    label 'python'
+    tag "$run_accession"
+    publishDir "${params.outDir}/$taxon_id/$run_accession/star/", mode: 'copy'
+    afterScript "sleep $params.files_latency"  // Needed because of file system latency
+
+    input:
+    tuple val(taxon_id), val(gca), val(run_accession), val(starOutputFile)
+
+
+    output:
+    tuple val(taxon_id), val(gca), val(run_accession), path("insert_into_align.json")
+
+    script:
+    def run_id = getDataFromTable("run_id", "run", "run_accession", run_accession)[0].run_id
+    def star_dir = new File(starOutputFile).parent
+    """
+    chmod +x $projectDir/bin/parse_star_output.py
+    parse_star_output.py --file_path ${starOutputFile} \
+    --extra_parameters "{'run_id': '${run_id}',  'assembly_accession': '${gca}'}" \
+    --uniquely_mapped_reads_percentage --percentage_reads_mapped_to_multiple_loci --percentage_reads_unmapped_too_short
+    cp insert_into_align.json ${params.outDir}/${taxon_id}/${run_accession}/star/
+    """
+}
diff --git a/pipelines/nextflow/modules/star_alignment/fetch_genome.nf b/pipelines/nextflow/modules/star_alignment/fetch_genome.nf
@@ -0,0 +1,45 @@
+#!/usr/bin/env nextflow
+/*
+See the NOTICE file distributed with this work for additional information
+regarding copyright ownership.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+process FETCH_GENOME {
+  tag "$taxon_id:$gca"
+  label 'fetch_file'
+
+  //storeDir "${params.outDir}/$taxon_id/$gca/ncbi_dataset/"
+  afterScript "sleep $params.files_latency"  // Needed because of file system latency
+  maxForks 10
+
+  input:
+  tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2) 
+
+  output:
+  tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val("${params.outDir}/$taxon_id/$gca/ncbi_dataset/")
+
+  script:
+  """
+  if [ ! -d "${params.outDir}/${taxon_id}/${gca}/ncbi_dataset/" ]; then
+    echo "Directory ncbi_dataset does not exist. Proceeding with download..."
+    curl --retry 3  -X GET "${params.ncbiBaseUrl}/${gca}/download?include_annotation_type=GENOME_FASTA&hydrated=FULLY_HYDRATED" -H "Accept: application/zip" --output genome_file.zip
+    unzip -j genome_file.zip
+    mkdir -p ${params.outDir}/${taxon_id}/${gca}/ncbi_dataset && cp *.fna ${params.outDir}/${taxon_id}/${gca}/ncbi_dataset
+  else
+    echo "Directory ncbi_dataset already exists. Skipping download."
+  fi
+  """
+
+}
diff --git a/pipelines/nextflow/modules/star_alignment/index_bam.nf b/pipelines/nextflow/modules/star_alignment/index_bam.nf
@@ -0,0 +1,62 @@
+#!/usr/bin/env nextflow
+/*
+See the NOTICE file distributed with this work for additional information
+regarding copyright ownership.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+process RUN_STAR {
+    tag "$run_accession"
+    label 'samtools'
+    publishDir "${params.outDir}/$taxon_id/$run_accession/star/", mode: 'copy'
+    afterScript "sleep $params.files_latency"  // Needed because of file system latency
+
+    input:
+    tuple val(taxon_id), val(gca), val(run_accession), val(bamFile)
+
+    output:
+    tuple val(taxon_id), val(gca), val(run_accession), val(bamFile)
+
+    script:
+
+    """
+    
+    rm -rf ${starTmpDir}
+    STAR --limitSjdbInsertNsj 2000000 \
+    --outFilterIntronMotifs RemoveNoncanonicalUnannotated \
+    --outSAMstrandField intronMotif --runThreadN ${task.cpus} \
+    --twopassMode Basic --runMode alignReads --genomeDir ${file(genomeDir)} \
+    --readFilesIn ${pair1} ${pair2} --outFileNamePrefix ${outFileNamePrefix} \
+    --outSAMattrRGline "ID:${run_accession}" --outTmpDir ${starTmpDir} --outSAMtype BAM \
+    SortedByCoordinate  
+    """
+
+}
+/*
+linuxbrew/bin/STAR  --limitBAMsortRAM 2006305390 --outBAMsortingThreadN 30
+--limitSjdbInsertNsj 2000000 --outFilterIntronMotifs RemoveNoncanonicalUnannotated
+--outSAMstrandField intronMotif --runThreadN 30 --twopassMode Basic 
+--runMode alignReads --genomeDir /hps/nobackup/flicek/ensembl/genebuild/
+ftricomi/fish/mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2
+/genome_dumps --readFilesIn /hps/nobackup/flicek/ensembl/genebuild/ftricomi
+/fish/mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2//rnaseq/input/
+SRR12475462_1.fastq.gz /hps/nobackup/flicek/ensembl/genebuild/ftricomi/fish/
+mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2//rnaseq/input/S
+RR12475462_2.fastq.gz --outFileNamePrefix /hps/nobackup/flicek/ensembl/genebuild/
+ftricomi/fish/mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2/rnaseq/
+output/SRR12475462_   --outSAMattrRGline "ID:SRR12475462" 
+--outTmpDir /hps/nobackup/flicek/ensembl/genebuild/ftricomi/fish/
+mummichog_annotation/fundulus_heteroclitus/GCA_011125445.2/rnaseq/output
+/SRR12475462_tmp --outSAMtype BAM SortedByCoordinate --outBAMsortingBinsN 200
+*/
diff --git a/pipelines/nextflow/modules/star_alignment/index_genome.nf b/pipelines/nextflow/modules/star_alignment/index_genome.nf
@@ -0,0 +1,98 @@
+#!/usr/bin/env nextflow
+/*
+See the NOTICE file distributed with this work for additional information
+regarding copyright ownership.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+import java.nio.file.*
+
+process INDEX_GENOME {
+    label 'star'
+    tag "$taxon_id:$gca"
+    publishDir "${params.outDir}/$taxon_id/$gca/ncbi_dataset/", mode: 'copy'
+    afterScript "sleep $params.files_latency"  // Needed because of file system latency
+    maxForks 10
+
+    input:
+    tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val(genomeDir)
+
+    output:
+    tuple val(taxon_id), val(gca), val(run_accession), val(url1), val(url2), val(md5_1), val(md5_2), val(genomeDir)
+
+    script:
+    def genomeDirPath= new File(genomeDir)
+    def genomeIndexFile = genomeDirPath.listFiles().find { it.name.endsWith('Genome') }
+    log.info("${genomeIndexFile}")
+    if (!genomeIndexFile || genomeIndexFile.length() == 0) {
+    def genomefilePath=d.listFiles().find { it.name.endsWith('.fna') }
+    def genomeFile=genomefilePath.absolutePath
+    // Function to calculate the min value
+    def min = { a, b -> a < b ? a : b }
+
+    // Initialize variables
+    def numberOfReferences = 0
+    def genomeLength = 0
+    // Read the FASTA file line by line
+    genomefilePath.eachLine { line ->
+        if (line.startsWith('>')) {
+            numberOfReferences++
+        } else {
+            genomeLength += line.trim().length()
+        }
+    }
+    log.info("numberOfReferences: ${numberOfReferences}")
+    log.info("genomeLength: ${genomeLength.abs()}")
+    // Read the FASTA file
+    //def fastaContent = genomefilePath.text
+
+    // Calculate the number of references
+    //def numberOfReferences = fastaContent.count('>')
+
+    // Calculate the genome length (excluding header lines)
+    //def genomeLength = fastaContent.split('\n').findAll { !it.startsWith('>') }.join('').length()
+
+    // Define read length (you may need to adjust this based on your data)
+    def readLength = 100
+
+    // Calculate genomeSAindexNbases
+    def genomeSAindexNbases = min(14, (Math.log(genomeLength.abs() as Double) / Math.log(2) / 2 - 1) as int)
+
+    // Calculate genomeChrBinNbits
+    //def genomeChrBinNbits = min(18, (Math.log(Math.max(genomeLength as Double/ numberOfReferences as Double, readLength as Double)) / Math.log(2)) as int)
+    def genomeChrBinNbits = min(18, (Math.log(Math.max((genomeLength.abs() / numberOfReferences) as Double, readLength as Double)) / Math.log(2)) as int)
+
+    // Print the calculated values for debugging
+    log.info("genomeSAindexNbases: ${genomeSAindexNbases}")
+    log.info("genomeChrBinNbits: ${genomeChrBinNbits}")
+
+    // Execute STAR command with calculated parameters #if [ ! -s "${genomeDir}/Genome" ]; then \
+    """
+    if [ ! -s "${genomeDir}/Genome" ]; \
+    then
+    rm -rf ${genomeDir}/_STARtmp ; 
+    STAR  --runThreadN ${task.cpus} --runMode genomeGenerate \
+    --outFileNamePrefix ${genomeDir} --genomeDir ${genomeDir} \
+    --genomeSAindexNbases ${genomeSAindexNbases} \
+    --genomeChrBinNbits ${genomeChrBinNbits} \
+    --genomeFastaFiles  ${genomeFile} --outTmpDir _STARtmp;fi
+    """
+    } else {
+    """
+    echo "Genome index already exists, skipping STAR genomeGenerate step."
+    """
+    }
+
+    }
+
+