Skip to content

Commit

Permalink
added batch logic
Browse files Browse the repository at this point in the history
  • Loading branch information
ens-ftricomi committed Sep 3, 2024
1 parent cf982a7 commit a39c8cf
Show file tree
Hide file tree
Showing 6 changed files with 140 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,25 @@ process GET_RUN_ACCESSIONS {
afterScript "sleep $params.files_latency" // Needed because of file system latency

input:
tuple val(taxon_id), val(gca), val(lastCheckedDate)
tuple val(taxon_id), val(gca), val(run_accession_batch), val(lastCheckedDate)

output:
val(runAccessionList)
path("run_accession_list.txt")

script:
//def runAccessionBatch = run_accession_batch ? run_accession_batch : null
runAccessionList = []
runAccessionToFile='run_accession_list.txt'
def fileBatch=new File(run_accession_batch)
if (fileBatch.exists()){
fileBatch.eachLine { line ->
runAccessionList.add([taxon_id: taxon_id, gca: gca, run_accession: line])
}
"""
cat ${run_accession_batch.toString()} > $runAccessionToFile
"""
} else {
def taxonQuery = "tax_eq(${taxon_id})"
def instrumentQuery = "instrument_platform=ILLUMINA"
def layoutQuery = "library_layout=PAIRED"
Expand All @@ -61,7 +73,7 @@ process GET_RUN_ACCESSIONS {
BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()))

// Create a list to hold the run accession data
runAccessionList = []
//runAccessionList = []
// Create a string to accumulate the response content
StringBuilder responseContent = new StringBuilder()

Expand All @@ -77,8 +89,11 @@ process GET_RUN_ACCESSIONS {
}
reader.close()

runAccessionToFile='run_accession_list.txt'
// runAccessionToFile='run_accession_list.txt'
//cp $run_accession_batch $runAccessionToFile
log.info("BATCH FILE ${run_accession_batch}")
"""
echo '${responseContent.toString()}' > $runAccessionToFile
"""
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@ process PROCESS_TAXON_ID {
tag "$taxon_id"

input:
tuple val(taxon_id), val(gca)
tuple val(taxon_id), val(gca), val(run_accession_batch)

output:
tuple(val(taxon_id), val(gca), stdout)
tuple(val(taxon_id), val(gca), val(run_accession_batch), stdout)

script:
def taxonomyExists = getDataFromTable('taxon_id', 'meta', 'taxon_id', taxon_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ process INDEX_GENOME {
tag "$taxon_id:$gca"
publishDir "${params.outDir}/$taxon_id/$gca/ncbi_dataset/", mode: 'copy'
afterScript "sleep $params.files_latency" // Needed because of file system latency
maxForks 10
maxForks 1

input:
tuple val(taxon_id), val(gca), val(run_accession), val(pair1), val(pair2), val(genomeDir)
Expand Down
2 changes: 1 addition & 1 deletion pipelines/nextflow/subworkflows/process_taxonomy_info.nf
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ workflow PROCESS_TAXONOMY_INFO {

main:
def data1=input_data
data1.flatten().view { d -> "Taxon ID: ${d.taxon_id}, GCA: ${d.gca}"}
data1.flatten().view { d -> "Taxon ID: ${d.taxon_id}, GCA: ${d.gca} batch: ${d.run_accession_batch}"}
def taxonomyInfo = PROCESS_TAXON_ID(input_data.flatten())
def (accessionList, runAccessionFile) = GET_RUN_ACCESSIONS(taxonomyInfo)
runAccessionList1= accessionList
Expand Down
2 changes: 1 addition & 1 deletion pipelines/nextflow/workflows/short_read.nf
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ include { RUN_ALIGNMENT } from '../subworkflows/run_alignment.nf'
workflow SHORT_READ {
def data = Channel.fromPath(params.csvFile, type: 'file', checkIfExists: true)
.splitCsv(sep:',', header:true)
.map { row -> [taxon_id:row.get('taxon_id'), gca:row.get('gca')]}
.map { row -> [taxon_id:row.get('taxon_id'), gca:row.get('gca'),run_accession_batch:row.get('runs_file')]}
data.each { dataRow -> dataRow.view() }
//taxon id present or not? if yes get all new short read data after this date if not add it for the first time
def taxonomyResults= PROCESS_TAXONOMY_INFO(data)
Expand Down
117 changes: 117 additions & 0 deletions src/python/ensembl/genes/metadata/check_for_transcriptomic_batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# See the NOTICE file distributed with this work for additional information #pylint: disable=missing-module-docstring
# regarding copyright ownership.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Check the availability for short and long read data from ENA website given a taxon id"""
import os.path
from pathlib import Path
from typing import List
import argparse
import requests

def ena_rest_api(query:str, taxon_id: int, batching_option:bool, batch_size:int , output_dir_path:Path)->int:
search_url = f"https://www.ebi.ac.uk/ena/portal/api/search?display=report&query={query}&domain=read&result=read_run&fields=run_accession" # pylint: disable=line-too-long
#def searchUrl = "https://www.ebi.ac.uk/ena/portal/api/search?result=read_run&query=${taxonQuery}%20AND%20instrument_platform=ILLUMINA%20AND%20library_layout=PAIRED%20AND%20library_source=TRANSCRIPTOMIC%20AND%20
#first_created%3E=${lastCheckedDate.trim()}&domain=read&fields=run_accession"

search_result = requests.get(search_url)
results = search_result.text.strip().split("\n")[1:]
# If batching is enabled, split the results based on batch_size
if batching_option and batch_size > 0:
batches = [results[i:i + batch_size] for i in range(0, len(results), batch_size)]
output_dir = output_dir_path / taxon_id / "batch"
# Create the directory structure if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)
# Save each batch into a separate file
for idx, batch in enumerate(batches):
batch_file = output_dir / f"batch_{idx + 1}.txt"
with open(batch_file, 'w') as f:
f.write("\n".join(batch))
return len(results)

def check_data_from_ena(# pylint: disable=too-many-locals
taxon_id: int,
tree: bool,
batching_option:bool, batch_size:int, output_dir:Path
) -> None:
"""Query ENA API to get short or long read data"""

if tree:
query = f"tax_tree({taxon_id})"
else:
query = f"tax_eq({taxon_id})"

query_short_paired=query+f" AND instrument_platform=ILLUMINA AND library_layout=PAIRED AND library_source=TRANSCRIPTOMIC AND first_created>=2019-01-01"
query_short_single=query+f" AND instrument_platform=ILLUMINA AND library_layout=SINGLE AND library_source=TRANSCRIPTOMIC AND first_created>=2019-01-01"
query_pacbio=query+f" AND instrument_platform=PACBIO_SMRT AND library_source=TRANSCRIPTOMIC AND first_created>=2019-01-01"
query_onp=query+f" AND instrument_platform=OXFORD_NANOPORE AND library_source=TRANSCRIPTOMIC AND first_created>=2019-01-01"

short_paired_runs=ena_rest_api(query_short_paired, taxon_id, batching_option, batch_size, output_dir)
short_single_runs=ena_rest_api(query_short_single, taxon_id, batching_option, batch_size, output_dir)
pacbio_read_runs =ena_rest_api(query_pacbio, taxon_id, batching_option, batch_size, output_dir)
onp_read_runs=ena_rest_api(query_onp, taxon_id, batching_option, batch_size, output_dir)

print(f"{taxon_id};Short-read paired-end illumina;{short_paired_runs};Short-read single-end illumina;{short_single_runs};Long-read PacBio;{pacbio_read_runs};Long_read ONP;{onp_read_runs}")
#print (text.BOLD+f"Short-read paired-end illumina data available! "+text.END+f"Found {short_paired_runs} runs.")
#print (text.BOLD+f"Short-read single-end illumina data available! "+text.END+f"Found {short_single_runs} runs.")
#print (text.BOLD+f"Long-read PacBio data available! "+text.END+f"Found {pacbio_read_runs} runs.")
#print (text.BOLD+f"Long_read ONP data available! "+text.END+f"Found {onp_read_runs} runs.")

class text:
"""formatting set"""

BOLD = "\033[1m"
UNDERLINE = "\033[4m"
END = "\033[0m"


class InputSchema(argparse.ArgumentParser):
"""Input arguments"""
def __init__(self):
super().__init__()

self.add_argument(
"-t", "--taxon_id", type=str, required=False, help="Taxon id"
)

self.add_argument(
"--tree", action='store_true', required=False, help="Turn on the 'Include subordinate taxa' option in your query to ENA"
)
self.add_argument(
"--output_dir", required=False, help="Output directory path"
)
self.add_argument(
"--batching_option", type=bool, required=False, help="Batch run accession"
)
self.add_argument(
"--batch_size", type=int, required=False, help="Batch size"
)
self.add_argument(
"-f", "--file", type=str, required=False, help="Path to the file containing a list of taxon ids"
)

def main() -> None:
"""Entrypoint"""
parser=InputSchema()
args = parser.parse_args()
if args.file:
with open(args.file, 'r') as input_file:
taxon_ids=input_file.read().splitlines()
for taxon_id in taxon_ids:
check_data_from_ena(taxon_id, args.tree, args.batching_option, args.batch_size, Path(args.output_dir))
else:
check_data_from_ena(args.taxon_id, args.tree, args.batching_option, args.batch_size, Path(args.output_dir))

if __name__ == "__main__":
main()

0 comments on commit a39c8cf

Please sign in to comment.