From d32a65324257cbaea019a4a1342f6bcc8967954f Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 9 Oct 2024 12:24:30 +0100 Subject: [PATCH 01/20] Initial commit of db cleanup pipeline --- nextflow/workflows/main.nf | 279 +++++++++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) create mode 100644 nextflow/workflows/main.nf diff --git a/nextflow/workflows/main.nf b/nextflow/workflows/main.nf new file mode 100644 index 000000000..04f02ac54 --- /dev/null +++ b/nextflow/workflows/main.nf @@ -0,0 +1,279 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +// default params +params.source_host = "" +params.source_port = "" +// params.db_list = params.db_list?.split(',') as List // https://github.com/nextflow-io/nextflow/discussions/2821 +params.db_list = [] +params.target_host = "mysql-ens-core-prod-1" +params.target_port = "4524" +params.target_path = "" +params.drop_db = false +params.email = "" +params.user = "ensro" + +log.info """\ + + INFO ON PARAMETERS CURRENTLY SET: + + General parameters + ================== + workDir : ${workDir} + launchDir : ${launchDir} + projectDir : ${projectDir} + db list : ${params.db_list} + email address for HPC : ${params.email} + user : ${params.user} + + """ + .stripIndent(true) + + +process DB_COPY_SUBMIT { + + input: + val db_name + + output: + tuple path('job_id.txt'), val(db_name), emit: job_info_ch + + script: + api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob" + source_db="${params.source_host}:${params.source_port}" + target_db="${params.target_host}:${params.target_port}" + + println "Submitting dbcopy job for $db_name" + + """ + # Submit the job via dbcopy-client + dbcopy-client -u $api_url -a submit -s $source_db -t $target_db -i $db_name -n ${db_name}_tmp -e $params.email -r $params.user --wipe_target 1 --skip-check &> out.log + + # sleep to allow file to be created + sleep 10 + + # Extract the job id + job_id=\$(grep -o '[0-9a-f\\-]\\{36\\}' out.log) + echo \$job_id > job_id.txt + """ +} + +// process EXTRACT_JOB_ID { + +// input: +// val db_name +// path job_id_file + +// output: +// tuple val(job_id_value), val(db_name), emit: job_info_ch + +// script: +// // """ +// // # Read the job ID from the job_id_file +// // #job_id_value=\$(cat $job_id_file) +// // #echo "Job ID: \$job_id_value" +// // """ +// // Define a Groovy variable to hold the job ID +// // Convert job_id_file (which is a path) to a Groovy File object and read its contents +// def job_id_value = job_id_file.newReader().text.trim() // New unique variable name +// println "Job ID: $job_id_value" // Print the job ID for debugging +// } + +process MONITOR_DB_COPY { + + input: + tuple val(job_id), val(db_name) // Get job ID and db name from the previous process + + output: + tuple val(job_id), val(db_name), emit: monitored_job + + script: + """ + # Define the API endpoint to check the status of the job + api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}" + + # Set the interval for checking the status (e.g., every so many seconds) + interval=60 + + # Polling loop + while true; do + # Fetch job status + status=\$(curl -s -X GET \$api_url | jq -r '.overall_status') + + # Print the status to the Nextflow log + echo "Job ID: ${job_id} - Status: \$status" + + # Check if the job is completed or failed + if [ "\$status" = "Complete" ]; then + echo "Job ID: ${job_id} has completed." + break + elif [ "\$status" = "Failed" ]; then + echo "Job ID: ${job_id} has failed." + exit 1 + fi + + # Wait for the next interval before checking the status again + sleep \$interval + done + """ +} + + +process GENERATE_SQL { + + publishDir "sql/${db_name}", mode: 'copy', overwrite: true // Publish SQL files to 'sql' directory + + input: + tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch + + output: + path "${db_name}.sql", emit: sql_output_file // Output pattern to capture SQL files + + script: + println "Generating SQL for db: ${db_name}" + """ + # add max package size to account for dna db table size + mysqldump --max-allowed-packet=2048M --opt --quick --skip-column-statistics -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql + sleep 180 + """ + // # For each table found in the db, dump it out to file + // for table in \$(mysql -h ${params.target_host} -P ${params.target_port} -u ensro -N -e 'SHOW TABLES' ${db_name}_tmp); do + // mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --routines --triggers --add-drop-table ${db_name}_tmp \${table} > \${table}.sql + // #mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --no-create-info ${db_name}_tmp \${table} > \${table}.sql + // done + // """ +} + +process COMPRESS_FILES { + + // get working and then check which compression method to use + + publishDir "zip/", mode: 'copy', overwrite: true + + input: + path sql_file + + output: + path "${sql_file}.bz2", emit: compressed_sql_ch // Output compressed table-named file into a channel + + script: + println "Compressing file: ${sql_file}" + + """ + # Ensure the file is copied to the current work dir, not linked + cp ${sql_file} ./temp_file.sql + + # Compress the file + #bzip2 \$(realpath temp_file.sql) + bzip2 temp_file.sql + + # Rename file + mv temp_file.sql.bz2 ${sql_file}.bz2 + """ +} + +process TAR_COMPRESSED_SQL { + + input: + path compressed_sql_list // The list of compressed SQL files + tuple val(job_id), val(db_name) // Get job ID and db name + + + output: + path "${db_name}.tar.bz2" // The final tar.bz2 archive + + script: + // Print a message to inform the user about the archiving + println "Archiving SQL files for database: ${db_name}" + println "Compressed files: ${compressed_sql_list.join(', ')}" + println "Creating archive: ${db_name}_archive.tar.bz2" + """ + # Create a tar archive with all the compressed SQL files + tar -cjf ${db_name}.tar.bz2 ${compressed_sql_list.join(' ')} + """ +} + + +workflow { + + main: + + // Print the raw db_list to ensure it's being passed properly + println "Raw params.db_list: ${params.db_list}" + + // Check if params.db_list is null or empty + if (params.db_list == null || params.db_list == '') { + println "ERROR: params.db_list is null or empty" + exit 1 + } + + // Split the string into a list and print it + db_list = params.db_list.split(',') + println "Split db_list: ${db_list}" + + // Check if the split resulted in an empty list + if (db_list.size() == 0) { + println "ERROR: db_list is empty after split" + exit 1 + } + + // Create channel of dbs to copy from user list + // Set the channel to a variable for use in DB_COPY + Channel + .from(db_list) + .view() + .set { db_names_ch } + + // Submit the db copy job(s) + result = DB_COPY_SUBMIT(db_names_ch) + + // Extract the job id and map to db name + DB_COPY_SUBMIT.out.job_info_ch + .map { job_id_file, db_name -> + def job_id = job_id_file.text.trim() // Read and trim the contents of job_id.txt + tuple(job_id, db_name) // Return the tuple (job_id, db_name) + } + .set { job_info_mapped_ch } + + // View the mapped channel contents + job_info_mapped_ch.view() + + // Monitor the db copy job + MONITOR_DB_COPY(job_info_mapped_ch) + + // Generate SQL files + GENERATE_SQL(MONITOR_DB_COPY.out.monitored_job) + + // sql_file_ch = Channel.of("sql/${db_name}/${db_name}.sql") + + GENERATE_SQL.out.sql_output_file.view() + + // Compress the SQL files + compressed_sql_ch = COMPRESS_FILES(GENERATE_SQL.out.sql_output_file) + // compressed_sql_ch = COMPRESS_FILES(sql_file_ch) + + // Collect the compressed SQL files into a list + // compressed_sql_list = compressed_sql_ch.collect() + + // compressed_sql_list.view() + + // archive the SQL files + // TAR_COMPRESSED_SQL(compressed_sql_list, job_info_mapped_ch) + + // move archives to final storage path + // use the datamover queue for copying things over? +} From 6ee16cd1043035ec60895cd2ab34078ca32667a2 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 9 Oct 2024 12:34:21 +0100 Subject: [PATCH 02/20] Rename nf file --- nextflow/workflows/{main.nf => dbCleanup.nf} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename nextflow/workflows/{main.nf => dbCleanup.nf} (100%) diff --git a/nextflow/workflows/main.nf b/nextflow/workflows/dbCleanup.nf similarity index 100% rename from nextflow/workflows/main.nf rename to nextflow/workflows/dbCleanup.nf From 5884bb970533db3311424c94463a5172f8b1cbc7 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 9 Oct 2024 12:54:46 +0100 Subject: [PATCH 03/20] Update print outs in log info --- nextflow/workflows/dbCleanup.nf | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf index 04f02ac54..536c2fbef 100644 --- a/nextflow/workflows/dbCleanup.nf +++ b/nextflow/workflows/dbCleanup.nf @@ -18,7 +18,6 @@ nextflow.enable.dsl=2 // default params params.source_host = "" params.source_port = "" -// params.db_list = params.db_list?.split(',') as List // https://github.com/nextflow-io/nextflow/discussions/2821 params.db_list = [] params.target_host = "mysql-ens-core-prod-1" params.target_port = "4524" @@ -36,9 +35,18 @@ log.info """\ workDir : ${workDir} launchDir : ${launchDir} projectDir : ${projectDir} - db list : ${params.db_list} email address for HPC : ${params.email} user : ${params.user} + target path for output : ${params.target_path} + + Database parameters + =================== + db list : ${params.db_list} + source db host : ${params.source_host} + source db port : ${params.source_port} + target db host : ${params.target_host} + target db port : ${params.target_port} + drop source db at end : ${params.drop_db} """ .stripIndent(true) From 7126e513ac4836f56a971845d13510d18d32356b Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 9 Oct 2024 13:12:53 +0100 Subject: [PATCH 04/20] Move DB_COPY_SUBMIT process to module file --- nextflow/modules/db_cleanup/db_copy_submit.nf | 44 +++++++++++++++++++ nextflow/workflows/dbCleanup.nf | 28 +----------- 2 files changed, 45 insertions(+), 27 deletions(-) create mode 100644 nextflow/modules/db_cleanup/db_copy_submit.nf diff --git a/nextflow/modules/db_cleanup/db_copy_submit.nf b/nextflow/modules/db_cleanup/db_copy_submit.nf new file mode 100644 index 000000000..2b5cc464b --- /dev/null +++ b/nextflow/modules/db_cleanup/db_copy_submit.nf @@ -0,0 +1,44 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process DB_COPY_SUBMIT { + + input: + val db_name + + output: + tuple path('job_id.txt'), val(db_name), emit: job_info_ch + + script: + api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob" + source_db="${params.source_host}:${params.source_port}" + target_db="${params.target_host}:${params.target_port}" + + println "Submitting dbcopy job for $db_name" + + """ + # Submit the job via dbcopy-client + dbcopy-client -u $api_url -a submit -s $source_db -t $target_db -i $db_name -n ${db_name}_tmp -e $params.email -r $params.user --wipe_target 1 --skip-check &> out.log + + # sleep to allow file to be created + sleep 10 + + # Extract the job id + job_id=\$(grep -o '[0-9a-f\\-]\\{36\\}' out.log) + echo \$job_id > job_id.txt + """ +} \ No newline at end of file diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf index 536c2fbef..86d064478 100644 --- a/nextflow/workflows/dbCleanup.nf +++ b/nextflow/workflows/dbCleanup.nf @@ -51,34 +51,8 @@ log.info """\ """ .stripIndent(true) +include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf' -process DB_COPY_SUBMIT { - - input: - val db_name - - output: - tuple path('job_id.txt'), val(db_name), emit: job_info_ch - - script: - api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob" - source_db="${params.source_host}:${params.source_port}" - target_db="${params.target_host}:${params.target_port}" - - println "Submitting dbcopy job for $db_name" - - """ - # Submit the job via dbcopy-client - dbcopy-client -u $api_url -a submit -s $source_db -t $target_db -i $db_name -n ${db_name}_tmp -e $params.email -r $params.user --wipe_target 1 --skip-check &> out.log - - # sleep to allow file to be created - sleep 10 - - # Extract the job id - job_id=\$(grep -o '[0-9a-f\\-]\\{36\\}' out.log) - echo \$job_id > job_id.txt - """ -} // process EXTRACT_JOB_ID { From 4f791e981f9ad63df2f4372734f06f286a405427 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 9 Oct 2024 13:45:52 +0100 Subject: [PATCH 05/20] Move MONITOR_DB_COPY process to module file --- .../modules/db_cleanup/monitor_db_copy.nf | 56 +++++++++++++++++ nextflow/workflows/dbCleanup.nf | 63 +------------------ 2 files changed, 57 insertions(+), 62 deletions(-) create mode 100644 nextflow/modules/db_cleanup/monitor_db_copy.nf diff --git a/nextflow/modules/db_cleanup/monitor_db_copy.nf b/nextflow/modules/db_cleanup/monitor_db_copy.nf new file mode 100644 index 000000000..dd7b10e19 --- /dev/null +++ b/nextflow/modules/db_cleanup/monitor_db_copy.nf @@ -0,0 +1,56 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process MONITOR_DB_COPY { + + // Get job ID and db name from the previous process + input: + tuple val(job_id), val(db_name) + + output: + tuple val(job_id), val(db_name), emit: monitored_job + + script: + """ + # Define the API endpoint to check the status of the job + api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}" + + # Set the interval for checking the status (e.g., every so many seconds) + interval=60 + + # Polling loop + while true; do + # Fetch job status + status=\$(curl -s -X GET \$api_url | jq -r '.overall_status') + + # Print the status to the Nextflow log + echo "Job ID: ${job_id} - Status: \$status" + + # Check if the job is completed or failed + if [ "\$status" = "Complete" ]; then + echo "Job ID: ${job_id} has completed." + break + elif [ "\$status" = "Failed" ]; then + echo "Job ID: ${job_id} has failed." + exit 1 + fi + + # Wait for the next interval before checking the status again + sleep \$interval + done + """ +} \ No newline at end of file diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf index 86d064478..5306d7912 100644 --- a/nextflow/workflows/dbCleanup.nf +++ b/nextflow/workflows/dbCleanup.nf @@ -52,68 +52,7 @@ log.info """\ .stripIndent(true) include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf' - - -// process EXTRACT_JOB_ID { - -// input: -// val db_name -// path job_id_file - -// output: -// tuple val(job_id_value), val(db_name), emit: job_info_ch - -// script: -// // """ -// // # Read the job ID from the job_id_file -// // #job_id_value=\$(cat $job_id_file) -// // #echo "Job ID: \$job_id_value" -// // """ -// // Define a Groovy variable to hold the job ID -// // Convert job_id_file (which is a path) to a Groovy File object and read its contents -// def job_id_value = job_id_file.newReader().text.trim() // New unique variable name -// println "Job ID: $job_id_value" // Print the job ID for debugging -// } - -process MONITOR_DB_COPY { - - input: - tuple val(job_id), val(db_name) // Get job ID and db name from the previous process - - output: - tuple val(job_id), val(db_name), emit: monitored_job - - script: - """ - # Define the API endpoint to check the status of the job - api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}" - - # Set the interval for checking the status (e.g., every so many seconds) - interval=60 - - # Polling loop - while true; do - # Fetch job status - status=\$(curl -s -X GET \$api_url | jq -r '.overall_status') - - # Print the status to the Nextflow log - echo "Job ID: ${job_id} - Status: \$status" - - # Check if the job is completed or failed - if [ "\$status" = "Complete" ]; then - echo "Job ID: ${job_id} has completed." - break - elif [ "\$status" = "Failed" ]; then - echo "Job ID: ${job_id} has failed." - exit 1 - fi - - # Wait for the next interval before checking the status again - sleep \$interval - done - """ -} - +include { MONITOR_DB_COPY } from '../modules/db_cleanup/monitor_db_copy.nf' process GENERATE_SQL { From 7c7e1f9d622430861b768cfb313cebd0d66b0469 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 9 Oct 2024 14:05:48 +0100 Subject: [PATCH 06/20] Move GENERATE_SQL process to module file --- nextflow/modules/db_cleanup/generate_sql.nf | 44 +++++++++++++++++++++ nextflow/workflows/dbCleanup.nf | 26 +----------- 2 files changed, 45 insertions(+), 25 deletions(-) create mode 100644 nextflow/modules/db_cleanup/generate_sql.nf diff --git a/nextflow/modules/db_cleanup/generate_sql.nf b/nextflow/modules/db_cleanup/generate_sql.nf new file mode 100644 index 000000000..5df291640 --- /dev/null +++ b/nextflow/modules/db_cleanup/generate_sql.nf @@ -0,0 +1,44 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process GENERATE_SQL { + + publishDir "sql/${db_name}", mode: 'copy', overwrite: true // Publish SQL files to 'sql' directory + + input: + tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch + + output: + path "${db_name}.sql", emit: sql_output_file // Output pattern to capture SQL files + + script: + println "Generating SQL for db: ${db_name}" + """ + # add max package size to account for dna db table size + mysqldump --max-allowed-packet=2048M --opt --quick --skip-column-statistics -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql + + # add sleep to let file system finish file dump + sleep 180 + """ + // Keep this table-based code in case we want to bring this back at later date + // # For each table found in the db, dump it out to file + // for table in \$(mysql -h ${params.target_host} -P ${params.target_port} -u ensro -N -e 'SHOW TABLES' ${db_name}_tmp); do + // mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --routines --triggers --add-drop-table ${db_name}_tmp \${table} > \${table}.sql + // #mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --no-create-info ${db_name}_tmp \${table} > \${table}.sql + // done + // """ +} \ No newline at end of file diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf index 5306d7912..d5ec11cbf 100644 --- a/nextflow/workflows/dbCleanup.nf +++ b/nextflow/workflows/dbCleanup.nf @@ -53,31 +53,7 @@ log.info """\ include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf' include { MONITOR_DB_COPY } from '../modules/db_cleanup/monitor_db_copy.nf' - -process GENERATE_SQL { - - publishDir "sql/${db_name}", mode: 'copy', overwrite: true // Publish SQL files to 'sql' directory - - input: - tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch - - output: - path "${db_name}.sql", emit: sql_output_file // Output pattern to capture SQL files - - script: - println "Generating SQL for db: ${db_name}" - """ - # add max package size to account for dna db table size - mysqldump --max-allowed-packet=2048M --opt --quick --skip-column-statistics -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql - sleep 180 - """ - // # For each table found in the db, dump it out to file - // for table in \$(mysql -h ${params.target_host} -P ${params.target_port} -u ensro -N -e 'SHOW TABLES' ${db_name}_tmp); do - // mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --routines --triggers --add-drop-table ${db_name}_tmp \${table} > \${table}.sql - // #mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --no-create-info ${db_name}_tmp \${table} > \${table}.sql - // done - // """ -} +include { GENERATE_SQL } from '../modules/db_cleanup/generate_sql.nf' process COMPRESS_FILES { From 09bb5e07a5f15ba84e13d4e7664509bcf09a6af8 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 9 Oct 2024 14:29:26 +0100 Subject: [PATCH 07/20] Move COMPRESS_FILE process to module file. Update path for resulting output. --- nextflow/modules/db_cleanup/compress_file.nf | 45 +++++++++++++++++ nextflow/workflows/dbCleanup.nf | 52 ++++---------------- 2 files changed, 54 insertions(+), 43 deletions(-) create mode 100644 nextflow/modules/db_cleanup/compress_file.nf diff --git a/nextflow/modules/db_cleanup/compress_file.nf b/nextflow/modules/db_cleanup/compress_file.nf new file mode 100644 index 000000000..d54684c60 --- /dev/null +++ b/nextflow/modules/db_cleanup/compress_file.nf @@ -0,0 +1,45 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process COMPRESS_FILE { + + // get working and then check which compression method to use + + // copies compressed file to user-provided target path + publishDir params.target_path, mode: 'copy', overwrite: true + + input: + path sql_file + + output: + path "${sql_file}.bz2", emit: compressed_sql_ch // Output compressed table-named file into a channel + + script: + println "Compressing file: ${sql_file}" + + """ + # Ensure the file is copied to the current work dir, not linked + cp ${sql_file} ./temp_file.sql + + # Compress the file + #bzip2 \$(realpath temp_file.sql) + bzip2 temp_file.sql + + # Rename file + mv temp_file.sql.bz2 ${sql_file}.bz2 + """ +} \ No newline at end of file diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf index d5ec11cbf..bcea1a375 100644 --- a/nextflow/workflows/dbCleanup.nf +++ b/nextflow/workflows/dbCleanup.nf @@ -54,35 +54,12 @@ log.info """\ include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf' include { MONITOR_DB_COPY } from '../modules/db_cleanup/monitor_db_copy.nf' include { GENERATE_SQL } from '../modules/db_cleanup/generate_sql.nf' +include { COMPRESS_FILE } from '../modules/db_cleanup/compress_file.nf' -process COMPRESS_FILES { - - // get working and then check which compression method to use - - publishDir "zip/", mode: 'copy', overwrite: true - - input: - path sql_file - - output: - path "${sql_file}.bz2", emit: compressed_sql_ch // Output compressed table-named file into a channel - - script: - println "Compressing file: ${sql_file}" - - """ - # Ensure the file is copied to the current work dir, not linked - cp ${sql_file} ./temp_file.sql - - # Compress the file - #bzip2 \$(realpath temp_file.sql) - bzip2 temp_file.sql - - # Rename file - mv temp_file.sql.bz2 ${sql_file}.bz2 - """ -} - +// Process not currently in use as changed to using single file +// for whole db, so no longer archiving a group of table sql files. +// Leaving here in case need it in future - needs testing before +// moving to own module file. process TAR_COMPRESSED_SQL { input: @@ -155,22 +132,11 @@ workflow { // Generate SQL files GENERATE_SQL(MONITOR_DB_COPY.out.monitored_job) - // sql_file_ch = Channel.of("sql/${db_name}/${db_name}.sql") - + // View the generated files GENERATE_SQL.out.sql_output_file.view() - // Compress the SQL files - compressed_sql_ch = COMPRESS_FILES(GENERATE_SQL.out.sql_output_file) - // compressed_sql_ch = COMPRESS_FILES(sql_file_ch) - - // Collect the compressed SQL files into a list - // compressed_sql_list = compressed_sql_ch.collect() - - // compressed_sql_list.view() - - // archive the SQL files - // TAR_COMPRESSED_SQL(compressed_sql_list, job_info_mapped_ch) + // Compress the SQL file + // also outputs compressed file to final storage path + compressed_sql_ch = COMPRESS_FILE(GENERATE_SQL.out.sql_output_file) - // move archives to final storage path - // use the datamover queue for copying things over? } From 707231ded9260c4cac9eba6eeab295d0e434f996 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Fri, 11 Oct 2024 11:44:14 +0100 Subject: [PATCH 08/20] No longer publish intermediate sql files --- nextflow/modules/db_cleanup/generate_sql.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow/modules/db_cleanup/generate_sql.nf b/nextflow/modules/db_cleanup/generate_sql.nf index 5df291640..106f3134f 100644 --- a/nextflow/modules/db_cleanup/generate_sql.nf +++ b/nextflow/modules/db_cleanup/generate_sql.nf @@ -17,7 +17,7 @@ nextflow.enable.dsl=2 process GENERATE_SQL { - publishDir "sql/${db_name}", mode: 'copy', overwrite: true // Publish SQL files to 'sql' directory + // publishDir "sql/${db_name}", mode: 'copy', overwrite: true // Publish SQL files to 'sql' directory input: tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch From f680105a95d3dc0a56020c89c8a45408794feaaf Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Fri, 11 Oct 2024 11:45:11 +0100 Subject: [PATCH 09/20] Rename main nextflow script. Follow Ensembl nextflow dir structure protocol. --- nextflow/workflows/{dbCleanup.nf => db_cleanup/main.nf} | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) rename nextflow/workflows/{dbCleanup.nf => db_cleanup/main.nf} (93%) diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/db_cleanup/main.nf similarity index 93% rename from nextflow/workflows/dbCleanup.nf rename to nextflow/workflows/db_cleanup/main.nf index bcea1a375..f912504b9 100644 --- a/nextflow/workflows/dbCleanup.nf +++ b/nextflow/workflows/db_cleanup/main.nf @@ -51,10 +51,10 @@ log.info """\ """ .stripIndent(true) -include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf' -include { MONITOR_DB_COPY } from '../modules/db_cleanup/monitor_db_copy.nf' -include { GENERATE_SQL } from '../modules/db_cleanup/generate_sql.nf' -include { COMPRESS_FILE } from '../modules/db_cleanup/compress_file.nf' +include { DB_COPY_SUBMIT } from '../../modules/db_cleanup/db_copy_submit.nf' +include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf' +include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf' +include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf' // Process not currently in use as changed to using single file // for whole db, so no longer archiving a group of table sql files. From 5db2b86d64947c42494ba8a566099726b5284ca6 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Fri, 11 Oct 2024 16:44:35 +0100 Subject: [PATCH 10/20] Use nf-schema for parameter validation and help info --- nextflow/workflows/db_cleanup/main.nf | 39 ++++++++----- nextflow/workflows/db_cleanup/nextflow.config | 27 +++++++++ .../workflows/db_cleanup/nextflow_schema.json | 56 +++++++++++++++++++ 3 files changed, 109 insertions(+), 13 deletions(-) create mode 100644 nextflow/workflows/db_cleanup/nextflow.config create mode 100644 nextflow/workflows/db_cleanup/nextflow_schema.json diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf index f912504b9..c39e7e80f 100644 --- a/nextflow/workflows/db_cleanup/main.nf +++ b/nextflow/workflows/db_cleanup/main.nf @@ -15,16 +15,32 @@ nextflow.enable.dsl=2 +// modules to include +include { DB_COPY_SUBMIT } from '../../modules/db_cleanup/db_copy_submit.nf' +include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf' +include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf' +include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf' + +// nf-schema-related modules +include { validateParameters; paramsSummaryLog; samplesheetToList } from 'plugin/nf-schema' + +// Validate input parameters +validateParameters() + +// Print summary of supplied parameters +// via nf-schema plugin +log.info paramsSummaryLog(workflow) + // default params -params.source_host = "" -params.source_port = "" -params.db_list = [] -params.target_host = "mysql-ens-core-prod-1" -params.target_port = "4524" -params.target_path = "" -params.drop_db = false -params.email = "" -params.user = "ensro" +// params.source_host = "" +// params.source_port = "" +// params.db_list = [] +// params.target_host = "mysql-ens-core-prod-1" +// params.target_port = "4524" +// params.target_path = "" +// params.drop_db = false +// params.email = "" +// params.user = "" log.info """\ @@ -51,10 +67,7 @@ log.info """\ """ .stripIndent(true) -include { DB_COPY_SUBMIT } from '../../modules/db_cleanup/db_copy_submit.nf' -include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf' -include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf' -include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf' + // Process not currently in use as changed to using single file // for whole db, so no longer archiving a group of table sql files. diff --git a/nextflow/workflows/db_cleanup/nextflow.config b/nextflow/workflows/db_cleanup/nextflow.config new file mode 100644 index 000000000..9e722eca2 --- /dev/null +++ b/nextflow/workflows/db_cleanup/nextflow.config @@ -0,0 +1,27 @@ +// required plugins +// https://nextflow-io.github.io/nf-schema/latest/#quick-start +plugins { + id 'nf-schema@2.1.1' +} + +// define parameters +params.source_host = "mysql-ens-sta-6" +params.source_port = "4695" +params.target_host = "mysql-ens-core-prod-1" +params.target_port = "4524" +params.db_list = "capra_hircus_core_110_1" +params.user = "nwillhoft" +params.email = "nwillhoft@ebi.ac.uk" +params.target_path = "${launchDir}/zip" +params.drop_db = false + +// https://nextflow-io.github.io/nf-schema/latest/configuration/configuration/#help +validation.help.enabled = true // default: false + +// enable the creation of the help message +// https://nextflow-io.github.io/nf-schema/latest/#quick-start +validation { + help { + enabled: true + } +} \ No newline at end of file diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json new file mode 100644 index 000000000..00248143f --- /dev/null +++ b/nextflow/workflows/db_cleanup/nextflow_schema.json @@ -0,0 +1,56 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://json-schema.org/draft/2020-12/schema", + "title": "Database Cleanup Pipeline", + "description": "Pipeline to clean up databases by generating SQL dumps for long-term storage", + "type": "object", + "properties": { + "source_host": { + "type": "string", + "description": "The host for connecting to the source database", + "required": true + }, + "source_port": { + "type": "string", + "description": "The port for connecting to the source database", + "required": true + }, + "db_list": { + "type": "string", + "description": "Comma-separated list of database names to process", + "required": true + }, + "target_host": { + "type": "string", + "description": "The host for connecting to the target database", + "required": true, + "default": "mysql-ens-core-prod-1" + }, + "target_port": { + "type": "string", + "description": "The port for connecting to the target database", + "required": true, + "default": "4524" + }, + "target_path": { + "type": "string", + "description": "Path to store final compressed SQL file", + "required": true + }, + "drop_db": { + "type": "string", + "description": "Boolean to drop the source database after put into storage", + "default": false + }, + "email": { + "type": "string", + "description": "Email address of user. Required for dbcopy-client service to run", + "required": true + }, + "user": { + "type": "string", + "description": "User name to use for dbcopy-client service", + "required": true + } + } +} \ No newline at end of file From 30c01ffb8f174d00725fbd2642a7834338c9c7c3 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Mon, 14 Oct 2024 21:55:02 +0100 Subject: [PATCH 11/20] Add process to clear up temp database --- nextflow/workflows/db_cleanup/main.nf | 25 +++++++++++++++++++ .../workflows/db_cleanup/nextflow_schema.json | 5 ++++ 2 files changed, 30 insertions(+) diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf index c39e7e80f..78ac40d10 100644 --- a/nextflow/workflows/db_cleanup/main.nf +++ b/nextflow/workflows/db_cleanup/main.nf @@ -94,6 +94,29 @@ process TAR_COMPRESSED_SQL { """ } +process CLEANUP_TMP_DB { + + input: + path compressed_file + tuple val(job_id), val(db_name) + + script: + """ + tmp_db_name="${db_name}_tmp" + echo "Checking if target database \${tmp_db_name} exists for removal..." + + db_exists=\$(mysql -h $params.target_host -P $params.target_port -u ensadmin -p $params.db_pwd -e "SHOW DATABASES LIKE '\${tmp_db_name}';") + + if [ ! -z "\${db_exists}" ]; then + echo "Database \${tmp_db_name} exists. Dropping the database." + mysqladmin -h $params.target_host -P $params.target_port -u $params.user -f drop \${tmp_db_name} + else + echo "Database \${tmp_db_name} does not exist, skipping drop." + fi + """ +} + + workflow { @@ -152,4 +175,6 @@ workflow { // also outputs compressed file to final storage path compressed_sql_ch = COMPRESS_FILE(GENERATE_SQL.out.sql_output_file) + // Cleanup the temp db created by this pipeline + CLEANUP_TMP_DB(compressed_sql_ch, MONITOR_DB_COPY.out.monitored_job) } diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json index 00248143f..a7dc9c265 100644 --- a/nextflow/workflows/db_cleanup/nextflow_schema.json +++ b/nextflow/workflows/db_cleanup/nextflow_schema.json @@ -42,6 +42,11 @@ "description": "Boolean to drop the source database after put into storage", "default": false }, + "db_pwd": { + "type": "string", + "description": "Password for db admin user", + "required": true + }, "email": { "type": "string", "description": "Email address of user. Required for dbcopy-client service to run", From 398dd4cecfb3d57e3b2c3d016116ef780750415e Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Tue, 15 Oct 2024 11:49:49 +0100 Subject: [PATCH 12/20] Remove column stats option as no longer needed --- nextflow/modules/db_cleanup/generate_sql.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow/modules/db_cleanup/generate_sql.nf b/nextflow/modules/db_cleanup/generate_sql.nf index 106f3134f..14eff6321 100644 --- a/nextflow/modules/db_cleanup/generate_sql.nf +++ b/nextflow/modules/db_cleanup/generate_sql.nf @@ -29,7 +29,7 @@ process GENERATE_SQL { println "Generating SQL for db: ${db_name}" """ # add max package size to account for dna db table size - mysqldump --max-allowed-packet=2048M --opt --quick --skip-column-statistics -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql + mysqldump --max-allowed-packet=2048M --opt --quick -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql # add sleep to let file system finish file dump sleep 180 From 1e369bb1e9535037bb77201ae7539a1ec66d3e84 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Tue, 15 Oct 2024 13:49:46 +0100 Subject: [PATCH 13/20] Require db admin user and password for dropping a db --- nextflow/workflows/db_cleanup/nextflow_schema.json | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json index a7dc9c265..2d8128c4b 100644 --- a/nextflow/workflows/db_cleanup/nextflow_schema.json +++ b/nextflow/workflows/db_cleanup/nextflow_schema.json @@ -42,9 +42,14 @@ "description": "Boolean to drop the source database after put into storage", "default": false }, - "db_pwd": { + "dba_user": { "type": "string", - "description": "Password for db admin user", + "description": "Username for db admin user. Used to drop a database", + "required": true + }, + "dba_pwd": { + "type": "string", + "description": "Password for db admin user. Used to drop a database", "required": true }, "email": { From 24b334d93c511c9890ab08e51185310994ee8263 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Tue, 15 Oct 2024 13:50:25 +0100 Subject: [PATCH 14/20] Fix drop db process with extra space and simplify drop statement --- nextflow/workflows/db_cleanup/main.nf | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf index 78ac40d10..8e0b869c6 100644 --- a/nextflow/workflows/db_cleanup/main.nf +++ b/nextflow/workflows/db_cleanup/main.nf @@ -103,16 +103,11 @@ process CLEANUP_TMP_DB { script: """ tmp_db_name="${db_name}_tmp" - echo "Checking if target database \${tmp_db_name} exists for removal..." + echo "Attempting to drop database \${tmp_db_name} if it exists..." - db_exists=\$(mysql -h $params.target_host -P $params.target_port -u ensadmin -p $params.db_pwd -e "SHOW DATABASES LIKE '\${tmp_db_name}';") + mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS \${tmp_db_name};" - if [ ! -z "\${db_exists}" ]; then - echo "Database \${tmp_db_name} exists. Dropping the database." - mysqladmin -h $params.target_host -P $params.target_port -u $params.user -f drop \${tmp_db_name} - else - echo "Database \${tmp_db_name} does not exist, skipping drop." - fi + echo "Drop operation complete." """ } From 4904f6bf098795b7394e121f3dd2eed7b0b4ff10 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Tue, 15 Oct 2024 14:32:35 +0100 Subject: [PATCH 15/20] Move CLEANUP_TMP_DB process to module file --- nextflow/modules/db_cleanup/cleanup_tmp_db.nf | 33 +++++++++++++++++++ nextflow/workflows/db_cleanup/main.nf | 19 +---------- 2 files changed, 34 insertions(+), 18 deletions(-) create mode 100644 nextflow/modules/db_cleanup/cleanup_tmp_db.nf diff --git a/nextflow/modules/db_cleanup/cleanup_tmp_db.nf b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf new file mode 100644 index 000000000..4c27266e8 --- /dev/null +++ b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf @@ -0,0 +1,33 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process CLEANUP_TMP_DB { + + input: + path compressed_file + tuple val(job_id), val(db_name) + + script: + """ + tmp_db_name="${db_name}_tmp" + echo "Attempting to drop database \${tmp_db_name} if it exists..." + + mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS \${tmp_db_name};" + + echo "Drop operation complete." + """ +} \ No newline at end of file diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf index 8e0b869c6..4c92201e5 100644 --- a/nextflow/workflows/db_cleanup/main.nf +++ b/nextflow/workflows/db_cleanup/main.nf @@ -20,6 +20,7 @@ include { DB_COPY_SUBMIT } from '../../modules/db_cleanup/db_copy_submit.nf' include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf' include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf' include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf' +include { CLEANUP_TMP_DB } from '../../modules/db_cleanup/cleanup_tmp_db.nf' // nf-schema-related modules include { validateParameters; paramsSummaryLog; samplesheetToList } from 'plugin/nf-schema' @@ -94,24 +95,6 @@ process TAR_COMPRESSED_SQL { """ } -process CLEANUP_TMP_DB { - - input: - path compressed_file - tuple val(job_id), val(db_name) - - script: - """ - tmp_db_name="${db_name}_tmp" - echo "Attempting to drop database \${tmp_db_name} if it exists..." - - mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS \${tmp_db_name};" - - echo "Drop operation complete." - """ -} - - workflow { From e6fe3afc78aca716ce390e0ce129ac1955a7ad7e Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 16 Oct 2024 15:06:51 +0100 Subject: [PATCH 16/20] Add process to drop source db when requested --- nextflow/workflows/db_cleanup/main.nf | 22 +++++++++++++++++++ .../workflows/db_cleanup/nextflow_schema.json | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf index 4c92201e5..30ba3e00d 100644 --- a/nextflow/workflows/db_cleanup/main.nf +++ b/nextflow/workflows/db_cleanup/main.nf @@ -95,6 +95,22 @@ process TAR_COMPRESSED_SQL { """ } +process DROP_SOURCE_DB { + + input: + tuple val(job_id), val(db_name) + + script: + """ + echo "Attempting to drop database ${db_name} if it exists..." + + mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS ${db_name};" + + echo "Drop operation complete." + """ + +} + workflow { @@ -155,4 +171,10 @@ workflow { // Cleanup the temp db created by this pipeline CLEANUP_TMP_DB(compressed_sql_ch, MONITOR_DB_COPY.out.monitored_job) + + // Cleanup source db (if flag set to true) + if (params.drop_source_db == true) { + DROP_SOURCE_DB(CLEANUP_TMP_DB.out.cleaned_up) + } + } diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json index 2d8128c4b..2c8b7fcc4 100644 --- a/nextflow/workflows/db_cleanup/nextflow_schema.json +++ b/nextflow/workflows/db_cleanup/nextflow_schema.json @@ -37,7 +37,7 @@ "description": "Path to store final compressed SQL file", "required": true }, - "drop_db": { + "drop_source_db": { "type": "string", "description": "Boolean to drop the source database after put into storage", "default": false From b316a8113488bde9aa3f6e6d8ee5e50851e41564 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 16 Oct 2024 15:07:59 +0100 Subject: [PATCH 17/20] Remove redundant check --- nextflow/workflows/db_cleanup/main.nf | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf index 30ba3e00d..a151e0bee 100644 --- a/nextflow/workflows/db_cleanup/main.nf +++ b/nextflow/workflows/db_cleanup/main.nf @@ -119,12 +119,6 @@ workflow { // Print the raw db_list to ensure it's being passed properly println "Raw params.db_list: ${params.db_list}" - // Check if params.db_list is null or empty - if (params.db_list == null || params.db_list == '') { - println "ERROR: params.db_list is null or empty" - exit 1 - } - // Split the string into a list and print it db_list = params.db_list.split(',') println "Split db_list: ${db_list}" From 16f3969359e0c9ece3053df7c4b17b3172f3d7f9 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Wed, 16 Oct 2024 15:08:28 +0100 Subject: [PATCH 18/20] Add prints for info --- nextflow/modules/db_cleanup/cleanup_tmp_db.nf | 5 +++++ nextflow/modules/db_cleanup/monitor_db_copy.nf | 2 ++ 2 files changed, 7 insertions(+) diff --git a/nextflow/modules/db_cleanup/cleanup_tmp_db.nf b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf index 4c27266e8..ce961b850 100644 --- a/nextflow/modules/db_cleanup/cleanup_tmp_db.nf +++ b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf @@ -21,7 +21,12 @@ process CLEANUP_TMP_DB { path compressed_file tuple val(job_id), val(db_name) + output: + tuple val(job_id), val(db_name), emit: cleaned_up + script: + println "Cleaning up temporary db: ${db_name}_tmp" + """ tmp_db_name="${db_name}_tmp" echo "Attempting to drop database \${tmp_db_name} if it exists..." diff --git a/nextflow/modules/db_cleanup/monitor_db_copy.nf b/nextflow/modules/db_cleanup/monitor_db_copy.nf index dd7b10e19..336bf6eef 100644 --- a/nextflow/modules/db_cleanup/monitor_db_copy.nf +++ b/nextflow/modules/db_cleanup/monitor_db_copy.nf @@ -25,6 +25,8 @@ process MONITOR_DB_COPY { tuple val(job_id), val(db_name), emit: monitored_job script: + println "Monitoring job id: ${job_id}" + """ # Define the API endpoint to check the status of the job api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}" From 323fcf9525fb3a5cb405c0e79eb4077139d79f17 Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Fri, 18 Oct 2024 12:18:53 +0100 Subject: [PATCH 19/20] Correct string type to boolean --- nextflow/workflows/db_cleanup/nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json index 2c8b7fcc4..a3f4d816e 100644 --- a/nextflow/workflows/db_cleanup/nextflow_schema.json +++ b/nextflow/workflows/db_cleanup/nextflow_schema.json @@ -38,7 +38,7 @@ "required": true }, "drop_source_db": { - "type": "string", + "type": "boolean", "description": "Boolean to drop the source database after put into storage", "default": false }, From 4bcb3db5316fe1128ef90036c821d05ac78b5acc Mon Sep 17 00:00:00 2001 From: nwillhoft Date: Fri, 18 Oct 2024 12:59:59 +0100 Subject: [PATCH 20/20] Move DROP_SOURCE_DB process to module file --- nextflow/modules/db_cleanup/drop_source_db.nf | 34 +++++++++++++++++++ nextflow/workflows/db_cleanup/main.nf | 18 +--------- 2 files changed, 35 insertions(+), 17 deletions(-) create mode 100644 nextflow/modules/db_cleanup/drop_source_db.nf diff --git a/nextflow/modules/db_cleanup/drop_source_db.nf b/nextflow/modules/db_cleanup/drop_source_db.nf new file mode 100644 index 000000000..fb3e99e49 --- /dev/null +++ b/nextflow/modules/db_cleanup/drop_source_db.nf @@ -0,0 +1,34 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process DROP_SOURCE_DB { + + input: + tuple val(job_id), val(db_name) + + script: + println "Dropping source database: ${db_name}" + + """ + echo "Attempting to drop database ${db_name} if it exists..." + + mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS ${db_name};" + + echo "Drop operation complete." + """ + +} \ No newline at end of file diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf index a151e0bee..030606d36 100644 --- a/nextflow/workflows/db_cleanup/main.nf +++ b/nextflow/workflows/db_cleanup/main.nf @@ -21,6 +21,7 @@ include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf' include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf' include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf' include { CLEANUP_TMP_DB } from '../../modules/db_cleanup/cleanup_tmp_db.nf' +include { DROP_SOURCE_DB } from '../../modules/db_cleanup/drop_source_db.nf' // nf-schema-related modules include { validateParameters; paramsSummaryLog; samplesheetToList } from 'plugin/nf-schema' @@ -95,23 +96,6 @@ process TAR_COMPRESSED_SQL { """ } -process DROP_SOURCE_DB { - - input: - tuple val(job_id), val(db_name) - - script: - """ - echo "Attempting to drop database ${db_name} if it exists..." - - mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS ${db_name};" - - echo "Drop operation complete." - """ - -} - - workflow { main: