diff --git a/nextflow/modules/db_cleanup/cleanup_tmp_db.nf b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf new file mode 100644 index 000000000..ce961b850 --- /dev/null +++ b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf @@ -0,0 +1,38 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process CLEANUP_TMP_DB { + + input: + path compressed_file + tuple val(job_id), val(db_name) + + output: + tuple val(job_id), val(db_name), emit: cleaned_up + + script: + println "Cleaning up temporary db: ${db_name}_tmp" + + """ + tmp_db_name="${db_name}_tmp" + echo "Attempting to drop database \${tmp_db_name} if it exists..." + + mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS \${tmp_db_name};" + + echo "Drop operation complete." + """ +} \ No newline at end of file diff --git a/nextflow/modules/db_cleanup/compress_file.nf b/nextflow/modules/db_cleanup/compress_file.nf new file mode 100644 index 000000000..d54684c60 --- /dev/null +++ b/nextflow/modules/db_cleanup/compress_file.nf @@ -0,0 +1,45 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process COMPRESS_FILE { + + // get working and then check which compression method to use + + // copies compressed file to user-provided target path + publishDir params.target_path, mode: 'copy', overwrite: true + + input: + path sql_file + + output: + path "${sql_file}.bz2", emit: compressed_sql_ch // Output compressed table-named file into a channel + + script: + println "Compressing file: ${sql_file}" + + """ + # Ensure the file is copied to the current work dir, not linked + cp ${sql_file} ./temp_file.sql + + # Compress the file + #bzip2 \$(realpath temp_file.sql) + bzip2 temp_file.sql + + # Rename file + mv temp_file.sql.bz2 ${sql_file}.bz2 + """ +} \ No newline at end of file diff --git a/nextflow/modules/db_cleanup/db_copy_submit.nf b/nextflow/modules/db_cleanup/db_copy_submit.nf new file mode 100644 index 000000000..2b5cc464b --- /dev/null +++ b/nextflow/modules/db_cleanup/db_copy_submit.nf @@ -0,0 +1,44 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process DB_COPY_SUBMIT { + + input: + val db_name + + output: + tuple path('job_id.txt'), val(db_name), emit: job_info_ch + + script: + api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob" + source_db="${params.source_host}:${params.source_port}" + target_db="${params.target_host}:${params.target_port}" + + println "Submitting dbcopy job for $db_name" + + """ + # Submit the job via dbcopy-client + dbcopy-client -u $api_url -a submit -s $source_db -t $target_db -i $db_name -n ${db_name}_tmp -e $params.email -r $params.user --wipe_target 1 --skip-check &> out.log + + # sleep to allow file to be created + sleep 10 + + # Extract the job id + job_id=\$(grep -o '[0-9a-f\\-]\\{36\\}' out.log) + echo \$job_id > job_id.txt + """ +} \ No newline at end of file diff --git a/nextflow/modules/db_cleanup/drop_source_db.nf b/nextflow/modules/db_cleanup/drop_source_db.nf new file mode 100644 index 000000000..fb3e99e49 --- /dev/null +++ b/nextflow/modules/db_cleanup/drop_source_db.nf @@ -0,0 +1,34 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process DROP_SOURCE_DB { + + input: + tuple val(job_id), val(db_name) + + script: + println "Dropping source database: ${db_name}" + + """ + echo "Attempting to drop database ${db_name} if it exists..." + + mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS ${db_name};" + + echo "Drop operation complete." + """ + +} \ No newline at end of file diff --git a/nextflow/modules/db_cleanup/generate_sql.nf b/nextflow/modules/db_cleanup/generate_sql.nf new file mode 100644 index 000000000..14eff6321 --- /dev/null +++ b/nextflow/modules/db_cleanup/generate_sql.nf @@ -0,0 +1,44 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process GENERATE_SQL { + + // publishDir "sql/${db_name}", mode: 'copy', overwrite: true // Publish SQL files to 'sql' directory + + input: + tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch + + output: + path "${db_name}.sql", emit: sql_output_file // Output pattern to capture SQL files + + script: + println "Generating SQL for db: ${db_name}" + """ + # add max package size to account for dna db table size + mysqldump --max-allowed-packet=2048M --opt --quick -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql + + # add sleep to let file system finish file dump + sleep 180 + """ + // Keep this table-based code in case we want to bring this back at later date + // # For each table found in the db, dump it out to file + // for table in \$(mysql -h ${params.target_host} -P ${params.target_port} -u ensro -N -e 'SHOW TABLES' ${db_name}_tmp); do + // mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --routines --triggers --add-drop-table ${db_name}_tmp \${table} > \${table}.sql + // #mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --no-create-info ${db_name}_tmp \${table} > \${table}.sql + // done + // """ +} \ No newline at end of file diff --git a/nextflow/modules/db_cleanup/monitor_db_copy.nf b/nextflow/modules/db_cleanup/monitor_db_copy.nf new file mode 100644 index 000000000..336bf6eef --- /dev/null +++ b/nextflow/modules/db_cleanup/monitor_db_copy.nf @@ -0,0 +1,58 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +process MONITOR_DB_COPY { + + // Get job ID and db name from the previous process + input: + tuple val(job_id), val(db_name) + + output: + tuple val(job_id), val(db_name), emit: monitored_job + + script: + println "Monitoring job id: ${job_id}" + + """ + # Define the API endpoint to check the status of the job + api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}" + + # Set the interval for checking the status (e.g., every so many seconds) + interval=60 + + # Polling loop + while true; do + # Fetch job status + status=\$(curl -s -X GET \$api_url | jq -r '.overall_status') + + # Print the status to the Nextflow log + echo "Job ID: ${job_id} - Status: \$status" + + # Check if the job is completed or failed + if [ "\$status" = "Complete" ]; then + echo "Job ID: ${job_id} has completed." + break + elif [ "\$status" = "Failed" ]; then + echo "Job ID: ${job_id} has failed." + exit 1 + fi + + # Wait for the next interval before checking the status again + sleep \$interval + done + """ +} \ No newline at end of file diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf new file mode 100644 index 000000000..030606d36 --- /dev/null +++ b/nextflow/workflows/db_cleanup/main.nf @@ -0,0 +1,158 @@ +// See the NOTICE file distributed with this work for additional information +// regarding copyright ownership. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +nextflow.enable.dsl=2 + +// modules to include +include { DB_COPY_SUBMIT } from '../../modules/db_cleanup/db_copy_submit.nf' +include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf' +include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf' +include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf' +include { CLEANUP_TMP_DB } from '../../modules/db_cleanup/cleanup_tmp_db.nf' +include { DROP_SOURCE_DB } from '../../modules/db_cleanup/drop_source_db.nf' + +// nf-schema-related modules +include { validateParameters; paramsSummaryLog; samplesheetToList } from 'plugin/nf-schema' + +// Validate input parameters +validateParameters() + +// Print summary of supplied parameters +// via nf-schema plugin +log.info paramsSummaryLog(workflow) + +// default params +// params.source_host = "" +// params.source_port = "" +// params.db_list = [] +// params.target_host = "mysql-ens-core-prod-1" +// params.target_port = "4524" +// params.target_path = "" +// params.drop_db = false +// params.email = "" +// params.user = "" + +log.info """\ + + INFO ON PARAMETERS CURRENTLY SET: + + General parameters + ================== + workDir : ${workDir} + launchDir : ${launchDir} + projectDir : ${projectDir} + email address for HPC : ${params.email} + user : ${params.user} + target path for output : ${params.target_path} + + Database parameters + =================== + db list : ${params.db_list} + source db host : ${params.source_host} + source db port : ${params.source_port} + target db host : ${params.target_host} + target db port : ${params.target_port} + drop source db at end : ${params.drop_db} + + """ + .stripIndent(true) + + + +// Process not currently in use as changed to using single file +// for whole db, so no longer archiving a group of table sql files. +// Leaving here in case need it in future - needs testing before +// moving to own module file. +process TAR_COMPRESSED_SQL { + + input: + path compressed_sql_list // The list of compressed SQL files + tuple val(job_id), val(db_name) // Get job ID and db name + + + output: + path "${db_name}.tar.bz2" // The final tar.bz2 archive + + script: + // Print a message to inform the user about the archiving + println "Archiving SQL files for database: ${db_name}" + println "Compressed files: ${compressed_sql_list.join(', ')}" + println "Creating archive: ${db_name}_archive.tar.bz2" + """ + # Create a tar archive with all the compressed SQL files + tar -cjf ${db_name}.tar.bz2 ${compressed_sql_list.join(' ')} + """ +} + +workflow { + + main: + + // Print the raw db_list to ensure it's being passed properly + println "Raw params.db_list: ${params.db_list}" + + // Split the string into a list and print it + db_list = params.db_list.split(',') + println "Split db_list: ${db_list}" + + // Check if the split resulted in an empty list + if (db_list.size() == 0) { + println "ERROR: db_list is empty after split" + exit 1 + } + + // Create channel of dbs to copy from user list + // Set the channel to a variable for use in DB_COPY + Channel + .from(db_list) + .view() + .set { db_names_ch } + + // Submit the db copy job(s) + result = DB_COPY_SUBMIT(db_names_ch) + + // Extract the job id and map to db name + DB_COPY_SUBMIT.out.job_info_ch + .map { job_id_file, db_name -> + def job_id = job_id_file.text.trim() // Read and trim the contents of job_id.txt + tuple(job_id, db_name) // Return the tuple (job_id, db_name) + } + .set { job_info_mapped_ch } + + // View the mapped channel contents + job_info_mapped_ch.view() + + // Monitor the db copy job + MONITOR_DB_COPY(job_info_mapped_ch) + + // Generate SQL files + GENERATE_SQL(MONITOR_DB_COPY.out.monitored_job) + + // View the generated files + GENERATE_SQL.out.sql_output_file.view() + + // Compress the SQL file + // also outputs compressed file to final storage path + compressed_sql_ch = COMPRESS_FILE(GENERATE_SQL.out.sql_output_file) + + // Cleanup the temp db created by this pipeline + CLEANUP_TMP_DB(compressed_sql_ch, MONITOR_DB_COPY.out.monitored_job) + + // Cleanup source db (if flag set to true) + if (params.drop_source_db == true) { + DROP_SOURCE_DB(CLEANUP_TMP_DB.out.cleaned_up) + } + +} diff --git a/nextflow/workflows/db_cleanup/nextflow.config b/nextflow/workflows/db_cleanup/nextflow.config new file mode 100644 index 000000000..9e722eca2 --- /dev/null +++ b/nextflow/workflows/db_cleanup/nextflow.config @@ -0,0 +1,27 @@ +// required plugins +// https://nextflow-io.github.io/nf-schema/latest/#quick-start +plugins { + id 'nf-schema@2.1.1' +} + +// define parameters +params.source_host = "mysql-ens-sta-6" +params.source_port = "4695" +params.target_host = "mysql-ens-core-prod-1" +params.target_port = "4524" +params.db_list = "capra_hircus_core_110_1" +params.user = "nwillhoft" +params.email = "nwillhoft@ebi.ac.uk" +params.target_path = "${launchDir}/zip" +params.drop_db = false + +// https://nextflow-io.github.io/nf-schema/latest/configuration/configuration/#help +validation.help.enabled = true // default: false + +// enable the creation of the help message +// https://nextflow-io.github.io/nf-schema/latest/#quick-start +validation { + help { + enabled: true + } +} \ No newline at end of file diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json new file mode 100644 index 000000000..a3f4d816e --- /dev/null +++ b/nextflow/workflows/db_cleanup/nextflow_schema.json @@ -0,0 +1,66 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://json-schema.org/draft/2020-12/schema", + "title": "Database Cleanup Pipeline", + "description": "Pipeline to clean up databases by generating SQL dumps for long-term storage", + "type": "object", + "properties": { + "source_host": { + "type": "string", + "description": "The host for connecting to the source database", + "required": true + }, + "source_port": { + "type": "string", + "description": "The port for connecting to the source database", + "required": true + }, + "db_list": { + "type": "string", + "description": "Comma-separated list of database names to process", + "required": true + }, + "target_host": { + "type": "string", + "description": "The host for connecting to the target database", + "required": true, + "default": "mysql-ens-core-prod-1" + }, + "target_port": { + "type": "string", + "description": "The port for connecting to the target database", + "required": true, + "default": "4524" + }, + "target_path": { + "type": "string", + "description": "Path to store final compressed SQL file", + "required": true + }, + "drop_source_db": { + "type": "boolean", + "description": "Boolean to drop the source database after put into storage", + "default": false + }, + "dba_user": { + "type": "string", + "description": "Username for db admin user. Used to drop a database", + "required": true + }, + "dba_pwd": { + "type": "string", + "description": "Password for db admin user. Used to drop a database", + "required": true + }, + "email": { + "type": "string", + "description": "Email address of user. Required for dbcopy-client service to run", + "required": true + }, + "user": { + "type": "string", + "description": "User name to use for dbcopy-client service", + "required": true + } + } +} \ No newline at end of file