From d32a65324257cbaea019a4a1342f6bcc8967954f Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 9 Oct 2024 12:24:30 +0100
Subject: [PATCH 01/20] Initial commit of db cleanup pipeline

---
 nextflow/workflows/main.nf | 279 +++++++++++++++++++++++++++++++++++++
 1 file changed, 279 insertions(+)
 create mode 100644 nextflow/workflows/main.nf

diff --git a/nextflow/workflows/main.nf b/nextflow/workflows/main.nf
new file mode 100644
index 000000000..04f02ac54
--- /dev/null
+++ b/nextflow/workflows/main.nf
@@ -0,0 +1,279 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+nextflow.enable.dsl=2
+
+// default params
+params.source_host = ""
+params.source_port = ""
+// params.db_list = params.db_list?.split(',') as List // https://github.com/nextflow-io/nextflow/discussions/2821
+params.db_list = []
+params.target_host = "mysql-ens-core-prod-1"
+params.target_port = "4524"
+params.target_path = ""
+params.drop_db = false
+params.email = ""
+params.user = "ensro"
+
+log.info """\
+
+    INFO ON PARAMETERS CURRENTLY SET:
+
+    General parameters
+    ==================
+    workDir                 : ${workDir}
+    launchDir               : ${launchDir}
+    projectDir              : ${projectDir}
+    db list                 : ${params.db_list}
+    email address for HPC   : ${params.email}
+    user                    : ${params.user}
+
+    """
+    .stripIndent(true)
+
+
+process DB_COPY_SUBMIT {
+
+    input:
+    val db_name
+
+    output:
+    tuple path('job_id.txt'), val(db_name), emit: job_info_ch
+
+    script:
+    api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob"
+    source_db="${params.source_host}:${params.source_port}"
+    target_db="${params.target_host}:${params.target_port}"
+
+    println "Submitting dbcopy job for $db_name"
+
+    """
+    # Submit the job via dbcopy-client
+    dbcopy-client -u $api_url -a submit -s $source_db -t $target_db -i $db_name -n ${db_name}_tmp -e $params.email -r $params.user --wipe_target 1 --skip-check &> out.log
+
+    # sleep to allow file to be created
+    sleep 10
+
+    # Extract the job id
+    job_id=\$(grep -o '[0-9a-f\\-]\\{36\\}' out.log)
+    echo \$job_id > job_id.txt
+    """
+}
+
+// process EXTRACT_JOB_ID {
+
+//     input:
+//     val db_name
+//     path job_id_file
+
+//     output:
+//     tuple val(job_id_value), val(db_name), emit: job_info_ch
+
+//     script:
+//     // """
+//     // # Read the job ID from the job_id_file
+//     // #job_id_value=\$(cat $job_id_file)
+//     // #echo "Job ID: \$job_id_value"
+//     // """
+//     // Define a Groovy variable to hold the job ID
+//     // Convert job_id_file (which is a path) to a Groovy File object and read its contents
+//     def job_id_value = job_id_file.newReader().text.trim()  // New unique variable name
+//     println "Job ID: $job_id_value" // Print the job ID for debugging
+// }
+
+process MONITOR_DB_COPY {
+
+    input:
+    tuple val(job_id), val(db_name) // Get job ID and db name from the previous process
+
+    output:
+    tuple val(job_id), val(db_name), emit: monitored_job
+
+    script:
+    """
+    # Define the API endpoint to check the status of the job
+    api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}"
+    
+    # Set the interval for checking the status (e.g., every so many seconds)
+    interval=60
+
+    # Polling loop
+    while true; do
+        # Fetch job status
+        status=\$(curl -s -X GET \$api_url | jq -r '.overall_status')
+
+        # Print the status to the Nextflow log
+        echo "Job ID: ${job_id} - Status: \$status"
+
+        # Check if the job is completed or failed
+        if [ "\$status" = "Complete" ]; then
+            echo "Job ID: ${job_id} has completed."
+            break
+        elif [ "\$status" = "Failed" ]; then
+            echo "Job ID: ${job_id} has failed."
+            exit 1
+        fi
+
+        # Wait for the next interval before checking the status again
+        sleep \$interval
+    done
+    """
+}
+
+
+process GENERATE_SQL {
+
+    publishDir "sql/${db_name}", mode: 'copy', overwrite: true  // Publish SQL files to 'sql' directory
+
+    input:
+    tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch
+
+    output:
+    path "${db_name}.sql", emit: sql_output_file  // Output pattern to capture SQL files
+
+    script:
+    println "Generating SQL for db: ${db_name}"
+    """
+    # add max package size to account for dna db table size
+    mysqldump --max-allowed-packet=2048M --opt --quick --skip-column-statistics -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql
+    sleep 180
+    """
+    // # For each table found in the db, dump it out to file
+    // for table in \$(mysql -h ${params.target_host} -P ${params.target_port} -u ensro -N -e 'SHOW TABLES' ${db_name}_tmp); do
+    //     mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --routines --triggers --add-drop-table ${db_name}_tmp \${table} > \${table}.sql
+    //     #mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --no-create-info ${db_name}_tmp \${table} > \${table}.sql
+    // done
+    // """
+}
+
+process COMPRESS_FILES {
+
+    // get working and then check which compression method to use
+
+    publishDir "zip/", mode: 'copy', overwrite: true
+
+    input:
+    path sql_file
+
+    output:
+    path "${sql_file}.bz2", emit: compressed_sql_ch  // Output compressed table-named file into a channel
+
+    script:
+    println "Compressing file: ${sql_file}"
+
+    """
+    # Ensure the file is copied to the current work dir, not linked
+    cp ${sql_file} ./temp_file.sql
+
+    # Compress the file
+    #bzip2 \$(realpath temp_file.sql)
+    bzip2 temp_file.sql
+
+    # Rename file
+    mv temp_file.sql.bz2 ${sql_file}.bz2
+    """
+}
+
+process TAR_COMPRESSED_SQL {
+
+    input:
+    path compressed_sql_list  // The list of compressed SQL files
+    tuple val(job_id), val(db_name) // Get job ID and db name
+
+
+    output:
+    path "${db_name}.tar.bz2"  // The final tar.bz2 archive
+
+    script:
+    // Print a message to inform the user about the archiving
+    println "Archiving SQL files for database: ${db_name}"
+    println "Compressed files: ${compressed_sql_list.join(', ')}"
+    println "Creating archive: ${db_name}_archive.tar.bz2"
+    """
+    # Create a tar archive with all the compressed SQL files
+    tar -cjf ${db_name}.tar.bz2 ${compressed_sql_list.join(' ')}
+    """
+}
+
+
+workflow {
+
+    main:
+
+        // Print the raw db_list to ensure it's being passed properly
+        println "Raw params.db_list: ${params.db_list}"
+
+        // Check if params.db_list is null or empty
+        if (params.db_list == null || params.db_list == '') {
+            println "ERROR: params.db_list is null or empty"
+            exit 1
+        }
+
+        // Split the string into a list and print it
+        db_list = params.db_list.split(',')
+        println "Split db_list: ${db_list}"
+
+        // Check if the split resulted in an empty list
+        if (db_list.size() == 0) {
+            println "ERROR: db_list is empty after split"
+            exit 1
+        }
+
+        // Create channel of dbs to copy from user list
+        // Set the channel to a variable for use in DB_COPY
+        Channel
+            .from(db_list)
+            .view()
+            .set { db_names_ch }
+
+        // Submit the db copy job(s)
+        result = DB_COPY_SUBMIT(db_names_ch)
+
+        // Extract the job id and map to db name
+        DB_COPY_SUBMIT.out.job_info_ch
+        .map { job_id_file, db_name ->  
+            def job_id = job_id_file.text.trim()  // Read and trim the contents of job_id.txt
+            tuple(job_id, db_name)  // Return the tuple (job_id, db_name)
+        }
+        .set { job_info_mapped_ch }
+
+        // View the mapped channel contents
+        job_info_mapped_ch.view()
+
+        // Monitor the db copy job
+        MONITOR_DB_COPY(job_info_mapped_ch)
+
+        // Generate SQL files
+        GENERATE_SQL(MONITOR_DB_COPY.out.monitored_job)
+
+        // sql_file_ch = Channel.of("sql/${db_name}/${db_name}.sql")
+
+        GENERATE_SQL.out.sql_output_file.view()
+
+        // Compress the SQL files
+        compressed_sql_ch = COMPRESS_FILES(GENERATE_SQL.out.sql_output_file)
+        // compressed_sql_ch = COMPRESS_FILES(sql_file_ch)
+
+        // Collect the compressed SQL files into a list
+        // compressed_sql_list = compressed_sql_ch.collect() 
+
+        // compressed_sql_list.view()
+
+        // archive the SQL files
+        // TAR_COMPRESSED_SQL(compressed_sql_list, job_info_mapped_ch)
+
+        // move archives to final storage path
+        // use the datamover queue for copying things over?
+}

From 6ee16cd1043035ec60895cd2ab34078ca32667a2 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 9 Oct 2024 12:34:21 +0100
Subject: [PATCH 02/20] Rename nf file

---
 nextflow/workflows/{main.nf => dbCleanup.nf} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename nextflow/workflows/{main.nf => dbCleanup.nf} (100%)

diff --git a/nextflow/workflows/main.nf b/nextflow/workflows/dbCleanup.nf
similarity index 100%
rename from nextflow/workflows/main.nf
rename to nextflow/workflows/dbCleanup.nf

From 5884bb970533db3311424c94463a5172f8b1cbc7 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 9 Oct 2024 12:54:46 +0100
Subject: [PATCH 03/20] Update print outs in log info

---
 nextflow/workflows/dbCleanup.nf | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf
index 04f02ac54..536c2fbef 100644
--- a/nextflow/workflows/dbCleanup.nf
+++ b/nextflow/workflows/dbCleanup.nf
@@ -18,7 +18,6 @@ nextflow.enable.dsl=2
 // default params
 params.source_host = ""
 params.source_port = ""
-// params.db_list = params.db_list?.split(',') as List // https://github.com/nextflow-io/nextflow/discussions/2821
 params.db_list = []
 params.target_host = "mysql-ens-core-prod-1"
 params.target_port = "4524"
@@ -36,9 +35,18 @@ log.info """\
     workDir                 : ${workDir}
     launchDir               : ${launchDir}
     projectDir              : ${projectDir}
-    db list                 : ${params.db_list}
     email address for HPC   : ${params.email}
     user                    : ${params.user}
+    target path for output  : ${params.target_path}
+
+    Database parameters
+    ===================
+    db list                 : ${params.db_list}
+    source db host          : ${params.source_host}
+    source db port          : ${params.source_port}
+    target db host          : ${params.target_host}
+    target db port          : ${params.target_port}
+    drop source db at end   : ${params.drop_db}
 
     """
     .stripIndent(true)

From 7126e513ac4836f56a971845d13510d18d32356b Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 9 Oct 2024 13:12:53 +0100
Subject: [PATCH 04/20] Move DB_COPY_SUBMIT process to module file

---
 nextflow/modules/db_cleanup/db_copy_submit.nf | 44 +++++++++++++++++++
 nextflow/workflows/dbCleanup.nf               | 28 +-----------
 2 files changed, 45 insertions(+), 27 deletions(-)
 create mode 100644 nextflow/modules/db_cleanup/db_copy_submit.nf

diff --git a/nextflow/modules/db_cleanup/db_copy_submit.nf b/nextflow/modules/db_cleanup/db_copy_submit.nf
new file mode 100644
index 000000000..2b5cc464b
--- /dev/null
+++ b/nextflow/modules/db_cleanup/db_copy_submit.nf
@@ -0,0 +1,44 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+nextflow.enable.dsl=2
+
+process DB_COPY_SUBMIT {
+
+    input:
+    val db_name
+
+    output:
+    tuple path('job_id.txt'), val(db_name), emit: job_info_ch
+
+    script:
+    api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob"
+    source_db="${params.source_host}:${params.source_port}"
+    target_db="${params.target_host}:${params.target_port}"
+
+    println "Submitting dbcopy job for $db_name"
+
+    """
+    # Submit the job via dbcopy-client
+    dbcopy-client -u $api_url -a submit -s $source_db -t $target_db -i $db_name -n ${db_name}_tmp -e $params.email -r $params.user --wipe_target 1 --skip-check &> out.log
+
+    # sleep to allow file to be created
+    sleep 10
+
+    # Extract the job id
+    job_id=\$(grep -o '[0-9a-f\\-]\\{36\\}' out.log)
+    echo \$job_id > job_id.txt
+    """
+}
\ No newline at end of file
diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf
index 536c2fbef..86d064478 100644
--- a/nextflow/workflows/dbCleanup.nf
+++ b/nextflow/workflows/dbCleanup.nf
@@ -51,34 +51,8 @@ log.info """\
     """
     .stripIndent(true)
 
+include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf'
 
-process DB_COPY_SUBMIT {
-
-    input:
-    val db_name
-
-    output:
-    tuple path('job_id.txt'), val(db_name), emit: job_info_ch
-
-    script:
-    api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob"
-    source_db="${params.source_host}:${params.source_port}"
-    target_db="${params.target_host}:${params.target_port}"
-
-    println "Submitting dbcopy job for $db_name"
-
-    """
-    # Submit the job via dbcopy-client
-    dbcopy-client -u $api_url -a submit -s $source_db -t $target_db -i $db_name -n ${db_name}_tmp -e $params.email -r $params.user --wipe_target 1 --skip-check &> out.log
-
-    # sleep to allow file to be created
-    sleep 10
-
-    # Extract the job id
-    job_id=\$(grep -o '[0-9a-f\\-]\\{36\\}' out.log)
-    echo \$job_id > job_id.txt
-    """
-}
 
 // process EXTRACT_JOB_ID {
 

From 4f791e981f9ad63df2f4372734f06f286a405427 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 9 Oct 2024 13:45:52 +0100
Subject: [PATCH 05/20] Move MONITOR_DB_COPY process to module file

---
 .../modules/db_cleanup/monitor_db_copy.nf     | 56 +++++++++++++++++
 nextflow/workflows/dbCleanup.nf               | 63 +------------------
 2 files changed, 57 insertions(+), 62 deletions(-)
 create mode 100644 nextflow/modules/db_cleanup/monitor_db_copy.nf

diff --git a/nextflow/modules/db_cleanup/monitor_db_copy.nf b/nextflow/modules/db_cleanup/monitor_db_copy.nf
new file mode 100644
index 000000000..dd7b10e19
--- /dev/null
+++ b/nextflow/modules/db_cleanup/monitor_db_copy.nf
@@ -0,0 +1,56 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+nextflow.enable.dsl=2
+
+process MONITOR_DB_COPY {
+
+    // Get job ID and db name from the previous process
+    input:
+    tuple val(job_id), val(db_name)
+
+    output:
+    tuple val(job_id), val(db_name), emit: monitored_job
+
+    script:
+    """
+    # Define the API endpoint to check the status of the job
+    api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}"
+    
+    # Set the interval for checking the status (e.g., every so many seconds)
+    interval=60
+
+    # Polling loop
+    while true; do
+        # Fetch job status
+        status=\$(curl -s -X GET \$api_url | jq -r '.overall_status')
+
+        # Print the status to the Nextflow log
+        echo "Job ID: ${job_id} - Status: \$status"
+
+        # Check if the job is completed or failed
+        if [ "\$status" = "Complete" ]; then
+            echo "Job ID: ${job_id} has completed."
+            break
+        elif [ "\$status" = "Failed" ]; then
+            echo "Job ID: ${job_id} has failed."
+            exit 1
+        fi
+
+        # Wait for the next interval before checking the status again
+        sleep \$interval
+    done
+    """
+}
\ No newline at end of file
diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf
index 86d064478..5306d7912 100644
--- a/nextflow/workflows/dbCleanup.nf
+++ b/nextflow/workflows/dbCleanup.nf
@@ -52,68 +52,7 @@ log.info """\
     .stripIndent(true)
 
 include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf'
-
-
-// process EXTRACT_JOB_ID {
-
-//     input:
-//     val db_name
-//     path job_id_file
-
-//     output:
-//     tuple val(job_id_value), val(db_name), emit: job_info_ch
-
-//     script:
-//     // """
-//     // # Read the job ID from the job_id_file
-//     // #job_id_value=\$(cat $job_id_file)
-//     // #echo "Job ID: \$job_id_value"
-//     // """
-//     // Define a Groovy variable to hold the job ID
-//     // Convert job_id_file (which is a path) to a Groovy File object and read its contents
-//     def job_id_value = job_id_file.newReader().text.trim()  // New unique variable name
-//     println "Job ID: $job_id_value" // Print the job ID for debugging
-// }
-
-process MONITOR_DB_COPY {
-
-    input:
-    tuple val(job_id), val(db_name) // Get job ID and db name from the previous process
-
-    output:
-    tuple val(job_id), val(db_name), emit: monitored_job
-
-    script:
-    """
-    # Define the API endpoint to check the status of the job
-    api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}"
-    
-    # Set the interval for checking the status (e.g., every so many seconds)
-    interval=60
-
-    # Polling loop
-    while true; do
-        # Fetch job status
-        status=\$(curl -s -X GET \$api_url | jq -r '.overall_status')
-
-        # Print the status to the Nextflow log
-        echo "Job ID: ${job_id} - Status: \$status"
-
-        # Check if the job is completed or failed
-        if [ "\$status" = "Complete" ]; then
-            echo "Job ID: ${job_id} has completed."
-            break
-        elif [ "\$status" = "Failed" ]; then
-            echo "Job ID: ${job_id} has failed."
-            exit 1
-        fi
-
-        # Wait for the next interval before checking the status again
-        sleep \$interval
-    done
-    """
-}
-
+include { MONITOR_DB_COPY } from '../modules/db_cleanup/monitor_db_copy.nf'
 
 process GENERATE_SQL {
 

From 7c7e1f9d622430861b768cfb313cebd0d66b0469 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 9 Oct 2024 14:05:48 +0100
Subject: [PATCH 06/20] Move GENERATE_SQL process to module file

---
 nextflow/modules/db_cleanup/generate_sql.nf | 44 +++++++++++++++++++++
 nextflow/workflows/dbCleanup.nf             | 26 +-----------
 2 files changed, 45 insertions(+), 25 deletions(-)
 create mode 100644 nextflow/modules/db_cleanup/generate_sql.nf

diff --git a/nextflow/modules/db_cleanup/generate_sql.nf b/nextflow/modules/db_cleanup/generate_sql.nf
new file mode 100644
index 000000000..5df291640
--- /dev/null
+++ b/nextflow/modules/db_cleanup/generate_sql.nf
@@ -0,0 +1,44 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+nextflow.enable.dsl=2
+
+process GENERATE_SQL {
+
+    publishDir "sql/${db_name}", mode: 'copy', overwrite: true  // Publish SQL files to 'sql' directory
+
+    input:
+    tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch
+
+    output:
+    path "${db_name}.sql", emit: sql_output_file  // Output pattern to capture SQL files
+
+    script:
+    println "Generating SQL for db: ${db_name}"
+    """
+    # add max package size to account for dna db table size
+    mysqldump --max-allowed-packet=2048M --opt --quick --skip-column-statistics -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql
+    
+    # add sleep to let file system finish file dump
+    sleep 180
+    """
+    // Keep this table-based code in case we want to bring this back at later date
+    // # For each table found in the db, dump it out to file
+    // for table in \$(mysql -h ${params.target_host} -P ${params.target_port} -u ensro -N -e 'SHOW TABLES' ${db_name}_tmp); do
+    //     mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --routines --triggers --add-drop-table ${db_name}_tmp \${table} > \${table}.sql
+    //     #mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --no-create-info ${db_name}_tmp \${table} > \${table}.sql
+    // done
+    // """
+}
\ No newline at end of file
diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf
index 5306d7912..d5ec11cbf 100644
--- a/nextflow/workflows/dbCleanup.nf
+++ b/nextflow/workflows/dbCleanup.nf
@@ -53,31 +53,7 @@ log.info """\
 
 include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf'
 include { MONITOR_DB_COPY } from '../modules/db_cleanup/monitor_db_copy.nf'
-
-process GENERATE_SQL {
-
-    publishDir "sql/${db_name}", mode: 'copy', overwrite: true  // Publish SQL files to 'sql' directory
-
-    input:
-    tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch
-
-    output:
-    path "${db_name}.sql", emit: sql_output_file  // Output pattern to capture SQL files
-
-    script:
-    println "Generating SQL for db: ${db_name}"
-    """
-    # add max package size to account for dna db table size
-    mysqldump --max-allowed-packet=2048M --opt --quick --skip-column-statistics -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql
-    sleep 180
-    """
-    // # For each table found in the db, dump it out to file
-    // for table in \$(mysql -h ${params.target_host} -P ${params.target_port} -u ensro -N -e 'SHOW TABLES' ${db_name}_tmp); do
-    //     mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --routines --triggers --add-drop-table ${db_name}_tmp \${table} > \${table}.sql
-    //     #mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --no-create-info ${db_name}_tmp \${table} > \${table}.sql
-    // done
-    // """
-}
+include { GENERATE_SQL } from '../modules/db_cleanup/generate_sql.nf'
 
 process COMPRESS_FILES {
 

From 09bb5e07a5f15ba84e13d4e7664509bcf09a6af8 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 9 Oct 2024 14:29:26 +0100
Subject: [PATCH 07/20] Move COMPRESS_FILE process to module file. Update path
 for resulting output.

---
 nextflow/modules/db_cleanup/compress_file.nf | 45 +++++++++++++++++
 nextflow/workflows/dbCleanup.nf              | 52 ++++----------------
 2 files changed, 54 insertions(+), 43 deletions(-)
 create mode 100644 nextflow/modules/db_cleanup/compress_file.nf

diff --git a/nextflow/modules/db_cleanup/compress_file.nf b/nextflow/modules/db_cleanup/compress_file.nf
new file mode 100644
index 000000000..d54684c60
--- /dev/null
+++ b/nextflow/modules/db_cleanup/compress_file.nf
@@ -0,0 +1,45 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+nextflow.enable.dsl=2
+
+process COMPRESS_FILE {
+
+    // get working and then check which compression method to use
+
+    // copies compressed file to user-provided target path
+    publishDir params.target_path, mode: 'copy', overwrite: true
+
+    input:
+    path sql_file
+
+    output:
+    path "${sql_file}.bz2", emit: compressed_sql_ch  // Output compressed table-named file into a channel
+
+    script:
+    println "Compressing file: ${sql_file}"
+
+    """
+    # Ensure the file is copied to the current work dir, not linked
+    cp ${sql_file} ./temp_file.sql
+
+    # Compress the file
+    #bzip2 \$(realpath temp_file.sql)
+    bzip2 temp_file.sql
+
+    # Rename file
+    mv temp_file.sql.bz2 ${sql_file}.bz2
+    """
+}
\ No newline at end of file
diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/dbCleanup.nf
index d5ec11cbf..bcea1a375 100644
--- a/nextflow/workflows/dbCleanup.nf
+++ b/nextflow/workflows/dbCleanup.nf
@@ -54,35 +54,12 @@ log.info """\
 include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf'
 include { MONITOR_DB_COPY } from '../modules/db_cleanup/monitor_db_copy.nf'
 include { GENERATE_SQL } from '../modules/db_cleanup/generate_sql.nf'
+include { COMPRESS_FILE } from '../modules/db_cleanup/compress_file.nf'
 
-process COMPRESS_FILES {
-
-    // get working and then check which compression method to use
-
-    publishDir "zip/", mode: 'copy', overwrite: true
-
-    input:
-    path sql_file
-
-    output:
-    path "${sql_file}.bz2", emit: compressed_sql_ch  // Output compressed table-named file into a channel
-
-    script:
-    println "Compressing file: ${sql_file}"
-
-    """
-    # Ensure the file is copied to the current work dir, not linked
-    cp ${sql_file} ./temp_file.sql
-
-    # Compress the file
-    #bzip2 \$(realpath temp_file.sql)
-    bzip2 temp_file.sql
-
-    # Rename file
-    mv temp_file.sql.bz2 ${sql_file}.bz2
-    """
-}
-
+// Process not currently in use as changed to using single file
+// for whole db, so no longer archiving a group of table sql files.
+// Leaving here in case need it in future - needs testing before
+// moving to own module file.
 process TAR_COMPRESSED_SQL {
 
     input:
@@ -155,22 +132,11 @@ workflow {
         // Generate SQL files
         GENERATE_SQL(MONITOR_DB_COPY.out.monitored_job)
 
-        // sql_file_ch = Channel.of("sql/${db_name}/${db_name}.sql")
-
+        // View the generated files
         GENERATE_SQL.out.sql_output_file.view()
 
-        // Compress the SQL files
-        compressed_sql_ch = COMPRESS_FILES(GENERATE_SQL.out.sql_output_file)
-        // compressed_sql_ch = COMPRESS_FILES(sql_file_ch)
-
-        // Collect the compressed SQL files into a list
-        // compressed_sql_list = compressed_sql_ch.collect() 
-
-        // compressed_sql_list.view()
-
-        // archive the SQL files
-        // TAR_COMPRESSED_SQL(compressed_sql_list, job_info_mapped_ch)
+        // Compress the SQL file
+        // also outputs compressed file to final storage path
+        compressed_sql_ch = COMPRESS_FILE(GENERATE_SQL.out.sql_output_file)
 
-        // move archives to final storage path
-        // use the datamover queue for copying things over?
 }

From 707231ded9260c4cac9eba6eeab295d0e434f996 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Fri, 11 Oct 2024 11:44:14 +0100
Subject: [PATCH 08/20] No longer publish intermediate sql files

---
 nextflow/modules/db_cleanup/generate_sql.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow/modules/db_cleanup/generate_sql.nf b/nextflow/modules/db_cleanup/generate_sql.nf
index 5df291640..106f3134f 100644
--- a/nextflow/modules/db_cleanup/generate_sql.nf
+++ b/nextflow/modules/db_cleanup/generate_sql.nf
@@ -17,7 +17,7 @@ nextflow.enable.dsl=2
 
 process GENERATE_SQL {
 
-    publishDir "sql/${db_name}", mode: 'copy', overwrite: true  // Publish SQL files to 'sql' directory
+    // publishDir "sql/${db_name}", mode: 'copy', overwrite: true  // Publish SQL files to 'sql' directory
 
     input:
     tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch

From f680105a95d3dc0a56020c89c8a45408794feaaf Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Fri, 11 Oct 2024 11:45:11 +0100
Subject: [PATCH 09/20] Rename main nextflow script. Follow Ensembl nextflow
 dir structure protocol.

---
 nextflow/workflows/{dbCleanup.nf => db_cleanup/main.nf} | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
 rename nextflow/workflows/{dbCleanup.nf => db_cleanup/main.nf} (93%)

diff --git a/nextflow/workflows/dbCleanup.nf b/nextflow/workflows/db_cleanup/main.nf
similarity index 93%
rename from nextflow/workflows/dbCleanup.nf
rename to nextflow/workflows/db_cleanup/main.nf
index bcea1a375..f912504b9 100644
--- a/nextflow/workflows/dbCleanup.nf
+++ b/nextflow/workflows/db_cleanup/main.nf
@@ -51,10 +51,10 @@ log.info """\
     """
     .stripIndent(true)
 
-include { DB_COPY_SUBMIT } from '../modules/db_cleanup/db_copy_submit.nf'
-include { MONITOR_DB_COPY } from '../modules/db_cleanup/monitor_db_copy.nf'
-include { GENERATE_SQL } from '../modules/db_cleanup/generate_sql.nf'
-include { COMPRESS_FILE } from '../modules/db_cleanup/compress_file.nf'
+include { DB_COPY_SUBMIT } from '../../modules/db_cleanup/db_copy_submit.nf'
+include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf'
+include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf'
+include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf'
 
 // Process not currently in use as changed to using single file
 // for whole db, so no longer archiving a group of table sql files.

From 5db2b86d64947c42494ba8a566099726b5284ca6 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Fri, 11 Oct 2024 16:44:35 +0100
Subject: [PATCH 10/20] Use nf-schema for parameter validation and help info

---
 nextflow/workflows/db_cleanup/main.nf         | 39 ++++++++-----
 nextflow/workflows/db_cleanup/nextflow.config | 27 +++++++++
 .../workflows/db_cleanup/nextflow_schema.json | 56 +++++++++++++++++++
 3 files changed, 109 insertions(+), 13 deletions(-)
 create mode 100644 nextflow/workflows/db_cleanup/nextflow.config
 create mode 100644 nextflow/workflows/db_cleanup/nextflow_schema.json

diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf
index f912504b9..c39e7e80f 100644
--- a/nextflow/workflows/db_cleanup/main.nf
+++ b/nextflow/workflows/db_cleanup/main.nf
@@ -15,16 +15,32 @@
 
 nextflow.enable.dsl=2
 
+// modules to include
+include { DB_COPY_SUBMIT } from '../../modules/db_cleanup/db_copy_submit.nf'
+include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf'
+include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf'
+include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf'
+
+// nf-schema-related modules
+include { validateParameters; paramsSummaryLog; samplesheetToList } from 'plugin/nf-schema'
+
+// Validate input parameters
+validateParameters()
+
+// Print summary of supplied parameters
+// via nf-schema plugin
+log.info paramsSummaryLog(workflow)
+
 // default params
-params.source_host = ""
-params.source_port = ""
-params.db_list = []
-params.target_host = "mysql-ens-core-prod-1"
-params.target_port = "4524"
-params.target_path = ""
-params.drop_db = false
-params.email = ""
-params.user = "ensro"
+// params.source_host = ""
+// params.source_port = ""
+// params.db_list = []
+// params.target_host = "mysql-ens-core-prod-1"
+// params.target_port = "4524"
+// params.target_path = ""
+// params.drop_db = false
+// params.email = ""
+// params.user = ""
 
 log.info """\
 
@@ -51,10 +67,7 @@ log.info """\
     """
     .stripIndent(true)
 
-include { DB_COPY_SUBMIT } from '../../modules/db_cleanup/db_copy_submit.nf'
-include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf'
-include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf'
-include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf'
+
 
 // Process not currently in use as changed to using single file
 // for whole db, so no longer archiving a group of table sql files.
diff --git a/nextflow/workflows/db_cleanup/nextflow.config b/nextflow/workflows/db_cleanup/nextflow.config
new file mode 100644
index 000000000..9e722eca2
--- /dev/null
+++ b/nextflow/workflows/db_cleanup/nextflow.config
@@ -0,0 +1,27 @@
+// required plugins
+// https://nextflow-io.github.io/nf-schema/latest/#quick-start
+plugins {
+  id 'nf-schema@2.1.1'
+}
+
+// define parameters
+params.source_host = "mysql-ens-sta-6"
+params.source_port = "4695"
+params.target_host = "mysql-ens-core-prod-1"
+params.target_port = "4524"
+params.db_list = "capra_hircus_core_110_1"
+params.user = "nwillhoft"
+params.email = "nwillhoft@ebi.ac.uk"
+params.target_path = "${launchDir}/zip"
+params.drop_db = false
+
+// https://nextflow-io.github.io/nf-schema/latest/configuration/configuration/#help
+validation.help.enabled = true // default: false
+
+// enable the creation of the help message
+// https://nextflow-io.github.io/nf-schema/latest/#quick-start
+validation {
+  help {
+    enabled: true
+  }
+}
\ No newline at end of file
diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json
new file mode 100644
index 000000000..00248143f
--- /dev/null
+++ b/nextflow/workflows/db_cleanup/nextflow_schema.json
@@ -0,0 +1,56 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "$id": "https://json-schema.org/draft/2020-12/schema",
+    "title": "Database Cleanup Pipeline",
+    "description": "Pipeline to clean up databases by generating SQL dumps for long-term storage",
+    "type": "object",
+    "properties": {
+        "source_host": {
+            "type": "string",
+            "description": "The host for connecting to the source database",
+            "required": true
+        },
+        "source_port": {
+            "type": "string",
+            "description": "The port for connecting to the source database",
+            "required": true
+        },
+        "db_list": {
+            "type": "string",
+            "description": "Comma-separated list of database names to process",
+            "required": true
+        },
+        "target_host": {
+            "type": "string",
+            "description": "The host for connecting to the target database",
+            "required": true,
+            "default": "mysql-ens-core-prod-1"
+        },
+        "target_port": {
+            "type": "string",
+            "description": "The port for connecting to the target database",
+            "required": true,
+            "default": "4524"
+        },
+        "target_path": {
+            "type": "string",
+            "description": "Path to store final compressed SQL file",
+            "required": true
+        },
+        "drop_db": {
+            "type": "string",
+            "description": "Boolean to drop the source database after put into storage",
+            "default": false
+        },
+        "email": {
+            "type": "string",
+            "description": "Email address of user. Required for dbcopy-client service to run",
+            "required": true
+        },
+        "user": {
+            "type": "string",
+            "description": "User name to use for dbcopy-client service",
+            "required": true
+        }
+    }
+}
\ No newline at end of file

From 30c01ffb8f174d00725fbd2642a7834338c9c7c3 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Mon, 14 Oct 2024 21:55:02 +0100
Subject: [PATCH 11/20] Add process to clear up temp database

---
 nextflow/workflows/db_cleanup/main.nf         | 25 +++++++++++++++++++
 .../workflows/db_cleanup/nextflow_schema.json |  5 ++++
 2 files changed, 30 insertions(+)

diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf
index c39e7e80f..78ac40d10 100644
--- a/nextflow/workflows/db_cleanup/main.nf
+++ b/nextflow/workflows/db_cleanup/main.nf
@@ -94,6 +94,29 @@ process TAR_COMPRESSED_SQL {
     """
 }
 
+process CLEANUP_TMP_DB {
+
+    input:
+    path compressed_file
+    tuple val(job_id), val(db_name)
+
+    script:
+    """
+    tmp_db_name="${db_name}_tmp"
+    echo "Checking if target database \${tmp_db_name} exists for removal..."
+
+    db_exists=\$(mysql -h $params.target_host -P $params.target_port -u ensadmin -p $params.db_pwd -e "SHOW DATABASES LIKE '\${tmp_db_name}';")
+
+    if [ ! -z "\${db_exists}" ]; then
+        echo "Database \${tmp_db_name} exists. Dropping the database."
+        mysqladmin -h $params.target_host -P $params.target_port -u $params.user -f drop \${tmp_db_name}
+    else
+        echo "Database \${tmp_db_name} does not exist, skipping drop."
+    fi
+    """
+}
+
+
 
 workflow {
 
@@ -152,4 +175,6 @@ workflow {
         // also outputs compressed file to final storage path
         compressed_sql_ch = COMPRESS_FILE(GENERATE_SQL.out.sql_output_file)
 
+        // Cleanup the temp db created by this pipeline
+        CLEANUP_TMP_DB(compressed_sql_ch, MONITOR_DB_COPY.out.monitored_job)
 }
diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json
index 00248143f..a7dc9c265 100644
--- a/nextflow/workflows/db_cleanup/nextflow_schema.json
+++ b/nextflow/workflows/db_cleanup/nextflow_schema.json
@@ -42,6 +42,11 @@
             "description": "Boolean to drop the source database after put into storage",
             "default": false
         },
+        "db_pwd": {
+            "type": "string",
+            "description": "Password for db admin user",
+            "required": true
+        },
         "email": {
             "type": "string",
             "description": "Email address of user. Required for dbcopy-client service to run",

From 398dd4cecfb3d57e3b2c3d016116ef780750415e Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Tue, 15 Oct 2024 11:49:49 +0100
Subject: [PATCH 12/20] Remove column stats option as no longer needed

---
 nextflow/modules/db_cleanup/generate_sql.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow/modules/db_cleanup/generate_sql.nf b/nextflow/modules/db_cleanup/generate_sql.nf
index 106f3134f..14eff6321 100644
--- a/nextflow/modules/db_cleanup/generate_sql.nf
+++ b/nextflow/modules/db_cleanup/generate_sql.nf
@@ -29,7 +29,7 @@ process GENERATE_SQL {
     println "Generating SQL for db: ${db_name}"
     """
     # add max package size to account for dna db table size
-    mysqldump --max-allowed-packet=2048M --opt --quick --skip-column-statistics -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql
+    mysqldump --max-allowed-packet=2048M --opt --quick -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql
     
     # add sleep to let file system finish file dump
     sleep 180

From 1e369bb1e9535037bb77201ae7539a1ec66d3e84 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Tue, 15 Oct 2024 13:49:46 +0100
Subject: [PATCH 13/20] Require db admin user and password for dropping a db

---
 nextflow/workflows/db_cleanup/nextflow_schema.json | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json
index a7dc9c265..2d8128c4b 100644
--- a/nextflow/workflows/db_cleanup/nextflow_schema.json
+++ b/nextflow/workflows/db_cleanup/nextflow_schema.json
@@ -42,9 +42,14 @@
             "description": "Boolean to drop the source database after put into storage",
             "default": false
         },
-        "db_pwd": {
+        "dba_user": {
             "type": "string",
-            "description": "Password for db admin user",
+            "description": "Username for db admin user. Used to drop a database",
+            "required": true
+        },
+        "dba_pwd": {
+            "type": "string",
+            "description": "Password for db admin user. Used to drop a database",
             "required": true
         },
         "email": {

From 24b334d93c511c9890ab08e51185310994ee8263 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Tue, 15 Oct 2024 13:50:25 +0100
Subject: [PATCH 14/20] Fix drop db process with extra space and simplify drop
 statement

---
 nextflow/workflows/db_cleanup/main.nf | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf
index 78ac40d10..8e0b869c6 100644
--- a/nextflow/workflows/db_cleanup/main.nf
+++ b/nextflow/workflows/db_cleanup/main.nf
@@ -103,16 +103,11 @@ process CLEANUP_TMP_DB {
     script:
     """
     tmp_db_name="${db_name}_tmp"
-    echo "Checking if target database \${tmp_db_name} exists for removal..."
+    echo "Attempting to drop database \${tmp_db_name} if it exists..."
 
-    db_exists=\$(mysql -h $params.target_host -P $params.target_port -u ensadmin -p $params.db_pwd -e "SHOW DATABASES LIKE '\${tmp_db_name}';")
+    mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS \${tmp_db_name};"
 
-    if [ ! -z "\${db_exists}" ]; then
-        echo "Database \${tmp_db_name} exists. Dropping the database."
-        mysqladmin -h $params.target_host -P $params.target_port -u $params.user -f drop \${tmp_db_name}
-    else
-        echo "Database \${tmp_db_name} does not exist, skipping drop."
-    fi
+    echo "Drop operation complete."
     """
 }
 

From 4904f6bf098795b7394e121f3dd2eed7b0b4ff10 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Tue, 15 Oct 2024 14:32:35 +0100
Subject: [PATCH 15/20] Move CLEANUP_TMP_DB process to module file

---
 nextflow/modules/db_cleanup/cleanup_tmp_db.nf | 33 +++++++++++++++++++
 nextflow/workflows/db_cleanup/main.nf         | 19 +----------
 2 files changed, 34 insertions(+), 18 deletions(-)
 create mode 100644 nextflow/modules/db_cleanup/cleanup_tmp_db.nf

diff --git a/nextflow/modules/db_cleanup/cleanup_tmp_db.nf b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf
new file mode 100644
index 000000000..4c27266e8
--- /dev/null
+++ b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf
@@ -0,0 +1,33 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+nextflow.enable.dsl=2
+
+process CLEANUP_TMP_DB {
+
+    input:
+    path compressed_file
+    tuple val(job_id), val(db_name)
+
+    script:
+    """
+    tmp_db_name="${db_name}_tmp"
+    echo "Attempting to drop database \${tmp_db_name} if it exists..."
+
+    mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS \${tmp_db_name};"
+
+    echo "Drop operation complete."
+    """
+}
\ No newline at end of file
diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf
index 8e0b869c6..4c92201e5 100644
--- a/nextflow/workflows/db_cleanup/main.nf
+++ b/nextflow/workflows/db_cleanup/main.nf
@@ -20,6 +20,7 @@ include { DB_COPY_SUBMIT } from '../../modules/db_cleanup/db_copy_submit.nf'
 include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf'
 include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf'
 include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf'
+include { CLEANUP_TMP_DB } from '../../modules/db_cleanup/cleanup_tmp_db.nf'
 
 // nf-schema-related modules
 include { validateParameters; paramsSummaryLog; samplesheetToList } from 'plugin/nf-schema'
@@ -94,24 +95,6 @@ process TAR_COMPRESSED_SQL {
     """
 }
 
-process CLEANUP_TMP_DB {
-
-    input:
-    path compressed_file
-    tuple val(job_id), val(db_name)
-
-    script:
-    """
-    tmp_db_name="${db_name}_tmp"
-    echo "Attempting to drop database \${tmp_db_name} if it exists..."
-
-    mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS \${tmp_db_name};"
-
-    echo "Drop operation complete."
-    """
-}
-
-
 
 workflow {
 

From e6fe3afc78aca716ce390e0ce129ac1955a7ad7e Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 16 Oct 2024 15:06:51 +0100
Subject: [PATCH 16/20] Add process to drop source db when requested

---
 nextflow/workflows/db_cleanup/main.nf         | 22 +++++++++++++++++++
 .../workflows/db_cleanup/nextflow_schema.json |  2 +-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf
index 4c92201e5..30ba3e00d 100644
--- a/nextflow/workflows/db_cleanup/main.nf
+++ b/nextflow/workflows/db_cleanup/main.nf
@@ -95,6 +95,22 @@ process TAR_COMPRESSED_SQL {
     """
 }
 
+process DROP_SOURCE_DB {
+    
+    input:
+    tuple val(job_id), val(db_name)
+
+    script:
+    """
+    echo "Attempting to drop database ${db_name} if it exists..."
+
+    mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS ${db_name};"
+
+    echo "Drop operation complete."
+    """
+
+}
+
 
 workflow {
 
@@ -155,4 +171,10 @@ workflow {
 
         // Cleanup the temp db created by this pipeline
         CLEANUP_TMP_DB(compressed_sql_ch, MONITOR_DB_COPY.out.monitored_job)
+
+        // Cleanup source db (if flag set to true)
+        if (params.drop_source_db == true) {
+            DROP_SOURCE_DB(CLEANUP_TMP_DB.out.cleaned_up)
+        }
+
 }
diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json
index 2d8128c4b..2c8b7fcc4 100644
--- a/nextflow/workflows/db_cleanup/nextflow_schema.json
+++ b/nextflow/workflows/db_cleanup/nextflow_schema.json
@@ -37,7 +37,7 @@
             "description": "Path to store final compressed SQL file",
             "required": true
         },
-        "drop_db": {
+        "drop_source_db": {
             "type": "string",
             "description": "Boolean to drop the source database after put into storage",
             "default": false

From b316a8113488bde9aa3f6e6d8ee5e50851e41564 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 16 Oct 2024 15:07:59 +0100
Subject: [PATCH 17/20] Remove redundant check

---
 nextflow/workflows/db_cleanup/main.nf | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf
index 30ba3e00d..a151e0bee 100644
--- a/nextflow/workflows/db_cleanup/main.nf
+++ b/nextflow/workflows/db_cleanup/main.nf
@@ -119,12 +119,6 @@ workflow {
         // Print the raw db_list to ensure it's being passed properly
         println "Raw params.db_list: ${params.db_list}"
 
-        // Check if params.db_list is null or empty
-        if (params.db_list == null || params.db_list == '') {
-            println "ERROR: params.db_list is null or empty"
-            exit 1
-        }
-
         // Split the string into a list and print it
         db_list = params.db_list.split(',')
         println "Split db_list: ${db_list}"

From 16f3969359e0c9ece3053df7c4b17b3172f3d7f9 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Wed, 16 Oct 2024 15:08:28 +0100
Subject: [PATCH 18/20] Add prints for info

---
 nextflow/modules/db_cleanup/cleanup_tmp_db.nf  | 5 +++++
 nextflow/modules/db_cleanup/monitor_db_copy.nf | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/nextflow/modules/db_cleanup/cleanup_tmp_db.nf b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf
index 4c27266e8..ce961b850 100644
--- a/nextflow/modules/db_cleanup/cleanup_tmp_db.nf
+++ b/nextflow/modules/db_cleanup/cleanup_tmp_db.nf
@@ -21,7 +21,12 @@ process CLEANUP_TMP_DB {
     path compressed_file
     tuple val(job_id), val(db_name)
 
+    output:
+    tuple val(job_id), val(db_name), emit: cleaned_up
+
     script:
+    println "Cleaning up temporary db: ${db_name}_tmp"
+
     """
     tmp_db_name="${db_name}_tmp"
     echo "Attempting to drop database \${tmp_db_name} if it exists..."
diff --git a/nextflow/modules/db_cleanup/monitor_db_copy.nf b/nextflow/modules/db_cleanup/monitor_db_copy.nf
index dd7b10e19..336bf6eef 100644
--- a/nextflow/modules/db_cleanup/monitor_db_copy.nf
+++ b/nextflow/modules/db_cleanup/monitor_db_copy.nf
@@ -25,6 +25,8 @@ process MONITOR_DB_COPY {
     tuple val(job_id), val(db_name), emit: monitored_job
 
     script:
+    println "Monitoring job id: ${job_id}"
+
     """
     # Define the API endpoint to check the status of the job
     api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}"

From 323fcf9525fb3a5cb405c0e79eb4077139d79f17 Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Fri, 18 Oct 2024 12:18:53 +0100
Subject: [PATCH 19/20] Correct string type to boolean

---
 nextflow/workflows/db_cleanup/nextflow_schema.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nextflow/workflows/db_cleanup/nextflow_schema.json b/nextflow/workflows/db_cleanup/nextflow_schema.json
index 2c8b7fcc4..a3f4d816e 100644
--- a/nextflow/workflows/db_cleanup/nextflow_schema.json
+++ b/nextflow/workflows/db_cleanup/nextflow_schema.json
@@ -38,7 +38,7 @@
             "required": true
         },
         "drop_source_db": {
-            "type": "string",
+            "type": "boolean",
             "description": "Boolean to drop the source database after put into storage",
             "default": false
         },

From 4bcb3db5316fe1128ef90036c821d05ac78b5acc Mon Sep 17 00:00:00 2001
From: nwillhoft <nwillhoft@ebi.ac.uk>
Date: Fri, 18 Oct 2024 12:59:59 +0100
Subject: [PATCH 20/20] Move DROP_SOURCE_DB process to module file

---
 nextflow/modules/db_cleanup/drop_source_db.nf | 34 +++++++++++++++++++
 nextflow/workflows/db_cleanup/main.nf         | 18 +---------
 2 files changed, 35 insertions(+), 17 deletions(-)
 create mode 100644 nextflow/modules/db_cleanup/drop_source_db.nf

diff --git a/nextflow/modules/db_cleanup/drop_source_db.nf b/nextflow/modules/db_cleanup/drop_source_db.nf
new file mode 100644
index 000000000..fb3e99e49
--- /dev/null
+++ b/nextflow/modules/db_cleanup/drop_source_db.nf
@@ -0,0 +1,34 @@
+// See the NOTICE file distributed with this work for additional information
+// regarding copyright ownership.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+nextflow.enable.dsl=2
+
+process DROP_SOURCE_DB {
+    
+    input:
+    tuple val(job_id), val(db_name)
+
+    script:
+    println "Dropping source database: ${db_name}"
+
+    """
+    echo "Attempting to drop database ${db_name} if it exists..."
+
+    mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS ${db_name};"
+
+    echo "Drop operation complete."
+    """
+
+}
\ No newline at end of file
diff --git a/nextflow/workflows/db_cleanup/main.nf b/nextflow/workflows/db_cleanup/main.nf
index a151e0bee..030606d36 100644
--- a/nextflow/workflows/db_cleanup/main.nf
+++ b/nextflow/workflows/db_cleanup/main.nf
@@ -21,6 +21,7 @@ include { MONITOR_DB_COPY } from '../../modules/db_cleanup/monitor_db_copy.nf'
 include { GENERATE_SQL } from '../../modules/db_cleanup/generate_sql.nf'
 include { COMPRESS_FILE } from '../../modules/db_cleanup/compress_file.nf'
 include { CLEANUP_TMP_DB } from '../../modules/db_cleanup/cleanup_tmp_db.nf'
+include { DROP_SOURCE_DB } from '../../modules/db_cleanup/drop_source_db.nf'
 
 // nf-schema-related modules
 include { validateParameters; paramsSummaryLog; samplesheetToList } from 'plugin/nf-schema'
@@ -95,23 +96,6 @@ process TAR_COMPRESSED_SQL {
     """
 }
 
-process DROP_SOURCE_DB {
-    
-    input:
-    tuple val(job_id), val(db_name)
-
-    script:
-    """
-    echo "Attempting to drop database ${db_name} if it exists..."
-
-    mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS ${db_name};"
-
-    echo "Drop operation complete."
-    """
-
-}
-
-
 workflow {
 
     main: