Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Database cleanup pipeline #964

Draft
wants to merge 20 commits into
base: release/mvp
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions nextflow/modules/db_cleanup/cleanup_tmp_db.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

nextflow.enable.dsl=2

process CLEANUP_TMP_DB {

input:
path compressed_file
tuple val(job_id), val(db_name)

output:
tuple val(job_id), val(db_name), emit: cleaned_up

script:
println "Cleaning up temporary db: ${db_name}_tmp"

"""
tmp_db_name="${db_name}_tmp"
echo "Attempting to drop database \${tmp_db_name} if it exists..."

mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS \${tmp_db_name};"

echo "Drop operation complete."
"""
}
45 changes: 45 additions & 0 deletions nextflow/modules/db_cleanup/compress_file.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

nextflow.enable.dsl=2

process COMPRESS_FILE {

// get working and then check which compression method to use

// copies compressed file to user-provided target path
publishDir params.target_path, mode: 'copy', overwrite: true

input:
path sql_file

output:
path "${sql_file}.bz2", emit: compressed_sql_ch // Output compressed table-named file into a channel

script:
println "Compressing file: ${sql_file}"

"""
# Ensure the file is copied to the current work dir, not linked
cp ${sql_file} ./temp_file.sql

# Compress the file
#bzip2 \$(realpath temp_file.sql)
bzip2 temp_file.sql

# Rename file
mv temp_file.sql.bz2 ${sql_file}.bz2
"""
}
44 changes: 44 additions & 0 deletions nextflow/modules/db_cleanup/db_copy_submit.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

nextflow.enable.dsl=2

process DB_COPY_SUBMIT {

input:
val db_name

output:
tuple path('job_id.txt'), val(db_name), emit: job_info_ch

script:
api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob"
source_db="${params.source_host}:${params.source_port}"
target_db="${params.target_host}:${params.target_port}"

println "Submitting dbcopy job for $db_name"

"""
# Submit the job via dbcopy-client
dbcopy-client -u $api_url -a submit -s $source_db -t $target_db -i $db_name -n ${db_name}_tmp -e $params.email -r $params.user --wipe_target 1 --skip-check &> out.log

# sleep to allow file to be created
sleep 10

# Extract the job id
job_id=\$(grep -o '[0-9a-f\\-]\\{36\\}' out.log)
echo \$job_id > job_id.txt
"""
}
34 changes: 34 additions & 0 deletions nextflow/modules/db_cleanup/drop_source_db.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

nextflow.enable.dsl=2

process DROP_SOURCE_DB {

input:
tuple val(job_id), val(db_name)

script:
println "Dropping source database: ${db_name}"

"""
echo "Attempting to drop database ${db_name} if it exists..."

mysql -h $params.target_host -P $params.target_port -u $params.dba_user -p$params.dba_pwd -e "DROP DATABASE IF EXISTS ${db_name};"

echo "Drop operation complete."
"""

}
44 changes: 44 additions & 0 deletions nextflow/modules/db_cleanup/generate_sql.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

nextflow.enable.dsl=2

process GENERATE_SQL {

// publishDir "sql/${db_name}", mode: 'copy', overwrite: true // Publish SQL files to 'sql' directory

input:
tuple val(job_id), val(db_name) // Get job ID and db name from job_info_ch

output:
path "${db_name}.sql", emit: sql_output_file // Output pattern to capture SQL files

script:
println "Generating SQL for db: ${db_name}"
"""
# add max package size to account for dna db table size
mysqldump --max-allowed-packet=2048M --opt --quick -h ${params.target_host} -P ${params.target_port} -u ensro ${db_name}_tmp > ${db_name}.sql

# add sleep to let file system finish file dump
sleep 180
"""
// Keep this table-based code in case we want to bring this back at later date
// # For each table found in the db, dump it out to file
// for table in \$(mysql -h ${params.target_host} -P ${params.target_port} -u ensro -N -e 'SHOW TABLES' ${db_name}_tmp); do
// mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --routines --triggers --add-drop-table ${db_name}_tmp \${table} > \${table}.sql
// #mysqldump --max-allowed-packet=1G --column-statistics=0 -h ${params.target_host} -P ${params.target_port} -u ensro --no-create-info ${db_name}_tmp \${table} > \${table}.sql
// done
// """
}
58 changes: 58 additions & 0 deletions nextflow/modules/db_cleanup/monitor_db_copy.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// See the NOTICE file distributed with this work for additional information
// regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

nextflow.enable.dsl=2

process MONITOR_DB_COPY {

// Get job ID and db name from the previous process
input:
tuple val(job_id), val(db_name)

output:
tuple val(job_id), val(db_name), emit: monitored_job

script:
println "Monitoring job id: ${job_id}"

"""
# Define the API endpoint to check the status of the job
api_url="https://services.ensembl-production.ebi.ac.uk/api/dbcopy/requestjob/${job_id}"

# Set the interval for checking the status (e.g., every so many seconds)
interval=60

# Polling loop
while true; do
# Fetch job status
status=\$(curl -s -X GET \$api_url | jq -r '.overall_status')

# Print the status to the Nextflow log
echo "Job ID: ${job_id} - Status: \$status"

# Check if the job is completed or failed
if [ "\$status" = "Complete" ]; then
echo "Job ID: ${job_id} has completed."
break
elif [ "\$status" = "Failed" ]; then
echo "Job ID: ${job_id} has failed."
exit 1
fi

# Wait for the next interval before checking the status again
sleep \$interval
done
"""
}
Loading