diff --git a/scripts/cubic/bootstrap-c-pac.sh b/scripts/cubic/bootstrap-c-pac.sh index f60cb14..d18dcca 100644 --- a/scripts/cubic/bootstrap-c-pac.sh +++ b/scripts/cubic/bootstrap-c-pac.sh @@ -1,6 +1,12 @@ ## NOTE ## # This workflow is derived from the Datalad Handbook +# In addition to the positional arguments described in https://pennlinc.github.io/docs/TheWay/RunningDataLadPipelines/#preparing-the-analysis-dataset , +# this bootstrap script also takes a /full/path/to/callback.log i.e., +# `bash bootstrap-c-pac.sh /full/path/to/BIDS /full/path/to/cpac-container /full/path/to/callback.log` +# for optimizing memory (see https://fcp-indi.github.io/docs/nightly/user/tutorials/observed_usage for C-PAC optimization tutorial, and see +# sections marked "C-PAC-specific memory optimization" in this script for details). + ## Ensure the environment is ready to bootstrap the analysis workspace # Check that we have conda installed #conda activate @@ -23,7 +29,7 @@ set -e -u ## Set up the directory that will contain the necessary directories -PROJECTROOT=${PWD}/fmriprep +PROJECTROOT=${PWD}/c-pac-1.8.5 if [[ -d ${PROJECTROOT} ]] then echo ${PROJECTROOT} already exists @@ -37,6 +43,11 @@ then fi +# C-PAC-specific memory optimization +CALLBACK_LOG=$3 +# ---------------------------------- + + ## Check the BIDS input BIDSINPUT=$1 if [[ -z ${BIDSINPUT} ]] @@ -92,41 +103,55 @@ else datalad save -r -m "added input data" fi -SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 ) +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 | sort) if [ -z "${SUBJECTS}" ] then echo "No subjects found in input data" # exit 1 fi -set +u -CONTAINERDS=$2 -set -u -#if [[ ! -z "${CONTAINERDS}" ]]; then -cd ${PROJECTROOT} -datalad clone ${CONTAINERDS} pennlinc-containers + ## Add the containers as a subdataset -#datalad clone ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers pennlinc-containers -# download the image so we don't ddos pmacs -cd pennlinc-containers -datalad get -r . -# get rid of the references to pmacs -#set +e -#datalad siblings remove -s pmacs-ria-storage -#datalad siblings remove -s origin -#set -e +cd ${PROJECTROOT} + +# Clone the containers dataset. If specified on the command, use that path +CONTAINERDS=$2 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +else + echo "No containers dataset specified, attempting to clone from pmacs" + datalad clone \ + ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers \ + pennlinc-containers + cd pennlinc-containers + datalad get -r . + # get rid of the references to pmacs + set +e + datalad siblings remove -s pmacs-ria-storage + git annex dead pmacs-ria-storage + datalad siblings remove -s origin + git annex dead origin + set -e +fi cd ${PROJECTROOT}/analysis datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +# C-PAC-specific memory optimization --------- +if [[ ! -z "${CALLBACK_LOG}" ]]; then + ln $CALLBACK_LOG code/runtime_callback.log +fi +# -------------------------------------------- + + ## the actual compute job specification cat > code/participant_job.sh << "EOT" #!/bin/bash #$ -S /bin/bash -#$ -l h_vmem=25G +#$ -l h_vmem=32G +#$ -l s_vmem=32G #$ -l tmpfree=200G -#$ -R y -#$ -l h_rt=24:00:00 # Set up the correct conda environment source ${CONDA_PREFIX}/bin/activate base echo I\'m in $PWD using `which python` @@ -138,6 +163,7 @@ set -e -u -x dssource="$1" pushgitremote="$2" subid="$3" +sesid="$4" # change into the cluster-assigned temp directory. Not done by default in SGE cd ${CBICA_TMPDIR} @@ -145,7 +171,7 @@ cd ${CBICA_TMPDIR} # cd /cbica/comp_space/$(basename $HOME) # Used for the branch names and the temp dir -BRANCH="job-${JOB_ID}-${subid}" +BRANCH="job-${JOB_ID}-${subid}-${sesid}" mkdir ${BRANCH} cd ${BRANCH} @@ -184,22 +210,38 @@ datalad get -n "inputs/data/${subid}" # ------------------------------------------------------------------------------ # Do the run! -datalad run \ - -i code/fmriprep_zip.sh \ - -i inputs/data/${subid} \ - -i inputs/data/*json \ - -i pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \ - --explicit \ - -o ${subid}_fmriprep-20.2.3.zip \ - -o ${subid}_freesurfer-20.2.3.zip \ - -m "fmriprep:20.2.3 ${subid}" \ - "bash ./code/fmriprep_zip.sh ${subid}" +# C-PAC-specific memory optimization -------------------------------- +if [[ -f code/runtime_callback.log ]] +then + datalad run \ + -i code/c-pac_zip.sh \ + -i code/runtime_callback.log \ + -i inputs/data/${subid}/${sesid} \ + -i inputs/data/*json \ + -i pennlinc-containers/.datalad/environments/cpac-1-8-5/image \ + --explicit \ + -o ${subid}_${sesid}_c-pac-1.8.5.zip \ + -m "C-PAC:1.8.5 ${subid} ${sesid}" \ + "bash ./code/c-pac_zip.sh ${subid} ${sesid}" +# ------------------------------------------------------------------- +else + datalad run \ + -i code/c-pac_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/*json \ + -i pennlinc-containers/.datalad/environments/cpac-1-8-5/image \ + --explicit \ + -o ${subid}_${sesid}_c-pac-1.8.5.zip \ + -m "C-PAC:1.8.5 ${subid}" \ + "bash ./code/c-pac_zip.sh ${subid}" +fi # file content first -- does not need a lock, no interaction with Git datalad push --to output-storage # and the output branch flock $DSLOCKFILE git push outputstore +# remove tempdir echo TMPDIR TO DELETE echo ${BRANCH} @@ -215,36 +257,71 @@ EOT chmod +x code/participant_job.sh -cat > code/fmriprep_zip.sh << "EOT" +cat > code/c-pac_zip.sh << "EOT" #!/bin/bash set -e -u -x subid="$1" -mkdir -p ${PWD}/.git/tmp/wdir -singularity run --cleanenv -B ${PWD} \ - pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \ - inputs/data \ - prep \ - participant \ - -w ${PWD}/.git/tmp/wkdir \ - --n_cpus 1 \ - --stop-on-first-crash \ - --fs-license-file code/license.txt \ - --skip-bids-validation \ - --output-spaces MNI152NLin6Asym:res-2 \ - --participant-label "$subid" \ - --force-bbr \ - --cifti-output 91k -v -v - -cd prep -7z a ../${subid}_fmriprep-20.2.3.zip fmriprep -7z a ../${subid}_freesurfer-20.2.3.zip freesurfer -rm -rf prep .git/tmp/wkdir +sesid="$2" + +# Create a filter file that only allows this session +filterfile=${PWD}/${sesid}_filter.json +echo "{" > ${filterfile} +echo "'fmap': {'datatype': 'fmap'}," >> ${filterfile} +echo "'bold': {'datatype': 'func', 'session': '$sesid', 'suffix': 'bold'}," >> ${filterfile} +echo "'sbref': {'datatype': 'func', 'session': '$sesid', 'suffix': 'sbref'}," >> ${filterfile} +echo "'flair': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'FLAIR'}," >> ${filterfile} +echo "'t2w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T2w'}," >> ${filterfile} +echo "'t1w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T1w'}," >> ${filterfile} +echo "'roi': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'roi'}" >> ${filterfile} +echo "}" >> ${filterfile} + +# remove ses and get valid json +sed -i "s/'/\"/g" ${filterfile} +sed -i "s/ses-//g" ${filterfile} + +mkdir -p ${subid}_${sesid}_outputs +# C-PAC-specific memory optimization ----------------------------- +if [[ -f code/runtime_callback.log ]] +then + singularity run --cleanenv \ + -B ${PWD} \ + -B ${PWD}/${subid}_${sesid}_outputs:/outputs \ + pennlinc-containers/.datalad/environments/cpac-1-8-5/image \ + inputs/data \ + /outputs \ + participant \ + --preconfig rbc-options \ + --skip_bids_validator \ + --n_cpus 4 \ + --mem_gb 32 \ + --participant_label "$subid" \ + --runtime_usage=code/runtime_callback.log \ + --runtime_buffer=30 +# ---------------------------------------------------------------- +else + singularity run --cleanenv \ + -B ${PWD} \ + -B ${PWD}/${subid}_${sesid}_outputs:/outputs \ + pennlinc-containers/.datalad/environments/cpac-1-8-5/image \ + inputs/data \ + /outputs \ + participant \ + --preconfig rbc-options \ + --skip_bids_validator \ + --n_cpus 4 \ + --mem_gb 32 \ + --participant_label "$subid" +fi + +rm -rf ${subid}_${sesid}_outputs/working +7z a ${subid}_${sesid}_c-pac-1.8.5.zip ${subid}_${sesid}_outputs +rm -rf ${subid}_${sesid}_outputs +rm ${filterfile} EOT -chmod +x code/fmriprep_zip.sh -cp ${FREESURFER_HOME}/license.txt code/license.txt +chmod +x code/c-pac_zip.sh mkdir logs echo .SGE_datalad_lock >> .gitignore @@ -252,8 +329,9 @@ echo logs >> .gitignore datalad save -m "Participant compute job implementation" -# Add a script for merging outputs -MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ cat > code/merge_outputs.sh << "EOT" #!/bin/bash set -e -u -x @@ -261,21 +339,86 @@ EOT echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ >> code/merge_outputs.sh echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh -wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" + +datalad clone ${outputsource} merge_ds +cd merge_ds +NBRANCHES=$(git branch -a | grep job- | sort | wc -l) +echo "Found $NBRANCHES branches to merge" + +gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1) + +# query all branches for the most recent commit and check if it is identical. +# Write all branch identifiers for jobs without outputs into a file. +for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" = x"${gitref}" ] && \ + echo $i; done | tee code/noresults.txt | wc -l + + +for i in $(git branch -a | grep job- | sort); \ + do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" != x"${gitref}" ] && \ + echo $i; \ +done | tee code/has_results.txt + +mkdir -p code/merge_batches +num_branches=$(wc -l < code/has_results.txt) +CHUNKSIZE=5000 +set +e +num_chunks=$(expr ${num_branches} / ${CHUNKSIZE}) +if [[ $num_chunks == 0 ]]; then + num_chunks=1 +fi +set -e +for chunknum in $(seq 1 $num_chunks) +do + startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1) + endnum=$(expr ${chunknum} \* ${CHUNKSIZE}) + batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt + [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches} + branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt) + echo ${branches} > ${batch_file} + git merge -m "C-PAC results batch ${chunknum}/${num_chunks}" $(cat ${batch_file}) + +done + +# Push the merge back +git push + +# Get the file availability info +git annex fsck --fast -f output-storage + +# This should not print anything +MISSING=$(git annex find --not --in output-storage) + +if [[ ! -z "$MISSING" ]] +then + echo Unable to find data for $MISSING + exit 1 +fi + +# stop tracking this branch +git annex dead here + +datalad push --data nothing +echo SUCCESS + +EOT -################################################################################ -# SGE SETUP START - remove or adjust to your needs -################################################################################ env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" echo '#!/bin/bash' > code/qsub_calls.sh dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" pushgitremote=$(git remote get-url --push output) eo_args="-e ${PWD}/logs -o ${PWD}/logs" for subject in ${SUBJECTS}; do - echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ - ${PWD}/code/participant_job.sh \ - ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh + SESSIONS=$(ls inputs/data/$subject | grep ses- | cut -d '/' -f 1) + for session in ${SESSIONS}; do + echo "qsub -cwd ${env_flags} -N c-pac_${subject}_${session} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} ${session}" >> code/qsub_calls.sh + done done datalad save -m "SGE submission setup" code/ .gitignore