🔀 Merge multises setup from fmriprep-multises bootstrap into c-pac bo…

…otstrap
PennLINC · Jul 7, 2022 · b1583e9 · b1583e9
1 parent a5d9037
commit b1583e9
Showing 1 changed file with 206 additions and 63 deletions.
diff --git a/scripts/cubic/bootstrap-c-pac.sh b/scripts/cubic/bootstrap-c-pac.sh
@@ -1,6 +1,12 @@
 ## NOTE ##
 # This workflow is derived from the Datalad Handbook
 
+# In addition to the positional arguments described in https://pennlinc.github.io/docs/TheWay/RunningDataLadPipelines/#preparing-the-analysis-dataset ,
+# this bootstrap script also takes a /full/path/to/callback.log i.e.,
+# `bash bootstrap-c-pac.sh /full/path/to/BIDS /full/path/to/cpac-container /full/path/to/callback.log`
+# for optimizing memory (see https://fcp-indi.github.io/docs/nightly/user/tutorials/observed_usage for C-PAC optimization tutorial, and see
+# sections marked "C-PAC-specific memory optimization" in this script for details).
+
 ## Ensure the environment is ready to bootstrap the analysis workspace
 # Check that we have conda installed
 #conda activate
@@ -23,7 +29,7 @@ set -e -u
 
 
 ## Set up the directory that will contain the necessary directories
-PROJECTROOT=${PWD}/fmriprep
+PROJECTROOT=${PWD}/c-pac-1.8.5
 if [[ -d ${PROJECTROOT} ]]
 then
     echo ${PROJECTROOT} already exists
@@ -37,6 +43,11 @@ then
 fi
 
 
+# C-PAC-specific memory optimization
+CALLBACK_LOG=$3
+# ----------------------------------
+
+
 ## Check the BIDS input
 BIDSINPUT=$1
 if [[ -z ${BIDSINPUT} ]]
@@ -92,41 +103,55 @@ else
     datalad save -r -m "added input data"
 fi
 
-SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 )
+SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 | sort)
 if [ -z "${SUBJECTS}" ]
 then
     echo "No subjects found in input data"
     # exit 1
 fi
 
-set +u
-CONTAINERDS=$2
-set -u
-#if [[ ! -z "${CONTAINERDS}" ]]; then
-cd ${PROJECTROOT}
-datalad clone ${CONTAINERDS} pennlinc-containers
+
 ## Add the containers as a subdataset
-#datalad clone ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers pennlinc-containers
-# download the image so we don't ddos pmacs
-cd pennlinc-containers
-datalad get -r .
-# get rid of the references to pmacs
-#set +e
-#datalad siblings remove -s pmacs-ria-storage
-#datalad siblings remove -s origin
-#set -e
+cd ${PROJECTROOT}
+
+# Clone the containers dataset. If specified on the command, use that path
+CONTAINERDS=$2
+if [[ ! -z "${CONTAINERDS}" ]]; then
+    datalad clone ${CONTAINERDS} pennlinc-containers
+else
+    echo "No containers dataset specified, attempting to clone from pmacs"
+    datalad clone \
+        ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers \
+        pennlinc-containers
+    cd pennlinc-containers
+    datalad get -r .
+    # get rid of the references to pmacs
+    set +e
+    datalad siblings remove -s pmacs-ria-storage
+    git annex dead pmacs-ria-storage
+    datalad siblings remove -s origin
+    git annex dead origin
+    set -e
+fi
 
 cd ${PROJECTROOT}/analysis
 datalad install -d . --source ${PROJECTROOT}/pennlinc-containers
 
+
+# C-PAC-specific memory optimization ---------
+if [[ ! -z "${CALLBACK_LOG}" ]]; then
+    ln $CALLBACK_LOG code/runtime_callback.log
+fi
+# --------------------------------------------
+
+
 ## the actual compute job specification
 cat > code/participant_job.sh << "EOT"
 #!/bin/bash
 #$ -S /bin/bash
-#$ -l h_vmem=25G
+#$ -l h_vmem=32G
+#$ -l s_vmem=32G
 #$ -l tmpfree=200G
-#$ -R y 
-#$ -l h_rt=24:00:00
 # Set up the correct conda environment
 source ${CONDA_PREFIX}/bin/activate base
 echo I\'m in $PWD using `which python`
@@ -138,14 +163,15 @@ set -e -u -x
 dssource="$1"
 pushgitremote="$2"
 subid="$3"
+sesid="$4"
 
 # change into the cluster-assigned temp directory. Not done by default in SGE
 cd ${CBICA_TMPDIR}
 # OR Run it on a shared network drive
 # cd /cbica/comp_space/$(basename $HOME)
 
 # Used for the branch names and the temp dir
-BRANCH="job-${JOB_ID}-${subid}"
+BRANCH="job-${JOB_ID}-${subid}-${sesid}"
 mkdir ${BRANCH}
 cd ${BRANCH}
 
@@ -184,22 +210,38 @@ datalad get -n "inputs/data/${subid}"
 # ------------------------------------------------------------------------------
 # Do the run!
 
-datalad run \
-    -i code/fmriprep_zip.sh \
-    -i inputs/data/${subid} \
-    -i inputs/data/*json \
-    -i pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \
-    --explicit \
-    -o ${subid}_fmriprep-20.2.3.zip \
-    -o ${subid}_freesurfer-20.2.3.zip \
-    -m "fmriprep:20.2.3 ${subid}" \
-    "bash ./code/fmriprep_zip.sh ${subid}"
+# C-PAC-specific memory optimization --------------------------------
+if [[ -f code/runtime_callback.log ]]
+then
+  datalad run \
+      -i code/c-pac_zip.sh \
+      -i code/runtime_callback.log \
+      -i inputs/data/${subid}/${sesid} \
+      -i inputs/data/*json \
+      -i pennlinc-containers/.datalad/environments/cpac-1-8-5/image \
+      --explicit \
+      -o ${subid}_${sesid}_c-pac-1.8.5.zip \
+      -m "C-PAC:1.8.5 ${subid} ${sesid}" \
+      "bash ./code/c-pac_zip.sh ${subid} ${sesid}"
+# -------------------------------------------------------------------
+else
+  datalad run \
+      -i code/c-pac_zip.sh \
+      -i inputs/data/${subid} \
+      -i inputs/data/*json \
+      -i pennlinc-containers/.datalad/environments/cpac-1-8-5/image \
+      --explicit \
+      -o ${subid}_${sesid}_c-pac-1.8.5.zip \
+      -m "C-PAC:1.8.5 ${subid}" \
+      "bash ./code/c-pac_zip.sh ${subid}"
+fi
 
 # file content first -- does not need a lock, no interaction with Git
 datalad push --to output-storage
 # and the output branch
 flock $DSLOCKFILE git push outputstore
 
+# remove tempdir
 echo TMPDIR TO DELETE
 echo ${BRANCH}
 
@@ -215,67 +257,168 @@ EOT
 
 chmod +x code/participant_job.sh
 
-cat > code/fmriprep_zip.sh << "EOT"
+cat > code/c-pac_zip.sh << "EOT"
 #!/bin/bash
 set -e -u -x
 
 subid="$1"
-mkdir -p ${PWD}/.git/tmp/wdir
-singularity run --cleanenv -B ${PWD} \
-    pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \
-    inputs/data \
-    prep \
-    participant \
-    -w ${PWD}/.git/tmp/wkdir \
-    --n_cpus 1 \
-    --stop-on-first-crash \
-    --fs-license-file code/license.txt \
-    --skip-bids-validation \
-    --output-spaces MNI152NLin6Asym:res-2 \
-    --participant-label "$subid" \
-    --force-bbr \
-    --cifti-output 91k -v -v
-
-cd prep
-7z a ../${subid}_fmriprep-20.2.3.zip fmriprep
-7z a ../${subid}_freesurfer-20.2.3.zip freesurfer
-rm -rf prep .git/tmp/wkdir
+sesid="$2"
+
+# Create a filter file that only allows this session
+filterfile=${PWD}/${sesid}_filter.json
+echo "{" > ${filterfile}
+echo "'fmap': {'datatype': 'fmap'}," >> ${filterfile}
+echo "'bold': {'datatype': 'func', 'session': '$sesid', 'suffix': 'bold'}," >> ${filterfile}
+echo "'sbref': {'datatype': 'func', 'session': '$sesid', 'suffix': 'sbref'}," >> ${filterfile}
+echo "'flair': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'FLAIR'}," >> ${filterfile}
+echo "'t2w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T2w'}," >> ${filterfile}
+echo "'t1w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T1w'}," >> ${filterfile}
+echo "'roi': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'roi'}" >> ${filterfile}
+echo "}" >> ${filterfile}
+
+# remove ses and get valid json
+sed -i "s/'/\"/g" ${filterfile}
+sed -i "s/ses-//g" ${filterfile}
+
+mkdir -p ${subid}_${sesid}_outputs
+# C-PAC-specific memory optimization -----------------------------
+if [[ -f code/runtime_callback.log ]]
+then
+  singularity run --cleanenv \
+      -B ${PWD} \
+      -B ${PWD}/${subid}_${sesid}_outputs:/outputs \
+      pennlinc-containers/.datalad/environments/cpac-1-8-5/image \
+      inputs/data \
+      /outputs \
+      participant \
+      --preconfig rbc-options \
+      --skip_bids_validator \
+      --n_cpus 4 \
+      --mem_gb 32 \
+      --participant_label "$subid" \
+      --runtime_usage=code/runtime_callback.log \
+      --runtime_buffer=30
+# ----------------------------------------------------------------
+else
+  singularity run --cleanenv \
+      -B ${PWD} \
+      -B ${PWD}/${subid}_${sesid}_outputs:/outputs \
+      pennlinc-containers/.datalad/environments/cpac-1-8-5/image \
+      inputs/data \
+      /outputs \
+      participant \
+      --preconfig rbc-options \
+      --skip_bids_validator \
+      --n_cpus 4 \
+      --mem_gb 32 \
+      --participant_label "$subid"
+fi
+
+rm -rf ${subid}_${sesid}_outputs/working
+7z a ${subid}_${sesid}_c-pac-1.8.5.zip ${subid}_${sesid}_outputs
+rm -rf ${subid}_${sesid}_outputs
+rm ${filterfile}
 
 EOT
 
-chmod +x code/fmriprep_zip.sh
-cp ${FREESURFER_HOME}/license.txt code/license.txt
+chmod +x code/c-pac_zip.sh
 
 mkdir logs
 echo .SGE_datalad_lock >> .gitignore
 echo logs >> .gitignore
 
 datalad save -m "Participant compute job implementation"
 
-# Add a script for merging outputs
-MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh
+################################################################################
+# SGE SETUP START - remove or adjust to your needs
+################################################################################
 cat > code/merge_outputs.sh << "EOT"
 #!/bin/bash
 set -e -u -x
 EOT
 echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \
     >> code/merge_outputs.sh
 echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh
-wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh
+
+cat >> code/merge_outputs.sh << "EOT"
+
+datalad clone ${outputsource} merge_ds
+cd merge_ds
+NBRANCHES=$(git branch -a | grep job- | sort | wc -l)
+echo "Found $NBRANCHES branches to merge"
+
+gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1)
+
+# query all branches for the most recent commit and check if it is identical.
+# Write all branch identifiers for jobs without outputs into a file.
+for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \
+  | cut -d ' ' -f1)" = x"${gitref}" ] && \
+  echo $i; done | tee code/noresults.txt | wc -l
+
+
+for i in $(git branch -a | grep job- | sort); \
+  do [ x"$(git show-ref $i  \
+     | cut -d ' ' -f1)" != x"${gitref}" ] && \
+     echo $i; \
+done | tee code/has_results.txt
+
+mkdir -p code/merge_batches
+num_branches=$(wc -l < code/has_results.txt)
+CHUNKSIZE=5000
+set +e
+num_chunks=$(expr ${num_branches} / ${CHUNKSIZE})
+if [[ $num_chunks == 0 ]]; then
+    num_chunks=1
+fi
+set -e
+for chunknum in $(seq 1 $num_chunks)
+do
+    startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1)
+    endnum=$(expr ${chunknum} \* ${CHUNKSIZE})
+    batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt
+    [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches}
+    branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt)
+    echo ${branches} > ${batch_file}
+    git merge -m "C-PAC results batch ${chunknum}/${num_chunks}" $(cat ${batch_file})
+
+done
+
+# Push the merge back
+git push
+
+# Get the file availability info
+git annex fsck --fast -f output-storage
+
+# This should not print anything
+MISSING=$(git annex find --not --in output-storage)
+
+if [[ ! -z "$MISSING" ]]
+then
+    echo Unable to find data for $MISSING
+    exit 1
+fi
+
+# stop tracking this branch
+git annex dead here
+
+datalad push --data nothing
+echo SUCCESS
+
+EOT
 
 
-################################################################################
-# SGE SETUP START - remove or adjust to your needs
-################################################################################
 env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock"
 echo '#!/bin/bash' > code/qsub_calls.sh
 dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)"
 pushgitremote=$(git remote get-url --push output)
 eo_args="-e ${PWD}/logs -o ${PWD}/logs"
 for subject in ${SUBJECTS}; do
-  echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \
-  ${PWD}/code/participant_job.sh \
-  ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh
+  SESSIONS=$(ls  inputs/data/$subject | grep ses- | cut -d '/' -f 1)
+  for session in ${SESSIONS}; do
+    echo "qsub -cwd ${env_flags} -N c-pac_${subject}_${session} ${eo_args} \
+    ${PWD}/code/participant_job.sh \
+    ${dssource} ${pushgitremote} ${subject} ${session}" >> code/qsub_calls.sh
+  done
 done
 datalad save -m "SGE submission setup" code/ .gitignore