Skip to content

Commit

Permalink
🔀 Merge multises setup from fmriprep-multises bootstrap into c-pac bo…
Browse files Browse the repository at this point in the history
…otstrap
  • Loading branch information
shnizzedy committed Jul 7, 2022
1 parent a5d9037 commit b1583e9
Showing 1 changed file with 206 additions and 63 deletions.
269 changes: 206 additions & 63 deletions scripts/cubic/bootstrap-c-pac.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
## NOTE ##
# This workflow is derived from the Datalad Handbook

# In addition to the positional arguments described in https://pennlinc.github.io/docs/TheWay/RunningDataLadPipelines/#preparing-the-analysis-dataset ,
# this bootstrap script also takes a /full/path/to/callback.log i.e.,
# `bash bootstrap-c-pac.sh /full/path/to/BIDS /full/path/to/cpac-container /full/path/to/callback.log`
# for optimizing memory (see https://fcp-indi.github.io/docs/nightly/user/tutorials/observed_usage for C-PAC optimization tutorial, and see
# sections marked "C-PAC-specific memory optimization" in this script for details).

## Ensure the environment is ready to bootstrap the analysis workspace
# Check that we have conda installed
#conda activate
Expand All @@ -23,7 +29,7 @@ set -e -u


## Set up the directory that will contain the necessary directories
PROJECTROOT=${PWD}/fmriprep
PROJECTROOT=${PWD}/c-pac-1.8.5
if [[ -d ${PROJECTROOT} ]]
then
echo ${PROJECTROOT} already exists
Expand All @@ -37,6 +43,11 @@ then
fi


# C-PAC-specific memory optimization
CALLBACK_LOG=$3
# ----------------------------------


## Check the BIDS input
BIDSINPUT=$1
if [[ -z ${BIDSINPUT} ]]
Expand Down Expand Up @@ -92,41 +103,55 @@ else
datalad save -r -m "added input data"
fi

SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 )
SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 | sort)
if [ -z "${SUBJECTS}" ]
then
echo "No subjects found in input data"
# exit 1
fi

set +u
CONTAINERDS=$2
set -u
#if [[ ! -z "${CONTAINERDS}" ]]; then
cd ${PROJECTROOT}
datalad clone ${CONTAINERDS} pennlinc-containers

## Add the containers as a subdataset
#datalad clone ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers pennlinc-containers
# download the image so we don't ddos pmacs
cd pennlinc-containers
datalad get -r .
# get rid of the references to pmacs
#set +e
#datalad siblings remove -s pmacs-ria-storage
#datalad siblings remove -s origin
#set -e
cd ${PROJECTROOT}

# Clone the containers dataset. If specified on the command, use that path
CONTAINERDS=$2
if [[ ! -z "${CONTAINERDS}" ]]; then
datalad clone ${CONTAINERDS} pennlinc-containers
else
echo "No containers dataset specified, attempting to clone from pmacs"
datalad clone \
ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers \
pennlinc-containers
cd pennlinc-containers
datalad get -r .
# get rid of the references to pmacs
set +e
datalad siblings remove -s pmacs-ria-storage
git annex dead pmacs-ria-storage
datalad siblings remove -s origin
git annex dead origin
set -e
fi

cd ${PROJECTROOT}/analysis
datalad install -d . --source ${PROJECTROOT}/pennlinc-containers


# C-PAC-specific memory optimization ---------
if [[ ! -z "${CALLBACK_LOG}" ]]; then
ln $CALLBACK_LOG code/runtime_callback.log
fi
# --------------------------------------------


## the actual compute job specification
cat > code/participant_job.sh << "EOT"
#!/bin/bash
#$ -S /bin/bash
#$ -l h_vmem=25G
#$ -l h_vmem=32G
#$ -l s_vmem=32G
#$ -l tmpfree=200G
#$ -R y
#$ -l h_rt=24:00:00
# Set up the correct conda environment
source ${CONDA_PREFIX}/bin/activate base
echo I\'m in $PWD using `which python`
Expand All @@ -138,14 +163,15 @@ set -e -u -x
dssource="$1"
pushgitremote="$2"
subid="$3"
sesid="$4"
# change into the cluster-assigned temp directory. Not done by default in SGE
cd ${CBICA_TMPDIR}
# OR Run it on a shared network drive
# cd /cbica/comp_space/$(basename $HOME)
# Used for the branch names and the temp dir
BRANCH="job-${JOB_ID}-${subid}"
BRANCH="job-${JOB_ID}-${subid}-${sesid}"
mkdir ${BRANCH}
cd ${BRANCH}
Expand Down Expand Up @@ -184,22 +210,38 @@ datalad get -n "inputs/data/${subid}"
# ------------------------------------------------------------------------------
# Do the run!
datalad run \
-i code/fmriprep_zip.sh \
-i inputs/data/${subid} \
-i inputs/data/*json \
-i pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \
--explicit \
-o ${subid}_fmriprep-20.2.3.zip \
-o ${subid}_freesurfer-20.2.3.zip \
-m "fmriprep:20.2.3 ${subid}" \
"bash ./code/fmriprep_zip.sh ${subid}"
# C-PAC-specific memory optimization --------------------------------
if [[ -f code/runtime_callback.log ]]
then
datalad run \
-i code/c-pac_zip.sh \
-i code/runtime_callback.log \
-i inputs/data/${subid}/${sesid} \
-i inputs/data/*json \
-i pennlinc-containers/.datalad/environments/cpac-1-8-5/image \
--explicit \
-o ${subid}_${sesid}_c-pac-1.8.5.zip \
-m "C-PAC:1.8.5 ${subid} ${sesid}" \
"bash ./code/c-pac_zip.sh ${subid} ${sesid}"
# -------------------------------------------------------------------
else
datalad run \
-i code/c-pac_zip.sh \
-i inputs/data/${subid} \
-i inputs/data/*json \
-i pennlinc-containers/.datalad/environments/cpac-1-8-5/image \
--explicit \
-o ${subid}_${sesid}_c-pac-1.8.5.zip \
-m "C-PAC:1.8.5 ${subid}" \
"bash ./code/c-pac_zip.sh ${subid}"
fi
# file content first -- does not need a lock, no interaction with Git
datalad push --to output-storage
# and the output branch
flock $DSLOCKFILE git push outputstore
# remove tempdir
echo TMPDIR TO DELETE
echo ${BRANCH}
Expand All @@ -215,67 +257,168 @@ EOT

chmod +x code/participant_job.sh

cat > code/fmriprep_zip.sh << "EOT"
cat > code/c-pac_zip.sh << "EOT"
#!/bin/bash
set -e -u -x
subid="$1"
mkdir -p ${PWD}/.git/tmp/wdir
singularity run --cleanenv -B ${PWD} \
pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \
inputs/data \
prep \
participant \
-w ${PWD}/.git/tmp/wkdir \
--n_cpus 1 \
--stop-on-first-crash \
--fs-license-file code/license.txt \
--skip-bids-validation \
--output-spaces MNI152NLin6Asym:res-2 \
--participant-label "$subid" \
--force-bbr \
--cifti-output 91k -v -v
cd prep
7z a ../${subid}_fmriprep-20.2.3.zip fmriprep
7z a ../${subid}_freesurfer-20.2.3.zip freesurfer
rm -rf prep .git/tmp/wkdir
sesid="$2"
# Create a filter file that only allows this session
filterfile=${PWD}/${sesid}_filter.json
echo "{" > ${filterfile}
echo "'fmap': {'datatype': 'fmap'}," >> ${filterfile}
echo "'bold': {'datatype': 'func', 'session': '$sesid', 'suffix': 'bold'}," >> ${filterfile}
echo "'sbref': {'datatype': 'func', 'session': '$sesid', 'suffix': 'sbref'}," >> ${filterfile}
echo "'flair': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'FLAIR'}," >> ${filterfile}
echo "'t2w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T2w'}," >> ${filterfile}
echo "'t1w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T1w'}," >> ${filterfile}
echo "'roi': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'roi'}" >> ${filterfile}
echo "}" >> ${filterfile}
# remove ses and get valid json
sed -i "s/'/\"/g" ${filterfile}
sed -i "s/ses-//g" ${filterfile}
mkdir -p ${subid}_${sesid}_outputs
# C-PAC-specific memory optimization -----------------------------
if [[ -f code/runtime_callback.log ]]
then
singularity run --cleanenv \
-B ${PWD} \
-B ${PWD}/${subid}_${sesid}_outputs:/outputs \
pennlinc-containers/.datalad/environments/cpac-1-8-5/image \
inputs/data \
/outputs \
participant \
--preconfig rbc-options \
--skip_bids_validator \
--n_cpus 4 \
--mem_gb 32 \
--participant_label "$subid" \
--runtime_usage=code/runtime_callback.log \
--runtime_buffer=30
# ----------------------------------------------------------------
else
singularity run --cleanenv \
-B ${PWD} \
-B ${PWD}/${subid}_${sesid}_outputs:/outputs \
pennlinc-containers/.datalad/environments/cpac-1-8-5/image \
inputs/data \
/outputs \
participant \
--preconfig rbc-options \
--skip_bids_validator \
--n_cpus 4 \
--mem_gb 32 \
--participant_label "$subid"
fi
rm -rf ${subid}_${sesid}_outputs/working
7z a ${subid}_${sesid}_c-pac-1.8.5.zip ${subid}_${sesid}_outputs
rm -rf ${subid}_${sesid}_outputs
rm ${filterfile}
EOT

chmod +x code/fmriprep_zip.sh
cp ${FREESURFER_HOME}/license.txt code/license.txt
chmod +x code/c-pac_zip.sh

mkdir logs
echo .SGE_datalad_lock >> .gitignore
echo logs >> .gitignore

datalad save -m "Participant compute job implementation"

# Add a script for merging outputs
MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh
################################################################################
# SGE SETUP START - remove or adjust to your needs
################################################################################
cat > code/merge_outputs.sh << "EOT"
#!/bin/bash
set -e -u -x
EOT
echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \
>> code/merge_outputs.sh
echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh
wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh

cat >> code/merge_outputs.sh << "EOT"
datalad clone ${outputsource} merge_ds
cd merge_ds
NBRANCHES=$(git branch -a | grep job- | sort | wc -l)
echo "Found $NBRANCHES branches to merge"
gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1)
# query all branches for the most recent commit and check if it is identical.
# Write all branch identifiers for jobs without outputs into a file.
for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \
| cut -d ' ' -f1)" = x"${gitref}" ] && \
echo $i; done | tee code/noresults.txt | wc -l
for i in $(git branch -a | grep job- | sort); \
do [ x"$(git show-ref $i \
| cut -d ' ' -f1)" != x"${gitref}" ] && \
echo $i; \
done | tee code/has_results.txt
mkdir -p code/merge_batches
num_branches=$(wc -l < code/has_results.txt)
CHUNKSIZE=5000
set +e
num_chunks=$(expr ${num_branches} / ${CHUNKSIZE})
if [[ $num_chunks == 0 ]]; then
num_chunks=1
fi
set -e
for chunknum in $(seq 1 $num_chunks)
do
startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1)
endnum=$(expr ${chunknum} \* ${CHUNKSIZE})
batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt
[[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches}
branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt)
echo ${branches} > ${batch_file}
git merge -m "C-PAC results batch ${chunknum}/${num_chunks}" $(cat ${batch_file})
done
# Push the merge back
git push
# Get the file availability info
git annex fsck --fast -f output-storage
# This should not print anything
MISSING=$(git annex find --not --in output-storage)
if [[ ! -z "$MISSING" ]]
then
echo Unable to find data for $MISSING
exit 1
fi
# stop tracking this branch
git annex dead here
datalad push --data nothing
echo SUCCESS
EOT


################################################################################
# SGE SETUP START - remove or adjust to your needs
################################################################################
env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock"
echo '#!/bin/bash' > code/qsub_calls.sh
dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)"
pushgitremote=$(git remote get-url --push output)
eo_args="-e ${PWD}/logs -o ${PWD}/logs"
for subject in ${SUBJECTS}; do
echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \
${PWD}/code/participant_job.sh \
${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh
SESSIONS=$(ls inputs/data/$subject | grep ses- | cut -d '/' -f 1)
for session in ${SESSIONS}; do
echo "qsub -cwd ${env_flags} -N c-pac_${subject}_${session} ${eo_args} \
${PWD}/code/participant_job.sh \
${dssource} ${pushgitremote} ${subject} ${session}" >> code/qsub_calls.sh
done
done
datalad save -m "SGE submission setup" code/ .gitignore

Expand Down

0 comments on commit b1583e9

Please sign in to comment.