diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..496ee2c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..b1b99db --- /dev/null +++ b/LICENSE @@ -0,0 +1,29 @@ +BSD 3-Clause License + +Copyright (c) 2021, Lifespan Informatics and Neuroimaging Center +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..9fc3566 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# TheWay +Boilerplate scripts for running workflows on Penn clusters diff --git a/Untitled Diagram.drawio b/Untitled Diagram.drawio new file mode 100644 index 0000000..ceb1dde --- /dev/null +++ b/Untitled Diagram.drawio @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/cubic/add_archive_content.sh b/scripts/cubic/add_archive_content.sh new file mode 100644 index 0000000..b8283dd --- /dev/null +++ b/scripts/cubic/add_archive_content.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#$ -l h_vmem=25G +#$ -R y + +# Adds output files to archive so thier filestubs are accessible by users + +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +cd ${PWD} + +BOOTSTRAP_DIR=$1 + +# clone output ria of bootstrap dir +datalad clone ria+file:///${BOOTSTRAP_DIR}/output_ria#~data archive_clone + +# cd into clone +cd archive_clone + +for zip in *.zip; do + echo Adding archive content for ${zip} + datalad get ${zip} + datalad add-archive-content -e 'logs/.*' -e '.bidsignore' -e 'dataset_description.json' -e 'dwiqc.json' --drop-after ${zip} +done + +datalad push diff --git a/scripts/cubic/add_urls_hcpd.sh b/scripts/cubic/add_urls_hcpd.sh new file mode 100644 index 0000000..b852ab7 --- /dev/null +++ b/scripts/cubic/add_urls_hcpd.sh @@ -0,0 +1,58 @@ +#!/bin/bash +#$ -l h_vmem=25G +#$ -R y +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +SUBJECTIDCSV=/cbica/projects/RBC/testing/hcpd/hcpd_subject_ids.csv +S3CSV=/cbica/projects/RBC/testing/hcpd/hcpd_s3.csv + +# create HCPD datalad dataset +datalad create -c text2git HCPD + +# cd into HCPD datalad dataset +cd HCPD + +# initialize git annex remote +git annex initremote datalad type=external externaltype=datalad encryption=none autoenable=true + +# get the dataset config file from github +wget https://raw.githubusercontent.com/TobiasKadelka/build_hcp/master/code/cfg_hcp_dataset.sh + +# get list of subject IDs and run addurls for each subject +SUBJECTS=$(cut -d, -f1 ${SUBJECTIDCSV}) + +# heredoc for subject csv creator +cat > get_subject_csv.py << "EOT" +#!/usr/bin/env python +""" +USAGE: +python participant_csv.py subid +Run this inside of participant_job.sh +Creates csv for one single participant +""" +import pandas as pd +import sys +hcpdcsv = sys.argv[1] +subid = sys.argv[2] +df = pd.read_csv(hcpdcsv) +df2=df[df.filename.str.startswith(subid)] +df3 = df2.drop_duplicates(subset ="filename", keep = 'first', ignore_index=True) +df3.to_csv("/cbica/projects/RBC/testing/hcpd/subject_csvs/" + subid + ".csv", index=False) +EOT + + + +chmod +x get_subject_csv.py + +datalad save -m "Added python file to create subject CSVs and HCPD dataset config" + +for subject in ${SUBJECTS}; do + echo Creating subject csv for ${subject} + python get_subject_csv.py ${S3CSV} ${subject} + + echo Adding URLS for ${subject} + datalad addurls -c hcp_dataset -d ${subject} ~/testing/hcpd/subject_csvs/${subject}.csv '{associated_file}' '{filename}' + datalad save -m "Added URLs for ${subject}" +done diff --git a/scripts/cubic/bootstrap-aslprep-anatomical.sh b/scripts/cubic/bootstrap-aslprep-anatomical.sh new file mode 100644 index 0000000..b7e2e5e --- /dev/null +++ b/scripts/cubic/bootstrap-aslprep-anatomical.sh @@ -0,0 +1,279 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/aslprep +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +if [[ "${BIDS_INPUT_METHOD}" == "clone" ]] +then + echo "Cloning input dataset into analysis dataset" + datalad clone -d . ${BIDSINPUT} inputs/data + # amend the previous commit with a nicer commit message + git commit --amend -m 'Register input data dataset as a subdataset' +else + echo "WARNING: copying input data into repository" + mkdir -p inputs/data + cp -r ${BIDSINPUT}/* inputs/data + datalad save -r -m "added input data" +fi + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 | sort) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + + +## Add the containers as a subdataset +cd ${PROJECTROOT} + +# Clone the containers dataset. If specified on the command, use that path +CONTAINERDS=$2 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +else + echo ERROR: requires a container dataset + exit 1 +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=32G +#$ -l tmpfree=200G +#$ -pe threaded 6 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" +# Reomve all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) +# ------------------------------------------------------------------------------ +# Do the run! +datalad run \ + -i code/aslprep_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/*json \ + -i pennlinc-containers/.datalad/environments/aslprep-0-2-7/image \ + --explicit \ + -o ${subid}_aslprep-0.2.7.zip \ + -m "aslprep:0.2.7 ${subid}" \ + "bash ./code/aslprep_zip.sh ${subid}" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/aslprep_zip.sh << "EOT" +#!/bin/bash +set -e -u -x +subid="$1" +mkdir -p ${PWD}/.git/tmp/wdir +singularity run --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/aslprep-0-2-7/image \ + inputs/data \ + prep \ + participant \ + -w ${PWD}/.git/tmp/wkdir \ + --n_cpus $NSLOTS \ + --stop-on-first-crash \ + --skip-bids-validation \ + --fs-license-file code/license.txt \ + --output-spaces MNI152NLin6Asym:res-2 \ + --participant-label "$subid" \ + --force-bbr -v -v +cd prep +7z a ../${subid}_aslprep-0.2.7.zip aslprep +rm -rf prep .git/tmp/wkdir +EOT + +chmod +x code/aslprep_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N qp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +if [ "${BIDS_INPUT_METHOD}" = "clone" ] +then + datalad uninstall -r --nocheck inputs/data +fi + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-aslprep-qc.sh b/scripts/cubic/bootstrap-aslprep-qc.sh new file mode 100644 index 0000000..7d5effe --- /dev/null +++ b/scripts/cubic/bootstrap-aslprep-qc.sh @@ -0,0 +1,244 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/ASLPREP_QC +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +mkdir -p $PROJECTROOT + +## DERIVATIVE_BOOTSTRAP_DIR will be the path to the bootstrap directory containing your derivatives +DERIVATIVE_BOOTSTRAP_DIR=$1 +DERIVATIVE_INPUT=ria+file://${DERIVATIVE_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${DERIVATIVE_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the aslprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${DERIVATIVE_INPUT} inputs/data + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +ZIPS=$(find inputs/data -name 'sub-*aslprep*' | cut -d '/' -f 3 | sort) +if [ -z "${ZIPS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! + +datalad run \ + -i code/get_files.sh \ + -i inputs/data/${subid}_aslprep*.zip \ + --explicit \ + -o ${subid}*quality*.csv \ + -m "unzipped ${subid}" \ + "bash code/get_files.sh inputs/data/${subid}_aslprep*.zip" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +## the actual compute job specification +echo writing script to file... +cat > code/get_files.sh << "EOT" +#!/bin/bash +set -e -u -x +ZIP_FILE=$1 +subid=$(basename $ZIP_FILE | cut -d '_' -f 1) +# unzip outputs +unzip -n $ZIP_FILE 'aslprep/*' -d . + +cp aslprep/${subid}/*/perf/*quality*.csv . + +# remove unzip dir +rm -rf aslprep +EOT + +chmod +x code/get_files.sh + +##### concat_outputs.sh START #### + +cat > code/concat_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "PROJECT_ROOT=${PROJECTROOT}" >> code/concat_outputs.sh +echo "cd ${PROJECTROOT}" >> code/concat_outputs.sh + +cat >> code/concat_outputs.sh << "EOT" +# set up concat_ds and run concatenator on it +cd ${CBICA_TMPDIR} +datalad clone ria+file://${PROJECT_ROOT}/output_ria#~data concat_ds +cd concat_ds/code +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/concatenator.py +cd .. +datalad save -m "added concatenator script" +datalad run -i 'sub-*quality*.csv' -o '${PROJECT_ROOT}/ASLPREP_QC.csv' --expand inputs --explicit "python code/concatenator.py $PWD ${PROJECT_ROOT}/ASLPREP_QC.csv" +datalad save -m "generated report" +# push changes +datalad push +# remove concat_ds +git annex dead here +cd .. +chmod +w -R concat_ds +rm -rf concat_ds +echo SUCCESS +EOT + +#### concat_output.sh END #### + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + +for zip in ${ZIPS}; do + subject=`echo ${zip} | cut -d '_' -f 1` + echo "qsub -cwd ${env_flags} -N UNZIP${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-aslprep.sh b/scripts/cubic/bootstrap-aslprep.sh new file mode 100644 index 0000000..b7e2e5e --- /dev/null +++ b/scripts/cubic/bootstrap-aslprep.sh @@ -0,0 +1,279 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/aslprep +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +if [[ "${BIDS_INPUT_METHOD}" == "clone" ]] +then + echo "Cloning input dataset into analysis dataset" + datalad clone -d . ${BIDSINPUT} inputs/data + # amend the previous commit with a nicer commit message + git commit --amend -m 'Register input data dataset as a subdataset' +else + echo "WARNING: copying input data into repository" + mkdir -p inputs/data + cp -r ${BIDSINPUT}/* inputs/data + datalad save -r -m "added input data" +fi + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 | sort) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + + +## Add the containers as a subdataset +cd ${PROJECTROOT} + +# Clone the containers dataset. If specified on the command, use that path +CONTAINERDS=$2 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +else + echo ERROR: requires a container dataset + exit 1 +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=32G +#$ -l tmpfree=200G +#$ -pe threaded 6 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" +# Reomve all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) +# ------------------------------------------------------------------------------ +# Do the run! +datalad run \ + -i code/aslprep_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/*json \ + -i pennlinc-containers/.datalad/environments/aslprep-0-2-7/image \ + --explicit \ + -o ${subid}_aslprep-0.2.7.zip \ + -m "aslprep:0.2.7 ${subid}" \ + "bash ./code/aslprep_zip.sh ${subid}" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/aslprep_zip.sh << "EOT" +#!/bin/bash +set -e -u -x +subid="$1" +mkdir -p ${PWD}/.git/tmp/wdir +singularity run --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/aslprep-0-2-7/image \ + inputs/data \ + prep \ + participant \ + -w ${PWD}/.git/tmp/wkdir \ + --n_cpus $NSLOTS \ + --stop-on-first-crash \ + --skip-bids-validation \ + --fs-license-file code/license.txt \ + --output-spaces MNI152NLin6Asym:res-2 \ + --participant-label "$subid" \ + --force-bbr -v -v +cd prep +7z a ../${subid}_aslprep-0.2.7.zip aslprep +rm -rf prep .git/tmp/wkdir +EOT + +chmod +x code/aslprep_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N qp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +if [ "${BIDS_INPUT_METHOD}" = "clone" ] +then + datalad uninstall -r --nocheck inputs/data +fi + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-fmriprep-anatonly.sh b/scripts/cubic/bootstrap-fmriprep-anatonly.sh new file mode 100644 index 0000000..2157cc4 --- /dev/null +++ b/scripts/cubic/bootstrap-fmriprep-anatonly.sh @@ -0,0 +1,304 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/fmriprep +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +if [[ "${BIDS_INPUT_METHOD}" == "clone" ]] +then + echo "Cloning input dataset into analysis dataset" + datalad clone -d . ${BIDSINPUT} inputs/data + # amend the previous commit with a nicer commit message + git commit --amend -m 'Register input data dataset as a subdataset' +else + echo "WARNING: copying input data into repository" + mkdir -p inputs/data + cp -r ${BIDSINPUT}/* inputs/data + datalad save -r -m "added input data" +fi + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 ) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +set +u +CONTAINERDS=$2 +set -u +#if [[ ! -z "${CONTAINERDS}" ]]; then +cd ${PROJECTROOT} +datalad clone ${CONTAINERDS} pennlinc-containers +## Add the containers as a subdataset +#datalad clone ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers pennlinc-containers +# download the image so we don't ddos pmacs +cd pennlinc-containers +datalad get -r . +# get rid of the references to pmacs +#set +e +#datalad siblings remove -s pmacs-ria-storage +#datalad siblings remove -s origin +#set -e + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" + +# Reomve all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) + +# ------------------------------------------------------------------------------ +# Do the run! + +datalad run \ + -i code/fmriprep_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/*json \ + -i pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \ + --explicit \ + -o ${subid}_freesurfer-20.2.3.zip \ + -m "fmriprep:20.2.3 ${subid}" \ + "bash ./code/fmriprep_zip.sh ${subid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/fmriprep_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +mkdir -p ${PWD}/.git/tmp/wdir +singularity run --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \ + inputs/data \ + prep \ + participant \ + -w ${PWD}/.git/tmp/wkdir \ + --n_cpus 1 \ + --stop-on-first-crash \ + --fs-license-file code/license.txt \ + --skip-bids-validation \ + --output-spaces MNI152NLin6Asym:res-2 \ + --anat-only \ + --participant-label "$subid" \ + --force-bbr \ + --cifti-output 91k -v -v + +cd prep +7z a ../${subid}_freesurfer-20.2.3.zip freesurfer +rm -rf prep .git/tmp/wkdir + +EOT + +chmod +x code/fmriprep_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +if [ "${BIDS_INPUT_METHOD}" = "clone" ] +then + datalad uninstall -r --nocheck inputs/data +fi + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-fmriprep-audit.sh b/scripts/cubic/bootstrap-fmriprep-audit.sh new file mode 100644 index 0000000..19bda55 --- /dev/null +++ b/scripts/cubic/bootstrap-fmriprep-audit.sh @@ -0,0 +1,345 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/fmriprep-audit +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +FMRIPREP_BOOTSTRAP_DIR=$1 +FMRIPREP_INPUT=ria+file://${FMRIPREP_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${FMRIPREP_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the fmriprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +# Is it a directory on the filesystem? +FMRIPREP_INPUT_METHOD=clone +if [[ ! -d "${FMRIPREP_BOOTSTRAP_DIR}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "fmriprep output dataset" + # exit 1 +fi + +# Check that there are some fmriprep zip files present in the input +# If you only need freesurfer, comment this out +# FMRIPREP_ZIPS=$(cd ${FMRIPREP_INPUT} && ls *fmriprep*.zip) +# if [[ -z "${FMRIPREP_ZIPS}" ]]; then +# echo No fmriprep zip files found in ${FMRIPREP_INPUT} +# exit 1 +# fi + +# Check that freesurfer data exists. If you only need fmriprep zips, comment +# this out +# FREESURFER_ZIPS=$(cd ${FMRIPREP_INPUT} && ls *freesurfer*.zip) +# if [[ -z "${FREESURFER_ZIPS}" ]]; then +# echo No freesurfer zip files found in ${FMRIPREP_INPUT} +# exit 1 +# fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + + +# Create a dataset with the logs in it +mkdir fmriprep_logs +cd fmriprep_logs +datalad create -D "Logs from the fmriprep runs" +cp ${FMRIPREP_BOOTSTRAP_DIR}/analysis/logs/* . +datalad save -m "add logs" + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${FMRIPREP_INPUT} inputs/data +datalad install -d . -r --source ${PROJECTROOT}/fmriprep_logs inputs/fmriprep_logs + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 5 ) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=5G +#$ -l s_vmem=3.5G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +datalad clone "${dssource}" ds +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" + +# ------------------------------------------------------------------------------ +# Do the run! +BIDS_DIR=${PWD}/inputs/data/inputs/data +ZIPS_DIR=${PWD}/inputs/data +ERROR_DIR=${PWD}/inputs/fmriprep_logs +CSV_DIR=csvs +mkdir ${CSV_DIR} +output_file=${CSV_DIR}/${subid}_fmriprep_audit.csv + +datalad get -n inputs/data + +INPUT_ZIP=$(ls inputs/data/${subid}_fmriprep*.zip | cut -d '@' -f 1 || true) +if [ ! -z "${INPUT_ZIP}" ]; then + INPUT_ZIP="-i ${INPUT_ZIP}" +fi + +echo DATALAD RUN INPUT +echo ${INPUT_ZIP} + +datalad run \ + -i code/bootstrap_zip_audit.py \ + ${INPUT_ZIP} \ + -i inputs/data/inputs/data/${subid} \ + -i inputs/fmriprep_logs/*${subid}* \ + --explicit \ + -o ${output_file} \ + -m "fmriprep-audit ${subid}" \ + "python code/bootstrap_zip_audit.py ${subid} ${BIDS_DIR} ${ZIPS_DIR} ${ERROR_DIR} ${output_file} fmriprep" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +# Sydney, please wget your audit script here! +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/bootstrap_zip_audit.py +mv bootstrap_zip_audit.py code/ +chmod +x code/bootstrap_zip_audit.py + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Add logs from fmriprep runs" + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" +datalad clone ${outputsource} merge_ds +cd merge_ds +NBRANCHES=$(git branch -a | grep job- | sort | wc -l) +echo "Found $NBRANCHES branches to merge" + +gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1) + +# query all branches for the most recent commit and check if it is identical. +# Write all branch identifiers for jobs without outputs into a file. +for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" = x"${gitref}" ] && \ + echo $i; done | tee code/noresults.txt | wc -l + + +for i in $(git branch -a | grep job- | sort); \ + do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" != x"${gitref}" ] && \ + echo $i; \ +done | tee code/has_results.txt + +mkdir -p code/merge_batches +num_branches=$(wc -l < code/has_results.txt) +CHUNKSIZE=5000 + +set +e +num_chunks=$(expr ${num_branches} / ${CHUNKSIZE}) +[[ $num_chunks == 0 ]] && num_chunks=1 +set -e -x + +for chunknum in $(seq 1 $num_chunks) +do + startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1) + endnum=$(expr ${chunknum} \* ${CHUNKSIZE}) + batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt + [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches} + branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt) + echo ${branches} > ${batch_file} + git merge -m "fmriprep results batch ${chunknum}/${num_chunks}" $(cat ${batch_file}) + +done + +# Push the merge back +git push + +# Get the file availability info +git annex fsck --fast -f output-storage + +# This should not print anything +MISSING=$(git annex find --not --in output-storage) + +if [[ ! -z "$MISSING" ]] +then + echo Unable to find data for $MISSING + exit 1 +fi + +# stop tracking this branch +git annex dead here +datalad push --data nothing +echo SUCCESS + +EOT + +##### concat_outputs.sh START #### + +cat > code/concat_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "PROJECT_ROOT=${PROJECTROOT}" >> code/concat_outputs.sh +echo "cd ${PROJECTROOT}" >> code/concat_outputs.sh + +cat >> code/concat_outputs.sh << "EOT" +# set up concat_ds and run concatenator on it +cd ${CBICA_TMPDIR} +datalad clone ria+file://${PROJECT_ROOT}/output_ria#~data concat_ds +cd concat_ds/code +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/concatenator.py +cd .. +datalad save -m "added concatenator script" +datalad run -i 'csvs/*' -o '${PROJECT_ROOT}/FMRIPREP_AUDIT.csv' --expand inputs --explicit "python code/concatenator.py csvs ${PROJECT_ROOT}/FMRIPREP_AUDIT.csv" +datalad save -m "generated report" +# push changes +datalad push +# remove concat_ds +git annex dead here +cd .. +chmod +w -R concat_ds +rm -rf concat_ds +echo SUCCESS + +EOT + +#### concat_output.sh END #### + + +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-fmriprep-bugcheck.sh b/scripts/cubic/bootstrap-fmriprep-bugcheck.sh new file mode 100755 index 0000000..4c63a0a --- /dev/null +++ b/scripts/cubic/bootstrap-fmriprep-bugcheck.sh @@ -0,0 +1,288 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +# USAGE: $0 bids-dir fmriprep-version + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + +VERSION=$2 + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/fmriprep-${VERSION} +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +FMRIPREP_BOOTSTRAP_DIR=$1 +FMRIPREP_INPUT=ria+file://${FMRIPREP_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${FMRIPREP_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the freesurfer bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +# Is it a directory on the filesystem? +FMRIPREP_INPUT_METHOD=clone +if [[ ! -d "${FMRIPREP_BOOTSTRAP_DIR}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "freesurfer output dataset" + # exit 1 +fi + +# Check that there are some freesurfer zip files present in the input +# If you only need freesurfer, comment this out +# FREESURFER_ZIPS=$(cd ${FMRIPREP_INPUT} && ls *freesurfer*.zip) +# if [[ -z "${FREESURFER_ZIPS}" ]]; then +# echo No freesurfer zip files found in ${FMRIPREP_INPUT} +# exit 1 +# fi + +# Check that freesurfer data exists. If you only need freesurfer zips, comment +# this out +# FREESURFER_ZIPS=$(cd ${FMRIPREP_INPUT} && ls *freesurfer*.zip) +# if [[ -z "${FREESURFER_ZIPS}" ]]; then +# echo No freesurfer zip files found in ${FMRIPREP_INPUT} +# exit 1 +# fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${FMRIPREP_INPUT} inputs/data + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(cat ~/fmriprep-bug-check/pnc_exemplars.txt) + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" + +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=5G +#$ -l s_vmem=3.5G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +#cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +datalad clone "${dssource}" ds +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! + +# Do the run! + +BIDS_DIR=${PWD}/inputs/data/inputs/data +ZIPS_DIR=${PWD}/inputs/data +ERROR_DIR=${PWD}/inputs/freesurfer_logs +CSV_DIR=csvs +mkdir ${CSV_DIR} +datalad get -n inputs/data +INPUT_ZIP=$(ls inputs/data/${subid}_fmriprep*.zip | cut -d '@' -f 1 || true) + +echo DATALAD RUN INPUT +echo ${INPUT_ZIP} +datalad get ${INPUT_ZIP} +datalad unlock ${INPUT_ZIP} + +BOLDREFS=$(bsdtar -tf ${INPUT_ZIP} | grep 152NLin6Asym_res-2_boldref) +MNI_DSEG=$(bsdtar -tf ${INPUT_ZIP} | grep func | grep aparc) +datalad save +set +u +OUTPUTS="" +for fname in $BOLDREFS $MNI_DSEG +do + OUTPUTS="-o $fname $OUTPUTS" + OUTPUT_FILES="$fname $OUTPUT_FILES" +done + +set -u + +datalad run \ + -i ${INPUT_ZIP} \ + ${OUTPUTS} \ + --explicit \ + -m "get warped files ${subid}" \ + "echo ${OUTPUT_FILES} | xargs 7z x -aoa ${INPUT_ZIP}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace + + +EOT + +chmod +x code/participant_job.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Add code for extracting data" + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" +datalad clone ${outputsource} merge_ds +cd merge_ds +NBRANCHES=$(git branch -a | grep job- | sort | wc -l) +echo "Found $NBRANCHES branches to merge" +gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1) +# query all branches for the most recent commit and check if it is identical. +# Write all branch identifiers for jobs without outputs into a file. +for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" = x"${gitref}" ] && \ + echo $i; done | tee code/noresults.txt | wc -l +for i in $(git branch -a | grep job- | sort); \ + do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" != x"${gitref}" ] && \ + echo $i; \ +done | tee code/has_results.txt +mkdir -p code/merge_batches +num_branches=$(wc -l < code/has_results.txt) +CHUNKSIZE=5000 +set +e +num_chunks=$(expr ${num_branches} / ${CHUNKSIZE}) +[[ $num_chunks == 0 ]] && num_chunks=1 +set -e -x +for chunknum in $(seq 1 $num_chunks) +do + startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1) + endnum=$(expr ${chunknum} \* ${CHUNKSIZE}) + batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt + [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches} + branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt) + echo ${branches} > ${batch_file} + git merge -m "freesurfer results batch ${chunknum}/${num_chunks}" $(cat ${batch_file}) +done +# Push the merge back +git push +# Get the file availability info +git annex fsck --fast -f output-storage +# This should not print anything +MISSING=$(git annex find --not --in output-storage) +if [[ ! -z "$MISSING" ]] +then + echo Unable to find data for $MISSING + exit 1 +fi +# stop tracking this branch +git annex dead here +datalad push --data nothing +echo SUCCESS +EOT + + +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-fmriprep-multises-audit.sh b/scripts/cubic/bootstrap-fmriprep-multises-audit.sh new file mode 100644 index 0000000..d0722b8 --- /dev/null +++ b/scripts/cubic/bootstrap-fmriprep-multises-audit.sh @@ -0,0 +1,349 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/fmriprep-multises-audit +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +FMRIPREP_BOOTSTRAP_DIR=$1 +FMRIPREP_INPUT=ria+file://${FMRIPREP_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${FMRIPREP_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the fmriprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +# Is it a directory on the filesystem? +FMRIPREP_INPUT_METHOD=clone +if [[ ! -d "${FMRIPREP_BOOTSTRAP_DIR}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "fmriprep output dataset" + # exit 1 +fi + +# Check that there are some fmriprep zip files present in the input +# If you only need freesurfer, comment this out +# FMRIPREP_ZIPS=$(cd ${FMRIPREP_INPUT} && ls *fmriprep*.zip) +# if [[ -z "${FMRIPREP_ZIPS}" ]]; then +# echo No fmriprep zip files found in ${FMRIPREP_INPUT} +# exit 1 +# fi + +# Check that freesurfer data exists. If you only need fmriprep zips, comment +# this out +# FREESURFER_ZIPS=$(cd ${FMRIPREP_INPUT} && ls *freesurfer*.zip) +# if [[ -z "${FREESURFER_ZIPS}" ]]; then +# echo No freesurfer zip files found in ${FMRIPREP_INPUT} +# exit 1 +# fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + + +# Create a dataset with the logs in it +mkdir fmriprep_logs +cd fmriprep_logs +datalad create -D "Logs from the fmriprep runs" +cp ${FMRIPREP_BOOTSTRAP_DIR}/analysis/logs/* . +datalad save -m "add logs" + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${FMRIPREP_INPUT} inputs/data +datalad install -d . -r --source ${PROJECTROOT}/fmriprep_logs inputs/fmriprep_logs + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 5 ) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=5G +#$ -l s_vmem=3.5G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +sesid="$4" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}-${sesid}" +mkdir ${BRANCH} +cd ${BRANCH} +datalad clone "${dssource}" ds +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" + +# ------------------------------------------------------------------------------ +# Do the run! +BIDS_DIR=${PWD}/inputs/data/inputs/data +ZIPS_DIR=${PWD}/inputs/data +ERROR_DIR=${PWD}/inputs/fmriprep_logs +CSV_DIR=csvs +mkdir ${CSV_DIR} +output_file=${CSV_DIR}/${subid}_${sesid}_fmriprep_audit.csv + +datalad get -n inputs/data + +INPUT_ZIP=$(ls inputs/data/${subid}_${sesid}_fmriprep*.zip | cut -d '@' -f 1 || true) +if [ ! -z "${INPUT_ZIP}" ]; then + INPUT_ZIP="-i ${INPUT_ZIP}" +fi + +echo DATALAD RUN INPUT +echo ${INPUT_ZIP} + +datalad run \ + -i code/bootstrap-fmriprep-multises-audit.py \ + ${INPUT_ZIP} \ + -i inputs/data/inputs/data/${subid} \ + -i inputs/fmriprep_logs/*${subid}*${sesid}* \ + --explicit \ + -o ${output_file} \ + -m "fmriprep-audit ${subid} ${sesid}" \ + "python code/bootstrap-fmriprep-multises-audit.py ${subid}_${sesid} ${BIDS_DIR} ${ZIPS_DIR} ${ERROR_DIR} ${output_file}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +# Sydney, please wget your audit script here! +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/bootstrap-fmriprep-multises-audit.py +mv bootstrap-fmriprep-multises-audit.py code/ +chmod +x code/bootstrap-fmriprep-multises-audit.py + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Add logs from fmriprep runs" + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" +datalad clone ${outputsource} merge_ds +cd merge_ds +NBRANCHES=$(git branch -a | grep job- | sort | wc -l) +echo "Found $NBRANCHES branches to merge" + +gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1) + +# query all branches for the most recent commit and check if it is identical. +# Write all branch identifiers for jobs without outputs into a file. +for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" = x"${gitref}" ] && \ + echo $i; done | tee code/noresults.txt | wc -l + + +for i in $(git branch -a | grep job- | sort); \ + do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" != x"${gitref}" ] && \ + echo $i; \ +done | tee code/has_results.txt + +mkdir -p code/merge_batches +num_branches=$(wc -l < code/has_results.txt) +CHUNKSIZE=5000 + +set +e +num_chunks=$(expr ${num_branches} / ${CHUNKSIZE}) +[[ $num_chunks == 0 ]] && num_chunks=1 +set -e -x + +for chunknum in $(seq 1 $num_chunks) +do + startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1) + endnum=$(expr ${chunknum} \* ${CHUNKSIZE}) + batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt + [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches} + branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt) + echo ${branches} > ${batch_file} + git merge -m "fmriprep results batch ${chunknum}/${num_chunks}" $(cat ${batch_file}) + +done + +# Push the merge back +git push + +# Get the file availability info +git annex fsck --fast -f output-storage + +# This should not print anything +set +u +MISSING=$(git annex find --not --in output-storage) + +if [[ ! -z "$MISSING" ]] +then + echo Unable to find data for $MISSING + exit 1 +fi + +# stop tracking this branch +git annex dead here +datalad push --data nothing +echo SUCCESS + +EOT + +##### concat_outputs.sh START #### + +cat > code/concat_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "PROJECT_ROOT=${PROJECTROOT}" >> code/concat_outputs.sh +echo "cd ${PROJECTROOT}" >> code/concat_outputs.sh + +cat >> code/concat_outputs.sh << "EOT" +# set up concat_ds and run concatenator on it +cd ${CBICA_TMPDIR} +datalad clone ria+file://${PROJECT_ROOT}/output_ria#~data concat_ds +cd concat_ds/code +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/concatenator.py +cd .. +datalad save -m "added concatenator script" +datalad run -i 'csvs/*' -o '${PROJECT_ROOT}/FMRIPREP_AUDIT.csv' --expand inputs --explicit "python code/concatenator.py csvs ${PROJECT_ROOT}/FMRIPREP_AUDIT.csv" +datalad save -m "generated report" +# push changes +datalad push +# remove concat_ds +git annex dead here +cd .. +chmod +w -R concat_ds +rm -rf concat_ds +echo SUCCESS + +EOT + +#### concat_output.sh END #### + +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + SESSIONS=$(ls inputs/data/inputs/data/$subject | grep ses- | cut -d '/' -f 1) + for session in ${SESSIONS}; do + echo "qsub -cwd ${env_flags} -N audit${subject}_${session} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} ${session}" >> code/qsub_calls.sh + done +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-fmriprep-multises.sh b/scripts/cubic/bootstrap-fmriprep-multises.sh new file mode 100644 index 0000000..3614bf1 --- /dev/null +++ b/scripts/cubic/bootstrap-fmriprep-multises.sh @@ -0,0 +1,401 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/fmriprep-multises +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +if [[ "${BIDS_INPUT_METHOD}" == "clone" ]] +then + echo "Cloning input dataset into analysis dataset" + datalad clone -d . ${BIDSINPUT} inputs/data + # amend the previous commit with a nicer commit message + git commit --amend -m 'Register input data dataset as a subdataset' +else + echo "WARNING: copying input data into repository" + mkdir -p inputs/data + cp -r ${BIDSINPUT}/* inputs/data + datalad save -r -m "added input data" +fi + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 | sort) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + + +## Add the containers as a subdataset +cd ${PROJECTROOT} + +# Clone the containers dataset. If specified on the command, use that path +CONTAINERDS=$2 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +else + echo "No containers dataset specified, attempting to clone from pmacs" + datalad clone \ + ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers \ + pennlinc-containers + cd pennlinc-containers + datalad get -r . + # get rid of the references to pmacs + set +e + datalad siblings remove -s pmacs-ria-storage + git annex dead pmacs-ria-storage + datalad siblings remove -s origin + git annex dead origin + set -e +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l s_vmem=23.5G +#$ -l tmpfree=200G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +sesid="$4" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}-${sesid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" + +# Reomve all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) + +# ------------------------------------------------------------------------------ +# Do the run! + +datalad run \ + -i code/fmriprep_zip.sh \ + -i inputs/data/${subid}/${sesid}\ + -i inputs/data/*json \ + -i pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \ + --explicit \ + -o ${subid}_${sesid}_fmriprep-20.2.3.zip \ + -o ${subid}_${sesid}_freesurfer-20.2.3.zip \ + -m "fmriprep:20.2.3 ${subid} ${sesid}" \ + "bash ./code/fmriprep_zip.sh ${subid} ${sesid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/fmriprep_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +sesid="$2" + +# Create a filter file that only allows this session +filterfile=${PWD}/${sesid}_filter.json +echo "{" > ${filterfile} +echo "'fmap': {'datatype': 'fmap'}," >> ${filterfile} +echo "'bold': {'datatype': 'func', 'session': '$sesid', 'suffix': 'bold'}," >> ${filterfile} +echo "'sbref': {'datatype': 'func', 'session': '$sesid', 'suffix': 'sbref'}," >> ${filterfile} +echo "'flair': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'FLAIR'}," >> ${filterfile} +echo "'t2w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T2w'}," >> ${filterfile} +echo "'t1w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T1w'}," >> ${filterfile} +echo "'roi': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'roi'}" >> ${filterfile} +echo "}" >> ${filterfile} + +# remove ses and get valid json +sed -i "s/'/\"/g" ${filterfile} +sed -i "s/ses-//g" ${filterfile} + +mkdir -p ${PWD}/.git/tmp/wdir +singularity run --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \ + inputs/data \ + prep \ + participant \ + -w ${PWD}/.git/tmp/wkdir \ + --n_cpus 1 \ + --stop-on-first-crash \ + --fs-license-file code/license.txt \ + --skip-bids-validation \ + --bids-filter-file "${filterfile}" \ + --output-spaces MNI152NLin6Asym:res-2 \ + --participant-label "$subid" \ + --force-bbr \ + --cifti-output 91k -v -v + +cd prep +7z a ../${subid}_${sesid}_fmriprep-20.2.3.zip fmriprep +7z a ../${subid}_${sesid}_freesurfer-20.2.3.zip freesurfer +rm -rf prep .git/tmp/wkdir +rm ${filterfile} + +EOT + +chmod +x code/fmriprep_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" + +datalad clone ${outputsource} merge_ds +cd merge_ds +NBRANCHES=$(git branch -a | grep job- | sort | wc -l) +echo "Found $NBRANCHES branches to merge" + +gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1) + +# query all branches for the most recent commit and check if it is identical. +# Write all branch identifiers for jobs without outputs into a file. +for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" = x"${gitref}" ] && \ + echo $i; done | tee code/noresults.txt | wc -l + + +for i in $(git branch -a | grep job- | sort); \ + do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" != x"${gitref}" ] && \ + echo $i; \ +done | tee code/has_results.txt + +mkdir -p code/merge_batches +num_branches=$(wc -l < code/has_results.txt) +CHUNKSIZE=5000 +set +e +num_chunks=$(expr ${num_branches} / ${CHUNKSIZE}) +if [[ $num_chunks == 0 ]]; then + num_chunks=1 +fi +set -e +for chunknum in $(seq 1 $num_chunks) +do + startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1) + endnum=$(expr ${chunknum} \* ${CHUNKSIZE}) + batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt + [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches} + branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt) + echo ${branches} > ${batch_file} + git merge -m "fmriprep results batch ${chunknum}/${num_chunks}" $(cat ${batch_file}) + +done + +# Push the merge back +git push + +# Get the file availability info +git annex fsck --fast -f output-storage + +# This should not print anything +MISSING=$(git annex find --not --in output-storage) + +if [[ ! -z "$MISSING" ]] +then + echo Unable to find data for $MISSING + exit 1 +fi + +# stop tracking this branch +git annex dead here + +datalad push --data nothing +echo SUCCESS + +EOT + + +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + SESSIONS=$(ls inputs/data/$subject | grep ses- | cut -d '/' -f 1) + for session in ${SESSIONS}; do + echo "qsub -cwd ${env_flags} -N fp${subject}_${session} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} ${session}" >> code/qsub_calls.sh + done +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +if [ "${BIDS_INPUT_METHOD}" = "clone" ] +then + datalad uninstall -r --nocheck inputs/data +fi + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-fmriprep.sh b/scripts/cubic/bootstrap-fmriprep.sh new file mode 100644 index 0000000..f60cb14 --- /dev/null +++ b/scripts/cubic/bootstrap-fmriprep.sh @@ -0,0 +1,305 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/fmriprep +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +if [[ "${BIDS_INPUT_METHOD}" == "clone" ]] +then + echo "Cloning input dataset into analysis dataset" + datalad clone -d . ${BIDSINPUT} inputs/data + # amend the previous commit with a nicer commit message + git commit --amend -m 'Register input data dataset as a subdataset' +else + echo "WARNING: copying input data into repository" + mkdir -p inputs/data + cp -r ${BIDSINPUT}/* inputs/data + datalad save -r -m "added input data" +fi + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 ) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +set +u +CONTAINERDS=$2 +set -u +#if [[ ! -z "${CONTAINERDS}" ]]; then +cd ${PROJECTROOT} +datalad clone ${CONTAINERDS} pennlinc-containers +## Add the containers as a subdataset +#datalad clone ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers pennlinc-containers +# download the image so we don't ddos pmacs +cd pennlinc-containers +datalad get -r . +# get rid of the references to pmacs +#set +e +#datalad siblings remove -s pmacs-ria-storage +#datalad siblings remove -s origin +#set -e + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" + +# Reomve all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) + +# ------------------------------------------------------------------------------ +# Do the run! + +datalad run \ + -i code/fmriprep_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/*json \ + -i pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \ + --explicit \ + -o ${subid}_fmriprep-20.2.3.zip \ + -o ${subid}_freesurfer-20.2.3.zip \ + -m "fmriprep:20.2.3 ${subid}" \ + "bash ./code/fmriprep_zip.sh ${subid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/fmriprep_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +mkdir -p ${PWD}/.git/tmp/wdir +singularity run --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/fmriprep-20-2-3/image \ + inputs/data \ + prep \ + participant \ + -w ${PWD}/.git/tmp/wkdir \ + --n_cpus 1 \ + --stop-on-first-crash \ + --fs-license-file code/license.txt \ + --skip-bids-validation \ + --output-spaces MNI152NLin6Asym:res-2 \ + --participant-label "$subid" \ + --force-bbr \ + --cifti-output 91k -v -v + +cd prep +7z a ../${subid}_fmriprep-20.2.3.zip fmriprep +7z a ../${subid}_freesurfer-20.2.3.zip freesurfer +rm -rf prep .git/tmp/wkdir + +EOT + +chmod +x code/fmriprep_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +if [ "${BIDS_INPUT_METHOD}" = "clone" ] +then + datalad uninstall -r --nocheck inputs/data +fi + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-freesurfer-audit.sh b/scripts/cubic/bootstrap-freesurfer-audit.sh new file mode 100644 index 0000000..5efde91 --- /dev/null +++ b/scripts/cubic/bootstrap-freesurfer-audit.sh @@ -0,0 +1,242 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/freesurfer-audit +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +FMRIPREP_BOOTSTRAP_DIR=$1 +FREESURFER_INPUT=ria+file://${FMRIPREP_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${FMRIPREP_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the freesurfer bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${FREESURFER_INPUT} inputs/data +datalad uninstall inputs/data/inputs/data + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(find inputs/data -name '*.zip' | cut -d '/' -f 3 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=8G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +# cd ${CBICA_TMPDIR} + +TMPDIR=${CBICA_TMPDIR} +cd $TMPDIR + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +datalad clone "${dssource}" ds +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! +ZIPS_DIR=${PWD}/inputs/data + +datalad get -n inputs/data +FS_INPUT_ZIP=$(ls inputs/data/${subid}_freesurfer*.zip | cut -d '@' -f 1 || true) +if [ ! -z "${FS_INPUT_ZIP}" ]; then + FS_INPUT_ZIP="-i ${FS_INPUT_ZIP}" +fi + +FMRI_INPUT_ZIP=$(ls inputs/data/${subid}_fmriprep*.zip | cut -d '@' -f 1 || true) +if [ ! -z "${FMRI_INPUT_ZIP}" ]; then + FMRI_INPUT_ZIP="-i ${FMRI_INPUT_ZIP}" +fi + +echo DATALAD RUN INPUT +echo ${FS_INPUT_ZIP} +echo ${FMRI_INPUT_ZIP} +datalad run \ + -i code/fs_euler_checker_and_plots_simplified.py \ + ${FS_INPUT_ZIP} \ + ${FMRI_INPUT_ZIP} \ + --explicit \ + -o csvs \ + -o svg \ + -m "freesurfer-audit ${subid}" \ + "python code/fs_euler_checker_and_plots_simplified.py ${subid} ${ZIPS_DIR}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall --nocheck --if-dirty ignore -r inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS + +EOT + +chmod +x code/participant_job.sh + +# Sydney, please wget your audit script here! +wget https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/generic/fs_euler_checker_and_plots_simplified.py +mv fs_euler_checker_and_plots_simplified.py code/ +chmod +x code/fs_euler_checker_and_plots_simplified.py + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + + +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +##### concat_outputs.sh START #### + +cat > code/concat_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "PROJECT_ROOT=${PROJECTROOT}" >> code/concat_outputs.sh +echo "cd ${PROJECTROOT}" >> code/concat_outputs.sh + +cat >> code/concat_outputs.sh << "EOT" +# set up concat_ds and run concatenator on it +cd ${CBICA_TMPDIR} +datalad clone ria+file://${PROJECT_ROOT}/output_ria#~data concat_ds +cd concat_ds/code +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/concatenator.py +cd .. +datalad save -m "added concatenator script" +datalad run -i 'csvs/*' -o 'concat_ds/group_report.csv' --expand inputs --explicit "python code/concatenator.py concat_ds/csvs ${PROJECT_ROOT}/XCP_AUDIT.csv" +datalad save -m "generated report" +# push changes +datalad push +# remove concat_ds +git annex dead here +cd .. +chmod +w -R concat_ds +rm -rf concat_ds +echo SUCCESS + +#### concat_output.sh END #### + +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-hcpd-bidsify.sh b/scripts/cubic/bootstrap-hcpd-bidsify.sh new file mode 100644 index 0000000..5298751 --- /dev/null +++ b/scripts/cubic/bootstrap-hcpd-bidsify.sh @@ -0,0 +1,217 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/BIDSIFY_HCPD +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +mkdir -p $PROJECTROOT + +## S3 downloaded dir will be the path to the data downloaded from S3 +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + +cd ${PROJECTROOT} +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +if [[ "${BIDS_INPUT_METHOD}" == "clone" ]] +then + echo "Cloning input dataset into analysis dataset" + datalad clone -d . ${BIDSINPUT} inputs/data + # amend the previous commit with a nicer commit message + git commit --amend -m 'Register input data dataset as a subdataset' +else + echo "WARNING: copying input data into repository" + mkdir -p inputs/data + cp -r ${BIDSINPUT}/* inputs/data + datalad save -r -m "added input data" +fi + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 ) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! + +datalad run \ + -i ${subid}_V1_MR \ + --explicit \ + -o ${subid}_V1_MR \ + -o ${rename_subid} \ + -m "rename for ${subid}" \ + "python bidsify_hcpd.py ${subid}_V1_MR" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +## the actual compute job specification +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/HBN_BIDS_Fix/hcpd_bidsify.py +mv hcpd_bidsify.py code/ +chmod +x code/hcpd_bidsify.py + + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-hcpya-xcp.sh b/scripts/cubic/bootstrap-hcpya-xcp.sh new file mode 100644 index 0000000..f1b265e --- /dev/null +++ b/scripts/cubic/bootstrap-hcpya-xcp.sh @@ -0,0 +1,294 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/xcp +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +## hcp input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +#get the workhorse script +wget -O code/xcp-hcpya-bootstrap.py https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/xcp-hcpya-bootstrap.py + +cat >> code/dataset_description.json << "EOT" +{ + "Name": "fMRIPrep - fMRI PREProcessing workflow", + "BIDSVersion": "1.4.0", + "DatasetType": "derivative", + "GeneratedBy": [ + { + "Name": "fMRIPrep", + "Version": "20.2.1", + "CodeURL": "https://github.com/nipreps/fmriprep/archive/20.2.1.tar.gz" + } + ], + "HowToAcknowledge": "Please cite our paper (https://doi.org/10.1038/s41592-018-0235-4), and include the generated citation boilerplate within the Methods section of the text." +} +EOT + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + + +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ${BIDSINPUT} inputs/data +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + + +SUBJECTS=$(find inputs/data/HCP1200/ -maxdepth 1 | cut -d '/' -f 4 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +cd ${PROJECTROOT} + +# Clone the containers dataset. If specified on the command, use that path + +#MUST BE AS NOT RBC USER +# build the container in /cbica/projects/hcpya/dropbox +# singularity build xcp-abcd-0.0.4.sif docker://pennlinc/xcp_abcd:0.0.4 + +#AS RBC +# then copy to /cbica/projects/hcpya/xcp-abcd-container +# datalad create -D "xcp-abcd container". +# do that actual copy +# datalad containers-add --url ~/dropbox/xcp-abcd-0.0.4.sif xcp-abcd-0.0.4 --update + +#can delete +#rm /cbica/projects/hcpya/dropbox/xcp-abcd-0.0.4.sif + +CONTAINERDS=~/xcp-abcd-container +datalad clone ${CONTAINERDS} pennlinc-containers + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=12G + +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +#cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" +echo GIT CHECKOUT FINISHED +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Do the run! +datalad get -r pennlinc-containers +echo GET CONTAINERS FINISHED +datalad run \ + -i code/xcp-hcpya-bootstrap.py \ + -i code/dataset_description.json \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/*Atlas_MSMAll.dtseries.nii \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/*LR.nii.gz* \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/*RL.nii.gz* \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/Movement_AbsoluteRMS.txt \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/Movement_Regressors.txt \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/SBRef_dc.nii.gz \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/*SBRef.nii.gz* \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/*CSF.txt* \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/*WM.txt* \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/ROIs/*2.nii.gz* \ + -i inputs/data/HCP1200/${subid}/MNINonLinear/Results/**/brainmask_fs.2.nii.gz \ + --explicit \ + -o ${subid}_xcp-0-0-4.zip \ + -m "xcp-abcd-run ${subid}" \ + "python code/xcp-hcpya-bootstrap.py ${subid}" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad uninstall --nocheck --if-dirty ignore -r inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + + +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N xcp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +chmod a+x code/qsub_calls.sh +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + + + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +#run last sge call to test +#$(tail -n 1 code/qsub_calls.sh) + +#submit the jobs as a job +#chmod a+x code/qsub_calls.sh +#qsub -l h_vmem=4G,s_vmem=4G -V -j y -b y -o /cbica/projects/hcpya/xcp/analysis/logs code/qsub_calls.sh diff --git a/scripts/cubic/bootstrap-qsiprep-audit.sh b/scripts/cubic/bootstrap-qsiprep-audit.sh new file mode 100644 index 0000000..87a6af1 --- /dev/null +++ b/scripts/cubic/bootstrap-qsiprep-audit.sh @@ -0,0 +1,319 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/qsiprep-audit +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +QSIPREP_BOOTSTRAP_DIR=$1 +QSIPREP_INPUT=ria+file://${QSIPREP_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${QSIPREP_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the qsiprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +# Is it a directory on the filesystem? +QSIPREP_INPUT_METHOD=clone +if [[ ! -d "${QSIPREP_BOOTSTRAP_DIR}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "qsiprep output dataset" + # exit 1 +fi + +# Check that there are some qsiprep zip files present in the input +# If you only need freesurfer, comment this out +# QSIPREP_ZIPS=$(cd ${QSIPREP_INPUT} && ls *qsiprep*.zip) +# if [[ -z "${QSIPREP_ZIPS}" ]]; then +# echo No qsiprep zip files found in ${QSIPREP_INPUT} +# exit 1 +# fi + +# Check that freesurfer data exists. If you only need qsiprep zips, comment +# this out +# FREESURFER_ZIPS=$(cd ${QSIPREP_INPUT} && ls *freesurfer*.zip) +# if [[ -z "${FREESURFER_ZIPS}" ]]; then +# echo No freesurfer zip files found in ${QSIPREP_INPUT} +# exit 1 +# fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + + +# Create a dataset with the logs in it +mkdir qsiprep_logs +cd qsiprep_logs +datalad create -D "Logs from the qsiprep runs" +cp ${QSIPREP_BOOTSTRAP_DIR}/analysis/logs/* . +datalad save -m "add logs" + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${QSIPREP_INPUT} inputs/data +datalad install -d . -r --source ${PROJECTROOT}/qsiprep_logs inputs/qsiprep_logs + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 5 ) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=5G +#$ -l s_vmem=3.5G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +datalad clone "${dssource}" ds +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! +BIDS_DIR=${PWD}/inputs/data/inputs/data +ZIPS_DIR=${PWD}/inputs/data +ERROR_DIR=${PWD}/inputs/qsiprep_logs +CSV_DIR=csvs +mkdir ${CSV_DIR} +output_file=${CSV_DIR}/${subid}_qsiprep_audit.csv +datalad get -n inputs/data +INPUT_ZIP=$(ls inputs/data/${subid}_qsiprep*.zip | cut -d '@' -f 1 || true) +if [ ! -z "${INPUT_ZIP}" ]; then + INPUT_ZIP="-i ${INPUT_ZIP}" +fi +echo DATALAD RUN INPUT +echo ${INPUT_ZIP} +datalad run \ + -i code/bootstrap_zip_audit.py \ + ${INPUT_ZIP} \ + -i inputs/qsiprep_logs/*${subid}* \ + --explicit \ + -o ${output_file} \ + -m "qsiprep-audit ${subid}" \ + "python code/bootstrap_zip_audit.py ${subid} ${BIDS_DIR} ${ZIPS_DIR} ${ERROR_DIR} ${output_file} qsiprep" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +# Sydney, please wget your audit script here! +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/bootstrap_zip_audit.py +mv bootstrap_zip_audit.py code/ +chmod +x code/bootstrap_zip_audit.py + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Add logs from qsiprep runs" + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" +datalad clone ${outputsource} merge_ds +cd merge_ds +NBRANCHES=$(git branch -a | grep job- | sort | wc -l) +echo "Found $NBRANCHES branches to merge" +gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1) +# query all branches for the most recent commit and check if it is identical. +# Write all branch identifiers for jobs without outputs into a file. +for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" = x"${gitref}" ] && \ + echo $i; done | tee code/noresults.txt | wc -l +for i in $(git branch -a | grep job- | sort); \ + do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" != x"${gitref}" ] && \ + echo $i; \ +done | tee code/has_results.txt +mkdir -p code/merge_batches +num_branches=$(wc -l < code/has_results.txt) +CHUNKSIZE=5000 +set +e +num_chunks=$(expr ${num_branches} / ${CHUNKSIZE}) +[[ $num_chunks == 0 ]] && num_chunks=1 +set -e -x +for chunknum in $(seq 1 $num_chunks) +do + startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1) + endnum=$(expr ${chunknum} \* ${CHUNKSIZE}) + batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt + [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches} + branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt) + echo ${branches} > ${batch_file} + git merge -m "qsiprep results batch ${chunknum}/${num_chunks}" $(cat ${batch_file}) +done +# Push the merge back +git push +# Get the file availability info +git annex fsck --fast -f output-storage +# This should not print anything +MISSING=$(git annex find --not --in output-storage) +if [[ ! -z "$MISSING" ]] +then + echo Unable to find data for $MISSING + exit 1 +fi +# stop tracking this branch +git annex dead here +datalad push --data nothing +echo SUCCESS +EOT + +##### concat_outputs.sh START #### + +cat > code/concat_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "PROJECT_ROOT=${PROJECTROOT}" >> code/concat_outputs.sh +echo "cd ${PROJECTROOT}" >> code/concat_outputs.sh + +cat >> code/concat_outputs.sh << "EOT" +# set up concat_ds and run concatenator on it +cd ${CBICA_TMPDIR} +datalad clone ria+file://${PROJECT_ROOT}/output_ria#~data concat_ds +cd concat_ds/code +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/concatenator.py +cd .. +datalad save -m "added concatenator script" +datalad run -i 'csvs/*' -o '${PROJECT_ROOT}/QSIPREP_AUDIT.csv' --expand inputs --explicit "python code/concatenator.py csvs ${PROJECT_ROOT}/QSIPREP_AUDIT.csv" +datalad save -m "generated report" +# push changes +datalad push +# remove concat_ds +git annex dead here +cd .. +chmod +w -R concat_ds +rm -rf concat_ds +echo SUCCESS + +EOT + +#### concat_output.sh END #### + + + +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# if we get here, we are happy +echo SUCCESS + diff --git a/scripts/cubic/bootstrap-qsiprep-multises.sh b/scripts/cubic/bootstrap-qsiprep-multises.sh new file mode 100644 index 0000000..951c318 --- /dev/null +++ b/scripts/cubic/bootstrap-qsiprep-multises.sh @@ -0,0 +1,302 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/qsiprep-multises +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ${BIDSINPUT} inputs/data +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 | sort) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +# Clone the containers dataset. If specified on the command, use that path +CONTAINERDS=$2 +## Add the containers as a subdataset +cd ${PROJECTROOT} + +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +else + echo ERROR: requires a container dataset + exit 1 +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=32G +#$ -l tmpfree=200G +#$ -pe threaded 6 + +# Set up the correct conda environment +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +sesid="$4" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}-${sesid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" + +# Reomve all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) + +# ------------------------------------------------------------------------------ +# Do the run! + +datalad run \ + -i code/qsiprep_zip.sh \ + -i inputs/data/${subid}/${sesid} \ + -i "inputs/data/*json" \ + -i pennlinc-containers/.datalad/environments/qsiprep-0-14-2/image \ + --expand inputs \ + --explicit \ + -o ${subid}_${sesid}_qsiprep-0.14.2.zip \ + -m "qsiprep:0.14.2 ${subid} ${sesid}" \ + "bash ./code/qsiprep_zip.sh ${subid} ${sesid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/qsiprep_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +sesid="$2" + +# Create a filter file that only allows this session +filterfile=${PWD}/${sesid}_filter.json +echo "{" > ${filterfile} +echo "'fmap': {'datatype': 'fmap'}," >> ${filterfile} +echo "'dwi': {'datatype': 'dwi', 'session': '$sesid', 'suffix': 'dwi'}," >> ${filterfile} +echo "'sbref': {'datatype': 'func', 'session': '$sesid', 'suffix': 'sbref'}," >> ${filterfile} +echo "'flair': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'FLAIR'}," >> ${filterfile} +echo "'t2w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T2w'}," >> ${filterfile} +echo "'t1w': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'T1w'}," >> ${filterfile} +echo "'roi': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'roi'}" >> ${filterfile} +echo "}" >> ${filterfile} + +# remove ses and get valid json +sed -i "s/'/\"/g" ${filterfile} +sed -i "s/ses-//g" ${filterfile} + +mkdir -p ${PWD}/.git/tmp/wdir +singularity run --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/qsiprep-0-14-2/image \ + inputs/data \ + prep \ + participant \ + -v -v \ + -w ${PWD}/.git/tmp/wdir \ + --n_cpus $NSLOTS \ + --stop-on-first-crash \ + --fs-license-file code/license.txt \ + --skip-bids-validation \ + --bids-filter-file "${filterfile}" \ + --participant-label "$subid" \ + --unringing-method mrdegibbs \ + --output-resolution 2.0 + +cd prep +7z a ../${subid}_${sesid}_qsiprep-0.14.2.zip qsiprep +rm -rf prep .git/tmp/wdir +rm ${filterfile} + +EOT + +chmod +x code/qsiprep_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + SESSIONS=$(ls inputs/data/$subject | grep ses- | cut -d '/' -f 1) + for session in ${SESSIONS}; do + echo "qsub -cwd ${env_flags} -N qp${subject}_${session} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} ${session}" >> code/qsub_calls.sh + done +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-qsiprep.sh b/scripts/cubic/bootstrap-qsiprep.sh new file mode 100644 index 0000000..274bf38 --- /dev/null +++ b/scripts/cubic/bootstrap-qsiprep.sh @@ -0,0 +1,296 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/qsiprep +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +if [[ "${BIDS_INPUT_METHOD}" == "clone" ]] +then + echo "Cloning input dataset into analysis dataset" + datalad clone -d . ${BIDSINPUT} inputs/data + # amend the previous commit with a nicer commit message + git commit --amend -m 'Register input data dataset as a subdataset' +else + echo "WARNING: copying input data into repository" + mkdir -p inputs/data + cp -r ${BIDSINPUT}/* inputs/data + datalad save -r -m "added input data" +fi + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 | sort) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + + +## Add the containers as a subdataset +cd ${PROJECTROOT} + +# Clone the containers dataset. If specified on the command, use that path +CONTAINERDS=$2 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +else + echo ERROR: requires a container dataset + exit 1 +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=32G +#$ -l s_vmem=30.5G +#$ -l tmpfree=300G +#$ -pe threaded 6 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" + +# Reomve all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) + +# ------------------------------------------------------------------------------ +# Do the run! + +datalad run \ + -i code/qsiprep_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/*json \ + -i pennlinc-containers/.datalad/environments/qsiprep-0-14-3/image \ + --explicit \ + -o ${subid}_qsiprep-0.14.3.zip \ + -m "qsiprep:0.14.3 ${subid}" \ + "bash ./code/qsiprep_zip.sh ${subid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/qsiprep_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" + +mkdir -p ${PWD}/.git/tmp/wdir +singularity run --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/qsiprep-0-14-3/image \ + inputs/data \ + prep \ + participant \ + -v -v \ + -w ${PWD}/.git/wkdir \ + --n_cpus $NSLOTS \ + --stop-on-first-crash \ + --fs-license-file code/license.txt \ + --skip-bids-validation \ + --participant-label "$subid" \ + --unringing-method mrdegibbs \ + --output-resolution 1.5 + +cd prep +7z a ../${subid}_qsiprep-0.14.3.zip qsiprep +rm -rf prep .git/tmp/wkdir + +EOT + +chmod +x code/qsiprep_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N qp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +if [ "${BIDS_INPUT_METHOD}" = "clone" ] +then + datalad uninstall -r --nocheck inputs/data +fi + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-qsirecon-fmriprep.sh b/scripts/cubic/bootstrap-qsirecon-fmriprep.sh new file mode 100755 index 0000000..58d3bd4 --- /dev/null +++ b/scripts/cubic/bootstrap-qsirecon-fmriprep.sh @@ -0,0 +1,333 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +# Arguments: +# 1. qsiprep bootstrap directory +# 2. fmriprep bootstrap directory +# 3. qsiprep container dataset directory +# 4. Path to the MNI template +# 5. suffix for this bootstrap directory + + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +# The fifth argument can be a suffix for the bootstrap directory +suffix="" +if [[ $# -gt 4 ]]; then + suffix='_'${5} +fi + +## qsiprep input +QSIPREPINPUT=$1 +if [[ -z ${QSIPREPINPUT} ]] +then + echo "Required argument is an identifier of the QSIPrep output zips" + # exit 1 +fi + +if [[ ! -d "${QSIPREPINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "QSIPrep output dataset" + # exit 1 +fi + +## qsirecon input +QSIRECONINPUT=$2 +if [[ -z ${QSIPREPINPUT} ]] +then + echo "Required argument is an identifier of the QSIPrep output zips" + # exit 1 +fi + +if [[ ! -d "${QSIRECONINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "QSIPrep output dataset" + # exit 1 +fi + +set -e -u + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/scalarmap${suffix} +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ria+file://${QSIPREPINPUT}/output_ria#~data inputs/data/prep +git commit --amend -m 'Register preprocessed dataset as a subdataset' +datalad clone -d . ria+file://${QSIRECONINPUT}/output_ria#~data inputs/data/recon +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register recon dataset as a subdataset' + +SUBJECTS=$(find inputs/data/recon -name '*.zip' | cut -d '/' -f 4 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +cd ${PROJECTROOT} + +CONTAINERDS=$3 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +mkdir MNI +cp $4 MNI/template.nii.gz +# Force this into git because it's small +git annex add --force-small MNI/template.nii.gz +datalad save -m "Added $4 as the template" MNI/template.nii.gz + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=24G +#$ -l s_vmem=16G +#$ -l tmpfree=50G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +# cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup + +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Do the run! + +datalad get -r pennlinc-containers +datalad get -n -r inputs/data +PREP_ZIP=$(ls inputs/data/prep/${subid}_qsiprep*.zip | cut -d '@' -f 1 || true) +RECON_ZIP=$(ls inputs/data/recon/${subid}_qsi*.zip | cut -d '@' -f 1 || true) + +datalad run \ + -i code/warp_scalars.sh \ + -i MNI/template.nii.gz \ + -i inputs/data/prep/${subid}*qsiprep*.zip \ + -i inputs/data/recon/${subid}*qsirecon*.zip \ + --explicit \ + -o MNI \ + -m "transform ${subid}" \ + "bash ./code/warp_scalars.sh ${subid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace + +EOT + +chmod +x code/participant_job.sh + + +cat > code/warp_scalars.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +wd=${PWD} + +cd inputs/data/prep +7z x ${subid}_qsiprep-0.14.2.zip +cd ../recon +7z x ${subid}_qsirecon-*.zip +cd $wd + +to_warp=$(find inputs/data/recon/qsirecon -name '*scalar.nii.gz' \ + -o -name '*OD_*' -o -name '*ICVF_*' -o -name '*ISOVF_*'\ + -o -name '*MAPMRI.nii.gz') +trf=$(find inputs/data/prep/qsiprep -name '*from-T1w_to-MNI152NLin2009cAsym_mode-image_xfm.h5') + +for scalar in ${to_warp} +do + + outfile=MNI/$(basename $scalar | sed 's/T1w/MNI/g') + singularity exec \ + --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/qsiprep-0-14-2/image \ + antsApplyTransforms \ + -d 3 \ + -t ${trf} \ + -i ${scalar} \ + -o ${outfile} \ + -r MNI/template.nii.gz \ + --interpolation NearestNeighbor +done + +EOT + +chmod +x code/warp_scalars.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N qsirecon${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + + + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +#run last sge call to test +#$(tail -n 1 code/qsub_calls.sh) + diff --git a/scripts/cubic/bootstrap-qsirecon-hsvs.sh b/scripts/cubic/bootstrap-qsirecon-hsvs.sh new file mode 100755 index 0000000..100c461 --- /dev/null +++ b/scripts/cubic/bootstrap-qsirecon-hsvs.sh @@ -0,0 +1,327 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +# Arguments: +# 1. qsiprep bootstrap directory +# 2. fmriprep bootstrap directory +# 3. qsiprep container dataset directory +# 4. Path to the MNI template +# 5. suffix for this bootstrap directory + + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +# The fifth argument can be a suffix for the bootstrap directory +suffix="" +if [[ $# -gt 4 ]]; then + suffix='_'${5} +fi + +## qsiprep input +QSIPREPINPUT=$1 +if [[ -z ${QSIPREPINPUT} ]] +then + echo "Required argument is an identifier of the QSIPrep output zips" + # exit 1 +fi + +if [[ ! -d "${QSIPREPINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "QSIPrep output dataset" + # exit 1 +fi + +## qsirecon input +FREESURFERINPUT=$2 +if [[ -z ${FREESURFERINPUT} ]] +then + echo "Required argument is an identifier of the FreeSurfer output zips" + # exit 1 +fi + +if [[ ! -d "${FREESURFERINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "QSIPrep output dataset" + # exit 1 +fi + +set -e -u + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/sifthsvs +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ria+file://${QSIPREPINPUT}/output_ria#~data inputs/data/qsiprep +git commit --amend -m 'Register qsiprep results dataset as a subdataset' +datalad clone -d . ria+file://${FREESURFERINPUT}/output_ria#~data inputs/data/fmriprep +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register freesurfer/fmriprep dataset as a subdataset' + +SUBJECTS=$(find inputs/data/qsiprep -name '*.zip' | cut -d '/' -f 4 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +cd ${PROJECTROOT} + +CONTAINERDS=$3 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=24G +#$ -l s_vmem=16G +#$ -l tmpfree=50G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +# cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup + +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Do the run! + +datalad get -r pennlinc-containers +datalad get -n -r inputs/data +QSIPREP_ZIP=$(ls inputs/data/qsiprep/${subid}_qsiprep*.zip | cut -d '@' -f 1 || true) +FREESURFER_ZIP=$(ls inputs/data/fmriprep/${subid}_free*.zip | cut -d '@' -f 1 || true) + +datalad run \ + -i code/qsirecon_zip.sh \ + -i ${QSIPREP_ZIP} \ + -i ${FREESURFER_ZIP} \ + --explicit \ + -o MNI \ + -m "transform ${subid}" \ + "bash ./code/qsirecon_zip.sh ${subid} ${QSIPREP_ZIP} ${FREESURFER_ZIP}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace + +EOT + +chmod +x code/participant_job.sh + + +cat > code/qsirecon_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +qsiprep_zip="$2" +freesurfer_zip="$3" +wd=${PWD} + +cd inputs/data/qsiprep +7z x `basename ${qsiprep_zip}` +cd ../fmriprep +7z x `basename ${freesurfer_zip}` +cd $wd + +mkdir -p ${PWD}/.git/tmp/wkdir +singularity run \ + --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/qsiprep-0-15-0a/image \ + inputs/data/qsiprep/qsiprep qsirecon participant \ + --participant_label $subid \ + --recon-input inputs/data/qsiprep/qsiprep \ + --fs-license-file code/license.txt \ + --nthreads ${NSLOTS} \ + --stop-on-first-crash \ + --recon-only \ + --freesurfer-input inputs/data/fmriprep/freesurfer \ + --recon-spec mrtrix_singleshell_ss3t_ATC-hsvs \ + -w ${PWD}/.git/tmp/wkdir + +cd qsirecon +7z a ../${subid}_qsirecon-0-15-0a.zip qsirecon +rm -rf prep .git/tmp/wkdir + +EOT + +chmod +x code/qsirecon_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N qsirecon${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + + + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +#run last sge call to test +#$(tail -n 1 code/qsub_calls.sh) + diff --git a/scripts/cubic/bootstrap-qsirecon-scalarfest.sh b/scripts/cubic/bootstrap-qsirecon-scalarfest.sh new file mode 100644 index 0000000..acc1006 --- /dev/null +++ b/scripts/cubic/bootstrap-qsirecon-scalarfest.sh @@ -0,0 +1,328 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +# The third argument can be a suffix for the bootstrap directory +suffix="" +if [[ $# -gt 2 ]]; then + suffix='_'${3} +fi + +## qsiprep input +QSIPREPINPUT=$1 +if [[ -z ${QSIPREPINPUT} ]] +then + echo "Required argument is an identifier of the QSIPrep output zips" + # exit 1 +fi + +if [[ ! -d "${QSIPREPINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "QSIPrep output dataset" + # exit 1 +fi + +## freesurfer input +FREESURFERINPUT=$2 +if [[ -z ${FREESURFERINPUT} ]] +then + echo "Required argument is an identifier of the FreeSurfer output zips" + # exit 1 +fi + +if [[ ! -d "${FREESURFERINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "QSIPrep output dataset" + # exit 1 +fi + +set -e -u + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/qsirecon${suffix} +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ria+file://${QSIPREPINPUT}/output_ria#~data inputs/data +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' +datalad clone -d . ria+file://${FREESURFERINPUT}/output_ria#~data inputs/data/fmriprep +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register freesurfer/fmriprep dataset as a subdataset' + + +SUBJECTS=$(find inputs/data -name '*.zip' | cut -d '/' -f 3 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +cd ${PROJECTROOT} + +CONTAINERDS=$3 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=64G +#$ -l tmpfree=300G +#$ -pe threaded 3 + +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +# cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup + +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Do the run! + +datalad get -r pennlinc-containers +datalad get -n -r inputs/data +QSIPREP_ZIP=$(ls inputs/data/qsiprep/${subid}_qsiprep*.zip | cut -d '@' -f 1 || true) +FREESURFER_ZIP=$(ls inputs/data/fmriprep/${subid}_free*.zip | cut -d '@' -f 1 || true) + +if [ -z "${FREESURFER_ZIP}" ]; then + echo NO freesurfer zip. + exit 1 +fi + +datalad run \ + -i code/qsirecon_zip.sh \ + -i inputs/data/${subid}*qsiprep*.zip \ + -i code/license.txt \ + --explicit \ + -o ${subid}_qsirecon-0-15-2.zip \ + -m "qsirecon-0.15.2 ${subid}" \ + "bash ./code/qsirecon_zip.sh ${subid} ${QSIPREP_ZIP} ${FREESURFER_ZIP}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +git annex dead here + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + + +cat > code/qsirecon_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +qsiprep_zip="$2" +freesurfer_zip="$3" +wd=${PWD} + +cd inputs/data/qsiprep +7z x `basename ${qsiprep_zip}` +cd ../fmriprep +7z x `basename ${freesurfer_zip}` +cd $wd + +mkdir -p ${PWD}/.git/tmp/wkdir +singularity run \ + --cleanenv --env DIPY_HOME=/home/qsiprep/.dipy -B ${PWD} \ + pennlinc-containers/.datalad/environments/qsiprep-0-15-2/image \ + inputs/data/qsiprep qsirecon participant \ + --participant_label $subid \ + --recon-input inputs/data/qsiprep \ + --fs-license-file code/license.txt \ + --nthreads ${NSLOTS} \ + --omp-nthreads $(expr ${NSLOTS} - 1) \ + --stop-on-first-crash \ + --recon-only \ + --skip-recon-reports \ + --freesurfer-input inputs/data/fmriprep/freesurfer \ + --recon-spec ${PWD}/code/multishell_gauntlet.json \ + -w ${PWD}/.git/tmp/wkdir + +cd qsirecon +7z a ../${subid}_qsirecon-0-15-2.zip qsirecon +rm -rf prep .git/tmp/wkdir + +EOT + +chmod +x code/qsirecon_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt +# Get the recon spec +RECON_SPEC=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/multishell_gauntlet.json +wget -qO- ${RECON_SPEC} >> code/multishell_gauntlet.json + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N qsirecon${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +#run last sge call to test +#$(tail -n 1 code/qsub_calls.sh) diff --git a/scripts/cubic/bootstrap-qsirecon.sh b/scripts/cubic/bootstrap-qsirecon.sh new file mode 100644 index 0000000..ab20d7d --- /dev/null +++ b/scripts/cubic/bootstrap-qsirecon.sh @@ -0,0 +1,294 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +# The third argument can be a suffix for the bootstrap directory +suffix="" +if [[ $# -gt 2 ]]; then + suffix='_'${3} +fi + +## fmriprep input +QSIPREPINPUT=$1 +if [[ -z ${QSIPREPINPUT} ]] +then + echo "Required argument is an identifier of the QSIPrep output zips" + # exit 1 +fi + +if [[ ! -d "${QSIPREPINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "QSIPrep output dataset" + # exit 1 +fi + +set -e -u + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/qsirecon${suffix} +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ria+file://${QSIPREPINPUT}/output_ria#~data inputs/data +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(find inputs/data -name '*.zip' | cut -d '/' -f 3 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +cd ${PROJECTROOT} + +CONTAINERDS=$2 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=32G +#$ -l s_vmem=32G +#$ -l tmpfree=100G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup + +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Do the run! + +datalad get -r pennlinc-containers + +datalad run \ + -i code/qsirecon_zip.sh \ + -i inputs/data/${subid}*qsiprep*.zip \ + -i code/license.txt \ + --explicit \ + -o ${subid}_qsirecon-0-14-2.zip \ + -m "qsirecon-0.14.2 ${subid}" \ + "bash ./code/qsirecon_zip.sh ${subid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +git annex dead here + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + + +cat > code/qsirecon_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +wd=${PWD} + +cd inputs/data +7z x ${subid}_qsiprep-0.13.1.zip +cd $wd + +mkdir -p ${PWD}/.git/tmp/wkdir +singularity run \ + --cleanenv --env DIPY_HOME=/home/qsiprep/.dipy -B ${PWD} \ + pennlinc-containers/.datalad/environments/qsiprep-0-14-2/image \ + inputs/data/qsiprep qsirecon participant \ + --participant_label $subid \ + --recon-input inputs/data/qsiprep \ + --fs-license-file code/license.txt \ + --nthreads ${NSLOTS} \ + --stop-on-first-crash \ + --recon-only \ + --recon-spec mrtrix_singleshell_ss3t_noACT \ + --sloppy \ + -w ${PWD}/.git/tmp/wkdir + +cd qsirecon +7z a ../${subid}_qsirecon-0-14-2.zip qsirecon +rm -rf prep .git/tmp/wkdir + +EOT + +chmod +x code/qsirecon_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N qsirecon${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + + + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +#run last sge call to test +#$(tail -n 1 code/qsub_calls.sh) diff --git a/scripts/cubic/bootstrap-scalarnorm.sh b/scripts/cubic/bootstrap-scalarnorm.sh new file mode 100755 index 0000000..eadaeef --- /dev/null +++ b/scripts/cubic/bootstrap-scalarnorm.sh @@ -0,0 +1,334 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +# Arguments: +# 1. qsiprep bootstrap directory +# 2. qsirecon bootstrap directory +# 3. qsiprep container dataset directory +# 4. Path to the MNI template +# 5. suffix for this bootstrap directory + + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +# The fifth argument can be a suffix for the bootstrap directory +suffix="" +if [[ $# -gt 4 ]]; then + suffix='_'${5} +fi + +## qsiprep input +QSIPREPINPUT=$1 +if [[ -z ${QSIPREPINPUT} ]] +then + echo "Required argument is an identifier of the QSIPrep output zips" + # exit 1 +fi + +if [[ ! -d "${QSIPREPINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "QSIPrep output dataset" + # exit 1 +fi + +## qsirecon input +QSIRECONINPUT=$2 +if [[ -z ${QSIPREPINPUT} ]] +then + echo "Required argument is an identifier of the QSIPrep output zips" + # exit 1 +fi + +if [[ ! -d "${QSIRECONINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "QSIPrep output dataset" + # exit 1 +fi + +set -e -u + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/scalarmap${suffix} +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ria+file://${QSIPREPINPUT}/output_ria#~data inputs/data/prep +git commit --amend -m 'Register preprocessed dataset as a subdataset' +datalad clone -d . ria+file://${QSIRECONINPUT}/output_ria#~data inputs/data/recon +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register recon dataset as a subdataset' + +SUBJECTS=$(find inputs/data/recon -name '*.zip' | cut -d '/' -f 4 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +cd ${PROJECTROOT} + +CONTAINERDS=$3 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +mkdir MNI +cp $4 MNI/template.nii.gz +# Force this into git because it's small +git annex add --force-small MNI/template.nii.gz +datalad save -m "Added $4 as the template" MNI/template.nii.gz + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=24G +#$ -l s_vmem=16G +#$ -l tmpfree=50G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +# cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup + +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Do the run! + +datalad get -r pennlinc-containers +datalad get -n -r inputs/data +PREP_ZIP=$(ls inputs/data/prep/${subid}_qsiprep*.zip | cut -d '@' -f 1 || true) +RECON_ZIP=$(ls inputs/data/recon/${subid}_qsi*.zip | cut -d '@' -f 1 || true) + +datalad run \ + -i code/warp_scalars.sh \ + -i MNI/template.nii.gz \ + -i inputs/data/prep/${subid}*qsiprep*.zip \ + -i inputs/data/recon/${subid}*qsirecon*.zip \ + --explicit \ + -o MNI \ + -m "transform ${subid}" \ + "bash ./code/warp_scalars.sh ${subid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +git annex dead here + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace + +EOT + +chmod +x code/participant_job.sh + + +cat > code/warp_scalars.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +wd=${PWD} + +cd inputs/data/prep +7z x ${subid}_qsiprep-0.14.2.zip +cd ../recon +7z x ${subid}_qsirecon-*.zip +cd $wd + +to_warp=$(find inputs/data/recon/qsirecon -name '*scalar.nii.gz' \ + -o -name '*OD_*' -o -name '*ICVF_*' -o -name '*ISOVF_*'\ + -o -name '*MAPMRI.nii.gz') +trf=$(find inputs/data/prep/qsiprep -name '*from-T1w_to-MNI152NLin2009cAsym_mode-image_xfm.h5') + +for scalar in ${to_warp} +do + + outfile=MNI/$(basename $scalar | sed 's/T1w/MNI/g') + singularity exec \ + --cleanenv -B ${PWD} \ + pennlinc-containers/.datalad/environments/qsiprep-0-14-2/image \ + antsApplyTransforms \ + -d 3 \ + -t ${trf} \ + -i ${scalar} \ + -o ${outfile} \ + -r MNI/template.nii.gz \ + --interpolation NearestNeighbor +done + +EOT + +chmod +x code/warp_scalars.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N qsirecon${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + + + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +#run last sge call to test +#$(tail -n 1 code/qsub_calls.sh) + diff --git a/scripts/cubic/bootstrap-shoreline-benchmark.sh b/scripts/cubic/bootstrap-shoreline-benchmark.sh new file mode 100755 index 0000000..6bd0976 --- /dev/null +++ b/scripts/cubic/bootstrap-shoreline-benchmark.sh @@ -0,0 +1,583 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/shoreline-benchmark +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + + + +# Install the input data and the containers at the same level as analysis/ +cd ${PROJECTROOT} +# register the input dataset +echo "Cloning input dataset into analysis dataset" +datalad clone osf://38vce/ phantoms +(cd phantoms && datalad get -r .) + +echo "Cloning the containers dataset" +datalad clone ///repronim/containers containers +(cd containers && datalad get images/bids/bids-qsiprep--0.14.3.sing) + + +cd analysis +datalad clone -d . ../phantoms inputs/data +git commit --amend -m 'Register phantom dataset as a subdataset' + +datalad clone -d . ../containers containers +git commit --amend -m 'Register containers dataset as a subdataset' + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +scheme="$3" +noise="$4" +PERCENT_MOTION="$5" +simnum="$6" +method="$7" +transform="$8" +denoise="$9" +motion_severity="${10}" + +# change into the cluster-assigned temp directory. Not done by default in SGE +# cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${method}-${scheme}-${noise}-${PERCENT_MOTION}-${transform}-${denoise}-${simnum}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data" + +# ------------------------------------------------------------------------------ +# Do the run! +outputname="${scheme}_${noise}_${PERCENT_MOTION}_${simnum}_${method}-${transform}-${denoise}-${motion_severity}-qsiprep-0.14.3.zip" +run_args="${scheme} ${noise} ${PERCENT_MOTION} ${simnum} ${method} ${transform} ${denoise} ${motion_severity} ${outputname}" + +datalad run \ + -i "inputs/data/${noise}/"'*/sub-'"${scheme}" \ + -i "inputs/data/realistic/nomotion/dataset_description.json" \ + -i "inputs/data/ground_truth_motion" \ + -i "containers/images/bids/bids-qsiprep--0.14.3.sing" \ + --explicit \ + --expand inputs \ + -o ${outputname} \ + -m "${run_args}" \ + "bash ./code/qsiprep_zip.sh ${run_args}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall --nocheck --if-dirty ignore -r inputs/data +datalad drop -r --nocheck . +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +EOT + +chmod +x code/participant_job.sh + +cat > code/qsiprep_zip.sh << "EOT" +#!/bin/bash +set -e -u -x +sequence="$1" +noise="$2" +percent_motion="$3" +permutation_number="$4" +hmc_method="$5" +transform="$6" +denoise="$7" +motion_severity="$9" +outputname="$8" + +CONTAINER=containers/images/bids/bids-qsiprep--0.14.3.sing +SOURCEBIND='-B /cbica/projects/Shoreline/code/qsiprep/qsiprep:/usr/local/miniconda/lib/python3.7/site-packages/qsiprep' + +# Create the subset in bids_subset/ +singularity exec --cleanenv -B ${PWD} \ + $CONTAINER python code/create_motion_subset.py \ + ${sequence} \ + ${noise} \ + ${percent_motion} \ + ${permutation_number} \ + ${motion_severity} + + +workdir=${PWD}/.git/tmp/wdir +mkdir -p ${workdir} + +if [[ "${hmc_method}" == "eddy" ]]; +then + + if [[ "${transform}" == "quadratic" ]]; + then + singularity run --cleanenv -B ${PWD} \ + ${SOURCEBIND} \ + ${CONTAINER} \ + bids_subset \ + prep \ + participant \ + -v -v \ + -w ${workdir} \ + --n_cpus $NSLOTS \ + --stop-on-first-crash \ + --fs-license-file code/license.txt \ + --skip-bids-validation \ + --denoise-method ${denoise} \ + --eddy-config code/quadratic.json \ + --output-resolution 2.0 + else + singularity run --cleanenv -B ${PWD} \ + ${SOURCEBIND} \ + ${CONTAINER} \ + bids_subset \ + prep \ + participant \ + -v -v \ + -w ${workdir} \ + --n_cpus $NSLOTS \ + --stop-on-first-crash \ + --fs-license-file code/license.txt \ + --skip-bids-validation \ + --denoise-method ${denoise} \ + --output-resolution 2.0 + fi + +else + # Run SHORELine + singularity run --cleanenv -B ${PWD} \ + ${SOURCEBIND} \ + ${CONTAINER} \ + bids_subset \ + prep \ + participant \ + -v -v \ + -w ${workdir} \ + --n_cpus $NSLOTS \ + --stop-on-first-crash \ + --fs-license-file code/license.txt \ + --skip-bids-validation \ + --hmc-model 3dSHORE \ + --hmc_transform ${transform} \ + --shoreline-iters 2 \ + --b0-motion-corr-to first \ + --denoise-method ${denoise} \ + --output-resolution 2.0 +fi + +# Copy the ground-truth motion file into the results zip +cp bids_subset/sub-${sequence}/dwi/*_dwi_motion.txt prep/qsiprep/ + +cd prep +7z a ../${outputname} qsiprep +cd .. +rm -rf prep ${workdir} + +EOT + +cat > code/create_motion_subset.py << "EOT" +#!/usr/bin/env python +""" + +USAGE: + +python create_motion_subset.py sequence noise percent_motion permutation_number severity + +Where + sequence is HASC55, HCP, ABCD, PN, DSIQ5 + noise is "realistic" or "noisefree" + percent_motion is 1-100 + permutation_number is an integer + severity is "low" or "high" + +Creates new version of the data with random volumes (determined by permutation_number) +are replaced with low|high motion versions of the same gradient direction. The percent +of volumes to be replaced is determined by percent_motion. +""" + + +import sys +import shutil +import os +import nibabel as nb +import numpy as np + + +def simulate_motion( + seq='HASC55', noise='noisefree', percent_motion=10, + permutation_number=999, severity=''): + + args = dict(seq=seq, noise=noise, percent_motion=percent_motion, severity=severity) + + dataset_description = \ + 'inputs/data/{noise}/nomotion/' \ + 'dataset_description.json'.format(**args) + + # No motion simulation + nonmotion_dwi = \ + 'inputs/data/{noise}/nomotion/sub-{seq}/' \ + 'dwi/sub-{seq}_acq-{noise}Xnomotion_dwi.nii.gz'.format(**args) + nonmotion_img = nb.load(nonmotion_dwi) + nonmotion_data = nonmotion_img.get_fdata(dtype=np.float32) + json = nonmotion_dwi[:-7] + '.json' + bval = nonmotion_dwi[:-7] + '.bval' + bvec = nonmotion_dwi[:-7] + '.bvec' + + # All motion simulation uses the low motion examples + motion_file = 'inputs/data/ground_truth_motion/' \ + 'sub-{seq}_acq-{noise}_run-{severity}motion_dwi_motion.txt'.format(**args) + all_motion = np.loadtxt(motion_file) + motion_dwi = \ + 'inputs/data/{noise}/{severity}motion/sub-{seq}/' \ + 'dwi/sub-{seq}_acq-{noise}X{severity}motion_dwi.nii.gz'.format(**args) + motion_img = nb.load(motion_dwi) + motion_data = motion_img.get_fdata(dtype=np.float32) + + out_dir = 'bids_subset/sub-{seq}/dwi'.format(**args) + os.makedirs(out_dir, exist_ok=True) + shutil.copyfile(dataset_description, + "bids_subset/dataset_description.json", + follow_symlinks=True) + + np.random.seed(permutation_number) + args['permnum'] = permutation_number + prefix = out_dir + '/sub-{seq}_acq-mot{percent_motion}perm' \ + '{permnum:03d}_dwi'.format(**args) + shutil.copyfile(json, prefix + '.json', follow_symlinks=True) + shutil.copyfile(bval, prefix + '.bval', follow_symlinks=True) + shutil.copyfile(bvec, prefix + '.bvec', follow_symlinks=True) + + # Determine which volumes should get swapped with their motion version + num_vols = nonmotion_img.shape[3] + num_to_replace = int(num_vols * float(percent_motion) / 100) + replace_vols = np.random.choice(num_vols - 1, size=num_to_replace, + replace=False) + 1 + # create the new 4D image with the moved images mixed in + nonmotion_data[..., replace_vols] = motion_data[..., replace_vols] + nb.Nifti1Image( + nonmotion_data, nonmotion_img.affine, + nonmotion_img.header).to_filename( + prefix + '.nii.gz') + + motion_params = np.zeros_like(all_motion) + motion_params[replace_vols] = all_motion[replace_vols] + np.savetxt(prefix + '_motion.txt', motion_params) + + +if __name__ == "__main__": + sequence = sys.argv[1] + noise = sys.argv[2] + percent_motion = int(sys.argv[3]) + permutation_number = int(sys.argv[4]) + severity = sys.argv[5] + simulate_motion( + seq=sequence, noise=noise, percent_motion=percent_motion, + permutation_number=permutation_number, severity=severity + ) + +EOT + + +chmod +x code/create_motion_subset.py +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" +echo '#!/bin/bash' > code/qsub_rerun.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +echo 'dssource='${dssource} >> code/qsub_rerun.sh +echo 'pushgitremote='${pushgitremote} >> code/qsub_rerun.sh +echo 'PROJECTROOT='${PROJECTROOT} >> code/qsub_rerun.sh +echo 'LOGDIR='${PROJECTROOT}/analysis/logs >> code/qsub_rerun.sh +echo 'DSLOCKFILE='${PROJECTROOT}/analysis/.SGE_datalad_lock >> code/qsub_rerun.sh + + +cat >> code/qsub_rerun.sh << "EOT" + +# USAGE bash code/qsub_rerun.sh [do run] +# With no arguments, print whether the branch exists in +# the output_ria (the job has completed successfully) +# + +QSIPREP_SCHEMES="ABCD DSIQ5 HCP HASC55" +EDDY_SCHEMES="ABCD HCP PNC" +NOISES="realistic" +PERCENT_MOTION=15 +NUM_PERMS=10 +#DENOISERS="dwidenoise none patch2self" +DENOISERS="dwidenoise" + +getreq(){ + case $1 in + + HCP | DSIQ5) + memreq="80G" + threadreq="4-6" + ;; + ABCD) + memreq="48G" + threadreq="2-4" + ;; + PNC | HASC55) + memreq="36G" + threadreq="2-4" + ;; + *) + memreq="54G" + threadreq="2-4" + ;; + + esac +} + +dorun=0 +if [ $# -gt 0 ]; then + dorun=1 + echo Submitting jobs to SGE +fi + +# Discover which branches have completed +cd ${PROJECTROOT}/output_ria/alias/data/ +branches=$(git branch -a | grep job- | tr '\n' ' ' | sed 's/ */,/g') +running_branches=$(qstat -r | grep "Full jobname" | tr -s ' ' | cut -d ' ' -f 4 | tr '\n' ',') + +submit_unfinished(){ + + BRANCH="${method}-${scheme}-${noise}-${PERCENT_MOTION}-${transform}-${denoise}-${simnum}" + branch_ok=$(echo $branches | grep "${BRANCH}," | wc -c) + branch_submitted=$(echo $running_branches | grep "${BRANCH}," | wc -c) + + # check status of this branch + if [ ${branch_ok} -gt 0 ]; then + echo FINISHED: $BRANCH + + elif [ "${branch_submitted}" -gt 0 ]; then + echo WAITING FOR: ${BRANCH} + + else + echo INCOMPLETE: $BRANCH + + # Run it if we got an extra argument + if [ ${dorun} -gt 0 ]; then + + # Set variables for resource requirements + getreq + + # Do the qsub call + set +x + qsub \ + -e ${LOGDIR} -o ${LOGDIR} \ + -cwd \ + -l "h_vmem=${memreq}" \ + -pe threaded ${threadreq} \ + -N x${BRANCH} \ + -v DSLOCKFILE=$DSLOCKFILE \ + code/participant_job.sh \ + ${dssource} \ + ${pushgitremote} \ + ${scheme} \ + ${noise} \ + ${PERCENT_MOTION} \ + ${simnum} \ + ${method} \ + ${transform} \ + ${denoise} + set -x + fi + + fi +} + +cd $PROJECTROOT/analysis +for denoise in ${DENOISERS} +do + for noise in ${NOISES} + do + for simnum in `seq ${NUM_PERMS}` + do + method=3dSHORE + for scheme in ${QSIPREP_SCHEMES} + do + transform=Rigid + submit_unfinished + + transform=Affine + submit_unfinished + done + + method=eddy + for scheme in ${EDDY_SCHEMES} + do + # One for linear + transform=Linear + submit_unfinished + + # One for quadratic + transform=Quadratic + submit_unfinished + done + done + done +done + +EOT + +# Eddy config for using quadratic +cat > code/quadratic.json << "EOT" +{ + "flm": "quadratic", + "slm": "quadratic", + "fep": false, + "interp": "spline", + "nvoxhp": 1000, + "fudge_factor": 10, + "dont_sep_offs_move": false, + "dont_peas": false, + "niter": 5, + "method": "jac", + "repol": true, + "num_threads": 1, + "is_shelled": true, + "use_cuda": false, + "cnr_maps": true, + "residuals": false, + "output_type": "NIFTI_GZ", + "args": "" +} + +EOT + +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-shoreline-quickunzip.sh b/scripts/cubic/bootstrap-shoreline-quickunzip.sh new file mode 100644 index 0000000..951dcb4 --- /dev/null +++ b/scripts/cubic/bootstrap-shoreline-quickunzip.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Used for the shoreline project: Edit this for other kinds of unzipping +PROJECTROOT=/cbica/projects/Shoreline/shoreline-benchmark +cd ${PROJECTROOT} +RIA=${PROJECTROOT}/output_ria +datalad create -c yoda -D "extract shoreline results" results +cd results +datalad clone -d . --reckless ephemeral "ria+file://${RIA}#~data" inputs/data + + +## the actual compute job specification +cat > code/get_files.sh << "EOT" +#!/bin/bash +set -e -u -x + +ZIP_FILE=$1 + +# Create a qsiprep/ directory +unzip $ZIP_FILE -x "*.nii.gz" -x "*.gif" -x "*.svg" + +outdir=moco_results/$(basename $ZIP_FILE | sed 's/\.zip//') + +mkdir -p ${outdir}/groundtruth + +# Send the memory profile to the group csv +cat qsiprep/profiled.csv >> moco_results/memprof.csv + +cp qsiprep/*_motion.txt ${outdir}/groundtruth/ +cp qsiprep/sub-*/dwi/*confounds.tsv ${outdir}/ +cp qsiprep/sub-*/dwi/*SliceQC* ${outdir}/ +cp qsiprep/sub-*/dwi/*ImageQC* ${outdir}/ + +rm -rf qsiprep + +EOT + +datalad save -m "Add data extraction code" code + +zip_files=$(find inputs/data/ -name '*.zip') +for input_zip in ${zip_files} +do + + outdir=moco_results/$(basename $input_zip | sed 's/\.zip//') + + datalad run \ + -i ${input_zip} \ + -o moco_results/memprof.csv \ + -o ${outdir} \ + --explicit \ + "bash code/get_files.sh ${input_zip}" +done + + + +# CRITICAL!!! Don't uninstall, just rm -rf the inputs +rm -rf inputs diff --git a/scripts/cubic/bootstrap-unzip-aslprep-custom.sh b/scripts/cubic/bootstrap-unzip-aslprep-custom.sh new file mode 100644 index 0000000..fcb4a58 --- /dev/null +++ b/scripts/cubic/bootstrap-unzip-aslprep-custom.sh @@ -0,0 +1,215 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure your environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/ASLPREP +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +mkdir -p $PROJECTROOT + +## DERIVATIVE_BOOTSTRAP_DIR will be the path to the bootstrap directory containing your derivatives +DERIVATIVE_BOOTSTRAP_DIR=$1 +DERIVATIVE_INPUT=ria+file://${DERIVATIVE_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${DERIVATIVE_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the aslprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +cd ${PROJECTROOT} +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${DERIVATIVE_INPUT} inputs/data + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +ZIPS=$(find inputs/data -name 'sub-*aslprep*' | cut -d '/' -f 3 | sort) +if [ -z "${ZIPS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! +html=${subid}.html +datalad run \ + -i code/get_files.sh \ + -i inputs/data/${subid}_aslprep*.zip \ + --explicit \ + -o ${subid}*space-MNI152NLin6Asym*cbf* \ + -o ${subid}*space-MNI152NLin6Asym*desc-brain*_mask* \ + -m "unzipped ${subid}" \ + "bash code/get_files.sh inputs/data/${subid}_aslprep*.zip" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +## the actual compute job specification +echo writing script to file... +cat > code/get_files.sh << "EOT" +#!/bin/bash +set -e -u -x +ZIP_FILE=$1 +subid=$(basename $ZIP_FILE | cut -d '_' -f 1) +# unzip outputs +unzip -n $ZIP_FILE 'aslprep/*' -d . + +cp aslprep/${subid}/*/perf/*space-MNI152NLin6Asym*cbf* . +cp aslprep/${subid}/*/perf/*space-MNI152NLin6Asym*desc-brain*_mask* . + +# remove unzip dir +rm -rf aslprep +EOT + +chmod +x code/get_files.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + +for zip in ${ZIPS}; do + subject=`echo ${zip} | cut -d '_' -f 1` + echo "qsub -cwd ${env_flags} -N UNZIP${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-unzip-fmriprep-custom.sh b/scripts/cubic/bootstrap-unzip-fmriprep-custom.sh new file mode 100644 index 0000000..19b425c --- /dev/null +++ b/scripts/cubic/bootstrap-unzip-fmriprep-custom.sh @@ -0,0 +1,220 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/FMRIPREP +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +mkdir -p $PROJECTROOT + +## DERIVATIVE_BOOTSTRAP_DIR will be the path to the bootstrap directory containing your derivatives +DERIVATIVE_BOOTSTRAP_DIR=$1 +DERIVATIVE_INPUT=ria+file://${DERIVATIVE_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${DERIVATIVE_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the fmriprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +cd ${PROJECTROOT} +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${DERIVATIVE_INPUT} inputs/data + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +ZIPS=$(find inputs/data -name 'sub-*fmriprep*' | cut -d '/' -f 3 | sort) +if [ -z "${ZIPS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! +html=${subid}.html +datalad run \ + -i code/get_files.sh \ + -i inputs/data/${subid}_fmriprep*.zip \ + --explicit \ + -o ${subid}_ses-PNC1_task-rest_acq-singleband_space-fsLR_den-91k_bold.dtseries.nii \ + -m "unzipped ${subid}" \ + "bash code/get_files.sh inputs/data/${subid}_fmriprep*.zip" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +## the actual compute job specification +echo writing script to file... +cat > code/get_files.sh << "EOT" +#!/bin/bash +set -e -u -x +ZIP_FILE=$1 +subid=$(basename $ZIP_FILE | cut -d '_' -f 1) + +# unzip outputs +unzip -n $ZIP_FILE 'fmriprep/*' -d . + +desired_file=fmriprep/${subid}/*/func/*task-rest_acq-singleband_space-fsLR_den-91k_bold.dtseries.nii + +# check if the desired file exists +if [ -e ${desired_file} ]; then + # copy only the file we need out of fmriprep + cp ${desired_file} . +fi + +# remove unzip dir +rm -rf fmriprep +EOT + +chmod +x code/get_files.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + +for zip in ${ZIPS}; do + subject=`echo ${zip} | cut -d '_' -f 1` + echo "qsub -cwd ${env_flags} -N UNZIP${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-unzip-fmriprep.sh b/scripts/cubic/bootstrap-unzip-fmriprep.sh new file mode 100644 index 0000000..9ea4894 --- /dev/null +++ b/scripts/cubic/bootstrap-unzip-fmriprep.sh @@ -0,0 +1,223 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/FMRIPREP +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +mkdir -p $PROJECTROOT + +## DERIVATIVE_BOOTSTRAP_DIR will be the path to the bootstrap directory containing your derivatives +DERIVATIVE_BOOTSTRAP_DIR=$1 +DERIVATIVE_INPUT=ria+file://${DERIVATIVE_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${DERIVATIVE_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the fmriprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +cd ${PROJECTROOT} +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${DERIVATIVE_INPUT} inputs/data + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +ZIPS=$(find inputs/data -name 'sub-*fmriprep*' | cut -d '/' -f 3 | sort) +if [ -z "${ZIPS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! +html=${subid}.html +datalad run \ + -i code/get_files.sh \ + -i inputs/data/${subid}_fmriprep*.zip \ + --explicit \ + -o ${subid} \ + -o ${html} \ + -m "unzipped ${subid}" \ + "bash code/get_files.sh inputs/data/${subid}_fmriprep*.zip" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +## the actual compute job specification +echo writing script to file... +cat > code/get_files.sh << "EOT" +#!/bin/bash +set -e -u -x +ZIP_FILE=$1 +subid=$(basename $ZIP_FILE | cut -d '_' -f 1) + +# unzip outputs +unzip -n $ZIP_FILE 'fmriprep/*' -d . + +# remove files we don't need +rm fmriprep/func/*from-scanner_to-T1w_mode-image_xfm.txt +rm fmriprep/func/*from-T1w_to-scanner_mode-image_xfm.txt +rm fmriprep/func/*space-MNI152NLin6Asym_res-2_boldref.nii.gz +rm fmriprep/func/*space-MNI152NLin6Asym_res-2_desc-aparcaseg_dseg.nii.gz +rm fmriprep/func/*space-MNI152NLin6Asym_res-2_desc-aseg_dseg.nii.gz + +# copy outputs out of fmriprep +cp -r fmriprep/func/* . +# remove unzip dir +rm -rf fmriprep +EOT + +chmod +x code/get_files.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + +for zip in ${ZIPS}; do + subject=`echo ${zip} | cut -d '_' -f 1` + echo "qsub -cwd ${env_flags} -N UNZIP${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-unzip-qsirecon.sh b/scripts/cubic/bootstrap-unzip-qsirecon.sh new file mode 100644 index 0000000..c6397d6 --- /dev/null +++ b/scripts/cubic/bootstrap-unzip-qsirecon.sh @@ -0,0 +1,212 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/QSIRECON +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +mkdir -p $PROJECTROOT + +## DERIVATIVE_BOOTSTRAP_DIR will be the path to the bootstrap directory containing your derivatives +DERIVATIVE_BOOTSTRAP_DIR=$1 +DERIVATIVE_INPUT=ria+file://${DERIVATIVE_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${DERIVATIVE_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the qsirecon bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +cd ${PROJECTROOT} +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${DERIVATIVE_INPUT} inputs/data + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +ZIPS=$(find inputs/data -name 'sub-*qsirecon*' | cut -d '/' -f 3 | sort) +if [ -z "${ZIPS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! +html=${subid}.html +datalad run \ + -i code/get_files.sh \ + -i inputs/data/${subid}_qsirecon*.zip \ + --explicit \ + -o ${subid} \ + -m "unzipped ${subid}" \ + "bash code/get_files.sh inputs/data/${subid}_qsirecon*.zip" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +## the actual compute job specification +echo writing script to file... +cat > code/get_files.sh << "EOT" +#!/bin/bash +set -e -u -x +ZIP_FILE=$1 +subid=$(basename $ZIP_FILE | cut -d '_' -f 1) +# unzip outputs +unzip -n $ZIP_FILE 'qsirecon/*' -d . +# copy outputs out of qsirecon +cp -r qsirecon/* . +# remove unzip dir +rm -rf qsirecon +EOT + +chmod +x code/get_files.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + +for zip in ${ZIPS}; do + subject=`echo ${zip} | cut -d '_' -f 1` + echo "qsub -cwd ${env_flags} -N UNZIP${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-unzip-xcp-multises.sh b/scripts/cubic/bootstrap-unzip-xcp-multises.sh new file mode 100644 index 0000000..11a444f --- /dev/null +++ b/scripts/cubic/bootstrap-unzip-xcp-multises.sh @@ -0,0 +1,223 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/DERIVATIVES +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +mkdir -p $PROJECTROOT + +## DERIVATIVE_BOOTSTRAP_DIR will be the path to the bootstrap directory containing your derivatives +DERIVATIVE_BOOTSTRAP_DIR=$1 +DERIVATIVE_INPUT=ria+file://${DERIVATIVE_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${DERIVATIVE_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the fmriprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +cd ${PROJECTROOT} +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${DERIVATIVE_INPUT} inputs/data + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +ZIPS=$(find inputs/data -name 'sub-*xcp*' | cut -d '/' -f 3 | sort) +if [ -z "${ZIPS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +sesid="$4" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}-${sesid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! + +html=${subid}_${sesid}.html + +datalad run \ + -i code/get_files.sh \ + -i inputs/data/${subid}_${sesid}_xcp*.zip \ + --explicit \ + -o ${subid} \ + -o ${html} \ + -m "unzipped ${subid}_${sesid}" \ + "bash code/get_files.sh inputs/data/${subid}_${sesid}_xcp*.zip" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +## the actual compute job specification +echo writing script to file... +cat > code/get_files.sh << "EOT" +#!/bin/bash +set -e -u -x +ZIP_FILE=$1 +subid=$(basename $ZIP_FILE | cut -d '_' -f 1) +sesid=$(basename $ZIP_FILE | cut -d '_' -f 2) +# unzip outputs +unzip -n $ZIP_FILE 'xcp_abcd/*' -d . +# rename html to include sesid +mv xcp_abcd/${subid}.html xcp_abcd/${subid}_${sesid}.html +# copy outputs out of xcp_abcd +cp -r xcp_abcd/* . +# remove unzip dir +rm -rf xcp_abcd +EOT + +chmod +x code/get_files.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + +for zip in ${ZIPS}; do + subject=`echo ${zip} | cut -d '_' -f 1` + session=`echo ${zip} | cut -d '_' -f 2` + echo "qsub -cwd ${env_flags} -N UNZIP${subject}_${session} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} ${session}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + diff --git a/scripts/cubic/bootstrap-unzip-xcp.sh b/scripts/cubic/bootstrap-unzip-xcp.sh new file mode 100644 index 0000000..b668dbc --- /dev/null +++ b/scripts/cubic/bootstrap-unzip-xcp.sh @@ -0,0 +1,215 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/XCP +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +mkdir -p $PROJECTROOT + +## DERIVATIVE_BOOTSTRAP_DIR will be the path to the bootstrap directory containing your derivatives +DERIVATIVE_BOOTSTRAP_DIR=$1 +DERIVATIVE_INPUT=ria+file://${DERIVATIVE_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${DERIVATIVE_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the xcp bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +cd ${PROJECTROOT} +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${DERIVATIVE_INPUT} inputs/data + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +ZIPS=$(find inputs/data -name 'sub-*xcp*' | cut -d '/' -f 3 | sort) +if [ -z "${ZIPS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" +# ------------------------------------------------------------------------------ +# Do the run! +html=${subid}.html +datalad run \ + -i code/get_files.sh \ + -i inputs/data/${subid}_xcp*.zip \ + --explicit \ + -o ${subid} \ + -m "unzipped ${subid}" \ + "bash code/get_files.sh inputs/data/${subid}_xcp*.zip" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +## the actual compute job specification +echo writing script to file... +cat > code/get_files.sh << "EOT" +#!/bin/bash +set -e -u -x +ZIP_FILE=$1 +subid=$(basename $ZIP_FILE | cut -d '_' -f 1) + +# unzip outputs +unzip -n $ZIP_FILE 'xcp_abcd/*' -d . + +# copy outputs out of xcp_abcd +cp -r xcp_abcd/* . + +# remove unzip dir +rm -rf xcp +EOT + +chmod +x code/get_files.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + +for zip in ${ZIPS}; do + subject=`echo ${zip} | cut -d '_' -f 1` + echo "qsub -cwd ${env_flags} -N UNZIP${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-xcp-audit.sh b/scripts/cubic/bootstrap-xcp-audit.sh new file mode 100644 index 0000000..76b59f7 --- /dev/null +++ b/scripts/cubic/bootstrap-xcp-audit.sh @@ -0,0 +1,343 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/xcp-audit +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +XCP_BOOTSTRAP_DIR=$1 +XCP_INPUT=ria+file://${XCP_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${XCP_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the xcp bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +# Is it a directory on the filesystem? +XCP_INPUT_METHOD=clone +if [[ ! -d "${XCP_BOOTSTRAP_DIR}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "XCP output dataset" + # exit 1 +fi + +# Check that there are some xcp zip files present in the input +# If you only need freesurfer, comment this out +# XCP_ZIPS=$(cd ${XCP_INPUT} && ls *xcp*.zip) +# if [[ -z "${XCP_ZIPS}" ]]; then +# echo No xcp zip files found in ${XCP_INPUT} +# exit 1 +# fi + +# Check that freesurfer data exists. If you only need xcp zips, comment +# this out +# FREESURFER_ZIPS=$(cd ${XCP_INPUT} && ls *freesurfer*.zip) +# if [[ -z "${FREESURFER_ZIPS}" ]]; then +# echo No freesurfer zip files found in ${XCP_INPUT} +# exit 1 +# fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + + +# Create a dataset with the logs in it +mkdir xcp_logs +cd xcp_logs +datalad create -D "Logs from the xcp runs" +cp ${XCP_BOOTSTRAP_DIR}/analysis/logs/* . +datalad save -m "add logs" + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +cd $PROJECTROOT +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +datalad install -d . -r --source ${XCP_INPUT} inputs/data +datalad install -d . -r --source ${PROJECTROOT}/xcp_logs inputs/xcp_logs + +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(find inputs/data/inputs/data/inputs/data -type d -name 'sub-*' | cut -d '@' -f 5| sed 's|^inputs/data/inputs/data/inputs/data/||') +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=5G +#$ -l s_vmem=3.5G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +datalad clone "${dssource}" ds +cd ds +git remote add outputstore "$pushgitremote" +git checkout -b "${BRANCH}" + +# ------------------------------------------------------------------------------ +# Do the run! +BIDS_DIR=${PWD}/inputs/data/inputs/data +ZIPS_DIR=${PWD}/inputs/data +ERROR_DIR=${PWD}/inputs/xcp_logs +CSV_DIR=csvs +mkdir ${CSV_DIR} +output_file=${CSV_DIR}/${subid}_xcp_audit.csv + +datalad get -n inputs/data + +INPUT_ZIP=$(ls inputs/data/${subid}_xcp*.zip | cut -d '@' -f 1 || true) +if [ ! -z "${INPUT_ZIP}" ]; then + INPUT_ZIP="-i ${INPUT_ZIP}" +fi + +echo DATALAD RUN INPUT +echo ${INPUT_ZIP} + +datalad run \ + -i code/bootstrap_zip_audit.py \ + ${INPUT_ZIP} \ + -i inputs/xcp_logs/*${subid}* \ + --explicit \ + -o ${output_file} \ + -m "xcp-audit ${subid}" \ + "python code/bootstrap_zip_audit.py ${subid} ${BIDS_DIR} ${ZIPS_DIR} ${ERROR_DIR} ${output_file} xcp" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +# Sydney, please wget your audit script here! +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/bootstrap_zip_audit.py +mv bootstrap_zip_audit.py code/ +chmod +x code/bootstrap_zip_audit.py + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Add logs from XCP runs" + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" +datalad clone ${outputsource} merge_ds +cd merge_ds +NBRANCHES=$(git branch -a | grep job- | sort | wc -l) +echo "Found $NBRANCHES branches to merge" + +gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1) + +# query all branches for the most recent commit and check if it is identical. +# Write all branch identifiers for jobs without outputs into a file. +for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" = x"${gitref}" ] && \ + echo $i; done | tee code/noresults.txt | wc -l + + +for i in $(git branch -a | grep job- | sort); \ + do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" != x"${gitref}" ] && \ + echo $i; \ +done | tee code/has_results.txt + +mkdir -p code/merge_batches +num_branches=$(wc -l < code/has_results.txt) +CHUNKSIZE=5000 + +set +e +num_chunks=$(expr ${num_branches} / ${CHUNKSIZE}) +[[ $num_chunks == 0 ]] && num_chunks=1 +set -e -x + +for chunknum in $(seq 1 $num_chunks) +do + startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1) + endnum=$(expr ${chunknum} \* ${CHUNKSIZE}) + batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt + [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches} + branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt) + echo ${branches} > ${batch_file} + git merge -m "XCP results batch ${chunknum}/${num_chunks}" $(cat ${batch_file}) + +done + +# Push the merge back +git push + +# Get the file availability info +git annex fsck --fast -f output-storage + +# This should not print anything +MISSING=$(git annex find --not --in output-storage) + +if [[ ! -z "$MISSING" ]] +then + echo Unable to find data for $MISSING + exit 1 +fi + +# stop tracking this branch +git annex dead here +datalad push --data nothing +echo SUCCESS + +EOT + +##### concat_outputs.sh START #### + +cat > code/concat_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT + +echo "PROJECT_ROOT=${PROJECTROOT}" >> code/concat_outputs.sh +echo "cd ${PROJECTROOT}" >> code/concat_outputs.sh + +cat >> code/concat_outputs.sh << "EOT" +# set up concat_ds and run concatenator on it +cd ${CBICA_TMPDIR} +datalad clone ria+file://${PROJECT_ROOT}/output_ria#~data concat_ds +cd concat_ds/code +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/concatenator.py +cd .. +datalad save -m "added concatenator script" +datalad run -i 'csvs/*' -o '${PROJECT_ROOT}/XCP_AUDIT.csv' --expand inputs --explicit "python code/concatenator.py csvs ${PROJECT_ROOT}/XCP_AUDIT.csv" +datalad save -m "generated report" +# push changes +datalad push +# remove concat_ds +git annex dead here +cd .. +chmod +w -R concat_ds +rm -rf concat_ds +echo SUCCESS +EOT + +#### concat_output.sh END #### + + +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-xcp-multises.sh b/scripts/cubic/bootstrap-xcp-multises.sh new file mode 100644 index 0000000..4d2fbf2 --- /dev/null +++ b/scripts/cubic/bootstrap-xcp-multises.sh @@ -0,0 +1,279 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/xcp-multises +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## fmriprep input +#e.g. FMRIPREP_BOOTSTRAP_DIR=~/testing/hrc_exemplars/fmriprep-multises + +FMRIPREP_BOOTSTRAP_DIR=$1 +FMRIPREP_INPUT=ria+file://${FMRIPREP_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${FMRIPREP_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the fmriprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +# Is it a directory on the filesystem? +FMRIPREP_INPUT_METHOD=clone +if [[ ! -d "${FMRIPREP_BOOTSTRAP_DIR}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "fmriprep output dataset" + # exit 1 +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ria+file://${FMRIPREP_INPUT}/output_ria#~data inputs/data +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +ZIPS=$(find inputs/data -name 'sub-*fmriprep*' | cut -d '/' -f 3 | sort) +if [ -z "${ZIPS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + + +## Add the containers as a subdataset +cd ${PROJECTROOT} + +# Clone the containers dataset. If specified on the command, use that path +CONTAINERDS=$2 +if [[ ! -z "${CONTAINERDS}" ]]; then + datalad clone ${CONTAINERDS} pennlinc-containers +else + echo "No containers dataset specified, attempting to clone from pmacs" + datalad clone \ + ria+ssh://sciget.pmacs.upenn.edu:/project/bbl_projects/containers#~pennlinc-containers \ + pennlinc-containers +fi + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=25G +#$ -l tmpfree=200G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +sesid="$4" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}-${sesid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup + +# ------------------------------------------------------------------------------ +# Do the run! + +datalad get -r pennlinc-containers + +datalad run \ + -i code/xcp_zip.sh \ + -i inputs/data/${subid}_${sesid}_fmriprep*.zip \ + --explicit \ + -o ${subid}_${sesid}_xcp-0-0-4.zip \ + -m "xcp-abcd-run ${subid} ${sesid}" \ + "bash ./code/xcp_zip.sh ${subid} ${sesid}" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/xcp_zip.sh << "EOT" +#!/bin/bash +set -e -u -x +subid="$1" +sesid="$2" +wd=${PWD} + +cd inputs/data +7z x ${subid}_${sesid}_fmriprep-20.2.3.zip +cd $wd + +mkdir -p ${PWD}/.git/tmp/wdir +singularity run --cleanenv -B ${PWD} pennlinc-containers/.datalad/environments/xcp-abcd-0-0-4/image inputs/data/fmriprep xcp participant \ +--despike --lower-bpf 0.01 --upper-bpf 0.08 --participant_label $subid -p 36P -f 10 -w ${PWD}/.git/tmp/wkdir +singularity run --cleanenv -B ${PWD} pennlinc-containers/.datalad/environments/xcp-abcd-0-0-4/image inputs/data/fmriprep xcp participant \ +--despike --lower-bpf 0.01 --upper-bpf 0.08 --participant_label $subid -p 36P -f 10 -w ${PWD}/.git/tmp/wkdir --cifti +cd xcp +7z a ../${subid}_${sesid}_xcp-0-0-4.zip xcp_abcd +rm -rf prep .git/tmp/wkdir + +EOT + +chmod +x code/xcp_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" + + +for zip in ${ZIPS}; do + subject=`echo ${zip} | cut -d '_' -f 1` + session=`echo ${zip} | cut -d '_' -f 2` + echo "qsub -cwd ${env_flags} -N xcp${subject}_${session} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} ${session}" >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/cubic/bootstrap-xcp.sh b/scripts/cubic/bootstrap-xcp.sh new file mode 100644 index 0000000..6ce0320 --- /dev/null +++ b/scripts/cubic/bootstrap-xcp.sh @@ -0,0 +1,298 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/xcp +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +FMRIPREP_BOOTSTRAP_DIR=$1 +FMRIPREP_INPUT=ria+file://${FMRIPREP_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${FMRIPREP_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the fmriprep bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +# Is it a directory on the filesystem? +FMRIPREP_INPUT_METHOD=clone +if [[ ! -d "${FMRIPREP_BOOTSTRAP_DIR}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "fmriprep output dataset" + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +if [[ "${FMRIPREP_INPUT_METHOD}" == "clone" ]] +then + echo "Cloning input dataset into analysis dataset" + datalad clone -d . ${FMRIPREP_INPUT} inputs/data + # amend the previous commit with a nicer commit message + git commit --amend -m 'Register input data dataset as a subdataset' +else + echo "WARNING: copying input data into repository" + mkdir -p inputs/data + cp -r ${FMRIPREP_INPUT}/* inputs/data + datalad save -r -m "added input data" +fi + +SUBJECTS=$(find inputs/data -name '*.zip' | cut -d '/' -f 3 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +set +u +CONTAINERDS=$2 +set -u +#if [[ ! -z "${CONTAINERDS}" ]]; then +cd ${PROJECTROOT} +datalad clone ${CONTAINERDS} pennlinc-containers +## Add the containers as a subdataset +cd pennlinc-containers +datalad get -r . + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${PROJECTROOT}/pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=32G +#$ -l tmpfree=100G +#$ -R y +#$ -l h_rt=24:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup + +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Do the run! + +datalad get -r pennlinc-containers + +datalad run \ + -i code/xcp_zip.sh \ + -i inputs/data/${subid}*fmriprep*.zip \ + --explicit \ + -o ${subid}_xcp-0-0-8.zip \ + -m "xcp-abcd-run ${subid}" \ + "bash ./code/xcp_zip.sh ${subid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +git annex dead here + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad uninstall -r --nocheck --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + + +cat > code/xcp_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +wd=${PWD} + +cd inputs/data +7z x ${subid}_fmriprep-20.2.3.zip +cd $wd + +mkdir -p ${PWD}/.git/tmp/wdir +singularity run --cleanenv -B ${PWD} pennlinc-containers/.datalad/environments/xcp-abcd-0-0-8/image inputs/data/fmriprep xcp participant \ +--despike --lower-bpf 0.01 --upper-bpf 0.08 --participant_label $subid -p 36P -f 10 -w ${PWD}/.git/tmp/wkdir +singularity run --cleanenv -B ${PWD} pennlinc-containers/.datalad/environments/xcp-abcd-0-0-8/image inputs/data/fmriprep xcp participant \ +--despike --lower-bpf 0.01 --upper-bpf 0.08 --participant_label $subid -p 36P -f 10 -w ${PWD}/.git/tmp/wkdir --cifti +cd xcp +7z a ../${subid}_xcp-0-0-8.zip xcp_abcd +rm -rf prep .git/tmp/wkdir +EOT + +chmod +x code/xcp_zip.sh +cp ${FREESURFER_HOME}/license.txt code/license.txt + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N xcp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +if [ "${FMRIPREP_INPUT_METHOD}" = "clone" ] +then + datalad uninstall -r --nocheck inputs/data +fi + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + + + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +#run last sge call to test +#$(tail -n 1 code/qsub_calls.sh) + diff --git a/scripts/cubic/bootstrap_matrices.sh b/scripts/cubic/bootstrap_matrices.sh new file mode 100644 index 0000000..212cff5 --- /dev/null +++ b/scripts/cubic/bootstrap_matrices.sh @@ -0,0 +1,259 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/matrices +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +XCP_BOOTSTRAP_DIR=${PWD}/xcp +XCP_INPUT=ria+file://${XCP_BOOTSTRAP_DIR}"/output_ria#~data" +if [[ -z ${XCP_BOOTSTRAP_DIR} ]] +then + echo "Required argument is the path to the xcp bootstrap directory." + echo "This directory should contain analysis/, input_ria/ and output_ria/." + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + + +# register the input dataset + +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ${XCP_INPUT} inputs/data +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + + +SUBJECTS=$(find inputs/data -name '*.zip' | cut -d '/' -f 3 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +cd ${PROJECTROOT}/analysis + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=12G +#$ -l s_vmem=12G +#$ -l tmpfree=10G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Do the run! +datalad run \ + -i code/make_matrices.py \ + -i inputs/data/${subid}*xcp*.zip \ + --explicit \ + -o ${subid}_matrices.zip \ + -m "make_matrix ${subid}" \ + "python code/make_matrices.py ${subid}" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +git annex dead here + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace + +EOT + +chmod +x code/participant_job.sh + +cat > code/make_matrices.py << "EOT" +#!/usr/bin/env python + +import os +import glob +import sys + +subid = sys.argv[1] + +subzip = 'inputs/data/{0}_xcp-0-0-1.zip'.format(subid) +os.system('7z x {0}'.format(subzip)) +os.mkdir('matrices') + +matrices = glob.glob('xcp_abcd/{0}/*ses*/func/*fsLR_atlas*.pconn.nii*'.format(subid)) +for m in matrices: + cmd = 'mv {0} matrices/'.format(m) + os.system(cmd) + +matrices = glob.glob('xcp_abcd/{0}/*ses*/func/*fsLR_atlas*.ptseries.nii*'.format(subid)) +for m in matrices: + cmd = 'mv {0} matrices/'.format(m) + os.system(cmd) + +qcs = glob.glob('xcp_abcd/{0}/*ses*/func/**fsLR_desc-qc_bold**'.format(subid)) +for qc in qcs: + cmd = 'mv {0} matrices/'.format(qc) + os.system(cmd) + +cmd = 'zip -r {0}_matrices.zip matrices/'.format(subid) +os.system(cmd) + +EOT + +chmod +x code/make_matrices.py + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ + +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N matrix${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +echo $RIA_DIR +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +#run last sge call to test +#$(tail -n 1 code/qsub_calls.sh) diff --git a/scripts/cubic/bootstrap_matrix_concat.sh b/scripts/cubic/bootstrap_matrix_concat.sh new file mode 100644 index 0000000..b023bb1 --- /dev/null +++ b/scripts/cubic/bootstrap_matrix_concat.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +DATA=$(pwd) +PROJECTROOT=$DATA/fcon +mkdir $PROJECTROOT +cd $PROJECTROOT + +datalad create -d . + +datalad clone ria+file://$DATA/matrices/output_ria#~data concat_ds + +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/matrix_concatenator.py + +datalad save -m "added matrix concatenator script" +datalad run \ + -m "generated concatenated matrices" \ + -i 'concat_ds/*matrices.zip*' \ + -o 'group_matrices.zip' \ + --expand inputs \ + --explicit \ + "python matrix_concatenator.py" + +# remove concat_ds +datalad uninstall concat_ds + +echo SUCCESS diff --git a/scripts/cubic/bootstrap_outputs.sh b/scripts/cubic/bootstrap_outputs.sh new file mode 100644 index 0000000..414c85a --- /dev/null +++ b/scripts/cubic/bootstrap_outputs.sh @@ -0,0 +1,265 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +# Arguments: +# 1. xcp bootstrap directory, eg for PNC: ria+file:///cbica/projects/RBC/production/PNC/xcp/output_ria#~data + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +## XCP input +# XCPINPUT=$1 +XCPINPUT=/cbica/projects/RBC/production/PNC/xcp/ +if [[ -z ${XCPINPUT} ]] +then + echo "Required argument is an identifier of the XCP output zips" + # exit 1 +fi + +if [[ ! -d "${XCPINPUT}/output_ria/alias/data" ]] +then + echo "There must be alias in the output ria store that points to the" + echo "XCP output dataset" + # exit 1 +fi + + +set -e -u +# set +e +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/xcp_derivatives +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ria+file://${XCPINPUT}/output_ria#~data inputs/data +git commit --amend -m 'Register preprocessed dataset as a subdataset' + + +SUBJECTS=$(find inputs/data -name '*.zip' | cut -d '/' -f 3 | cut -d '_' -f 1 | sort | uniq) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +cd ${PROJECTROOT}/analysis +mkdir outputs +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#$ -S /bin/bash +#$ -l h_vmem=8G +#$ -l s_vmem=8G +#$ -l tmpfree=50G +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in SGE +# cd ${CBICA_TMPDIR} +# OR Run it on a shared network drive +cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds +mkdir outputs +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup + +# ------------------------------------------------------------------------------ +# ------------------------------------------------------------------------------ +# Do the run! + +datalad get -n inputs/data + +datalad run \ + -i code/unpack.sh \ + -i inputs/data/${subid}*xcp*.zip \ + --explicit \ + -o outputs \ + -m "unpack ${subid}" \ + "bash ./code/unpack.sh ${subid}" + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +git annex dead here + +# remove tempdir +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace + +EOT + +chmod +x code/participant_job.sh + + +cat > code/unpack.sh << "EOT" +#!/bin/bash +set -e -u -x + +subid="$1" +wd=${PWD} + +7z x inputs/data/${subid}_xcp-0-0-4.zip +mv xcp_abcd/** outputs/ +cd $wd + +EOT + +chmod +x code/unpack.sh + +mkdir logs +echo .SGE_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + + +################################################################################ +# SGE SETUP START - remove or adjust to your needs +################################################################################ +env_flags="-v DSLOCKFILE=${PWD}/.SGE_datalad_lock" + +echo '#!/bin/bash' > code/qsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "qsub -cwd ${env_flags} -N unpack${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/qsub_calls.sh +done +datalad save -m "SGE submission setup" code/ .gitignore + +################################################################################ +# SGE SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + + + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS + +#run last sge call to test +#$(tail -n 1 code/qsub_calls.sh) diff --git a/scripts/cubic/concat_outputs.sh b/scripts/cubic/concat_outputs.sh new file mode 100644 index 0000000..e2dd272 --- /dev/null +++ b/scripts/cubic/concat_outputs.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# create alias +mkdir -p ~/production/PNC/fmriprep-audit/output_ria/alias +ln -s ~/production/PNC/fmriprep-audit/output_ria/adb/be4a6-4a9f-402a-92ea-e4cfd624b974 ~/production/PNC/fmriprep-audit/output_ria/alias/data + + +# set up concat_ds and run concatenator on it +cd ~/testing +datalad clone ria+file:///cbica/projects/RBC/production/PNC/fmriprep-audit/output_ria#~data concat_ds +cd concat_ds/code +wget https://raw.githubusercontent.com/PennLINC/RBC/master/PennLINC/Generic/concatenator.py +cd ~/testing/concat_ds +datalad save -m "added concatenator script" +datalad run -i 'csvs/*' -o '~/testing/concat_ds/group_report.csv' --expand inputs --explicit "python code/concatenator.py ~/testing/concat_ds/csvs ~/testing/concat_ds/group_report.csv" + +# copy report to a directory that isn't getting deleted +cp ~/testing/concat_ds/group_report.csv ~/testing/ + +datald save -m "generated report" +# push changes +datalad push + +# remove concat_ds +git annex dead here +cd ~/testing +chmod +w -R concat_ds +rm -rf concat_ds + +echo SUCCESS diff --git a/scripts/cubic/cubic-setup-project-user.sh b/scripts/cubic/cubic-setup-project-user.sh new file mode 100644 index 0000000..c4a1469 --- /dev/null +++ b/scripts/cubic/cubic-setup-project-user.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +# This script installs conda and datalad into the home +# directory of a CUBIC user. + +# USAGE: +# bash cubic-install-conda.sh + +set -e -u -x + +# Turn off cbica python on login +chmod +w ${HOME}/.bashrc +echo "# Configure python for this user" >> ${HOME}/.bashrc +echo "module unload python/anaconda/3" >> ${HOME}/.bashrc +echo "unset PYTHONPATH" >> ${HOME}/.bashrc +chmod -w ${HOME}/.bashrc + +# Download and install conda +cd ${HOME} +if [ ! -d ${HOME}/miniconda3 ] +then + module unload python/anaconda/3 + unset PYTHONPATH + + curl -sSLO https://repo.anaconda.com/miniconda/Miniconda3-py38_4.9.2-Linux-x86_64.sh && \ + bash Miniconda3-py38_4.9.2-Linux-x86_64.sh -b -p ${HOME}/miniconda3 && \ + rm -f Miniconda3-py38_4.9.2-Linux-x86_64.sh + + # Unlock bashrc and edit it so conda works + chmod +w ${HOME}/.bashrc + echo "export CONDA_PREFIX=${HOME}/miniconda3" + echo ". ${HOME}/miniconda3/etc/profile.d/conda.sh" >> ${HOME}/.bashrc + chmod -w ${HOME}/.bashrc + set +u + source ${HOME}/.bashrc + # Fix some permissions errors + chown -R `whoami` ${PROJECTROOT}/miniconda3/bin + set -u +fi + +# Note: if your user does not have (base) in front of it when you log back into +# CUBIC, you may need to run these next lines manually +# Activate the base conda environment +conda activate +# Install CuBIDS and datalad +conda install -y -c conda-forge git-annex datalad +pip install --upgrade datalad datalad_container diff --git a/scripts/cubic/merge_outputs_postscript.sh b/scripts/cubic/merge_outputs_postscript.sh new file mode 100644 index 0000000..e7e44e8 --- /dev/null +++ b/scripts/cubic/merge_outputs_postscript.sh @@ -0,0 +1,63 @@ + +# The following should be pasted into the merge_outputs.sh script +datalad clone ${outputsource} merge_ds +cd merge_ds +NBRANCHES=$(git branch -a | grep job- | sort | wc -l) +echo "Found $NBRANCHES branches to merge" + +gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1) + +# query all branches for the most recent commit and check if it is identical. +# Write all branch identifiers for jobs without outputs into a file. +for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" = x"${gitref}" ] && \ + echo $i; done | tee code/noresults.txt | wc -l + + +for i in $(git branch -a | grep job- | sort); \ + do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" != x"${gitref}" ] && \ + echo $i; \ +done | tee code/has_results.txt + +mkdir -p code/merge_batches +num_branches=$(wc -l < code/has_results.txt) +CHUNKSIZE=5000 +set +e +num_chunks=$(expr ${num_branches} / ${CHUNKSIZE}) +if [[ $num_chunks == 0 ]]; then + num_chunks=1 +fi +set -e +for chunknum in $(seq 1 $num_chunks) +do + startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1) + endnum=$(expr ${chunknum} \* ${CHUNKSIZE}) + batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt + [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches} + branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt) + echo ${branches} > ${batch_file} + git merge -m "merge results batch ${chunknum}/${num_chunks}" $(cat ${batch_file}) + +done + +# Push the merge back +git push + +# Get the file availability info +git annex fsck --fast -f output-storage + +# This should not print anything +MISSING=$(git annex find --not --in output-storage) + +if [[ ! -z "$MISSING" ]] +then + echo Unable to find data for $MISSING + exit 1 +fi + +# stop tracking this branch +git annex dead here + +datalad push --data nothing +echo SUCCESS diff --git a/scripts/cubic/multishell_gauntlet.json b/scripts/cubic/multishell_gauntlet.json new file mode 100644 index 0000000..6220f84 --- /dev/null +++ b/scripts/cubic/multishell_gauntlet.json @@ -0,0 +1,209 @@ +{ + "name": "multishell_gauntlet", + "space": "T1w", + "atlases": [ + "schaefer100x17", + "schaefer200x17", + "schaefer400x17", + "brainnetome246", + "aicha384", + "gordon333", + "aal116" + ], + "anatomical": ["mrtrix_5tt_hsvs"], + "nodes": [ + { + "name": "msmt_csd", + "software": "MRTrix3", + "action": "csd", + "output_suffix": "msmtcsd", + "input": "qsiprep", + "parameters": { + "mtnormalize": true, + "response": { + "algorithm": "dhollander" + }, + "fod": { + "algorithm": "msmt_csd", + "max_sh": [8, 8, 8] + } + }}, + { + "name": "track_ifod2", + "software": "MRTrix3", + "action": "tractography", + "output_suffix": "ifod2", + "input": "msmt_csd", + "parameters": { + "use_5tt": true, + "method_5tt": "hsvs", + "use_sift2": true, + "tckgen": { + "algorithm": "iFOD2", + "select": 10000000, + "max_length": 250, + "min_length": 30, + "power": 0.33, + "crop_at_gmwmi": true, + "backtrack": true + }, + "sift2": {} + } + }, + { + "name": "mrtrix_conn", + "software": "MRTrix3", + "action": "connectivity", + "output_suffix": "msmtconnectome", + "input": "track_ifod2", + "parameters": { + "tck2connectome": [ + { + "zero_diagonal": false, + "search_radius": 2, + "scale_invnodevol": true, + "symmetric": true, + "use_sift_weights": true, + "stat_edge": "sum", + "measure": "sift_invnodevol_radius2_count" + }, + { + "zero_diagonal": false, + "search_radius": 2, + "scale_invnodevol": false, + "symmetric": true, + "length_scale": "length", + "use_sift_weights": false, + "stat_edge": "mean", + "measure": "radius2_meanlength" + }, + { + "zero_diagonal": false, + "search_radius": 2, + "scale_invnodevol": false, + "symmetric": true, + "use_sift_weights": false, + "stat_edge": "sum", + "measure": "radius2_count" + }, + { + "zero_diagonal": false, + "search_radius": 2, + "scale_invnodevol": false, + "symmetric": true, + "use_sift_weights": true, + "stat_edge": "sum", + "measure": "sift_radius2_count" + } + ]}}, + { + "name": "fit_noddi", + "action": "fit_noddi", + "software": "AMICO", + "input": "qsiprep", + "output_suffix": "wmNODDI", + "parameters": { + "isExvivo": false, + "dPar": 1.7E-3, + "dIso": 3.0E-3 + } + }, + { + "name": "fit_noddi_gm", + "action": "fit_noddi", + "software": "AMICO", + "input": "qsiprep", + "output_suffix": "gmNODDI", + "parameters": { + "isExvivo": false, + "dPar": 1.1E-3, + "dIso": 3.0E-3 + } + }, + { + "name": "calc_sops", + "action": "steinhardt_order_parameters", + "input": "msmt_csd", + "output_suffix": "SOP", + "parameters": { + "order": 8 + } + }, + { + "name": "mapmri_recon", + "software": "Dipy", + "action": "MAPMRI_reconstruction", + "input": "qsiprep", + "output_suffix": "MAPMRI", + "parameters": { + "radial_order": 6, + "laplacian_regularization": true, + "laplacian_weighting": 0.2, + "anisotropic_scaling": false, + "bval_threshold": 2000, + "dti_scale_estimation": false, + "write_mif": false, + "write_fibgz": false + } + }, + { + "name": "dki_recon", + "software": "Dipy", + "action": "DKI_reconstruction", + "input": "qsiprep", + "output_suffix": "DKI", + "parameters": { + "write_mif": false, + "write_fibgz": false + } + }, + { + "name": "csdsi_3dshore", + "software": "Dipy", + "action": "3dSHORE_reconstruction", + "input": "qsiprep", + "output_suffix": "3dSHORE", + "parameters": { + "extrapolate_scheme": "HCP", + "radial_order": 8, + "regularization": "L2", + "lambdaN": 1e-8, + "lambdaL": 1e-8, + "write_mif": false, + "write_fibgz": false + } + }, + { + "name": "msmt_csd_fromcs", + "software": "MRTrix3", + "action": "csd", + "output_suffix": "csmsmtcsd", + "input":"csdsi_3dshore", + "parameters": { + "mtnormalize": true, + "response": { + "algorithm": "dhollander" + }, + "fod": { + "algorithm": "msmt_csd", + "max_sh": [8, 8, 8] + } + } + }, + { + "name": "dsistudio_gqi", + "software": "DSI Studio", + "action": "reconstruction", + "input": "qsiprep", + "output_suffix": "gqi", + "parameters": {"method": "gqi"} + }, + { + "name": "scalar_export", + "software": "DSI Studio", + "action": "export", + "input": "dsistudio_gqi", + "output_suffix": "gqiscalar" + } + ] + } \ No newline at end of file diff --git a/scripts/cubic/rerun_hcp.py b/scripts/cubic/rerun_hcp.py new file mode 100644 index 0000000..933247c --- /dev/null +++ b/scripts/cubic/rerun_hcp.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python +import pandas as pd +import numpy as np +import os +import glob +import subprocess + +os.chdir('/cbica/projects/hcpya/xcp/analysis/code') +df = pd.read_csv('qsub_calls.sh').rename(columns={'#!/bin/bash':'call'}) + +subjects = np.zeros((df.shape[0])) +for line in df.iterrows(): + if line[1].values[0]== 'sleep 600': + subjects[line[0]] = sub + continue + + sub = line[1].values[0].split(' ')[-2] + subjects[line[0]] = sub + +df['subject'] = subjects +ran = np.zeros((df.shape[0])) +os.chdir('/cbica/projects/hcpya/xcp/analysis/logs') +for line in df.iterrows(): + + logs = glob.glob('xcp{0}.o**'.format(str(int(line[1].subject)))) + if len(logs) == 0: + ran[line[0]] = 0 + continue + latest_file = max(logs, key=os.path.getctime) + with open("{0}".format(latest_file), 'r') as f: + last_line = f.readlines()[-1].split('\n')[0] + if last_line == 'SUCCESS': + ran[line[0]] = 1 +df['ran'] = ran.astype(bool) + +for line in df.iterrows(): + if line[1].ran == True:continue + else: + os.system('sleep 600') + os.system(line[1].call) +#qsub -l h_vmem=4G,s_vmem=4G -N rerunhcp -V -j y -b y -o /cbica/projects/hcpya/xcp/analysis/logs python code/rerun_hcp.py diff --git a/scripts/cubic/xcp-hcp-d-bootstrap.py b/scripts/cubic/xcp-hcp-d-bootstrap.py new file mode 100644 index 0000000..a26a503 --- /dev/null +++ b/scripts/cubic/xcp-hcp-d-bootstrap.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python +import glob +import os +import sys +import pandas as pd +import nibabel as nb +import numpy as np +from shutil import copyfile +import json +import subprocess +import h5py +import time +from sklearn.linear_model import LinearRegression +from scipy.stats import pearsonr + + +""" +FUCKING CHANGE THESE!!! +""" + +hcp_dir = '/Users/maxbertolero/Box/DCAN_test_datasets/HCP-D/hcp_processed/' +outdir = '/Users/maxbertolero/Box/DCAN_test_datasets/HCP-D/xcp_fmriprepdir/' + +#move the json +os.system('cp code/dataset_description.json {0}/dataset_description.json'.format(outdir)) + +nslots = subprocess.run(['echo $NSLOTS'], stdout=subprocess.PIPE,shell=True).stdout.decode('utf-8').split('\n')[0] +subid = str(sys.argv[1]) + + +def make_masks(segmentation, wm_mask_out, vent_mask_out, **kwargs): + """ + generates ventricular and white matter masks from a Desikan/FreeSurfer + segmentation file. label constraints may be overridden. + :param segmentation: Desikan/FreeSurfer spec segmentation nifti file. + Does not need to be a cifti but must have labels according to FS lookup + table, including cortical parcellations. + :param wm_mask_out: binary white matter mask. + :param vent_mask_out: binary ventricular mask. + :param kwargs: dictionary of label value overrides. You may override + default label number bounds for white matter and ventricle masks in the + segmentation file. + :return: None + """ + + wd = os.path.dirname(wm_mask_out) + # set parameter defaults + defaults = dict(wm_lt_R=2950, wm_ut_R=3050, wm_lt_L=3950, wm_ut_L=4050, + vent_lt_R=43, vent_ut_R=43, vent_lt_L=4, vent_ut_L=4, + roi_res=2) + # set temporary filenames + tempfiles = { + 'wm_mask_L': os.path.join(wd, 'tmp_left_wm.nii.gz'), + 'wm_mask_R': os.path.join(wd, 'tmp_right_wm.nii.gz'), + 'vent_mask_L': os.path.join(wd, 'tmp_left_vent.nii.gz'), + 'vent_mask_R': os.path.join(wd, 'tmp_right_vent.nii.gz'), + 'wm_mask': os.path.join(wd, 'tmp_wm.nii.gz'), + 'vent_mask': os.path.join(wd, 'tmp_vent.nii.gz') + } + # inputs and outputs + iofiles = { + 'segmentation': segmentation, + 'wm_mask_out': wm_mask_out, + 'vent_mask_out': vent_mask_out + } + # command pipeline + cmdlist = [ + 'fslmaths {segmentation} -thr {wm_lt_R} -uthr {wm_ut_R} {wm_mask_R}', + 'fslmaths {segmentation} -thr {wm_lt_L} -uthr {wm_ut_L} {wm_mask_L}', + 'fslmaths {wm_mask_R} -add {wm_mask_L} -bin {wm_mask}', + 'fslmaths {wm_mask} -kernel gauss {roi_res:g} -ero {wm_mask_out}', + 'fslmaths {segmentation} -thr {vent_lt_R} -uthr {vent_ut_R} ' + '{vent_mask_R}', + 'fslmaths {segmentation} -thr {vent_lt_L} -uthr {vent_ut_L} ' + '{vent_mask_L}', + 'fslmaths {vent_mask_R} -add {vent_mask_L} -bin {vent_mask}', + 'fslmaths {vent_mask} -kernel gauss {roi_res:g} -ero {vent_mask_out}' + ] + + # get params + defaults.update(kwargs) + kwargs.update(defaults) + kwargs.update(iofiles) + kwargs.update(tempfiles) + # format and run commands + for cmdfmt in cmdlist: + cmd = cmdfmt.format(**kwargs) + subprocess.call(cmd.split()) + # cleanup + for key in tempfiles.keys(): + os.remove(tempfiles[key]) + +os.makedirs(outdir,exist_ok=True) +""" +Data Narrative + +All subjects from HCP-D were analyzed. For each Task ("REST1","REST2","WM","MOTOR","GAMBLING","EMOTION","LANGUAGE","SOCIAL") and Encoding Direction ("LR","RL"), we analyzed the session if the following files were present: +(1) rfMRI/tfMRI_{Task}_{Encoding_}_Atlas_MSMAll.dtseries.nii, (2) rfMRI/tfMRI_{Task}_{Encoding}.nii, (3) Movement_Regressors.txt, (4) Movement_AbsoluteRMS.txt, (5) SBRef_dc.nii.gz, and (6) rfMRI/tfMRI_{Task}_{Encoding_}_SBRef.nii.gz. +For all tasks, the global signal timeseries was generated with: wb_command -cifti-stats rfMRI/tfMRI_{Task}_{Encoding_}_Atlas_MSMAll.dtseries.nii -reduce MEAN'. For REST1 and REST2, we used the HCP distributed CSF.txt and WM.txt cerebral spinal fluid +and white matter time series. For all other tasks (i.e., all tfMRI), we generated those files in the exact manner the HCP did: fslmeants -i tfMRI_{Task}_{Encoding}.nii -o CSF.txt -m CSFReg.2.nii.gz; fslmeants -i tfMRI_{Task}_{Encoding}.nii -o WM.txt -m WMReg.2.nii.gz. +To ensure this process was identical, we generated these time series for the rfMRI sessions and compared them to the HCP distributed timeseries, ensuring they are identical. These files were then formatted into fMRIprep outputs by renaming the files, +creating the regression json, and creating dummy transforms. These inputs were then analyzed by xcp_abcd with the following command: +singularity run --cleanenv -B ${PWD} ~/xcp_hcp/xcp-abcd-0.0.4.sif fmriprepdir/ xcp/ participant --cifti --despike --lower-bpf 0.01 --upper-bpf 0.08 --participant_label sub-$SUBJECT -p 36P -f 100 --omp-nthreads 4 --nthreads 4 +All subjects ran successfully. +""" +orig_tasks = ["carit01","carit02","guessing02","guessing01","rest01","rest02","rest03"] + +#put this directly in here +tasklist = [] +for orig_task in orig_tasks: + if len(glob.glob('{0}/{1}/ses-V1/files/MNINonLinear/Results/*{2}*/task-{2}_Atlas.dtseries.nii'.format(hcp_dir,subid,orig_task))) != 1: continue + if len(glob.glob('{0}/{1}/ses-V1/files/MNINonLinear/Results/*{2}*/task-{2}.nii.gz'.format(hcp_dir,subid,orig_task))) != 1: continue + if len(glob.glob('{0}/{1}/ses-V1/files/MNINonLinear/Results/*{2}*/Movement_Regressors.txt'.format(hcp_dir,subid,orig_task))) != 1: continue + if len(glob.glob('{0}/{1}/ses-V1/files/MNINonLinear/Results/*{2}*/Movement_AbsoluteRMS.txt'.format(hcp_dir,subid,orig_task))) != 1: continue + if len(glob.glob('{0}/{1}/ses-V1/files/MNINonLinear/Results/*{2}*/**SBRef.nii.gz'.format(hcp_dir,subid,orig_task))) != 1: continue + if len(glob.glob('{0}/{1}/ses-V1/files/MNINonLinear/Results/*{2}*/brainmask_fs.2.0.nii.gz'.format(hcp_dir,subid,orig_task))) != 1: continue + + tdir = glob.glob('{0}/{1}/ses-V1/files/MNINonLinear/Results/*{2}*'.format(hcp_dir,subid,orig_task))[0] + task = tdir.split('/')[-1] + tasklist.append(task) + task_dir = '{0}/{1}/MNINonLinear/Results/{2}'.format(hcp_dir,subid,task) + wbs_file = '{0}/{1}/MNINonLinear/Results/{2}/task-{2}_Atlas.dtseries.nii'.format(hcp_dir,subid,task) + if os.path.exists(wbs_file): + os.system('rm {0}/{1}_WBS.txt'.format(task_dir,task)) + command = 'singularity exec -B ${PWD} --env OMP_NTHREADS=%s pennlinc-containers/.datalad/environments/xcp-abcd-0-0-4/image wb_command -cifti-stats %s -reduce MEAN >> %s/%s_WBS.txt'%(nslots,wbs_file,task_dir,task) + os.system(command) + +anatdir=outdir+'/sub-'+subid+'/anat/' +funcdir=outdir+'/sub-'+subid+'/func/' + +os.makedirs(outdir+'/sub-'+subid+'/anat',exist_ok=True) # anat dir +os.makedirs(outdir+'/sub-'+subid+'/func',exist_ok=True) # func dir + +for j in tasklist: + + bb = j.split('_') + taskname = bb[1] + acqname = bb[2] + datadir = hcp_dir +subid+'/MNINonLinear/Results/'+ j + + ResultsFolder='{0}/{1}/MNINonLinear/Results/{2}/'.format(hcp_dir,subid,j) + ROIFolder="{0}/{1}/MNINonLinear/ROIs".format(hcp_dir,subid) + + + segmentation = '{0}/{1}/ses-V1/files/MNINonLinear/ROIs/wmparc.2.nii.gz'.format(hcp_dir,subid,orig_task) + wm_mask_out = "{3}/WMReg.2.nii.gz".format(ROIFolder) + vent_mask_out = "{3}/CSFReg.2.nii.gz".format(ROIFolder) + make_masks(segmentation, wm_mask_out, vent_mask_out) + + xcp_file = '{0}/{1}/MNINonLinear/Results/{2}/{3}_WM.txt'.format(hcp_dir,subid,j,j) + cmd = "fslmeants -i {0}/{1}.nii.gz -o {2} -m {3}/WMReg.2.nii.gz".format(ResultsFolder,j,xcp_file,ROIFolder) + os.system(cmd) + xcp_file = '{0}/{1}/MNINonLinear/Results/{2}/{3}_CSF.txt'.format(hcp_dir,subid,j,j) + cmd = "fslmeants -i {0}/{1}.nii.gz -o {2} -m {3}/CSFReg.2.nii.gz".format(ResultsFolder,j,xcp_file,ROIFolder) + os.system(cmd) + + ##create confound regressors + mvreg = pd.read_csv(datadir +'/Movement_Regressors.txt',header=None,delimiter=r"\s+") + mvreg = mvreg.iloc[:,0:6] + mvreg.columns=['trans_x','trans_y','trans_z','rot_x','rot_y','rot_z'] + # convert rot to rad + mvreg['rot_x']=mvreg['rot_x']*np.pi/180 + mvreg['rot_y']=mvreg['rot_y']*np.pi/180 + mvreg['rot_z']=mvreg['rot_z']*np.pi/180 + + + csfreg = np.loadtxt(datadir +'/'+ j + '_CSF.txt') + wmreg = np.loadtxt(datadir +'/'+ j + '_WM.txt') + gsreg = np.loadtxt(datadir +'/'+ j + '_WBS.txt') + rsmd = np.loadtxt(datadir +'/Movement_AbsoluteRMS.txt') + + + brainreg = pd.DataFrame({'global_signal':gsreg,'white_matter':wmreg,'csf':csfreg,'rmsd':rsmd}) + + regressors = pd.concat([mvreg, brainreg], axis=1) + jsonreg = pd.DataFrame({'LR': [1,2,3]}) # just a fake json + regressors.to_csv(funcdir+'sub-'+subid+'_task-'+taskname+'_acq-'+acqname+'_desc-confounds_timeseries.tsv',index=False, + sep= '\t') + regressors.to_json(funcdir+'sub-'+subid+'_task-'+taskname+'_acq-'+acqname+'_desc-confounds_timeseries.json') + + + hcp_mask = '{0}/{1}//MNINonLinear/Results/{2}/{2}_SBRef.nii.gz'.format(hcp_dir,subid,j) + prep_mask = funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-MNI152NLin6Asym_boldref.nii.gz' + copyfile(hcp_mask,prep_mask) + + hcp_mask = '{0}/{1}//MNINonLinear/Results/{2}/brainmask_fs.2.nii.gz'.format(hcp_dir,subid,j) + prep_mask = funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-MNI152NLin6Asym_desc-brain_mask.nii.gz' + copyfile(hcp_mask,prep_mask) + + # create/copy cifti + niftip = '{0}/{1}/MNINonLinear/Results/{2}/{2}.nii.gz'.format(hcp_dir,subid,j,j) # to get TR and just sample + niftib = funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-MNI152NLin6Asym_desc-preproc_bold.nii.gz' + ciftip = datadir + '/'+ j +'_Atlas.dtseries.nii' + ciftib = funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-fsLR_den-91k_bold.dtseries.nii' + + os.system('cp {0} {1}'.format(ciftip,ciftib)) + os.system('cp {0} {1}'.format(niftip,niftib)) + + tr = nb.load(niftip).header.get_zooms()[-1]# repetition time + jsontis={"RepetitionTime": np.float(tr),"TaskName": taskname} + json2={"RepetitionTime": np.float(tr),"grayordinates": "91k", "space": "HCP grayordinates","surface": "fsLR","surface_density": "32k","volume": "MNI152NLin6Asym"} + + with open(funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-MNI152NLin6Asym_desc-preproc_bold.json', 'w') as outfile: + json.dump(jsontis, outfile) + + with open(funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-fsLR_den-91k_bold.dtseries.json', 'w') as outfile: + json.dump(json2, outfile) + + + +# if we are goin to use xcp-0.04 we dont need real `MNI152NLin2009cAsym_to-T1w_mode-image_xfm.h5` otherwise we do +# hcp doesn't produced ants trasnform file, the only thing is to generate it or change xcp to accomodate both + +anat1 = '{0}/{1}/ses-V1/files/MNINonLinear/T1w_restore.nii.gz'.format(hcp_dir,subid,orig_task) +mni2t1 = anatdir+'sub-'+subid+'_from-MNI152NLin2009cAsym_to-T1w_mode-image_xfm.h5' +t1w2mni = anatdir+'sub-'+subid+'_from-T1w_to-MNI152NLin2009cAsym_mode-image_xfm.h5' +cmd = 'cp {0} {1}'.format(anat1,mni2t1) +os.system(cmd) +cmd = 'cp {0} {1}'.format(anat1,t1w2mni) +os.system(cmd) + +os.system('export SINGULARITYENV_OMP_NUM_THREADS={0}'.format(nslots)) +cmd = 'singularity run --cleanenv -B ${PWD} pennlinc-containers/.datalad/environments/xcp-abcd-0-0-4/image fmriprepdir xcp participant --cifti --despike --lower-bpf 0.01 --upper-bpf 0.08 --participant_label sub-%s -p 36P -f 100 --nthreads %s --cifti'%(subid,nslots) +os.system(cmd) + +""" +audit +""" +data = [] +for fdir in fdirs: + for orig_task in orig_tasks: + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/*Atlas_MSMAll.dtseries.nii'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/*{2}_{3}.nii.gz'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/Movement_Regressors.txt'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/Movement_AbsoluteRMS.txt'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/SBRef_dc.nii.gz'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/**SBRef.nii.gz'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + data.append('_'.join([orig_task,fdir])) + +results = [] +for r in glob.glob('xcp/xcp_abcd/sub-%s/func/*Schaefer417*pconn*'%(subid)): + results.append(r.split('/')[-1].split('-')[2].split('_')[0] + '_' +r.split('/')[-1].split('-')[3].split('_')[0]) +data.sort() +results.sort() +ran = False +data = np.unique(data) +if len(np.intersect1d(data,results)) == len(data): + ran = True + line = 'No errors' + +else: line = None +if ran == False: + e_file=sorted(glob.glob('/cbica/projects/RBC/hcpya/xcp/analysis/logs/*%s*.o*'%(subid)),key=os.path.getmtime)[-1] + with open(e_file) as f: + for line in f: + pass + print (subid,line) +sdf = pd.DataFrame(columns=['ran','subject','error']) +sdf['ran'] = [ran] +sdf['subject'] = [subid] +sdf['error'] = [line] +sdf.to_csv('xcp/xcp_abcd/sub-{0}/audit_{0}.csv'.format(subid),index=False) + +os.system('7z a {0}_xcp-0.0.4.zip xcp/xcp_abcd'.format(subid)) +os.system('rm -rf prep .git/tmp/wkdir') diff --git a/scripts/cubic/xcp-hcpya-bootstrap.py b/scripts/cubic/xcp-hcpya-bootstrap.py new file mode 100644 index 0000000..76d296b --- /dev/null +++ b/scripts/cubic/xcp-hcpya-bootstrap.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python +import glob +import os +import sys +import pandas as pd +import nibabel as nb +import numpy as np +from shutil import copyfile +import json +import subprocess +import h5py +import time +from sklearn.linear_model import LinearRegression +from scipy.stats import pearsonr + +nslots = subprocess.run(['echo $NSLOTS'], stdout=subprocess.PIPE,shell=True).stdout.decode('utf-8').split('\n')[0] +subid = str(sys.argv[1]) +hcp_dir = 'inputs/data/HCP1200/' +outdir = 'fmriprepdir/' +os.makedirs(outdir,exist_ok=True) +""" +Data Narrative + +All subjects from the S1200 HCP-YA were analyzed. For each Task ("REST1","REST2","WM","MOTOR","GAMBLING","EMOTION","LANGUAGE","SOCIAL") and Encoding Direction ("LR","RL"), we analyzed the session if the following files were present: +(1) rfMRI/tfMRI_{Task}_{Encoding_}_Atlas_MSMAll.dtseries.nii, (2) rfMRI/tfMRI_{Task}_{Encoding}.nii, (3) Movement_Regressors.txt, (4) Movement_AbsoluteRMS.txt, (5) SBRef_dc.nii.gz, and (6) rfMRI/tfMRI_{Task}_{Encoding_}_SBRef.nii.gz. +For all tasks, the global signal timeseries was generated with: wb_command -cifti-stats rfMRI/tfMRI_{Task}_{Encoding_}_Atlas_MSMAll.dtseries.nii -reduce MEAN'. For REST1 and REST2, we used the HCP distributed CSF.txt and WM.txt cerebral spinal fluid +and white matter time series. For all other tasks (i.e., all tfMRI), we generated those files in the exact manner the HCP did: fslmeants -i tfMRI_{Task}_{Encoding}.nii -o CSF.txt -m CSFReg.2.nii.gz; fslmeants -i tfMRI_{Task}_{Encoding}.nii -o WM.txt -m WMReg.2.nii.gz. +To ensure this process was identical, we generated these time series for the rfMRI sessions and compared them to the HCP distributed timeseries, ensuring they are identical. These files were then formatted into fMRIprep outputs by renaming the files, +creating the regression json, and creating dummy transforms. These inputs were then analyzed by xcp_abcd with the following command: +singularity run --cleanenv -B ${PWD} ~/xcp_hcp/xcp-abcd-0.0.4.sif fmriprepdir/ xcp/ participant --cifti --despike --lower-bpf 0.01 --upper-bpf 0.08 --participant_label sub-$SUBJECT -p 36P -f 100 --omp-nthreads 4 --nthreads 4 +All subjects ran successfully. +""" +fdirs = ["RL","LR"] +orig_tasks = ["REST1","REST2","WM","MOTOR","GAMBLING","EMOTION","LANGUAGE","SOCIAL"] +# fdirs = ["RL"] +# orig_tasks = ["WM"] +os.system('cp code/dataset_description.json {0}/dataset_description.json'.format(outdir)) +#put this directly in here +tasklist = [] +for fdir in fdirs: + for orig_task in orig_tasks: + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/*Atlas_MSMAll.dtseries.nii'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/*{2}_{3}.nii.gz'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/Movement_Regressors.txt'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/Movement_AbsoluteRMS.txt'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/SBRef_dc.nii.gz'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/**SBRef.nii.gz'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + tdir = glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*'.format(hcp_dir,subid,orig_task,fdir))[0] + task = tdir.split('/')[-1] + tasklist.append(task) + task_dir = '{0}/{1}/MNINonLinear/Results/{2}'.format(hcp_dir,subid,task) + wbs_file = '{0}/{1}/MNINonLinear/Results/{2}/{2}_Atlas_MSMAll.dtseries.nii'.format(hcp_dir,subid,task) + if os.path.exists(wbs_file): + os.system('rm {0}/{1}_WBS.txt'.format(task_dir,task)) + command = 'singularity exec -B ${PWD} --env OMP_NTHREADS=%s pennlinc-containers/.datalad/environments/xcp-abcd-0-0-4/image wb_command -cifti-stats %s -reduce MEAN >> %s/%s_WBS.txt'%(nslots,wbs_file,task_dir,task) + os.system(command) + + anatdir=outdir+'/sub-'+subid+'/anat/' + funcdir=outdir+'/sub-'+subid+'/func/' + + os.makedirs(outdir+'/sub-'+subid+'/anat',exist_ok=True) # anat dir + os.makedirs(outdir+'/sub-'+subid+'/func',exist_ok=True) # func dir + +for j in tasklist: + + bb = j.split('_') + taskname = bb[1] + acqname = bb[2] + datadir = hcp_dir +subid+'/MNINonLinear/Results/'+ j + + if 'REST' not in j: + ResultsFolder='{0}/{1}/MNINonLinear/Results/{2}/'.format(hcp_dir,subid,j) + ROIFolder="{0}/{1}/MNINonLinear/ROIs".format(hcp_dir,subid) + + xcp_file = '{0}/{1}/MNINonLinear/Results/{2}/{3}_WM.txt'.format(hcp_dir,subid,j,j) + cmd = "fslmeants -i {0}/{1}.nii.gz -o {2} -m {3}/WMReg.2.nii.gz".format(ResultsFolder,j,xcp_file,ROIFolder) + os.system(cmd) + + xcp_file = '{0}/{1}/MNINonLinear/Results/{2}/{3}_CSF.txt'.format(hcp_dir,subid,j,j) + cmd = "fslmeants -i {0}/{1}.nii.gz -o {2} -m {3}/CSFReg.2.nii.gz".format(ResultsFolder,j,xcp_file,ROIFolder) + os.system(cmd) + + + ##create confound regressors + mvreg = pd.read_csv(datadir +'/Movement_Regressors.txt',header=None,delimiter=r"\s+") + mvreg = mvreg.iloc[:,0:6] + mvreg.columns=['trans_x','trans_y','trans_z','rot_x','rot_y','rot_z'] + # convert rot to rad + mvreg['rot_x']=mvreg['rot_x']*np.pi/180 + mvreg['rot_y']=mvreg['rot_y']*np.pi/180 + mvreg['rot_z']=mvreg['rot_z']*np.pi/180 + + + csfreg = np.loadtxt(datadir +'/'+ j + '_CSF.txt') + wmreg = np.loadtxt(datadir +'/'+ j + '_WM.txt') + gsreg = np.loadtxt(datadir +'/'+ j + '_WBS.txt') + rsmd = np.loadtxt(datadir +'/Movement_AbsoluteRMS.txt') + + + brainreg = pd.DataFrame({'global_signal':gsreg,'white_matter':wmreg,'csf':csfreg,'rmsd':rsmd}) + + regressors = pd.concat([mvreg, brainreg], axis=1) + jsonreg = pd.DataFrame({'LR': [1,2,3]}) # just a fake json + regressors.to_csv(funcdir+'sub-'+subid+'_task-'+taskname+'_acq-'+acqname+'_desc-confounds_timeseries.tsv',index=False, + sep= '\t') + regressors.to_json(funcdir+'sub-'+subid+'_task-'+taskname+'_acq-'+acqname+'_desc-confounds_timeseries.json') + + + hcp_mask = '{0}/{1}//MNINonLinear/Results/{2}/{2}_SBRef.nii.gz'.format(hcp_dir,subid,j) + prep_mask = funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-MNI152NLin6Asym_boldref.nii.gz' + copyfile(hcp_mask,prep_mask) + + hcp_mask = '{0}/{1}//MNINonLinear/Results/{2}/brainmask_fs.2.nii.gz'.format(hcp_dir,subid,j) + prep_mask = funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-MNI152NLin6Asym_desc-brain_mask.nii.gz' + copyfile(hcp_mask,prep_mask) + + # create/copy cifti + niftip = '{0}/{1}/MNINonLinear/Results/{2}/{2}.nii.gz'.format(hcp_dir,subid,j,j) # to get TR and just sample + niftib = funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-MNI152NLin6Asym_desc-preproc_bold.nii.gz' + ciftip = datadir + '/'+ j +'_Atlas_MSMAll.dtseries.nii' + ciftib = funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-fsLR_den-91k_bold.dtseries.nii' + + os.system('cp {0} {1}'.format(ciftip,ciftib)) + os.system('cp {0} {1}'.format(niftip,niftib)) + + tr = nb.load(niftip).header.get_zooms()[-1]# repetition time + jsontis={"RepetitionTime": np.float(tr),"TaskName": taskname} + json2={"RepetitionTime": np.float(tr),"grayordinates": "91k", "space": "HCP grayordinates","surface": "fsLR","surface_density": "32k","volume": "MNI152NLin6Asym"} + + with open(funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-MNI152NLin6Asym_desc-preproc_bold.json', 'w') as outfile: + json.dump(jsontis, outfile) + + with open(funcdir+'/sub-'+subid+'_task-'+taskname+'_acq-'+ acqname +'_space-fsLR_den-91k_bold.dtseries.json', 'w') as outfile: + json.dump(json2, outfile) + + + +# just fake anatomical profile for xcp, it wont be use +anat1 = datadir +'/' +'/SBRef_dc.nii.gz' +mni2t1 = anatdir+'sub-'+subid+'_from-MNI152NLin2009cAsym_to-T1w_mode-image_xfm.h5' +t1w2mni = anatdir+'sub-'+subid+'_from-T1w_to-MNI152NLin2009cAsym_mode-image_xfm.h5' +cmd = 'cp {0} {1}'.format(anat1,mni2t1) +os.system(cmd) +cmd = 'cp {0} {1}'.format(anat1,t1w2mni) +os.system(cmd) + +os.system('export SINGULARITYENV_OMP_NUM_THREADS={0}'.format(nslots)) +cmd = 'singularity run --cleanenv -B ${PWD} pennlinc-containers/.datalad/environments/xcp-abcd-0-0-4/image fmriprepdir xcp participant --cifti --despike --lower-bpf 0.01 --upper-bpf 0.08 --participant_label sub-%s -p 36P -f 100 --nthreads %s --cifti'%(subid,nslots) +os.system(cmd) + +""" +audit +""" +data = [] +for fdir in fdirs: + for orig_task in orig_tasks: + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/*Atlas_MSMAll.dtseries.nii'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/*{2}_{3}.nii.gz'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/Movement_Regressors.txt'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/Movement_AbsoluteRMS.txt'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/SBRef_dc.nii.gz'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + if len(glob.glob('{0}/{1}/MNINonLinear/Results/*{2}*{3}*/**SBRef.nii.gz'.format(hcp_dir,subid,orig_task,fdir))) != 1: continue + data.append('_'.join([orig_task,fdir])) + +results = [] +for r in glob.glob('xcp/xcp_abcd/sub-%s/func/*Schaefer417*pconn*'%(subid)): + results.append(r.split('/')[-1].split('-')[2].split('_')[0] + '_' +r.split('/')[-1].split('-')[3].split('_')[0]) +data.sort() +results.sort() +ran = False +data = np.unique(data) +if len(np.intersect1d(data,results)) == len(data): + ran = True + line = 'No errors' + +else: line = None +if ran == False: + e_file=sorted(glob.glob('/cbica/projects/RBC/hcpya/xcp/analysis/logs/*%s*.o*'%(subid)),key=os.path.getmtime)[-1] + with open(e_file) as f: + for line in f: + pass + print (subid,line) +sdf = pd.DataFrame(columns=['ran','subject','error']) +sdf['ran'] = [ran] +sdf['subject'] = [subid] +sdf['error'] = [line] +sdf.to_csv('xcp/xcp_abcd/sub-{0}/audit_{0}.csv'.format(subid),index=False) + +os.system('cd xcp; 7z a ../{0}_xcp-0-0-4.zip xcp_abcd'.format(subid)) +os.system('rm -rf prep .git/tmp/wkdir') diff --git a/scripts/generic/fs_euler_checker_and_plots_simplified.py b/scripts/generic/fs_euler_checker_and_plots_simplified.py new file mode 100755 index 0000000..9fa7344 --- /dev/null +++ b/scripts/generic/fs_euler_checker_and_plots_simplified.py @@ -0,0 +1,234 @@ +from pathlib import Path +import re +import pandas as pd +import subprocess +import zipfile +import sys +import os +import shutil +import matplotlib.pyplot as plt +import nilearn.image as nim +import nilearn.plotting as nip + + +""" +Get the FreeSurfer statistics and a plot of the T1w image after a +FMRIPrep/FreeSurfer run. Must be run from a clone of the analysis directory. +Expects only one zip per subject. A one-line CSV and a SVG image are written +to the csvs/ and svg/ directory respectively. + +USAGE: fs_euler_checker_and_plots_simplified.py subjectID zips_dir + +Arguments: +---------- + + subjectID: sub-*, the BIDS subject identifier. NOTE: can also be + sub-X_ses-Y if multisession + zips_dir: path, relative to the current working directory +""" + +subid = sys.argv[1] +input_zip_dir = Path(sys.argv[2]) +if not input_zip_dir.exists(): + raise ValueError("Must provide a directory with zip files") + +# Set up the working/output directories +unzip_temp_dir = Path("temp") +unzip_temp_dir.mkdir(exist_ok=True) +output_dir = Path("csvs") +output_dir.mkdir(exist_ok=True) +output_svg_dir = Path("svg") +output_svg_dir.mkdir(exist_ok=True) +# This dictionary holds all the info we're going to collect on this subject +fs_audit = {'SubjectID': subid} + +# Find the zip files we need to extract +freesurfer_zips = list( + input_zip_dir.rglob("**/*{}*freesurfer*zip".format(subid))) +print("Found FreeSurfer archives:\n ", + "\n ".join(map(str, freesurfer_zips))) + +fmriprep_zips = list( + input_zip_dir.rglob("**/*{}*fmriprep*zip".format(subid))) +print("Found FMRIPrep archives:\n ", + "\n ".join(map(str, fmriprep_zips))) + +if not len(fmriprep_zips) == len(freesurfer_zips) == 1: + raise Exception("Exactly 1 FMRIPrep and 1 FreeSurfer must match " + subid) + +fmriprep_zip = str(fmriprep_zips[0]) +freesurfer_zip = str(freesurfer_zips[0]) + +# Unpack the freesurfer zip +with zipfile.ZipFile(freesurfer_zip, 'r') as zip_ref: + zip_ref.extractall(str(unzip_temp_dir)) + +# File paths +l_orig_nofix = str(unzip_temp_dir / 'freesurfer' / subid / + 'surf' / 'lh.orig.nofix') +r_orig_nofix = str(unzip_temp_dir / 'freesurfer' / subid / + 'surf' / 'rh.orig.nofix') +l_euler_textfile = str(unzip_temp_dir / "l_euler.txt") +r_euler_textfile = str(unzip_temp_dir / "r_euler.txt") + +# run mris_euler +subprocess.run(["mris_euler_number", "-o", l_euler_textfile, l_orig_nofix]) +subprocess.run(["mris_euler_number", "-o", r_euler_textfile, r_orig_nofix]) + + +def read_euler(euler_file, hemi, info): + """Reads the output from mris_euler_number + """ + with open(euler_file) as eulerf: + lines = eulerf.readlines() + print("content of", euler_file) + print("\n".join(lines)) + + # sanity check the content of the euler output + if not lines: + raise Exception("Not enough lines generated") + first_line = lines[0] + + # split into components + tokens = first_line.strip().split(" ") + + if len(tokens) > 2: + num_holes = float(tokens[3]) + elif len(tokens) == 1: + num_holes = float(tokens[0]) + else: + raise Exception("required number of outputs not available") + + info[hemi + '_NumHoles'] = num_holes + euler_number = abs(2 - 2 * num_holes) + info[hemi + '_EulerNumber'] = euler_number + + defect_index = 2 * num_holes + info[hemi + '_DefectIndex'] = defect_index + + +def read_surf_stats(stats_name, source_id, info, get_measures=False): + """Reads stats from the aparc stats table. + + Parameters: + =========== + + stats_name: str + Name of the .stats file to parts + source_id: str + ID for these stats in the output () + info: dict + Dictionary containing other collected info about the run + get_measures: bool + Should the # Measure lines be parsed and added to info? + Returns: Nothing. the info dict gets keys/values added to it + + """ + stats_file = unzip_temp_dir / "freesurfer" / subid / "stats" / stats_name + if not stats_file.exists(): + raise Exception(str(stats_file) + "does not exist") + + with stats_file.open("r") as statsf: + lines = statsf.readlines() + + # Get the column names by finding the line with the header tag in it + header_tag = "# ColHeaders" + header, = [line for line in lines if header_tag in line] + header = header[len(header_tag):].strip().split() + + stats_df = pd.read_csv( + str(stats_file), + sep='\s+', + comment="#", + names=header).melt(id_vars=["StructName"]) + + if stats_name.startswith("lh"): + prefix = "Left" + elif stats_name.startswith("rh"): + prefix = "Right" + else: + prefix = "Both" + + # Get it into a nice form + stats_df['FlatName'] = prefix + '_' + stats_df['variable'] + "_" \ + + source_id + "_" + stats_df['StructName'] + for _, row in stats_df.iterrows(): + info[row['FlatName']] = row['value'] + + if get_measures: + get_stat_measures(stats_file, prefix, info) + + +def get_stat_measures(stats_file, prefix, info): + """Read a "Measure" from a stats file. + + Parameters: + =========== + + stats_file: Path + Path to a .stats file containing the measure you want + info: dict + Dictionary with all this subject's info + """ + with stats_file.open("r") as statsf: + lines = statsf.readlines() + + measure_pat = re.compile( + "# Measure ([A-Za-z]+), ([A-Za-z]+),* [-A-Za-z ]+, ([0-9.]+), .*") + for line in lines: + match = re.match(measure_pat, line) + if match: + pt1, pt2, value = match.groups() + key = "{}_{}_{}".format(prefix, pt1, pt2) + info[key] = float(value) + + +# modify fs_audit inplace: stats from euler number +read_euler(l_euler_textfile, "Left", fs_audit) +read_euler(r_euler_textfile, "Right", fs_audit) +fs_audit['AverageEulerNumber'] = abs( + float(fs_audit['Right_EulerNumber'] + + fs_audit['Left_EulerNumber']) / 2.0) + +# Add stats from the DKT atlas +read_surf_stats("lh.aparc.DKTatlas.stats", "DKT", fs_audit) +read_surf_stats("rh.aparc.DKTatlas.stats", "DKT", fs_audit) +read_surf_stats("lh.aparc.pial.stats", "Pial", fs_audit, get_measures=True) +read_surf_stats("rh.aparc.pial.stats", "Pial", fs_audit, get_measures=True) +read_surf_stats("aseg.stats", "aseg", fs_audit, get_measures=True) + +pd.DataFrame([fs_audit]).to_csv(str(output_dir / (subid + "audit.csv"))) + + +# Do the plotting! +t1w_pat = re.compile("((?!space-).)*T1w.nii.gz") +t1w_svg = re.compile("((?!space-).)*reconall_T1w.svg") + +# Exract the t1w from the zip file +with zipfile.ZipFile(str(fmriprep_zip), 'r') as zip_ref_fmri: + zip_contents = zip_ref_fmri.namelist() + + t1_svg_candidates = [ + fname for fname in zip_contents if re.match(t1w_svg, fname)] + + if not t1_svg_candidates: + raise Exception("No native-space T1w svgs were found") + if len(t1_svg_candidates) > 1: + raise Exception("Too many possible T1w svgs were found") + + t1w_svg_path = t1_svg_candidates[0] + svg_orig_filename = Path(t1w_svg_path).name + svg_orig_path = str(Path(t1w_svg_path)) + svg_external_path = unzip_temp_dir / svg_orig_path + + + svg_fname = "%05d" % fs_audit['AverageEulerNumber'] + "_" \ + + svg_orig_filename + + svg_result_path = output_svg_dir / svg_fname + + # copy svg file to result directory + with zip_ref_fmri.open(t1w_svg_path, "r") as t1w_svg_srcf: + with svg_result_path.open('wb+') as t1w_svg_destf: + shutil.copyfileobj(t1w_svg_srcf, t1w_svg_destf) + diff --git a/scripts/generic/fs_euler_logs.py b/scripts/generic/fs_euler_logs.py new file mode 100755 index 0000000..adfdb57 --- /dev/null +++ b/scripts/generic/fs_euler_logs.py @@ -0,0 +1,45 @@ +from pathlib import Path +import pandas as pd +from glob import glob +import zipfile +from tqdm import tqdm + +def process_zip(zip_path): + freesurfer_zip = Path(zip_path) + subid = freesurfer_zip.stem.split("_")[0] + sesid = "" + if "ses-" in freesurfer_zip.stem: + sesid = freesurfer_zip.stem.split("_")[1] + if not freesurfer_zip.exists(): + raise ValueError("Must provide a zip file") + + # Get a handle for the freesurfer zip + zip_ref = zipfile.ZipFile(freesurfer_zip, 'r') + + zip_contents = zip_ref.namelist() + reconlog, = [pth for pth in zip_contents if + pth.endswith("scripts/recon-all.log") and "sub-" in pth] + with zip_ref.open(reconlog, "r") as reconlogf: + log_lines = [line.decode("utf-8").strip() for line in reconlogf] + + def read_qc(target_str): + data, = [line for line in log_lines if target_str in line] + data = data.replace(",", "") + tokens = data.split() + rh_val = float(tokens[-1]) + lh_val = float(tokens[-4]) + return rh_val, lh_val + + rh_euler, lh_euler = read_qc("lheno") + rh_holes, lh_holes = read_qc("lhholes") + return {"rbc_id": subid, "archive": zip_path, "session": sesid, + "lh_euler": lh_euler, "rh_euler": rh_euler, + "lh_holes": lh_holes, "rh_holes": rh_holes} + +zip_files = glob("*freesurfer*.zip") +results = [] +for zip_file in tqdm(zip_files): + results.append(process_zip(zip_file)) + +pd.DataFrame(results).to_csv("../surface_qc.csv", index=False) + diff --git a/scripts/generic/rerun_failures.sh b/scripts/generic/rerun_failures.sh new file mode 100644 index 0000000..6034c1b --- /dev/null +++ b/scripts/generic/rerun_failures.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# From an analysis directory, find stderr files that +# dont contain UCCESS. Run from analysis/ + +fails=$(grep -L "UCCESS" logs/*.e*) +nfails=$(echo $fails | wc -w ) + +echo found $nfails unsuccessful runs + +subjects=$(echo $fails | tr " " "\n" | \ + sed 's/.*sub-\([A-Za-z0-9][A-Za-z0-9]*\)\.e.*/sub-\1/') + + +>code/qsub_calls2.sh +for subject in $subjects +do + grep $subject code/qsub_calls.sh >> code/qsub_calls2.sh +done + + diff --git a/scripts/generic/rerun_remaining.py b/scripts/generic/rerun_remaining.py new file mode 100644 index 0000000..a513f31 --- /dev/null +++ b/scripts/generic/rerun_remaining.py @@ -0,0 +1,82 @@ +#!/bin/bash +# this file checks the output ria for complete job branches and crosschecks +# the original list of jobs to find the difference. The remaining subjects +# printed to stdout or qsubbed using os.system + +# USAGE +# python code/rerun_remaining +# --output_ria /path/to/output/ria/short_hash/long_hash +# [--execute] + +import subprocess +import sys +import argparse +import os + + +def get_branches(ria): + + try: + assert os.path.exists(ria) + popdir = os.getcwd() + os.chdir(ria) + stdout = subprocess.check_output('git branch -a'.split()) + out = stdout.decode() + branches = [b.strip('* ') for b in out.splitlines()] + os.chdir(popdir) + + branches2 = [b for b in branches if "job" in b] + branches3 = [b[b.find("sub"):] for b in branches2] + + return branches3 + except: + print("Error finding RIA branches") + print("Are you sure you gave the correct path?") + raise ValueError("No git branches found") + + +def get_all_jobs(): + + assert os.getcwd().endswith("analysis"), "Please only run this script from the ANALYSIS subdirectory of your bootstrap directory" + + with open('./code/qsub_calls.sh', 'r') as f: + qsubs = f.readlines() + + qsubs2 = [] + + for x in qsubs[1:]: + + subject_start = x[x.find(" sub-")+1:] + subject_end = subject_start[:subject_start.find(" ")] + qsubs2.append(subject_end.strip()) + + return (qsubs, qsubs2) + + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument("--output_ria", help="path to the output ria", type=str) + parser.add_argument("--execute", action="store_true") + args = parser.parse_args() + completed_jobs = get_branches(args.output_ria) + qsub_calls, all_jobs = get_all_jobs() + + for i, x in enumerate(all_jobs): + + if x in completed_jobs: + + del qsub_calls[i] + + if args.execute: + print("Running qsub on remaining", len(qsub_calls), "jobs") + for x in qsub_calls: + os.system(x) + else: + print(len(qsub_calls), "remaining jobs:") + print("\n".join(qsub_calls)) + + +if __name__ == "__main__": + + main() \ No newline at end of file diff --git a/scripts/mit_slurm/bootstrap_fmriprep.sh b/scripts/mit_slurm/bootstrap_fmriprep.sh new file mode 100644 index 0000000..da61b80 --- /dev/null +++ b/scripts/mit_slurm/bootstrap_fmriprep.sh @@ -0,0 +1,272 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/fmriprep +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "First required argument is an identifier of the BIDS source" + # exit 1 +fi + +TMPDIR=$2 +if [[ -z ${TMPDIR} ]] +then + echo "Second required argument is a path to temporary working directory" + # exit 1 +fi +mkdir -p "${TMPDIR}" + +# Is it a directory on the filesystem? +BIDS_INPUT_METHOD=clone +if [[ -d "${BIDSINPUT}" ]] +then + # Check if it's datalad + BIDS_DATALAD_ID=$(datalad -f '{infos[dataset][id]}' wtf -S \ + dataset -d ${BIDSINPUT} 2> /dev/null || true) + [ "${BIDS_DATALAD_ID}" = 'N/A' ] && BIDS_INPUT_METHOD=copy +fi + + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ${BIDSINPUT} inputs/data +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3 ) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +CONTAINERDS=///repronim/containers +datalad install -d . --source ${CONTAINERDS} + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#SBATCH --mem=25GB +#SBATCH --time=4-00:00:00 +# Set up the correct conda environment +source ${CONDA_PREFIX}/bin/activate base +echo I\'m in $PWD using `which python` +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" +# change into the cluster-assigned temp directory. Not done by default in SGE +# TODO: change to local tempdir assigned by SLURM +cd ${TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) +# Used for the branch names and the temp dir +BRANCH="job-${JOB_ID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds +# all following actions are performed in the context of the superdataset +cd ds +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" +# Reomve all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) +# ------------------------------------------------------------------------------ +# Do the run! +# TODO: Be sure the actual path to the fmriprep container is correct +datalad run \ + -i code/fmriprep_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/*json \ + -i containers/.datalad/environments/fmriprep-20-2-3/image \ # FIX!! + --explicit \ + -o ${subid}_fmriprep-20.2.3.zip \ + -o ${subid}_freesurfer-20.2.3.zip \ + -m "fmriprep:20.2.3 ${subid}" \ + "bash ./code/fmriprep_zip.sh ${subid}" +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore +echo TMPDIR TO DELETE +echo ${BRANCH} +datalad uninstall -r --if-dirty ignore inputs/data +datalad drop -r . --nocheck +git annex dead here +cd ../.. +rm -rf $BRANCH +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/fmriprep_zip.sh << "EOT" +#!/bin/bash +set -e -u -x +subid="$1" +mkdir -p ${PWD}/.git/tmp/wdir +# TODO: fix path to singularity image +singularity run --cleanenv -B ${PWD} \ + containers/.datalad/environments/fmriprep-20-2-3/image \ # FIX!! + inputs/data \ + prep \ + participant \ + -w ${PWD}/.git/wkdir \ + --n_cpus 1 \ + --stop-on-first-crash \ + --fs-license-file code/license.txt \ + --skip-bids-validation \ + --use-aroma \ + --output-spaces MNI152NLin6Asym:res-2 anat \ + --participant-label "$subid" \ + --force-bbr \ + --cifti-output 91k -v -v +cd prep +7z a ../${subid}_fmriprep-20.2.3.zip fmriprep +7z a ../${subid}_freesurfer-20.2.3.zip freesurfer +rm -rf prep .git/tmp/wkdir +EOT + +chmod +x code/fmriprep_zip.sh +#cp ${FREESURFER_HOME}/license.txt code/license.txt +cp /om2/user/smeisler/TheWay/scripts/mit_slurm/license.txt code/license.txt #REMOVE THIS LATER + +mkdir logs +echo .SLURM_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +MERGE_POSTSCRIPT=https://raw.githubusercontent.com/PennLINC/TheWay/main/scripts/cubic/merge_outputs_postscript.sh +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh +wget -qO- ${MERGE_POSTSCRIPT} >> code/merge_outputs.sh + + +################################################################################ +# SLURM SETUP START - remove or adjust to your needs +################################################################################ +env_flags="--export=DSLOCKFILE=${PWD}/.SLURM_datalad_lock" +echo '#!/bin/bash' > code/sbatch_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs" +for subject in ${SUBJECTS}; do + echo "sbatch ${env_flags} --job-name=fp${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} " >> code/sbatch_calls.sh +done +datalad save -m "SLURM submission setup" code/ .gitignore + +################################################################################ +# SLURM SETUP END +################################################################################ + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +if [ "${BIDS_INPUT_METHOD}" = "clone" ] +then + datalad uninstall -r --nocheck inputs/data +fi + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS diff --git a/scripts/msi/bootstrap-bids-dataladdening.sh b/scripts/msi/bootstrap-bids-dataladdening.sh new file mode 100644 index 0000000..571ce4e --- /dev/null +++ b/scripts/msi/bootstrap-bids-dataladdening.sh @@ -0,0 +1,196 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +# workflow for converting many subjects into datalad + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +source /home/umii/hendr522/SW/miniconda3/etc/profile.d/conda.sh +conda activate datalad_and_nda +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/dataladdening +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + exit 1 +fi + + +## Check the BIDS input: This will be the copy of Anders's read-only ABCC BIDS data +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + exit 1 +fi + +SUBJECTS=$(find ${BIDSINPUT} -maxdepth 1 -type d -name 'sub-*' | xargs -n 1 basename) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# Only make a single ria store - we'll send all the subdatasets there +output_store="${PROJECTROOT}/BIDS_DATASETS" +mkdir -p ${output_store} + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +#SBATCH -J qsiprep +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=8 +#SBATCH --cpus-per-task=1 +#SBATCH --mem=20gb +#SBATCH -t 24:00:00 +#SBATCH -p small,amdsmall +#SBATCH -A feczk001 +#SBATCH --mail-type=ALL +#SBATCH --mail-user=hendr522@umn.edu + +# Set up the correct conda environment +source /home/umii/hendr522/SW/miniconda3/etc/profile.d/conda.sh +conda activate datalad_and_nda +echo I\'m in $PWD using `which python` + +# set up AWS credentials as environment variables +export AWS_ACCESS_KEY_ID=`cat ${HOME}/.s3cfg | grep access_key | awk '{print $3}'` +export AWS_SECRET_ACCESS_KEY=`cat ${HOME}/.s3cfg | grep secret_key | awk '{print $3}'` + +# fail whenever something is fishy, use -x to get verbose logfiles +export PS4='> ' +set -e -u -x + +echo $SLURM_JOB_ID +# Set up the remotes and get the subject id from the call +collector_dir="$1" +# make $2 subid/sesid to have session subdatasets ##UNTESTED## +subid="$2" +bidsroot="$3" +bucket="$4" +srname="$5" + +# change into the directory where the individual subject datasets will go +cd $collector_dir + +# New dataset to house this subject +datalad create -D "Copy subject $subid" $subid +cd $subid + +# Add the s3 output +git annex initremote "$srname" \ + type=S3 \ + autoenable=true \ + bucket=$bucket \ + encryption=none \ + "fileprefix=$subid/" \ + host=s3.msi.umn.edu \ + partsize=1GiB \ + port=443 \ + public=no + +# Copy the entire input directory into the current dataset +# and save it as a subdataset. +datalad run \ + -m "Copy in ${subid}" \ + "cp -rL ${bidsroot}/${subid}/* ." + +# Push to s3 +datalad push --to $srname + +# Cleanup +datalad drop . + +# Announce +echo SUCCESS + +EOT + +chmod +x code/participant_job.sh + +mkdir logs +echo .SLURM_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +#TODO add s3info credential dynamically + +################################################################################ +# SLURM SETUP START - remove or adjust to your needs +################################################################################ +echo '#!/bin/bash' > code/srun_calls.sh +for subject in ${SUBJECTS}; do + eo_args="-e ${PWD}/logs/${subject}.err -o ${PWD}/logs/${subject}.out" + echo "sbatch -J bids${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${output_store} ${subject} ${BIDSINPUT} hendr522-dataladdening private-umn-s3" >> code/srun_calls.sh +done +datalad save -m "SLURM submission setup" code/ .gitignore + +################################################################################ +# SETUP END +################################################################################ + + +######################### +# Merge outputs script +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "output_store=${output_store}" >> code/merge_outputs.sh +echo "BIDSINPUT=${BIDSINPUT}" >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" +cd $output_store +subjects=$(find . -maxdepth 1 -type d -name 'sub-') +datalad create -D "Collection of BIDS subdatasets" -c text2git -d BIDS +cd BIDS +for subject in $subjects +do + datalad clone -d . ${output_store}/${subject} $subject +done +datalad save -m "added subject data" +EOT + +# if we get here, we are happy +echo SUCCESS \ No newline at end of file diff --git a/scripts/pmacs/bootstrap-bids-dataladdening.sh b/scripts/pmacs/bootstrap-bids-dataladdening.sh new file mode 100644 index 0000000..4056ecf --- /dev/null +++ b/scripts/pmacs/bootstrap-bids-dataladdening.sh @@ -0,0 +1,196 @@ +## NOTE ## +# This workflow is derived from the Datalad Handbook + +# workflow for converting many subjects into datalad + +## Ensure the environment is ready to bootstrap the analysis workspace +# Check that we have conda installed +#conda activate +#if [ $? -gt 0 ]; then +# echo "Error initializing conda. Exiting" +# exit $? +#fi + +DATALAD_VERSION=$(datalad --version) + +if [ $? -gt 0 ]; then + echo "No datalad available in your conda environment." + echo "Try pip install datalad" + # exit 1 +fi + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +set -e -u + + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/dataladdening +if [[ -d ${PROJECTROOT} ]] +then + echo ${PROJECTROOT} already exists + # exit 1 +fi + +if [[ ! -w $(dirname ${PROJECTROOT}) ]] +then + echo Unable to write to ${PROJECTROOT}\'s parent. Change permissions and retry + # exit 1 +fi + + +## Check the BIDS input +#BIDSINPUT=/project/msdepression/repos/ms-depression/nifti +BIDSINPUT=$1 +if [[ -z ${BIDSINPUT} ]] +then + echo "Required argument is an identifier of the BIDS source" + # exit 1 +fi + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# Only make a single ria store - we'll send all the subdatasets there +output_store="ria+file://${PROJECTROOT}/output_ria" +# and the directory for aliases +mkdir -p ${PROJECTROOT}/output_ria/alias +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis +# Ensure the ria store is created +datalad create-sibling-ria -s output "${output_store}" + +SUBJECTS=$(find ${BIDSINPUT} -maxdepth 1 -type d -name 'sub-*' | xargs -n 1 basename) +if [ -z "${SUBJECTS}" ] +then + echo "No subjects found in input data" + # exit 1 +fi + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +# Set up the correct conda environment +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +echo $LSB_JOBID + +# Set up the remotes and get the subject id from the call +collector_ria="$1" +# make $2 subid/sesid to have session subdatasets ##UNTESTED## +subid="$2" +bidsroot="$3" + +# change into the cluster-assigned temp directory. Not done by default in LSF +workdir=/scratch/${LSB_JOBID} +mkdir -p ${workdir} +cd ${workdir} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad create -D "Copy subject $subid" $subid + +# all following actions are performed in the context of the superdataset +cd $subid +datalad create-sibling-ria -s output "${collector_ria}" + +# Copy the entire input directory into the current dataset +# and save it as a subdataset. +datalad run \ + -m "Copy in ${subid}" \ + "cp -r ${bidsroot}/${subid}/* ." + +ria_path=$(datalad siblings | grep 'output(-' | sed 's/.*\[\(.*\) (git)\]/\1/') + +datalad push --to output +datalad drop --nocheck . +git annex dead here + +# Make an alias in the RIA store +cd ${ria_path}/../../alias +pt1=$(basename `dirname $ria_path`) +pt2=$(basename $ria_path) +ln -s "../$pt1/$pt2" $subid + +# cleanup +rm -rf $workdir + +# Announce +echo SUCCESS +EOT + +chmod +x code/participant_job.sh + +mkdir logs +echo .LSF_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +################################################################################ +# LSF SETUP START - remove or adjust to your needs +################################################################################ +echo '#!/bin/bash' > code/bsub_calls.sh +eo_args="-e ${PWD}/logs -o ${PWD}/logs -n 1 -R 'rusage[mem=5000]'" +for subject in ${SUBJECTS}; do + echo "bsub -J bids${subject} ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${output_store} ${subject} ${BIDSINPUT}" >> code/bsub_calls.sh +done +datalad save -m "LSF submission setup" code/ .gitignore + +################################################################################ +# LSF SETUP END +################################################################################ + + +######################### +# Merge outputs script +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "output_store=${output_store}" >> code/merge_outputs.sh +echo "BIDSINPUT=${BIDSINPUT}" >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" +subjects=$(ls output_ria/alias) +datalad create -D "Collection of BIDS subdatasets" -c text2git -d merge_ds +cd merge_ds +for subject in $subjects +do + datalad clone -d . ${output_store}"#~${subject}" $subject +done +datalad create-sibling-ria -s output "${output_store}" + +# Copy the non-subject data into here +cp $(find $BIDSINPUT -maxdepth 1 -type f) . +datalad save -m "Add subdatasets" +datalad push --to output + +ria_path=$(datalad siblings | grep 'output(-' | sed 's/.*\[\(.*\) (git)\]/\1/') + +# stop tracking this branch +datalad drop --nocheck . +git annex dead here + +cd ${ria_path}/../../alias +pt1=$(basename `dirname $ria_path`) +pt2=$(basename $ria_path) +ln -s "../$pt1/$pt2" data + +EOT + +# if we get here, we are happy +echo SUCCESS \ No newline at end of file diff --git a/scripts/pmacs/bootstrap-mimosa.sh b/scripts/pmacs/bootstrap-mimosa.sh new file mode 100644 index 0000000..5714bd6 --- /dev/null +++ b/scripts/pmacs/bootstrap-mimosa.sh @@ -0,0 +1,397 @@ +#!/bin/bash +# This workflow is derived from the Datalad Handbook + +set -euf -o pipefail + +usage() { + # Note that errors are sent to STDERR + echo "$0 --bids-input ria+file:///path/to/bids --container-ds /path/or/uri/to/containers" 1>&2 + echo 1>&2 + echo "$*" 1>&2 + exit 1 +} + +checkandexit() { + if [ $? != 0 ]; then + # there was an error + echo "$2" 1>&2 + exit $1 + fi +} + +BIDSINPUT="" +CONTAINERDS="" +FILTERFILE="" +OUTDIR="mimosa" +##### CLI parsing +while [ ${1-X} != "X" ]; do + case $1 in + -i | --bids-input) + shift + BIDSINPUT=$1 + shift + ;; + + -c | --container-ds) + shift + CONTAINERDS=$1 + shift + ;; + + -f | --filter-file) + shift + FILTERFILE=$1 + shift + ;; + + -o | --outdir) + shift + OUTDIR=$1 + shift + ;; + + *) + usage "Unrecognized argument: \"$1\"" + ;; + esac +done + +DATALAD_VERSION=$(datalad --version) +checkandexit $? "No datalad available in your conda environment; try pip install datalad" + +echo USING DATALAD VERSION ${DATALAD_VERSION} + +## Set up the directory that will contain the necessary directories +PROJECTROOT=${PWD}/${OUTDIR} +test ! -d ${PROJECTROOT} +checkandexit $? "${PROJECTROOT} already exists" + +test -w $(dirname ${PROJECTROOT}) +checkandexit $? "Unable to write to ${PROJECTROOT}'s parent. Change permissions and retry" + +## Check the BIDS input +test ! -z ${BIDSINPUT} +checkandexit $? "--bids-input is a required argument" + +## Check the container DS +test ! -z ${CONTAINERDS} +checkandexit $? "--container-ds is a required argument" + +# If we were given a filter file, check that it exists +test ! -z ${FILTERFILE} || true && test -f ${FILTERFILE} +checkandexit $? "Was given the filter file: '${FILTERFILE}' but no such file exists" + +## Start making things +mkdir -p ${PROJECTROOT} +cd ${PROJECTROOT} + +# Jobs are set up to not require a shared filesystem (except for the lockfile) +# ------------------------------------------------------------------------------ +# RIA-URL to a different RIA store from which the dataset will be cloned from. +# Both RIA stores will be created +input_store="ria+file://${PROJECTROOT}/input_ria" +output_store="ria+file://${PROJECTROOT}/output_ria" + +# Create a source dataset with all analysis components as an analysis access +# point. +datalad create -c yoda analysis +cd analysis + +# create dedicated input and output locations. Results will be pushed into the +# output sibling and the analysis will start with a clone from the input sibling. +datalad create-sibling-ria -s output "${output_store}" +pushremote=$(git remote get-url --push output) +datalad create-sibling-ria -s input --storage-sibling off "${input_store}" + +# register the input dataset +echo "Cloning input dataset into analysis dataset" +datalad clone -d . ${BIDSINPUT} inputs/data +# amend the previous commit with a nicer commit message +git commit --amend -m 'Register input data dataset as a subdataset' + +SUBJECTS=$(find inputs/data -type d -name 'sub-*' | cut -d '/' -f 3) +test ! -z "${SUBJECTS}" +checkandexit $? "No subjects found in input data" + +cd ${PROJECTROOT}/analysis +datalad install -d . --source ${CONTAINERDS} pennlinc-containers + +## the actual compute job specification +cat > code/participant_job.sh << "EOT" +#!/bin/bash +echo I\'m in $PWD using `which python` + +# fail whenever something is fishy, use -x to get verbose logfiles +set -e -u -x + +# Set up the remotes and get the subject id from the call +dssource="$1" +pushgitremote="$2" +subid="$3" + +# change into the cluster-assigned temp directory. Not done by default in LSF +cd ${TMPDIR} +# OR Run it on a shared network drive +# cd /cbica/comp_space/$(basename $HOME) + +# Used for the branch names and the temp dir +BRANCH="job-${LSB_JOBID}-${subid}" +mkdir ${BRANCH} +cd ${BRANCH} + +# get the analysis dataset, which includes the inputs as well +# importantly, we do not clone from the lcoation that we want to push the +# results to, in order to avoid too many jobs blocking access to +# the same location and creating a throughput bottleneck +datalad clone "${dssource}" ds + +# all following actions are performed in the context of the superdataset +cd ds + +# in order to avoid accumulation temporary git-annex availability information +# and to avoid a syncronization bottleneck by having to consolidate the +# git-annex branch across jobs, we will only push the main tracking branch +# back to the output store (plus the actual file content). Final availability +# information can be establish via an eventual `git-annex fsck -f joc-storage`. +# this remote is never fetched, it accumulates a larger number of branches +# and we want to avoid progressive slowdown. Instead we only ever push +# a unique branch per each job (subject AND process specific name) +git remote add outputstore "$pushgitremote" + +# all results of this job will be put into a dedicated branch +git checkout -b "${BRANCH}" + +# we pull down the input subject manually in order to discover relevant +# files. We do this outside the recorded call, because on a potential +# re-run we want to be able to do fine-grained recomputing of individual +# outputs. The recorded calls will have specific paths that will enable +# recomputation outside the scope of the original setup +datalad get -n "inputs/data/${subid}" + +# Remove all subjects we're not working on +(cd inputs/data && rm -rf `find . -type d -name 'sub*' | grep -v $subid`) + + +# ------------------------------------------------------------------------------ +# Do the run! + +if [ $# -eq 4 ]; then + datalad run \ + -i code/mimosa_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/dataset_description.json \ + -i pennlinc-containers/.datalad/environments/mimosa-0-2-1/image \ + -i $4 \ + --explicit \ + -o ${subid}_mimosa-0.2.1.zip \ + -m "mimosa:0.2.1 ${subid}" \ + "bash ./code/mimosa_zip.sh ${subid} ${4}" +else + datalad run \ + -i code/mimosa_zip.sh \ + -i inputs/data/${subid} \ + -i inputs/data/dataset_description.json \ + -i pennlinc-containers/.datalad/environments/mimosa-0-2-1/image \ + --explicit \ + -o ${subid}_mimosa-0.2.1.zip \ + -m "mimosa:0.2.1 ${subid}" \ + "bash ./code/mimosa_zip.sh ${subid}" +fi + +# file content first -- does not need a lock, no interaction with Git +datalad push --to output-storage +# and the output branch +flock $DSLOCKFILE git push outputstore + +echo TMPDIR TO DELETE +echo ${BRANCH} + +datalad drop -r . --nocheck +datalad uninstall -r inputs/data +git annex dead here +cd ../.. +rm -rf $BRANCH + +echo SUCCESS +# job handler should clean up workspace +EOT + +chmod +x code/participant_job.sh + +cat > code/mimosa_zip.sh << "EOT" +#!/bin/bash +set -e -u -x + +export SINGULARITYENV_CORES=1 +export SINGULARITYENV_ITK_GLOBAL_DEFAULT_NUMBER_OF_THREADS=1 +export SINGULARITYENV_OMP_NUM_THREADS=1 +export SINGULARITYENV_OMP_THREAD_LIMIT=1 +export SINGULARITYENV_MKL_NUM_THREADS=1 +export SINGULARITYENV_OPENBLAS_NUM_THREADS=1 +export SINGULARITYENV_TMPDIR=$TMPDIR + +subid="$1" + +if [ $# -eq 2 ]; then + filterfile=$2 + + singularity run --cleanenv -B ${PWD} -B ${TMPDIR} \ + pennlinc-containers/.datalad/environments/mimosa-0-2-1/image \ + inputs/data \ + mimosa \ + participant \ + --participant_label $(echo $subid | cut -d '-' -f 2) \ + --bids-filter-file $filterfile \ + --strip mass \ + --n4 \ + --register \ + --whitestripe \ + --thresh 0.25 \ + --debug \ + --skip_bids_validator +else + singularity run --cleanenv -B ${PWD} -B ${TMPDIR} \ + pennlinc-containers/.datalad/environments/mimosa-0-2-1/image \ + inputs/data \ + mimosa \ + participant \ + --participant_label $(echo $subid | cut -d '-' -f 2) \ + --strip mass \ + --n4 \ + --register \ + --whitestripe \ + --thresh 0.25 \ + --debug \ + --skip_bids_validator +fi + +outdirs=$(ls | grep mimosa) +7z a ${subid}_mimosa-0.2.1.zip $outdirs +rm -rf $outdirs + +EOT + +chmod +x code/mimosa_zip.sh + +mkdir logs +echo .LSF_datalad_lock >> .gitignore +echo logs >> .gitignore + +datalad save -m "Participant compute job implementation" + +# Add a script for merging outputs +cat > code/merge_outputs.sh << "EOT" +#!/bin/bash +set -e -u -x +EOT +echo "outputsource=${output_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" \ + >> code/merge_outputs.sh +echo "cd ${PROJECTROOT}" >> code/merge_outputs.sh + +cat >> code/merge_outputs.sh << "EOT" + +# The following should be pasted into the merge_outputs.sh script +datalad clone ${outputsource} merge_ds +cd merge_ds +NBRANCHES=$(git branch -a | grep job- | sort | wc -l) +echo "Found $NBRANCHES branches to merge" + +gitref=$(git show-ref master | cut -d ' ' -f1 | head -n 1) + +# query all branches for the most recent commit and check if it is identical. +# Write all branch identifiers for jobs without outputs into a file. +for i in $(git branch -a | grep job- | sort); do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" = x"${gitref}" ] && \ + echo $i; done | tee code/noresults.txt | wc -l + + +for i in $(git branch -a | grep job- | sort); \ + do [ x"$(git show-ref $i \ + | cut -d ' ' -f1)" != x"${gitref}" ] && \ + echo $i; \ +done | tee code/has_results.txt + +mkdir -p code/merge_batches +num_branches=$(wc -l < code/has_results.txt) +CHUNKSIZE=5000 +set +e +num_chunks=$(expr ${num_branches} / ${CHUNKSIZE}) +if [[ $num_chunks == 0 ]]; then + num_chunks=1 +fi +set -e +for chunknum in $(seq 1 $num_chunks) +do + startnum=$(expr $(expr ${chunknum} - 1) \* ${CHUNKSIZE} + 1) + endnum=$(expr ${chunknum} \* ${CHUNKSIZE}) + batch_file=code/merge_branches_$(printf %04d ${chunknum}).txt + [[ ${num_branches} -lt ${endnum} ]] && endnum=${num_branches} + branches=$(sed -n "${startnum},${endnum}p;$(expr ${endnum} + 1)q" code/has_results.txt) + echo ${branches} > ${batch_file} + git merge -m "merge results batch ${chunknum}/${num_chunks}" $(cat ${batch_file}) + +done + +# Push the merge back +git push + +# Get the file availability info +git annex fsck --fast -f output-storage + +# This should not print anything +MISSING=$(git annex find --not --in output-storage) + +if [[ ! -z "$MISSING" ]] +then + echo Unable to find data for $MISSING + exit 1 +fi + +# stop tracking this branch +git annex dead here + +datalad push --data nothing +echo SUCCESS +EOT + + +################################################################################ +# LSF SETUP START - remove or adjust to your needs +################################################################################ +echo '#!/bin/bash' > code/bsub_calls.sh +echo "export DSLOCKFILE=${PWD}/.LSF_datalad_lock" >> code/bsub_calls.sh +dssource="${input_store}#$(datalad -f '{infos[dataset][id]}' wtf -S dataset)" +pushgitremote=$(git remote get-url --push output) +eo_args="-e ${PWD}/logs -o ${PWD}/logs -n 1 -R 'rusage[mem=20000]'" +if [ ! -z "${FILTERFILE}" ]; then + cp ${FILTERFILE} code/filterfile.json + FILTERFILE="code/filterfile.json" +fi +for subject in ${SUBJECTS}; do + echo "bsub ${eo_args} \ + ${PWD}/code/participant_job.sh \ + ${dssource} ${pushgitremote} ${subject} ${FILTERFILE}" >> code/bsub_calls.sh +done +datalad save -m "LSF submission setup" code/ .gitignore + +################################################################################ +# LSF SETUP END +################################################################################ + +# make sure the fully configured output dataset is available from the designated +# store for initial cloning and pushing the results. +datalad push --to input +datalad push --to output + +# cleanup - we have generated the job definitions, we do not need to keep a +# massive input dataset around. Having it around wastes resources and makes many +# git operations needlessly slow +datalad uninstall -r --nocheck inputs/data + +# Add an alias to the data in the RIA store +RIA_DIR=$(find $PROJECTROOT/output_ria/???/ -maxdepth 1 -type d | sort | tail -n 1) +mkdir -p ${PROJECTROOT}/output_ria/alias +ln -s ${RIA_DIR} ${PROJECTROOT}/output_ria/alias/data + +# if we get here, we are happy +echo SUCCESS