-
Notifications
You must be signed in to change notification settings - Fork 2
/
_run_massbank.sh
229 lines (200 loc) · 6.56 KB
/
_run_massbank.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/bin/bash
# -----------------------
# Ensure that all parameters are set
DEBUG=0
N_THREADS=$1
MOL_KERNEL=$2
EVAL_SET_ID=$3
SSVM_MODEL_INDEX=$4
N_TRAIN_SEQS=$5
N_EPOCHS=$6
BATCH_SIZE=$7
MS2SCORER=$8
LLOSS_MODE=$9
C_GRID=( "$(echo ${10} | tr "," "\n")" ) # convert to array
LOAD_OPT_C=${11}
SSVM_FLAVOR=${12}
MOL_FEATURES=${13}
MOL_IDENTIFIER=${14}
MAX_N_CAND_TRAIN=${15}
TRAINING_DATASET=${16}
EXP_VERSION=${17}
SSVM_LIBRARY_VERSION=${18}
CLUSTER_SYSTEM=${19}
#if [ -z "$N_THREADS" ] ; then
# echo "[ERROR] Number of threads (N_THREADS) not specified."
# exit 1
#fi
#
#if [ -z "$MOL_KERNEL" ] ; then
# echo "[ERROR] Molecule kernel (MOL_KERNEL) not specified."
# exit 1
#fi
#
#if [ -z "$EVAL_SET_ID" ] ; then
# echo "[ERROR] Evaluation set ID (EVAL_SET_ID) not specified."
# exit 1
#fi
#
#if [ -z "$SSVM_MODEL_INDEX" ] ; then
# echo "[ERROR] SSMV model index (SSVM_MODEL_INDEX) not specified."
# exit 1
#fi
#
#if [ -z "$N_TRAIN_SEQS" ] ; then
# echo "[ERROR] Number of training sequences (N_TRAIN_SEQS) not specified."
# exit 1
#fi
#
#if [ -z "$N_EPOCHS" ] ; then
# echo "[ERROR] Number of training epochs (N_EPOCHS) not specified."
# exit 1
#fi
#
#if [ -z "$BATCH_SIZE" ] ; then
# echo "[ERROR] Batch-size (BATCH_SIZE) not specified."
# exit 1
#fi
#
#if [ -z "$MS2SCORER" ] ; then
# echo "[ERROR] MS2-scoring method (MS2SCORER) not specified."
# exit 1
#fi
#
#if [ -z "$LLOSS_MODE" ] ; then
# echo "[ERROR] Label-loss mode (LLOSS_MODE) not specified."
# exit 1
#fi
#
#if [ -z "$C_GRID" ] ; then
# echo "[ERROR] C-grid (C_GRID) not specified."
# exit 1
#fi
#
#if [ -z "$LOAD_OPT_C" ] ; then
# echo "[ERROR] Not specified whether the optimal C-parameter (LOAD_OPT_C) should be loaded."
# exit 1
#fi
#
#if [ -z "$SSVM_FLAVOR" ] ; then
# echo "[ERROR] SSVM flavor (SSVM_FLAVOR) not specified."
# exit 1
#fi
#
#if [ -z "$MOL_FEATURES" ] ; then
# echo "[ERROR] Molecule features (MOL_FEATURES) not specified."
# exit 1
#fi
#
#if [ -z "$MOL_IDENTIFIER" ] ; then
# echo "[ERROR] Molecule identifier (MOL_IDENTIFIER) not specified."
# exit 1
#fi
#
#if [ -z "$MAX_N_CAND_TRAIN" ] ; then
# echo "[ERROR] Maximum number of random candidate during training (MAX_N_CAND_TRAIN) not specified."
# exit 1
#fi
#
#if [ -z "$TRAINING_DATASET" ] ; then
# echo "[ERROR] Training dataset (TRAINING_DATASET) not specified."
# exit 1
#fi
# -----------------------
# ----------------------------------------------
# Create temporary directories on computing node
if [[ "${CLUSTER_SYSTEM}" == "triton" ]] ; then
LOCAL_SCRATCH="/dev/shm/$SLURM_JOB_ID"
mkdir "$LOCAL_SCRATCH" || exit 1
fi
if [[ -z "${LOCAL_SCRATCH}" ]] ; then
echo "[ERROR] Local scratch directory not defined on the cluster system '${CLUSTER_SYSTEM}'."
exit 1
fi
# Local directories
LOCAL_DIR=${LOCAL_SCRATCH} # base dir
LOCAL_DB_DIR=${LOCAL_SCRATCH} # for SQLite DB
LOCAL_CONDA_DIR="$LOCAL_DIR/miniconda/" # for conda environment
mkdir "$LOCAL_CONDA_DIR" || exit 1
if [[ "${CLUSTER_SYSTEM}" == "triton" ]] ; then
# Set up trap to remove my results on exit from the local disk
trap "rm -rf $LOCAL_DB_DIR; rm -rf $LOCAL_DIR; exit" TERM EXIT
fi
# On puhti (CSC) the local disc is automatically wiped
# ----------------------------------------------
# Set some multiprocessing parameters
N_JOBS=$(($SLURM_CPUS_PER_TASK/$N_THREADS)) # e.g. parallel sub-problems solved
echo "n_cpus=$SLURM_CPUS_PER_TASK, n_threads=$N_THREADS, n_jobs=$N_JOBS, mol_kernel=$MOL_KERNEL"
# Set up some paths
BASE_PROJECT_DIR="${SCRATCHHOME}/projects/"
PROJECT_DIR="$BASE_PROJECT_DIR/lcms2struct_experiments/"
SCRIPTPATH="$PROJECT_DIR/run_scripts/publication/massbank/run_with_gridsearch.py"
DB_DIR="$PROJECT_DIR/data/"
DB_FN="massbank.sqlite"
# ----------------------------
# Set up the conda environment
module load git
# Directory where our SSVM library will be stored
SSVM_LIB_DIR="$LOCAL_DIR/msms_rt_ssvm"
if [[ ${CLUSTER_SYSTEM} == "triton" ]] ; then
module load miniconda
# Which SSVM library version should be used?
[email protected]:aalto-ics-kepaco/msms_rt_ssvm.git
VERSION_TAG=$(git ls-remote --tags "$GIT_REPO_URL" | cut -d'/' -f3 | grep "$SSVM_LIBRARY_VERSION" | sort --version-sort | tail -1 | sed "s/\^{}//g")
# Activate conda
eval "$(conda shell.bash hook)"
elif [[ ${CLUSTER_SYSTEM} == "puhti" ]] ; then
# Activate conda
eval "$(/projappl/${MYPROJECTID}/miniconda3/bin/conda shell.bash hook)"
# Which SSVM library version should be used?
GIT_REPO_URL=${SCRATCHHOME}/projects/msms_rt_ssvm
VERSION_TAG=$(git --git-dir="${GIT_REPO_URL}/.git" tag --list | grep "$SSVM_LIBRARY_VERSION" | sort --version-sort | tail -1)
fi
echo "Repository URL: $GIT_REPO_URL"
echo "Version-tag: $VERSION_TAG"
# Clone the SSVM library from the master branch at github
git clone --branch "$VERSION_TAG" "$GIT_REPO_URL" "$SSVM_LIB_DIR"
# Create the conda environment based on the SSVM library's requirements
cd "$LOCAL_CONDA_DIR" || exit 1
conda create --clone msms_rt_ssvm__base --prefix msms_rt_ssvm__local
conda activate "$LOCAL_CONDA_DIR/msms_rt_ssvm__local"
cd - || exit 1
# Install SSVM library
pip install --no-deps "$SSVM_LIB_DIR"
# ----------------------------
# Set up path to output directory
OUTPUT_DIR="$(dirname $SCRIPTPATH | sed "s/run_scripts/results_raw/g")/ssvm_lib=${SSVM_LIBRARY_VERSION}__exp_ver=${EXP_VERSION}/"
mkdir -p "$OUTPUT_DIR" || exit 1
echo "Output dir: $OUTPUT_DIR"
# Copy the DB file to the node's local disk
cp -v "$DB_DIR/$DB_FN" "$LOCAL_DB_DIR" || exit 1
# Copy the run script to the local disk (fortifies it from changes)
LOCAL_SCRIPTPATH=$LOCAL_DIR/$(basename $SCRIPTPATH)
cp -v "$SCRIPTPATH" "$LOCAL_SCRIPTPATH" || exit 1
# ------------------
# Run the experiment
NUMBA_NUM_THREADS=$N_THREADS;OMP_NUM_THREADS=$N_THREADS;OPENBLAS_NUM_THREADS=$N_THREADS; \
srun python "$LOCAL_SCRIPTPATH" \
"$EVAL_SET_ID" "$SSVM_MODEL_INDEX" \
--n_jobs="$N_JOBS" \
--n_threads_test_prediction="$SLURM_CPUS_PER_TASK" \
--db_fn="$LOCAL_DB_DIR/$DB_FN" \
--output_dir="$OUTPUT_DIR" \
--n_samples_train="$N_TRAIN_SEQS" \
--n_epoch="$N_EPOCHS" \
--batch_size="$BATCH_SIZE" \
--mol_kernel="$MOL_KERNEL" \
--ms2scorer="$MS2SCORER" \
--lloss_fps_mode="$LLOSS_MODE" \
--stepsize="linesearch" \
--C_grid ${C_GRID[*]} \
--debug="$DEBUG" \
--load_optimal_C_from_tree_zero="$LOAD_OPT_C" \
--ssvm_flavor="$SSVM_FLAVOR" \
--mol_feat_retention_order="$MOL_FEATURES" \
--molecule_identifier="$MOL_IDENTIFIER" \
--max_n_train_candidates="$MAX_N_CAND_TRAIN" \
--training_dataset="$TRAINING_DATASET"
# ------------------
# Export the conda environment.yml when the run was successful
conda env export --no-build > "$OUTPUT_DIR/conda_env__$SLURM_JOB_ID.yml"