Skip to content

Commit 3b3f33b

Browse files
authored
Quality improvements (#29)
1 parent a09b0ac commit 3b3f33b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+912
-871
lines changed

DAG.pdf

69.3 KB
Binary file not shown.

Makefile

+35-56
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ WORKSPACE=12000
1111
CLUSTER_CORES=16
1212
CONFIG=configs/config.prod.yml
1313
CONDA_PATH=$(SHARED_ROOT)/mambaforge
14+
SNAKEMAKE_OUTPUT_CACHE=$(SHARED_ROOT)/cache
15+
TARGET=
1416
###
1517

1618
CONDA_ACTIVATE=source $(CONDA_PATH)/etc/profile.d/conda.sh ; conda activate ; conda activate
19+
SNAKEMAKE=export SNAKEMAKE_OUTPUT_CACHE=$(SNAKEMAKE_OUTPUT_CACHE); snakemake
1720

1821
### 2. setup
1922

@@ -26,7 +29,8 @@ conda:
2629

2730
snakemake:
2831
$(CONDA_ACTIVATE) base
29-
mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.9.1 --yes
32+
mamba create -c conda-forge -c bioconda -n snakemake snakemake==6.10.0 --yes
33+
mkdir -p "$(SNAKEMAKE_OUTPUT_CACHE)"
3034

3135
# build container image for cluster and run-local modes (preferred)
3236
build:
@@ -44,64 +48,78 @@ pull:
4448

4549
dry-run:
4650
$(CONDA_ACTIVATE) snakemake
47-
snakemake \
51+
$(SNAKEMAKE) \
4852
--use-conda \
4953
--cores all \
54+
--cache \
5055
--reason \
5156
--configfile $(CONFIG) \
5257
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true \
53-
-n
58+
-n \
59+
$(TARGET)
5460

5561
run-local:
62+
echo "Running with config $(CONFIG)"
5663
$(CONDA_ACTIVATE) snakemake
57-
snakemake \
64+
$(SNAKEMAKE) \
5865
--use-conda \
5966
--reason \
6067
--cores all \
68+
--cache \
6169
--resources gpu=$(GPUS) \
6270
--configfile $(CONFIG) \
63-
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true
71+
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) deps=true \
72+
$(TARGET)
73+
74+
test: CONFIG=configs/config.test.yml
75+
test: run-local
6476

6577
run-local-container:
6678
$(CONDA_ACTIVATE) snakemake
6779
module load singularity
68-
snakemake \
80+
$(SNAKEMAKE) \
6981
--use-conda \
7082
--use-singularity \
7183
--reason \
7284
--cores all \
85+
--cache \
7386
--resources gpu=$(GPUS) \
7487
--configfile $(CONFIG) \
7588
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
76-
--singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR) --nv"
89+
--singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR) --nv" \
90+
$(TARGET)
7791

7892
run-slurm:
7993
$(CONDA_ACTIVATE) snakemake
8094
chmod +x profiles/slurm/*
81-
snakemake \
95+
$(SNAKEMAKE) \
8296
--use-conda \
8397
--reason \
8498
--cores $(CLUSTER_CORES) \
99+
--cache \
85100
--configfile $(CONFIG) \
86101
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
87-
--profile=profiles/slurm
102+
--profile=profiles/slurm \
103+
$(TARGET)
88104

89105
run-slurm-container:
90106
$(CONDA_ACTIVATE) snakemake
91107
chmod +x profiles/slurm/*
92108
module load singularity
93-
snakemake \
109+
$(SNAKEMAKE) \
94110
--use-conda \
95111
--use-singularity \
96112
--reason \
97113
--verbose \
98114
--cores $(CLUSTER_CORES) \
115+
--cache \
99116
--configfile $(CONFIG) \
100117
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
101118
--profile=profiles/slurm \
102-
--singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR),/tmp --nv --containall"
119+
--singularity-args="--bind $(SHARED_ROOT),$(CUDA_DIR),/tmp --nv --containall" \
120+
$(TARGET)
103121
# if CPU nodes don't have access to cuda dirs, use
104-
# export CUDA_DIR=$(CUDA_DIR)
122+
# export CUDA_DIR=$(CUDA_DIR); $(SNAKEMAKE) \
105123
# --singularity-args="--bind $(SHARED_ROOT),/tmp --nv --containall"
106124

107125

@@ -123,25 +141,11 @@ run-file-server:
123141
### extra
124142

125143
dag:
126-
snakemake --dag | dot -Tpdf > DAG.pdf
127-
128-
lint:
129-
snakemake --lint
130-
131-
install-monitor:
132-
$(CONDA_ACTIVATE) base
133-
conda create --name panoptes
134-
conda install -c panoptes-organization panoptes-ui
135-
136-
run-monitor:
137-
$(CONDA_ACTIVATE) panoptes
138-
panoptes
139-
140-
run-with-monitor:
141144
snakemake \
142-
--use-conda \
143-
--cores all \
144-
--wms-monitor http://127.0.0.1:5000
145+
--dag \
146+
--configfile $(CONFIG) \
147+
--config root="$(SHARED_ROOT)" cuda="$(CUDA_DIR)" gpus=$(GPUS) workspace=$(WORKSPACE) \
148+
| dot -Tpdf > DAG.pdf
145149

146150
install-tensorboard:
147151
$(CONDA_ACTIVATE) base
@@ -151,29 +155,4 @@ tensorboard:
151155
$(CONDA_ACTIVATE) tensorboard
152156
ls -d $(SHARED_ROOT)/models/*/*/* > tb-monitored-jobs; \
153157
tensorboard --logdir=$$MODELS --host=0.0.0.0 &; \
154-
python utils/tb_log_parser.py --prefix=
155-
156-
install-snakepit-scheduler:
157-
mkdir -p $(SHARED_ROOT)/snakepit
158-
cd $(SHARED_ROOT)/snakepit
159-
160-
curl -sL https://deb.nodesource.com/setup_12.x | sudo -E bash -
161-
sudo apt install nodejs
162-
163-
if [ ! -e snakepit-client ]; then
164-
git clone https://github.com/mozilla/snakepit-client.git
165-
fi
166-
cd snakepit-client
167-
npm install
168-
sudo npm link
169-
170-
echo "http://10.2.224.243" > /root/.pitconnect.txt
171-
172-
pit status
173-
174-
run-snakepit:
175-
chmod +x profiles/snakepit/*
176-
snakemake \
177-
--use-conda \
178-
--cores all \
179-
--profile=profiles/snakepit
158+
python utils/tb_log_parser.py --prefix=

README.md

+145-11
Original file line numberDiff line numberDiff line change
@@ -128,15 +128,24 @@ make dry-run
128128

129129
### Local mode
130130

131-
Without containerization:
131+
#### Without containerization
132+
132133
```
133134
make run-local
134135
```
135-
With containerization:
136+
To test the whole pipeline end to end (it supposed to run quickly and does not train anything useful):
137+
138+
```
139+
make test
140+
```
141+
Or run
142+
#### With containerization
136143
```
137144
make run-local-container
138145
```
139146

147+
148+
140149
### Cluster mode
141150

142151
To run on Slurm
@@ -149,6 +158,18 @@ with containerization (recommended):
149158
```
150159
make run-slurm-container
151160
```
161+
### Specific target
162+
163+
By default, all Snakemake rules are executed. To run the pipeline up to a specific rule use:
164+
```
165+
make <run-command> TARGET=<non-wildcard-rule>
166+
```
167+
168+
For example, collect corpus first:
169+
```
170+
make run-local TARGET=merge_corpus
171+
```
172+
152173

153174
### Using Snakepit
154175

@@ -209,20 +230,23 @@ Step | Description | Bottleneck | Comments
209230
--- | --- | --- | ---
210231
Installation | Installing dependencies and compiling | CPU | Takes ~1 hour
211232
Data downloading | Downloads datasets, samples sentences | Network, Disk | Time depends on dataset size, sampling of huge mono datasets (100M+ sentences) is the most intensive operation.
212-
Data cleaning | Basic preprocessing, language specific, rule based, deduplication, and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/clean_parallel.py).
213-
Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning threshold is controlled by `BICLEANER_THRESHOLD` config setting.
233+
Data cleaning | Basic preprocessing, dataset specific, language specific, rule based and other attempts to clean noisy data in parallel and mono datasets | CPU | Good parallelization across CPU cores. To make cleaning of a new language more efficient add it to [clean_parallel.py](/pipeline/clean/tools/clean_parallel.py).
234+
Bicleaner | Filters noisy sentence pairs in a parallel corpus using [bicleaner](https://github.com/bitextor/bicleaner) or [bicleaner-ai](https://github.com/bitextor/bicleaner-ai) depending on available language packs. | CPU, GPU | If there are no pretrained language packs for bicleaner-ai, it uses bicleaner. If there are no ones for bicleaner either, this step is skipped. Cleaning thresholds are configurable per dataset, see [Dataset cleaning](##Dataset cleaning).
235+
Merge and dedupe | Merges clean dataset and applies deduplicaiton | CPU, Disk |
214236
Training s2s | Trains a backward shallow s2s model, which is useful for back-translations and ce-filtering | GPU | Inspired by a [marian example](https://github.com/marian-nmt/marian-examples/tree/master/training-basics-sentencepiece).
215-
Augmentation with back-translations | Translates mono corpus combined from `MONO_DATASETS_TRG` using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others.
216-
Training teacher | Trains one or multiple big transformer models | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size. Inspired by [transformer](https://github.com/marian-nmt/marian-examples/tree/master/transformer) and [wmt2017-uedin](https://github.com/marian-nmt/marian-examples/tree/master/wmt2017-uedin) marian examples and extended with [SentencePiece](https://github.com/google/sentencepiece).
237+
Augmentation with back-translations | Translates mono corpus combined from monolingual datasets in target language using shallow s2s model. | GPU | It is more useful for low-resource languages and can be skipped for others.
238+
Training teacher | Trains an ensemble of big transformer models on augmented dataset | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) or `after-epochs` parameters depending on datasets size.
239+
Continue training teacher | Continue training an ensemble of teachers on parallel data only | GPU | You might want to adjust [early stopping](pipeline/train/configs/training/teacher.transformer.train.yml) parameters depending on datasets size.
217240
Translation by teacher | Translates a corpus and monolingual data combined from `MONO_DATASETS_SRC` using the teacher model (ensemble is not supported yet) | GPU | The slowest part of the pipeline. Can take days. It is possible to speed it up launching the same scripts ([corpus](pipeline/translate/translate-corpus.sh), [mono](pipeline/translate/translate-mono.sh)) in parallel from another machine with access to the same network directory.
218241
Cross-entropy filtering | Scores translated corpus with backward s2s model and removes a part of the corpus with the lowest scores to reduce noise | GPU, CPU, Disk | At this point we work with huge datasets, so it utilizes copying to a local disk to make things faster.
219242
Training alignments and shortlist | Trains alignments using [fast_align](https://github.com/clab/fast_align) and extracts lexical shortlist using [extract_lex](https://github.com/marian-nmt/extract-lex) tool | CPU, Disk | Some tools requires uncompressed datasets on disk and they are huge at this point. Data is copied to a local disk to make things faster. Might take 100+GB of local disk depending on a dataset size. Good CPU parallelization.
220-
Training student | Trains a small transformer student model on filtered data and using alignments | GPU | Run [Tensorboard](utils/tensorboard/tensorboard.sh) manually to see training visualization.
243+
Training student | Trains a small transformer student model on filtered data and using alignments | GPU |
221244
Fine-tuning student | Finetunes the student model by emulating 8bit GEMM during training | GPU | Converges very quickly and then degrades. It's quick but you might want to reduce early stopping threshold.
222245
Quantizaiton | Applies 8 bit quantization to the fined-tuned student model and evaluates on CPU | CPU | CPU threads must be set to 1 for this step.
246+
Evaluation | Calculates metrics for all models (BLEU, chrf) using [SacreBLEU](https://github.com/mjpost/sacrebleu) | GPU | Uses `datasets.test` configuration section.
223247
Export | Exports trained model and shortlist to (bergamot-translator)(https://github.com/mozilla/bergamot-translator) format | |
224248

225-
## Datasets importers
249+
## Dataset importers
226250

227251
Dataset importers can be used in `datasets` sections of experiment config.
228252

@@ -256,6 +280,119 @@ Example:
256280
Just add a shell script to [corpus](pipeline/data/importers/corpus) or [mono]() which is named as `<prefix>.sh`
257281
and accepts the same parameters as the other scripts from the same folder.
258282

283+
## Dataset fixing
284+
285+
Some datasets require fixes like detokenization. Dataset and language specific fixes are implemented in [pipeline/clean/fixes]([pipeline/clean/fixes]).
286+
Naming convention:
287+
- `<dataset_name>.sh` for parallel dataset cleaning
288+
- `<dataset_name>.<lang>.sh` for language specific cleaning of parallel or monolingual dataset
289+
- `/` in dataset name should be replaced with `_`
290+
291+
## Dataset cleaning
292+
Some parallel datasets require more aggressive filtering.
293+
Dataset specific Bicleaner thretholds can be set in config. Example:
294+
295+
```angular2html
296+
experiment:
297+
...
298+
bicleaner:
299+
default-threshold: 0.5
300+
dataset-thresholds:
301+
mtdata_neulab_tedtalksv1_train: 0.6
302+
```
303+
304+
## Utilities
305+
306+
### Tensorboard
307+
308+
To see training graphs run tensorboard:
309+
310+
```
311+
make install-tensorboard
312+
make tensorboard
313+
```
314+
315+
Then port forward 6006.
316+
317+
## Directory structure
318+
319+
├ data
320+
│ └ ru-en
321+
│ └ test
322+
│ ├ original
323+
│ │ ├ corpus
324+
│ │ │ ├ mtdata_JW300.en.gz
325+
│ │ │ └ mtdata_JW300.ru.gz
326+
│ │ ├ devset
327+
│ │ │ ├ flores_dev.en.gz
328+
│ │ │ └ flores_dev.ru.gz
329+
│ │ ├ eval
330+
│ │ │ ├ sacrebleu_wmt20.en.gz
331+
│ │ │ └ sacrebleu_wmt20.ru.gz
332+
│ │ ├ mono
333+
│ │ │ ├ news-crawl_news.2020.ru.gz
334+
│ │ │ └ news-crawl_news.2020.en.gz
335+
│ │ ├ devset.ru.gz
336+
│ │ └ devset.en.gz
337+
│ ├ clean
338+
│ │ ├ corpus
339+
│ │ │ ├ mtdata_JW300.en.gz
340+
│ │ │ └ mtdata_JW300.ru.gz
341+
│ │ ├ mono
342+
│ │ │ ├ news-crawl_news.2020.ru.gz
343+
│ │ │ └ news-crawl_news.2020.en.gz
344+
│ │ ├ mono.ru.gz
345+
│ │ └ mono.en.gz
346+
│ ├ biclean
347+
│ │ ├ corpus
348+
│ │ │ ├ mtdata_JW300.en.gz
349+
│ │ │ └ mtdata_JW300.ru.gz
350+
│ │ ├ corpus.ru.gz
351+
│ │ ├ corpus.en.gz
352+
│ ├ translated
353+
│ │ ├ mono.ru.gz
354+
│ │ └ mono.en.gz
355+
│ ├ augmented
356+
│ │ ├ corpus.ru.gz
357+
│ │ └ corpus.en.gz
358+
│ ├ alignment
359+
│ │ ├ corpus.aln.gz
360+
│ │ └ lex.s2t.pruned.gz
361+
│ ├ merged
362+
│ │ ├ corpus.ru.gz
363+
│ │ └ corpus.en.gz
364+
│ └ filtered
365+
│ ├ corpus.ru.gz
366+
│ └ corpus.en.gz
367+
├ models
368+
│ ├ ru-en
369+
│ │ └ test
370+
│ │ ├ teacher
371+
│ │ ├ student
372+
│ │ ├ student-finetuned
373+
│ │ ├ speed
374+
│ │ ├ evaluation
375+
│ │ │ ├ backward
376+
│ │ │ ├ teacher0
377+
│ │ │ ├ teacher1
378+
│ │ │ ├ teacher-ensemble
379+
│ │ │ ├ student
380+
│ │ │ ├ student-finetuned
381+
│ │ │ └ speed
382+
│ │ └ exported
383+
│ ├ en-ru
384+
│ └ test
385+
│ └ backward
386+
387+
├ experiments
388+
│ └ ru-en
389+
│ └ test
390+
│ └ config.sh
391+
├ logs
392+
│ └ ru-en
393+
│ └ test
394+
│ └ clean_corpus.log
395+
259396
## Development
260397

261398
### Architecture
@@ -271,9 +408,6 @@ Snakemake parallelizes steps that can be executed simultniously. It is especiall
271408
The main snakemkae process (scheduler) should be launched interactively. It runs job processes on the worker nodes in cluster mode or on a local machine in local mode.
272409

273410
### Conventions
274-
275-
- All scripts work with respect to repo root directory.
276-
It allows to not think about relative paths and execution folders.
277411

278412
- Scripts inside the `pipeline` directory are independent and operate only using input arguments, input files
279413
and global envs.

0 commit comments

Comments
 (0)