Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[wip] add support for automatic uploading of artifacts (fixes #466) #590

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
3 changes: 2 additions & 1 deletion .taskcluster.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ tasks:
$switch:
'tasks_for[:19] == "github-pull-request"': ${event.pull_request.base.ref}
'tasks_for == "github-push" && event.base_ref': ${event.base_ref}
'tasks_for == "github-push"': ${event.ref}
'tasks_for == "github-push" && !(event.base_ref)': ${event.ref}
'tasks_for in ["cron", "action"]': '${push.branch}'
'tasks_for == "pr-action"': '${push.base_branch}'
head_ref:
Expand Down Expand Up @@ -225,6 +225,7 @@ tasks:

features:
taskclusterProxy: true
chainOfTrust: true

image: mozillareleases/taskgraph:decision-v11.0.0@sha256:8c57e30214c85625856479812e63212dc2873b1ffcd97467a7c93508cbb3075a
maxRunTime: 1800
Expand Down
37 changes: 21 additions & 16 deletions taskcluster/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ taskgraph:
register: translations_taskgraph:register
decision-parameters: "translations_taskgraph.parameters:get_decision_parameters"
repositories:
firefox_translations_training:
name: "firefox-translations-training"
translations:
name: "translations"

# The list of valid stages that can be used with `target-stage and `start-stage`.
# These get attached to tasks in `kinds`.
Expand Down Expand Up @@ -93,44 +93,49 @@ workers:
worker-type: 'b-linux-large-gcp-1tb-64-512-std-d2g'
b-linux-v100-gpu:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4'
b-linux-v100-gpu-4:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4'
b-linux-v100-gpu-4-300gb:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
b-linux-v100-gpu-4-300gb-standard:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
b-linux-v100-gpu-4-1tb:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
b-linux-v100-gpu-4-2tb:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
b-linux-v100-gpu-4-1tb-standard:
provisioner: '{trust-domain}-{level}'
implementation: generic-worker
implementation: docker-worker
os: linux
worker-type: '{alias}'
worker-type: 'b-linux-v100-gpu-d2g-4-300gb'
images:
provisioner: '{trust-domain}-{level}'
implementation: docker-worker
os: linux
worker-type: '{alias}-gcp'
beetmover:
provisioner: 'scriptworker-k8s'
implementation: beetmover-translations
os: scriptworker
worker-type: 'translations-1-beetmover-dev'

# Ideally these would be in `workers.aliases` above, but those alias' are
# resolved by Taskgraph, which is unaware of the `worker-class` lookups
Expand Down
1 change: 0 additions & 1 deletion taskcluster/configs/config.ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ datasets:
- opus_ada83/v1
- opus_ELRC-3075-wikipedia_health/v1
- url_https://storage.googleapis.com/releng-translations-dev/data/en-ru/pytest-dataset.[LANG].zst
- mtdata_ELRC-web_acquired_data_related_to_scientific_research-1-eng-rus
devtest:
- flores_dev
- sacrebleu_aug-upper_wmt19
Expand Down
1 change: 1 addition & 0 deletions taskcluster/docker/base/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# cache bust
FROM ubuntu:22.04
LABEL maintainer="Mozilla Release Engineering <[email protected]>"

Expand Down
16 changes: 16 additions & 0 deletions taskcluster/docker/train/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
FROM $DOCKER_IMAGE_PARENT
LABEL maintainer="Mozilla Release Engineering <[email protected]>"

RUN curl -L https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb > /tmp/cuda-keyring.deb \
&& dpkg -i /tmp/cuda-keyring.deb \
&& rm /tmp/cuda-keyring.deb

RUN apt-get update -qq \
&& apt-get install -y python3-numpy \
python3-fasttext \
Expand All @@ -16,7 +20,19 @@ RUN apt-get update -qq \
wget \
pkg-config \
libicu-dev \
cuda-toolkit \
software-properties-common \
&& apt-get clean

RUN curl -L https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin > /tmp/cuda.pin \
&& mv /tmp/cuda.pin /etc/apt/preferences.d/cuda-repository-pin-600 \
&& apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub \
&& add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /" \
&& apt-get update -qq \
&& apt-get install -y cudnn9-cuda-12 \
libcudnn9-dev-cuda-12 \
&& apt-get clean


VOLUME /builds/worker/checkouts
VOLUME /builds/worker/.cache
1 change: 1 addition & 0 deletions taskcluster/kinds/alignments-backtranslated/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ tasks:
- attributes
worker-type: b-cpu-xlargedisk-32-256
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
# 7 days
max-run-time: 604800
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/alignments-original/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ tasks:
- attributes
worker-type: b-cpu-xlargedisk-32-256
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
# 7 days
max-run-time: 604800
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/alignments-student/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ tasks:
- attributes
worker-type: b-cpu-xlargedisk-32-256
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
# 7 days
max-run-time: 604800
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/all-pipeline/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ kind-dependencies:
- evaluate-teacher-ensemble
- analyze-corpus
- analyze-mono
- beetmover

tasks:
all-pipeline:
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/all-pr-pipeline/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ kind-dependencies:
- evaluate-teacher-ensemble
- analyze-corpus
- analyze-mono
- beetmover

tasks:
all-pr-pipeline:
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/analyze-corpus/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ tasks:
- worker.env
- run.command
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
max-run-time: 86400 # one day
artifacts:
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/analyze-mono/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ task-defaults:
- fetches
- run.command
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
max-run-time: 86400 # one day
artifacts:
Expand Down
134 changes: 134 additions & 0 deletions taskcluster/kinds/beetmover/kind.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
---

loader: taskgraph.loader.transform:loader

transforms:
- taskgraph.transforms.from_deps
# step_dir is calculated based on a number of attributes from
# the upstream task. we can't reasonable express that in yml, so
# we let a transform do it instead
- translations_taskgraph.transforms.beetmover
- taskgraph.transforms.task_context
- taskgraph.transforms.task

# Any kinds containing tasks that we want to upload artifacts for must be
# listed here.
kind-dependencies:
- alignments-backtranslated
- alignments-original
- alignments-student
- analyze-corpus
- analyze-mono
- bicleaner
- bicleaner-model
- cefilter
- clean-corpus
- clean-mono
- collect-corpus
- collect-mono-src
- collect-mono-trg
- evaluate
- evaluate-quantized
- evaluate-teacher-ensemble
- export
- extract-best
- finetune-student
- merge-corpus
- merge-devset
- merge-mono
- merge-translated
- quantize
- score
- shortlist
- split-corpus
- split-mono-src
- split-mono-trg
- train-backwards
- train-student
- train-teacher
- train-vocab
- translate-corpus
- translate-mono-src
- translate-mono-trg

tasks:
beetmover:
description: upload artifacts
from-deps:
group-by: single
# This will pull in stage, src_locale, trg_locale, etc. attributes
copy-attributes: true
set-name: retain-kind

task-context:
from-parameters:
src_locale: training_config.experiment.src
trg_locale: training_config.experiment.trg
experiment_name: training_config.experiment.name
substitution-fields:
- attributes
- worker.artifact-map
worker-type: beetmover
run-on-tasks-for: []
scopes:
# TODO: this will differ by worker type and possibly where we're running?
# PRs should _always_ be dep
# actions on PRs should _always_ be dep
# pushes might be either?
# In the future, L3 will be prod and everything else will be dep
# TODO: is this actually controlling anything in scriptworker?
# if not, we should just remove it?
- project:translations:releng:beetmover:bucket:dep
- project:translations:releng:beetmover:action:upload-translations-artifacts
worker:
dryrun: false
# by-tasks-for:
# # TODO: actions on PRs should probably be off by default too?
# # maybe this should be the same as wandb-publication and be
# # adjustable in the training config?
# action: false
# default: true

release-properties:
appName: translations
# for training, evaluation, and export tasks we upload logs to /logs and
# everything else to /models
# for other tasks, we only upload logs
# we also need artifactMap? upstream artifacts is for CoT and fetching artifacts
# artifactMap is for mapping upstream artifact -> dest in bucket
# upstream artifacts defines which artifacts from upstream tasks we want to upload
upstream-artifacts:
- paths:
by-upstream-kind:
# Tasks from kinds matching these patterns will have all artifacts
# uploaded; other steps will only have logs uploaded.
(train.*|finetune.*|evaluate.*|export):
- "public/build/*"
- "public/logs/certified.log"
default:
- "public/build/*.log"
- "public/logs/certified.log"
taskType: build
optional: true

# artifact map determines _where_ we will put any artifacts fetched from upstream tasks
# artifacts not matching a more specific pattern will fall back to the "*" pattern
# note: because artifact names are not known when taskgraph runs this determination is
# made at upload time by beetmover (https://github.com/mozilla-releng/scriptworker-scripts/tree/master/beetmoverscript)
# as such, the entire artifact map specified here is given to those tasks.
# most substitutions are handled by `task_context`.
artifact-map:
- paths:
"*.log":
destinations:
- {
"task-reference": "logs/{src_locale}-{trg_locale}/{experiment_name}_<decision>/{step_dir}"
}
"*":
destinations:
- {
"task-reference": "models/{src_locale}-{trg_locale}/{experiment_name}_<decision>/{step_dir}"
}
1 change: 1 addition & 0 deletions taskcluster/kinds/bicleaner-model/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ tasks:
- run.command
worker-type: b-cpu
worker:
chain-of-trust: true
docker-image: {in-tree: toolchain-build}
artifacts:
- name: public/build
Expand Down
5 changes: 4 additions & 1 deletion taskcluster/kinds/bicleaner/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,11 @@ tasks:

worker-type: b-largegpu-largedisk
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
artifacts:
- name: public/build
path: artifacts
path: /builds/worker/artifacts
type: directory
# 7 days. yes, it can take a while to clean a huge dataset
max-run-time: 604800
Expand All @@ -88,6 +90,7 @@ tasks:

run:
using: run-task
cache-dotcache: true
command:
- bash
- -c
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/cefilter/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ tasks:
- attributes
worker-type: b-cpu-xlargedisk-32-256
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
# 7 days
max-run-time: 604800
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/clean-corpus/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ tasks:
- worker.env
- run.command
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
# 12 hours (OpusCleaner can get stuck)
max-run-time: 43200
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/clean-mono/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ task-defaults:
- fetches
- run.command
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
# 7 days. yes, it can take a while to clean a huge dataset
max-run-time: 604800
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/collect-corpus/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ tasks:

worker-type: b-cpu-largedisk
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
max-run-time: 86400
artifacts:
Expand Down
1 change: 1 addition & 0 deletions taskcluster/kinds/collect-mono-src/kind.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ task-defaults:
- attributes
worker-type: b-cpu-largedisk
worker:
chain-of-trust: true
docker-image: {"in-tree": "train"}
max-run-time: 86400
artifacts:
Expand Down
Loading