diff --git a/evaluation/dev/bleu-results.md b/evaluation/dev/bleu-results.md index 90b0164d..e7294fdc 100644 --- a/evaluation/dev/bleu-results.md +++ b/evaluation/dev/bleu-results.md @@ -56,57 +56,68 @@ Both absolute and relative differences in BLEU scores between Bergamot and other ## avg -| Translator/Dataset | en-ru | ru-en | en-nl | fa-en | uk-en | en-fa | is-en | nl-en | en-uk | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 29.44 | 33.69 | 27.30 | 28.70 | 35.93 | 17.30 | 23.40 | 29.65 | 26.30 | -| google | 34.49 (+5.05, +17.15%) | 38.20 (+4.51, +13.38%) | 29.30 (+2.00, +7.33%) | 40.85 (+12.15, +42.33%) | 42.43 (+6.50, +18.09%) | 27.80 (+10.50, +60.69%) | 38.90 (+15.50, +66.24%) | 33.05 (+3.40, +11.47%) | 32.63 (+6.33, +24.08%) | -| microsoft | 33.62 (+4.18, +14.21%) | 38.38 (+4.68, +13.90%) | 28.80 (+1.50, +5.49%) | 36.15 (+7.45, +25.96%) | 42.30 (+6.37, +17.72%) | 20.50 (+3.20, +18.50%) | 38.17 (+14.77, +63.11%) | 32.60 (+2.95, +9.95%) | 32.03 (+5.73, +21.80%) | +| Translator/Dataset | ru-en | en-nl | en-ru | en-fa | nl-en | uk-en | fa-en | ca-en | en-uk | is-en | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 33.69 | 27.30 | 29.44 | 17.30 | 29.65 | 35.93 | 28.70 | 38.00 | 26.30 | 23.40 | +| google | 38.20 (+4.51, +13.38%) | 29.30 (+2.00, +7.33%) | 34.49 (+5.05, +17.15%) | 27.80 (+10.50, +60.69%) | 33.05 (+3.40, +11.47%) | 42.43 (+6.50, +18.09%) | 40.85 (+12.15, +42.33%) | 48.95 (+10.95, +28.82%) | 32.63 (+6.33, +24.08%) | 38.90 (+15.50, +66.24%) | +| microsoft | 38.38 (+4.68, +13.90%) | 28.80 (+1.50, +5.49%) | 33.62 (+4.18, +14.21%) | 20.50 (+3.20, +18.50%) | 32.60 (+2.95, +9.95%) | 42.30 (+6.37, +17.72%) | 36.15 (+7.45, +25.96%) | 46.50 (+8.50, +22.37%) | 32.03 (+5.73, +21.80%) | 38.17 (+14.77, +63.11%) | ![Results](img/avg-bleu.png) --- -## en-ru - -| Translator/Dataset | wmt20 | wmt13 | flores-test | flores-dev | wmt21 | wmt19 | wmt17 | wmt16 | wmt15 | wmt14 | wmt22 | wmt18 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 22.00 | 26.20 | 29.20 | 29.90 | 25.50 | 31.40 | 33.60 | 30.90 | 31.40 | 38.20 | 26.50 | 28.50 | -| google | 27.20 (+5.20, +23.64%) | 28.00 (+1.80, +6.87%) | 34.40 (+5.20, +17.81%) | 34.90 (+5.00, +16.72%) | 30.00 (+4.50, +17.65%) | 32.90 (+1.50, +4.78%) | 38.90 (+5.30, +15.77%) | 35.00 (+4.10, +13.27%) | 36.90 (+5.50, +17.52%) | 45.70 (+7.50, +19.63%) | 35.00 (+8.50, +32.08%) | 35.00 (+6.50, +22.81%) | -| microsoft | 26.30 (+4.30, +19.55%) | 27.30 (+1.10, +4.20%) | 33.60 (+4.40, +15.07%) | 33.50 (+3.60, +12.04%) | 29.20 (+3.70, +14.51%) | 33.20 (+1.80, +5.73%) | 38.60 (+5.00, +14.88%) | 34.20 (+3.30, +10.68%) | 36.10 (+4.70, +14.97%) | 44.70 (+6.50, +17.02%) | 33.10 (+6.60, +24.91%) | 33.70 (+5.20, +18.25%) | - -![Results](img/en-ru-bleu.png) ---- - ## ru-en -| Translator/Dataset | flores-dev | mtedx_test | wmt18 | wmt20 | wmt19 | wmt15 | wmt17 | wmt14 | wmt16 | wmt22 | wmt13 | flores-test | wmt21 | +| Translator/Dataset | mtedx_test | wmt19 | wmt17 | flores-dev | wmt22 | flores-test | wmt14 | wmt15 | wmt16 | wmt13 | wmt18 | wmt21 | wmt20 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 31.90 | 24.00 | 31.90 | 35.00 | 39.10 | 33.50 | 37.60 | 37.80 | 33.00 | 38.50 | 29.30 | 31.00 | 35.40 | -| google | 38.40 (+6.50, +20.38%) | 25.10 (+1.10, +4.58%) | 37.30 (+5.40, +16.93%) | 38.40 (+3.40, +9.71%) | 42.80 (+3.70, +9.46%) | 38.60 (+5.10, +15.22%) | 42.70 (+5.10, +13.56%) | 42.70 (+4.90, +12.96%) | 37.60 (+4.60, +13.94%) | 43.70 (+5.20, +13.51%) | 32.20 (+2.90, +9.90%) | 37.30 (+6.30, +20.32%) | 39.80 (+4.40, +12.43%) | -| microsoft | 36.50 (+4.60, +14.42%) | 26.20 (+2.20, +9.17%) | 37.40 (+5.50, +17.24%) | 38.80 (+3.80, +10.86%) | 43.80 (+4.70, +12.02%) | 38.50 (+5.00, +14.93%) | 43.70 (+6.10, +16.22%) | 44.10 (+6.30, +16.67%) | 38.40 (+5.40, +16.36%) | 43.90 (+5.40, +14.03%) | 32.50 (+3.20, +10.92%) | 36.10 (+5.10, +16.45%) | 39.00 (+3.60, +10.17%) | +| bergamot | 24.00 | 39.10 | 37.60 | 31.90 | 38.50 | 31.00 | 37.80 | 33.50 | 33.00 | 29.30 | 31.90 | 35.40 | 35.00 | +| google | 25.10 (+1.10, +4.58%) | 42.80 (+3.70, +9.46%) | 42.70 (+5.10, +13.56%) | 38.40 (+6.50, +20.38%) | 43.70 (+5.20, +13.51%) | 37.30 (+6.30, +20.32%) | 42.70 (+4.90, +12.96%) | 38.60 (+5.10, +15.22%) | 37.60 (+4.60, +13.94%) | 32.20 (+2.90, +9.90%) | 37.30 (+5.40, +16.93%) | 39.80 (+4.40, +12.43%) | 38.40 (+3.40, +9.71%) | +| microsoft | 26.20 (+2.20, +9.17%) | 43.80 (+4.70, +12.02%) | 43.70 (+6.10, +16.22%) | 36.50 (+4.60, +14.42%) | 43.90 (+5.40, +14.03%) | 36.10 (+5.10, +16.45%) | 44.10 (+6.30, +16.67%) | 38.50 (+5.00, +14.93%) | 38.40 (+5.40, +16.36%) | 32.50 (+3.20, +10.92%) | 37.40 (+5.50, +17.24%) | 39.00 (+3.60, +10.17%) | 38.80 (+3.80, +10.86%) | ![Results](img/ru-en-bleu.png) --- ## en-nl -| Translator/Dataset | flores-test | flores-dev | +| Translator/Dataset | flores-dev | flores-test | | --- | --- | --- | -| bergamot | 27.00 | 27.60 | -| google | 29.20 (+2.20, +8.15%) | 29.40 (+1.80, +6.52%) | -| microsoft | 28.60 (+1.60, +5.93%) | 29.00 (+1.40, +5.07%) | +| bergamot | 27.60 | 27.00 | +| google | 29.40 (+1.80, +6.52%) | 29.20 (+2.20, +8.15%) | +| microsoft | 29.00 (+1.40, +5.07%) | 28.60 (+1.60, +5.93%) | ![Results](img/en-nl-bleu.png) --- -## fa-en +## en-ru + +| Translator/Dataset | wmt16 | wmt15 | flores-dev | wmt22 | wmt18 | wmt14 | wmt17 | wmt20 | wmt13 | wmt21 | wmt19 | flores-test | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 30.90 | 31.40 | 29.90 | 26.50 | 28.50 | 38.20 | 33.60 | 22.00 | 26.20 | 25.50 | 31.40 | 29.20 | +| google | 35.00 (+4.10, +13.27%) | 36.90 (+5.50, +17.52%) | 34.90 (+5.00, +16.72%) | 35.00 (+8.50, +32.08%) | 35.00 (+6.50, +22.81%) | 45.70 (+7.50, +19.63%) | 38.90 (+5.30, +15.77%) | 27.20 (+5.20, +23.64%) | 28.00 (+1.80, +6.87%) | 30.00 (+4.50, +17.65%) | 32.90 (+1.50, +4.78%) | 34.40 (+5.20, +17.81%) | +| microsoft | 34.20 (+3.30, +10.68%) | 36.10 (+4.70, +14.97%) | 33.50 (+3.60, +12.04%) | 33.10 (+6.60, +24.91%) | 33.70 (+5.20, +18.25%) | 44.70 (+6.50, +17.02%) | 38.60 (+5.00, +14.88%) | 26.30 (+4.30, +19.55%) | 27.30 (+1.10, +4.20%) | 29.20 (+3.70, +14.51%) | 33.20 (+1.80, +5.73%) | 33.60 (+4.40, +15.07%) | + +![Results](img/en-ru-bleu.png) +--- + +## en-fa + +| Translator/Dataset | flores-test | flores-dev | +| --- | --- | --- | +| bergamot | 17.40 | 17.20 | +| google | 28.40 (+11.00, +63.22%) | 27.20 (+10.00, +58.14%) | +| microsoft | 21.10 (+3.70, +21.26%) | 19.90 (+2.70, +15.70%) | + +![Results](img/en-fa-bleu.png) +--- + +## nl-en | Translator/Dataset | flores-dev | flores-test | | --- | --- | --- | -| bergamot | 29.10 | 28.30 | -| google | 42.00 (+12.90, +44.33%) | 39.70 (+11.40, +40.28%) | -| microsoft | 36.50 (+7.40, +25.43%) | 35.80 (+7.50, +26.50%) | +| bergamot | 29.70 | 29.60 | +| google | 33.00 (+3.30, +11.11%) | 33.10 (+3.50, +11.82%) | +| microsoft | 32.40 (+2.70, +9.09%) | 32.80 (+3.20, +10.81%) | -![Results](img/fa-en-bleu.png) +![Results](img/nl-en-bleu.png) --- ## uk-en @@ -120,46 +131,46 @@ Both absolute and relative differences in BLEU scores between Bergamot and other ![Results](img/uk-en-bleu.png) --- -## en-fa +## fa-en | Translator/Dataset | flores-dev | flores-test | | --- | --- | --- | -| bergamot | 17.20 | 17.40 | -| google | 27.20 (+10.00, +58.14%) | 28.40 (+11.00, +63.22%) | -| microsoft | 19.90 (+2.70, +15.70%) | 21.10 (+3.70, +21.26%) | - -![Results](img/en-fa-bleu.png) ---- - -## is-en - -| Translator/Dataset | flores-dev | flores-test | wmt21 | -| --- | --- | --- | --- | -| bergamot | 23.60 | 23.40 | 23.20 | -| google | 39.40 (+15.80, +66.95%) | 38.60 (+15.20, +64.96%) | 38.70 (+15.50, +66.81%) | -| microsoft | 37.30 (+13.70, +58.05%) | 36.70 (+13.30, +56.84%) | 40.50 (+17.30, +74.57%) | +| bergamot | 29.10 | 28.30 | +| google | 42.00 (+12.90, +44.33%) | 39.70 (+11.40, +40.28%) | +| microsoft | 36.50 (+7.40, +25.43%) | 35.80 (+7.50, +26.50%) | -![Results](img/is-en-bleu.png) +![Results](img/fa-en-bleu.png) --- -## nl-en +## ca-en | Translator/Dataset | flores-dev | flores-test | | --- | --- | --- | -| bergamot | 29.70 | 29.60 | -| google | 33.00 (+3.30, +11.11%) | 33.10 (+3.50, +11.82%) | -| microsoft | 32.40 (+2.70, +9.09%) | 32.80 (+3.20, +10.81%) | +| bergamot | 38.70 | 37.30 | +| google | 49.60 (+10.90, +28.17%) | 48.30 (+11.00, +29.49%) | +| microsoft | 46.80 (+8.10, +20.93%) | 46.20 (+8.90, +23.86%) | -![Results](img/nl-en-bleu.png) +![Results](img/ca-en-bleu.png) --- ## en-uk -| Translator/Dataset | flores-test | wmt22 | flores-dev | +| Translator/Dataset | flores-dev | flores-test | wmt22 | | --- | --- | --- | --- | -| bergamot | 28.20 | 22.80 | 27.90 | -| google | 33.10 (+4.90, +17.38%) | 32.00 (+9.20, +40.35%) | 32.80 (+4.90, +17.56%) | -| microsoft | 33.50 (+5.30, +18.79%) | 30.40 (+7.60, +33.33%) | 32.20 (+4.30, +15.41%) | +| bergamot | 27.90 | 28.20 | 22.80 | +| google | 32.80 (+4.90, +17.56%) | 33.10 (+4.90, +17.38%) | 32.00 (+9.20, +40.35%) | +| microsoft | 32.20 (+4.30, +15.41%) | 33.50 (+5.30, +18.79%) | 30.40 (+7.60, +33.33%) | ![Results](img/en-uk-bleu.png) +--- + +## is-en + +| Translator/Dataset | flores-dev | flores-test | wmt21 | +| --- | --- | --- | --- | +| bergamot | 23.60 | 23.40 | 23.20 | +| google | 39.40 (+15.80, +66.95%) | 38.60 (+15.20, +64.96%) | 38.70 (+15.50, +66.81%) | +| microsoft | 37.30 (+13.70, +58.05%) | 36.70 (+13.30, +56.84%) | 40.50 (+17.30, +74.57%) | + +![Results](img/is-en-bleu.png) --- \ No newline at end of file diff --git a/evaluation/dev/ca-en/flores-dev.bergamot.en.bleu b/evaluation/dev/ca-en/flores-dev.bergamot.en.bleu new file mode 100644 index 00000000..31dd36b7 --- /dev/null +++ b/evaluation/dev/ca-en/flores-dev.bergamot.en.bleu @@ -0,0 +1 @@ +38.7 diff --git a/evaluation/dev/ca-en/flores-dev.bergamot.en.comet b/evaluation/dev/ca-en/flores-dev.bergamot.en.comet new file mode 100644 index 00000000..8149cebd --- /dev/null +++ b/evaluation/dev/ca-en/flores-dev.bergamot.en.comet @@ -0,0 +1 @@ +0.6699 diff --git a/evaluation/dev/ca-en/flores-dev.ca-en.cometcompare b/evaluation/dev/ca-en/flores-dev.ca-en.cometcompare new file mode 100644 index 00000000..32db9626 --- /dev/null +++ b/evaluation/dev/ca-en/flores-dev.ca-en.cometcompare @@ -0,0 +1,61 @@ +========================== +x_name: flores-dev.bergamot.en +y_name: flores-dev.microsoft.en + +Bootstrap Resampling Results: +x-mean: 0.6700 +y-mean: 0.7980 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -18.8769 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +========================== +x_name: flores-dev.bergamot.en +y_name: flores-dev.google.en + +Bootstrap Resampling Results: +x-mean: 0.6700 +y-mean: 0.8228 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -21.5915 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-dev.google.en outperforms flores-dev.bergamot.en. +========================== +x_name: flores-dev.microsoft.en +y_name: flores-dev.google.en + +Bootstrap Resampling Results: +x-mean: 0.7980 +y-mean: 0.8228 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -6.7390 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-dev.google.en outperforms flores-dev.microsoft.en. + +Summary +If system_x is better than system_y then: +Null hypothesis rejected according to t-test with p_value=0.05. +Scores differ significantly across samples. +system_x \ system_y flores-dev.bergamot.en flores-dev.microsoft.en flores-dev.google.en +----------------------- ------------------------ ------------------------- ---------------------- +flores-dev.bergamot.en False False +flores-dev.microsoft.en True False +flores-dev.google.en True True diff --git a/evaluation/dev/ca-en/flores-dev.google.en.bleu b/evaluation/dev/ca-en/flores-dev.google.en.bleu new file mode 100644 index 00000000..56d29426 --- /dev/null +++ b/evaluation/dev/ca-en/flores-dev.google.en.bleu @@ -0,0 +1 @@ +49.6 diff --git a/evaluation/dev/ca-en/flores-dev.google.en.comet b/evaluation/dev/ca-en/flores-dev.google.en.comet new file mode 100644 index 00000000..8f27dfaa --- /dev/null +++ b/evaluation/dev/ca-en/flores-dev.google.en.comet @@ -0,0 +1 @@ +0.8218 diff --git a/evaluation/dev/ca-en/flores-dev.microsoft.en.bleu b/evaluation/dev/ca-en/flores-dev.microsoft.en.bleu new file mode 100644 index 00000000..48068f47 --- /dev/null +++ b/evaluation/dev/ca-en/flores-dev.microsoft.en.bleu @@ -0,0 +1 @@ +46.8 diff --git a/evaluation/dev/ca-en/flores-dev.microsoft.en.comet b/evaluation/dev/ca-en/flores-dev.microsoft.en.comet new file mode 100644 index 00000000..0a51005f --- /dev/null +++ b/evaluation/dev/ca-en/flores-dev.microsoft.en.comet @@ -0,0 +1 @@ +0.7979 diff --git a/evaluation/dev/ca-en/flores-test.bergamot.en.bleu b/evaluation/dev/ca-en/flores-test.bergamot.en.bleu new file mode 100644 index 00000000..6bc73938 --- /dev/null +++ b/evaluation/dev/ca-en/flores-test.bergamot.en.bleu @@ -0,0 +1 @@ +37.3 diff --git a/evaluation/dev/ca-en/flores-test.bergamot.en.comet b/evaluation/dev/ca-en/flores-test.bergamot.en.comet new file mode 100644 index 00000000..a6c43155 --- /dev/null +++ b/evaluation/dev/ca-en/flores-test.bergamot.en.comet @@ -0,0 +1 @@ +0.6381 diff --git a/evaluation/dev/ca-en/flores-test.ca-en.cometcompare b/evaluation/dev/ca-en/flores-test.ca-en.cometcompare new file mode 100644 index 00000000..a62b2333 --- /dev/null +++ b/evaluation/dev/ca-en/flores-test.ca-en.cometcompare @@ -0,0 +1,61 @@ +========================== +x_name: flores-test.bergamot.en +y_name: flores-test.microsoft.en + +Bootstrap Resampling Results: +x-mean: 0.6383 +y-mean: 0.7878 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -17.4826 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-test.microsoft.en outperforms flores-test.bergamot.en. +========================== +x_name: flores-test.bergamot.en +y_name: flores-test.google.en + +Bootstrap Resampling Results: +x-mean: 0.6383 +y-mean: 0.8105 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -18.9692 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-test.google.en outperforms flores-test.bergamot.en. +========================== +x_name: flores-test.microsoft.en +y_name: flores-test.google.en + +Bootstrap Resampling Results: +x-mean: 0.7878 +y-mean: 0.8105 +ties (%): 0.0000 +x_wins (%): 0.0000 +y_wins (%): 1.0000 + +Paired T-Test Results: +statistic: -6.1132 +p_value: 0.0000 +Null hypothesis rejected according to t-test. +Scores differ significantly across samples. +flores-test.google.en outperforms flores-test.microsoft.en. + +Summary +If system_x is better than system_y then: +Null hypothesis rejected according to t-test with p_value=0.05. +Scores differ significantly across samples. +system_x \ system_y flores-test.bergamot.en flores-test.microsoft.en flores-test.google.en +------------------------ ------------------------- -------------------------- ----------------------- +flores-test.bergamot.en False False +flores-test.microsoft.en True False +flores-test.google.en True True diff --git a/evaluation/dev/ca-en/flores-test.google.en.bleu b/evaluation/dev/ca-en/flores-test.google.en.bleu new file mode 100644 index 00000000..29330f13 --- /dev/null +++ b/evaluation/dev/ca-en/flores-test.google.en.bleu @@ -0,0 +1 @@ +48.3 diff --git a/evaluation/dev/ca-en/flores-test.google.en.comet b/evaluation/dev/ca-en/flores-test.google.en.comet new file mode 100644 index 00000000..18f63a56 --- /dev/null +++ b/evaluation/dev/ca-en/flores-test.google.en.comet @@ -0,0 +1 @@ +0.8103 diff --git a/evaluation/dev/ca-en/flores-test.microsoft.en.bleu b/evaluation/dev/ca-en/flores-test.microsoft.en.bleu new file mode 100644 index 00000000..4b1058c6 --- /dev/null +++ b/evaluation/dev/ca-en/flores-test.microsoft.en.bleu @@ -0,0 +1 @@ +46.2 diff --git a/evaluation/dev/ca-en/flores-test.microsoft.en.comet b/evaluation/dev/ca-en/flores-test.microsoft.en.comet new file mode 100644 index 00000000..a8384b11 --- /dev/null +++ b/evaluation/dev/ca-en/flores-test.microsoft.en.comet @@ -0,0 +1 @@ +0.7877 diff --git a/evaluation/dev/comet-results.md b/evaluation/dev/comet-results.md index 64b69439..1550110e 100644 --- a/evaluation/dev/comet-results.md +++ b/evaluation/dev/comet-results.md @@ -6,6 +6,18 @@ Three models with different human judgments have been trained to showcase the fr The models developed by COMET have achieved new state-of-the-art performance on the WMT 2019 Metrics shared task, demonstrating robustness to high-performing systems. +## Interpreting Scores: + +When using COMET to evaluate machine translation, it's important to understand how to interpret the scores it produces. + +In general, COMET models are trained to predict quality scores for translations. These scores are typically normalized using a z-score transformation to account for individual differences among annotators. While the raw score itself does not have a direct interpretation, it is useful for ranking translations and systems according to their quality. + +However, for the latest COMET models like Unbabel/wmt22-comet-da, we have introduced a new training approach that scales the scores between 0 and 1. This makes it easier to interpret the scores: a score close to 1 indicates a high-quality translation, while a score close to 0 indicates a translation that is no better than random chance. + +It's worth noting that when using COMET to compare the performance of two different translation systems, it's important to run the comet-compare command to obtain statistical significance measures. This command compares the output of two systems using a statistical hypothesis test, providing an estimate of the probability that the observed difference in scores between the systems is due to chance. This is an important step to ensure that any differences in scores between systems are statistically significant. + +Overall, the added interpretability of scores in the latest COMET models, combined with the ability to assess statistical significance between systems using comet-compare, make COMET a valuable tool for evaluating machine translation. + Source: https://aclanthology.org/2020.emnlp-main.213.pdf Tool: https://github.com/Unbabel/COMET @@ -36,323 +48,313 @@ We also compare the systems using the `comet-compare` tool that calculates the s ## avg -| Translator/Dataset | en-ru | ru-en | en-nl | fa-en | uk-en | en-fa | is-en | nl-en | en-uk | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.54 | 0.49 | 0.58 | 0.50 | 0.52 | 0.31 | 0.15 | 0.63 | 0.51 | -| google | 0.76 (+0.21, +39.38%) | 0.59 (+0.10, +20.83%) | 0.67 (+0.08, +14.30%) | 0.74 (+0.24, +48.00%) | 0.67 (+0.15, +28.26%) | 0.70 (+0.39, +126.54%) | 0.70 (+0.55, +370.91%) | 0.70 (+0.07, +10.71%) | 0.79 (+0.27, +53.31%) | -| microsoft | 0.72 (+0.18, +32.36%) | 0.60 (+0.11, +22.13%) | 0.65 (+0.06, +11.05%) | 0.66 (+0.16, +32.78%) | 0.64 (+0.12, +23.16%) | 0.41 (+0.10, +31.65%) | 0.67 (+0.52, +353.71%) | 0.69 (+0.06, +9.12%) | 0.75 (+0.23, +45.60%) | +| Translator/Dataset | ru-en | en-nl | en-ru | en-fa | nl-en | uk-en | fa-en | ca-en | en-uk | is-en | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 0.49 | 0.58 | 0.54 | 0.31 | 0.63 | 0.52 | 0.50 | 0.65 | 0.51 | 0.15 | +| google | 0.59 (+0.10, +20.83%) | 0.67 (+0.08, +14.30%) | 0.76 (+0.21, +39.38%) | 0.70 (+0.39, +126.54%) | 0.70 (+0.07, +10.71%) | 0.67 (+0.15, +28.26%) | 0.74 (+0.24, +48.00%) | 0.82 (+0.16, +24.78%) | 0.79 (+0.27, +53.31%) | 0.70 (+0.55, +370.91%) | +| microsoft | 0.60 (+0.11, +22.13%) | 0.65 (+0.06, +11.05%) | 0.72 (+0.18, +32.36%) | 0.41 (+0.10, +31.65%) | 0.69 (+0.06, +9.12%) | 0.64 (+0.12, +23.16%) | 0.66 (+0.16, +32.78%) | 0.79 (+0.14, +21.22%) | 0.75 (+0.23, +45.60%) | 0.67 (+0.52, +353.71%) | ![Results](img/avg-comet.png) --- -## en-ru - -| Translator/Dataset | wmt18 | wmt21 | wmt20 | wmt16 | flores-test | wmt22 | wmt14 | wmt15 | wmt13 | flores-dev | wmt19 | wmt17 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.59 | 0.40 | 0.41 | 0.59 | 0.57 | 0.43 | 0.70 | 0.64 | 0.52 | 0.57 | 0.47 | 0.64 | -| google | 0.81 (+0.22, +37.57%) | 0.64 (+0.23, +57.93%) | 0.64 (+0.23, +57.66%) | 0.78 (+0.18, +31.02%) | 0.77 (+0.20, +34.81%) | 0.73 (+0.30, +70.07%) | 0.88 (+0.18, +26.42%) | 0.84 (+0.20, +31.50%) | 0.69 (+0.17, +32.58%) | 0.76 (+0.20, +34.95%) | 0.72 (+0.25, +53.03%) | 0.84 (+0.20, +30.61%) | -| microsoft | 0.77 (+0.18, +30.02%) | 0.59 (+0.19, +46.32%) | 0.59 (+0.18, +44.41%) | 0.74 (+0.15, +25.24%) | 0.73 (+0.16, +27.74%) | 0.67 (+0.25, +57.60%) | 0.86 (+0.16, +22.72%) | 0.81 (+0.17, +26.31%) | 0.67 (+0.14, +27.63%) | 0.72 (+0.15, +26.80%) | 0.70 (+0.22, +47.24%) | 0.81 (+0.17, +26.32%) | - -![Results](img/en-ru-comet.png) -### Comparisons between systems -*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt18.en-ru](en-ru/wmt18.en-ru.cometcompare) -- wmt18.microsoft.ru outperforms wmt18.bergamot.ru. -- wmt18.google.ru outperforms wmt18.bergamot.ru. -- wmt18.google.ru outperforms wmt18.microsoft.ru. - -#### [wmt21.en-ru](en-ru/wmt21.en-ru.cometcompare) -- wmt21.microsoft.ru outperforms wmt21.bergamot.ru. -- wmt21.google.ru outperforms wmt21.bergamot.ru. -- wmt21.google.ru outperforms wmt21.microsoft.ru. - -#### [wmt20.en-ru](en-ru/wmt20.en-ru.cometcompare) -- wmt20.microsoft.ru outperforms wmt20.bergamot.ru. -- wmt20.google.ru outperforms wmt20.bergamot.ru. -- wmt20.google.ru outperforms wmt20.microsoft.ru. - -#### [wmt16.en-ru](en-ru/wmt16.en-ru.cometcompare) -- wmt16.microsoft.ru outperforms wmt16.bergamot.ru. -- wmt16.google.ru outperforms wmt16.bergamot.ru. -- wmt16.google.ru outperforms wmt16.microsoft.ru. - -#### [flores-test.en-ru](en-ru/flores-test.en-ru.cometcompare) -- flores-test.microsoft.ru outperforms flores-test.bergamot.ru. -- flores-test.google.ru outperforms flores-test.bergamot.ru. -- flores-test.google.ru outperforms flores-test.microsoft.ru. - -#### [wmt22.en-ru](en-ru/wmt22.en-ru.cometcompare) -- wmt22.microsoft.ru outperforms wmt22.bergamot.ru. -- wmt22.google.ru outperforms wmt22.bergamot.ru. -- wmt22.google.ru outperforms wmt22.microsoft.ru. - -#### [wmt14.en-ru](en-ru/wmt14.en-ru.cometcompare) -- wmt14.microsoft.ru outperforms wmt14.bergamot.ru. -- wmt14.google.ru outperforms wmt14.bergamot.ru. -- wmt14.google.ru outperforms wmt14.microsoft.ru. - -#### [wmt15.en-ru](en-ru/wmt15.en-ru.cometcompare) -- wmt15.microsoft.ru outperforms wmt15.bergamot.ru. -- wmt15.google.ru outperforms wmt15.bergamot.ru. -- wmt15.google.ru outperforms wmt15.microsoft.ru. - -#### [wmt13.en-ru](en-ru/wmt13.en-ru.cometcompare) -- wmt13.microsoft.ru outperforms wmt13.bergamot.ru. -- wmt13.google.ru outperforms wmt13.bergamot.ru. -- wmt13.google.ru outperforms wmt13.microsoft.ru. - -#### [flores-dev.en-ru](en-ru/flores-dev.en-ru.cometcompare) -- flores-dev.microsoft.ru outperforms flores-dev.bergamot.ru. -- flores-dev.google.ru outperforms flores-dev.bergamot.ru. -- flores-dev.google.ru outperforms flores-dev.microsoft.ru. - -#### [wmt19.en-ru](en-ru/wmt19.en-ru.cometcompare) -- wmt19.microsoft.ru outperforms wmt19.bergamot.ru. -- wmt19.google.ru outperforms wmt19.bergamot.ru. -- wmt19.google.ru outperforms wmt19.microsoft.ru. - -#### [wmt17.en-ru](en-ru/wmt17.en-ru.cometcompare) -- wmt17.microsoft.ru outperforms wmt17.bergamot.ru. -- wmt17.google.ru outperforms wmt17.bergamot.ru. -- wmt17.google.ru outperforms wmt17.microsoft.ru. - ---- - ## ru-en -| Translator/Dataset | wmt16 | wmt13 | wmt17 | wmt21 | wmt14 | wmt19 | flores-dev | wmt22 | wmt20 | flores-test | wmt15 | mtedx_test | wmt18 | +| Translator/Dataset | wmt17 | wmt22 | flores-test | wmt20 | mtedx_test | wmt15 | wmt18 | wmt14 | wmt16 | wmt19 | wmt21 | flores-dev | wmt13 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.49 | 0.44 | 0.53 | 0.52 | 0.56 | 0.48 | 0.58 | 0.47 | 0.53 | 0.58 | 0.50 | 0.19 | 0.47 | -| google | 0.59 (+0.11, +22.02%) | 0.53 (+0.09, +19.35%) | 0.64 (+0.11, +20.17%) | 0.61 (+0.09, +18.09%) | 0.67 (+0.11, +19.24%) | 0.56 (+0.08, +16.67%) | 0.67 (+0.10, +17.05%) | 0.61 (+0.14, +29.75%) | 0.61 (+0.08, +14.10%) | 0.67 (+0.09, +16.41%) | 0.61 (+0.11, +21.38%) | 0.30 (+0.10, +51.95%) | 0.60 (+0.12, +26.13%) | -| microsoft | 0.60 (+0.11, +22.55%) | 0.54 (+0.10, +22.66%) | 0.65 (+0.11, +21.50%) | 0.62 (+0.10, +20.00%) | 0.68 (+0.11, +19.91%) | 0.59 (+0.11, +22.70%) | 0.67 (+0.09, +15.54%) | 0.62 (+0.15, +31.74%) | 0.62 (+0.09, +16.50%) | 0.66 (+0.08, +14.34%) | 0.62 (+0.11, +22.51%) | 0.30 (+0.10, +53.96%) | 0.60 (+0.13, +27.21%) | +| bergamot | 0.53 | 0.47 | 0.58 | 0.53 | 0.19 | 0.50 | 0.47 | 0.56 | 0.49 | 0.48 | 0.52 | 0.58 | 0.44 | +| google | 0.64 (+0.11, +20.17%) | 0.61 (+0.14, +29.75%) | 0.67 (+0.09, +16.41%) | 0.61 (+0.08, +14.10%) | 0.30 (+0.10, +51.95%) | 0.61 (+0.11, +21.38%) | 0.60 (+0.12, +26.13%) | 0.67 (+0.11, +19.24%) | 0.59 (+0.11, +22.02%) | 0.56 (+0.08, +16.67%) | 0.61 (+0.09, +18.09%) | 0.67 (+0.10, +17.05%) | 0.53 (+0.09, +19.35%) | +| microsoft | 0.65 (+0.11, +21.50%) | 0.62 (+0.15, +31.74%) | 0.66 (+0.08, +14.34%) | 0.62 (+0.09, +16.50%) | 0.30 (+0.10, +53.96%) | 0.62 (+0.11, +22.51%) | 0.60 (+0.13, +27.21%) | 0.68 (+0.11, +19.91%) | 0.60 (+0.11, +22.55%) | 0.59 (+0.11, +22.70%) | 0.62 (+0.10, +20.00%) | 0.67 (+0.09, +15.54%) | 0.54 (+0.10, +22.66%) | ![Results](img/ru-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt16.ru-en](ru-en/wmt16.ru-en.cometcompare) -- wmt16.microsoft.en outperforms wmt16.bergamot.en. -- wmt16.google.en outperforms wmt16.bergamot.en. - -#### [wmt13.ru-en](ru-en/wmt13.ru-en.cometcompare) -- wmt13.microsoft.en outperforms wmt13.bergamot.en. -- wmt13.google.en outperforms wmt13.bergamot.en. -- wmt13.microsoft.en outperforms wmt13.google.en. - #### [wmt17.ru-en](ru-en/wmt17.ru-en.cometcompare) - wmt17.microsoft.en outperforms wmt17.bergamot.en. - wmt17.google.en outperforms wmt17.bergamot.en. - wmt17.microsoft.en outperforms wmt17.google.en. -#### [wmt21.ru-en](ru-en/wmt21.ru-en.cometcompare) -- wmt21.microsoft.en outperforms wmt21.bergamot.en. -- wmt21.google.en outperforms wmt21.bergamot.en. - -#### [wmt14.ru-en](ru-en/wmt14.ru-en.cometcompare) -- wmt14.microsoft.en outperforms wmt14.bergamot.en. -- wmt14.google.en outperforms wmt14.bergamot.en. - -#### [wmt19.ru-en](ru-en/wmt19.ru-en.cometcompare) -- wmt19.microsoft.en outperforms wmt19.bergamot.en. -- wmt19.google.en outperforms wmt19.bergamot.en. -- wmt19.microsoft.en outperforms wmt19.google.en. - -#### [flores-dev.ru-en](ru-en/flores-dev.ru-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.microsoft.en. - #### [wmt22.ru-en](ru-en/wmt22.ru-en.cometcompare) - wmt22.microsoft.en outperforms wmt22.bergamot.en. - wmt22.google.en outperforms wmt22.bergamot.en. -#### [wmt20.ru-en](ru-en/wmt20.ru-en.cometcompare) -- wmt20.microsoft.en outperforms wmt20.bergamot.en. -- wmt20.google.en outperforms wmt20.bergamot.en. -- wmt20.microsoft.en outperforms wmt20.google.en. - #### [flores-test.ru-en](ru-en/flores-test.ru-en.cometcompare) - flores-test.microsoft.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.microsoft.en. -#### [wmt15.ru-en](ru-en/wmt15.ru-en.cometcompare) -- wmt15.microsoft.en outperforms wmt15.bergamot.en. -- wmt15.google.en outperforms wmt15.bergamot.en. +#### [wmt20.ru-en](ru-en/wmt20.ru-en.cometcompare) +- wmt20.microsoft.en outperforms wmt20.bergamot.en. +- wmt20.google.en outperforms wmt20.bergamot.en. +- wmt20.microsoft.en outperforms wmt20.google.en. #### [mtedx_test.ru-en](ru-en/mtedx_test.ru-en.cometcompare) - mtedx_test.microsoft.en outperforms mtedx_test.bergamot.en. - mtedx_test.google.en outperforms mtedx_test.bergamot.en. +#### [wmt15.ru-en](ru-en/wmt15.ru-en.cometcompare) +- wmt15.microsoft.en outperforms wmt15.bergamot.en. +- wmt15.google.en outperforms wmt15.bergamot.en. + #### [wmt18.ru-en](ru-en/wmt18.ru-en.cometcompare) - wmt18.microsoft.en outperforms wmt18.bergamot.en. - wmt18.google.en outperforms wmt18.bergamot.en. +#### [wmt14.ru-en](ru-en/wmt14.ru-en.cometcompare) +- wmt14.microsoft.en outperforms wmt14.bergamot.en. +- wmt14.google.en outperforms wmt14.bergamot.en. + +#### [wmt16.ru-en](ru-en/wmt16.ru-en.cometcompare) +- wmt16.microsoft.en outperforms wmt16.bergamot.en. +- wmt16.google.en outperforms wmt16.bergamot.en. + +#### [wmt19.ru-en](ru-en/wmt19.ru-en.cometcompare) +- wmt19.microsoft.en outperforms wmt19.bergamot.en. +- wmt19.google.en outperforms wmt19.bergamot.en. +- wmt19.microsoft.en outperforms wmt19.google.en. + +#### [wmt21.ru-en](ru-en/wmt21.ru-en.cometcompare) +- wmt21.microsoft.en outperforms wmt21.bergamot.en. +- wmt21.google.en outperforms wmt21.bergamot.en. + +#### [flores-dev.ru-en](ru-en/flores-dev.ru-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.microsoft.en. + +#### [wmt13.ru-en](ru-en/wmt13.ru-en.cometcompare) +- wmt13.microsoft.en outperforms wmt13.bergamot.en. +- wmt13.google.en outperforms wmt13.bergamot.en. +- wmt13.microsoft.en outperforms wmt13.google.en. + --- ## en-nl -| Translator/Dataset | flores-test | flores-dev | +| Translator/Dataset | flores-dev | flores-test | | --- | --- | --- | -| bergamot | 0.58 | 0.59 | -| google | 0.67 (+0.09, +15.59%) | 0.67 (+0.08, +13.04%) | -| microsoft | 0.65 (+0.08, +13.25%) | 0.64 (+0.05, +8.90%) | +| bergamot | 0.59 | 0.58 | +| google | 0.67 (+0.08, +13.04%) | 0.67 (+0.09, +15.59%) | +| microsoft | 0.64 (+0.05, +8.90%) | 0.65 (+0.08, +13.25%) | ![Results](img/en-nl-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-test.en-nl](en-nl/flores-test.en-nl.cometcompare) -- flores-test.microsoft.nl outperforms flores-test.bergamot.nl. -- flores-test.google.nl outperforms flores-test.bergamot.nl. -- flores-test.google.nl outperforms flores-test.microsoft.nl. - #### [flores-dev.en-nl](en-nl/flores-dev.en-nl.cometcompare) - flores-dev.microsoft.nl outperforms flores-dev.bergamot.nl. - flores-dev.google.nl outperforms flores-dev.bergamot.nl. - flores-dev.google.nl outperforms flores-dev.microsoft.nl. +#### [flores-test.en-nl](en-nl/flores-test.en-nl.cometcompare) +- flores-test.microsoft.nl outperforms flores-test.bergamot.nl. +- flores-test.google.nl outperforms flores-test.bergamot.nl. +- flores-test.google.nl outperforms flores-test.microsoft.nl. + --- -## fa-en +## en-ru -| Translator/Dataset | flores-dev | flores-test | -| --- | --- | --- | -| bergamot | 0.49 | 0.51 | -| google | 0.74 (+0.25, +50.08%) | 0.74 (+0.23, +45.97%) | -| microsoft | 0.66 (+0.16, +33.31%) | 0.67 (+0.16, +32.25%) | +| Translator/Dataset | wmt19 | wmt21 | wmt15 | wmt13 | wmt20 | wmt16 | wmt14 | flores-dev | flores-test | wmt18 | wmt22 | wmt17 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 0.47 | 0.40 | 0.64 | 0.52 | 0.41 | 0.59 | 0.70 | 0.57 | 0.57 | 0.59 | 0.43 | 0.64 | +| google | 0.72 (+0.25, +53.03%) | 0.64 (+0.23, +57.93%) | 0.84 (+0.20, +31.50%) | 0.69 (+0.17, +32.58%) | 0.64 (+0.23, +57.66%) | 0.78 (+0.18, +31.02%) | 0.88 (+0.18, +26.42%) | 0.76 (+0.20, +34.95%) | 0.77 (+0.20, +34.81%) | 0.81 (+0.22, +37.57%) | 0.73 (+0.30, +70.07%) | 0.84 (+0.20, +30.61%) | +| microsoft | 0.70 (+0.22, +47.24%) | 0.59 (+0.19, +46.32%) | 0.81 (+0.17, +26.31%) | 0.67 (+0.14, +27.63%) | 0.59 (+0.18, +44.41%) | 0.74 (+0.15, +25.24%) | 0.86 (+0.16, +22.72%) | 0.72 (+0.15, +26.80%) | 0.73 (+0.16, +27.74%) | 0.77 (+0.18, +30.02%) | 0.67 (+0.25, +57.60%) | 0.81 (+0.17, +26.32%) | -![Results](img/fa-en-comet.png) +![Results](img/en-ru-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.fa-en](fa-en/flores-dev.fa-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.microsoft.en. +#### [wmt19.en-ru](en-ru/wmt19.en-ru.cometcompare) +- wmt19.microsoft.ru outperforms wmt19.bergamot.ru. +- wmt19.google.ru outperforms wmt19.bergamot.ru. +- wmt19.google.ru outperforms wmt19.microsoft.ru. -#### [flores-test.fa-en](fa-en/flores-test.fa-en.cometcompare) -- flores-test.microsoft.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.microsoft.en. +#### [wmt21.en-ru](en-ru/wmt21.en-ru.cometcompare) +- wmt21.microsoft.ru outperforms wmt21.bergamot.ru. +- wmt21.google.ru outperforms wmt21.bergamot.ru. +- wmt21.google.ru outperforms wmt21.microsoft.ru. ---- +#### [wmt15.en-ru](en-ru/wmt15.en-ru.cometcompare) +- wmt15.microsoft.ru outperforms wmt15.bergamot.ru. +- wmt15.google.ru outperforms wmt15.bergamot.ru. +- wmt15.google.ru outperforms wmt15.microsoft.ru. -## uk-en +#### [wmt13.en-ru](en-ru/wmt13.en-ru.cometcompare) +- wmt13.microsoft.ru outperforms wmt13.bergamot.ru. +- wmt13.google.ru outperforms wmt13.bergamot.ru. +- wmt13.google.ru outperforms wmt13.microsoft.ru. -| Translator/Dataset | flores-dev | wmt22 | flores-test | -| --- | --- | --- | --- | -| bergamot | 0.59 | 0.38 | 0.60 | -| google | 0.70 (+0.10, +17.39%) | 0.61 (+0.23, +60.58%) | 0.71 (+0.11, +18.39%) | -| microsoft | 0.68 (+0.09, +15.01%) | 0.56 (+0.18, +47.84%) | 0.69 (+0.09, +15.47%) | +#### [wmt20.en-ru](en-ru/wmt20.en-ru.cometcompare) +- wmt20.microsoft.ru outperforms wmt20.bergamot.ru. +- wmt20.google.ru outperforms wmt20.bergamot.ru. +- wmt20.google.ru outperforms wmt20.microsoft.ru. -![Results](img/uk-en-comet.png) -### Comparisons between systems -*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.uk-en](uk-en/flores-dev.uk-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.microsoft.en. +#### [wmt16.en-ru](en-ru/wmt16.en-ru.cometcompare) +- wmt16.microsoft.ru outperforms wmt16.bergamot.ru. +- wmt16.google.ru outperforms wmt16.bergamot.ru. +- wmt16.google.ru outperforms wmt16.microsoft.ru. -#### [wmt22.uk-en](uk-en/wmt22.uk-en.cometcompare) -- wmt22.microsoft.en outperforms wmt22.bergamot.en. -- wmt22.google.en outperforms wmt22.bergamot.en. -- wmt22.google.en outperforms wmt22.microsoft.en. +#### [wmt14.en-ru](en-ru/wmt14.en-ru.cometcompare) +- wmt14.microsoft.ru outperforms wmt14.bergamot.ru. +- wmt14.google.ru outperforms wmt14.bergamot.ru. +- wmt14.google.ru outperforms wmt14.microsoft.ru. -#### [flores-test.uk-en](uk-en/flores-test.uk-en.cometcompare) -- flores-test.microsoft.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.microsoft.en. +#### [flores-dev.en-ru](en-ru/flores-dev.en-ru.cometcompare) +- flores-dev.microsoft.ru outperforms flores-dev.bergamot.ru. +- flores-dev.google.ru outperforms flores-dev.bergamot.ru. +- flores-dev.google.ru outperforms flores-dev.microsoft.ru. + +#### [flores-test.en-ru](en-ru/flores-test.en-ru.cometcompare) +- flores-test.microsoft.ru outperforms flores-test.bergamot.ru. +- flores-test.google.ru outperforms flores-test.bergamot.ru. +- flores-test.google.ru outperforms flores-test.microsoft.ru. + +#### [wmt18.en-ru](en-ru/wmt18.en-ru.cometcompare) +- wmt18.microsoft.ru outperforms wmt18.bergamot.ru. +- wmt18.google.ru outperforms wmt18.bergamot.ru. +- wmt18.google.ru outperforms wmt18.microsoft.ru. + +#### [wmt22.en-ru](en-ru/wmt22.en-ru.cometcompare) +- wmt22.microsoft.ru outperforms wmt22.bergamot.ru. +- wmt22.google.ru outperforms wmt22.bergamot.ru. +- wmt22.google.ru outperforms wmt22.microsoft.ru. + +#### [wmt17.en-ru](en-ru/wmt17.en-ru.cometcompare) +- wmt17.microsoft.ru outperforms wmt17.bergamot.ru. +- wmt17.google.ru outperforms wmt17.bergamot.ru. +- wmt17.google.ru outperforms wmt17.microsoft.ru. --- ## en-fa -| Translator/Dataset | flores-dev | flores-test | +| Translator/Dataset | flores-test | flores-dev | | --- | --- | --- | -| bergamot | 0.30 | 0.32 | -| google | 0.70 (+0.40, +134.13%) | 0.71 (+0.39, +119.58%) | -| microsoft | 0.40 (+0.10, +34.06%) | 0.42 (+0.10, +29.43%) | +| bergamot | 0.32 | 0.30 | +| google | 0.71 (+0.39, +119.58%) | 0.70 (+0.40, +134.13%) | +| microsoft | 0.42 (+0.10, +29.43%) | 0.40 (+0.10, +34.06%) | ![Results](img/en-fa-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.en-fa](en-fa/flores-dev.en-fa.cometcompare) -- flores-dev.microsoft.fa outperforms flores-dev.bergamot.fa. -- flores-dev.google.fa outperforms flores-dev.bergamot.fa. -- flores-dev.google.fa outperforms flores-dev.microsoft.fa. - #### [flores-test.en-fa](en-fa/flores-test.en-fa.cometcompare) - flores-test.microsoft.fa outperforms flores-test.bergamot.fa. - flores-test.google.fa outperforms flores-test.bergamot.fa. - flores-test.google.fa outperforms flores-test.microsoft.fa. +#### [flores-dev.en-fa](en-fa/flores-dev.en-fa.cometcompare) +- flores-dev.microsoft.fa outperforms flores-dev.bergamot.fa. +- flores-dev.google.fa outperforms flores-dev.bergamot.fa. +- flores-dev.google.fa outperforms flores-dev.microsoft.fa. + --- -## is-en +## nl-en -| Translator/Dataset | wmt21 | flores-dev | flores-test | -| --- | --- | --- | --- | -| bergamot | 0.02 | 0.21 | 0.22 | -| google | 0.67 (+0.66, +4185.35%) | 0.71 (+0.50, +236.11%) | 0.71 (+0.49, +226.94%) | -| microsoft | 0.66 (+0.64, +4101.27%) | 0.68 (+0.47, +219.75%) | 0.68 (+0.46, +213.75%) | +| Translator/Dataset | flores-test | flores-dev | +| --- | --- | --- | +| bergamot | 0.64 | 0.63 | +| google | 0.70 (+0.07, +10.23%) | 0.70 (+0.07, +11.21%) | +| microsoft | 0.69 (+0.06, +8.75%) | 0.69 (+0.06, +9.49%) | -![Results](img/is-en-comet.png) +![Results](img/nl-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt21.is-en](is-en/wmt21.is-en.cometcompare) -- wmt21.microsoft.en outperforms wmt21.bergamot.en. -- wmt21.google.en outperforms wmt21.bergamot.en. -- wmt21.google.en outperforms wmt21.microsoft.en. +#### [flores-test.nl-en](nl-en/flores-test.nl-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.microsoft.en. -#### [flores-dev.is-en](is-en/flores-dev.is-en.cometcompare) +#### [flores-dev.nl-en](nl-en/flores-dev.nl-en.cometcompare) - flores-dev.microsoft.en outperforms flores-dev.bergamot.en. - flores-dev.google.en outperforms flores-dev.bergamot.en. - flores-dev.google.en outperforms flores-dev.microsoft.en. -#### [flores-test.is-en](is-en/flores-test.is-en.cometcompare) +--- + +## uk-en + +| Translator/Dataset | wmt22 | flores-test | flores-dev | +| --- | --- | --- | --- | +| bergamot | 0.38 | 0.60 | 0.59 | +| google | 0.61 (+0.23, +60.58%) | 0.71 (+0.11, +18.39%) | 0.70 (+0.10, +17.39%) | +| microsoft | 0.56 (+0.18, +47.84%) | 0.69 (+0.09, +15.47%) | 0.68 (+0.09, +15.01%) | + +![Results](img/uk-en-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [wmt22.uk-en](uk-en/wmt22.uk-en.cometcompare) +- wmt22.microsoft.en outperforms wmt22.bergamot.en. +- wmt22.google.en outperforms wmt22.bergamot.en. +- wmt22.google.en outperforms wmt22.microsoft.en. + +#### [flores-test.uk-en](uk-en/flores-test.uk-en.cometcompare) - flores-test.microsoft.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.microsoft.en. +#### [flores-dev.uk-en](uk-en/flores-dev.uk-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.microsoft.en. + --- -## nl-en +## fa-en -| Translator/Dataset | flores-dev | flores-test | +| Translator/Dataset | flores-test | flores-dev | | --- | --- | --- | -| bergamot | 0.63 | 0.64 | -| google | 0.70 (+0.07, +11.21%) | 0.70 (+0.07, +10.23%) | -| microsoft | 0.69 (+0.06, +9.49%) | 0.69 (+0.06, +8.75%) | +| bergamot | 0.51 | 0.49 | +| google | 0.74 (+0.23, +45.97%) | 0.74 (+0.25, +50.08%) | +| microsoft | 0.67 (+0.16, +32.25%) | 0.66 (+0.16, +33.31%) | -![Results](img/nl-en-comet.png) +![Results](img/fa-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.nl-en](nl-en/flores-dev.nl-en.cometcompare) +#### [flores-test.fa-en](fa-en/flores-test.fa-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.microsoft.en. + +#### [flores-dev.fa-en](fa-en/flores-dev.fa-en.cometcompare) - flores-dev.microsoft.en outperforms flores-dev.bergamot.en. - flores-dev.google.en outperforms flores-dev.bergamot.en. - flores-dev.google.en outperforms flores-dev.microsoft.en. -#### [flores-test.nl-en](nl-en/flores-test.nl-en.cometcompare) +--- + +## ca-en + +| Translator/Dataset | flores-test | flores-dev | +| --- | --- | --- | +| bergamot | 0.64 | 0.67 | +| google | 0.81 (+0.17, +26.99%) | 0.82 (+0.15, +22.68%) | +| microsoft | 0.79 (+0.15, +23.44%) | 0.80 (+0.13, +19.11%) | + +![Results](img/ca-en-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [flores-test.ca-en](ca-en/flores-test.ca-en.cometcompare) - flores-test.microsoft.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.microsoft.en. +#### [flores-dev.ca-en](ca-en/flores-dev.ca-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.microsoft.en. + --- ## en-uk -| Translator/Dataset | wmt22 | flores-test | flores-dev | +| Translator/Dataset | flores-test | flores-dev | wmt22 | | --- | --- | --- | --- | -| bergamot | 0.36 | 0.60 | 0.58 | -| google | 0.73 (+0.36, +99.31%) | 0.82 (+0.23, +38.00%) | 0.81 (+0.23, +40.14%) | -| microsoft | 0.67 (+0.31, +84.35%) | 0.79 (+0.20, +32.94%) | 0.78 (+0.20, +34.26%) | +| bergamot | 0.60 | 0.58 | 0.36 | +| google | 0.82 (+0.23, +38.00%) | 0.81 (+0.23, +40.14%) | 0.73 (+0.36, +99.31%) | +| microsoft | 0.79 (+0.20, +32.94%) | 0.78 (+0.20, +34.26%) | 0.67 (+0.31, +84.35%) | ![Results](img/en-uk-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt22.en-uk](en-uk/wmt22.en-uk.cometcompare) -- wmt22.microsoft.uk outperforms wmt22.bergamot.uk. -- wmt22.google.uk outperforms wmt22.bergamot.uk. -- wmt22.google.uk outperforms wmt22.microsoft.uk. - #### [flores-test.en-uk](en-uk/flores-test.en-uk.cometcompare) - flores-test.microsoft.uk outperforms flores-test.bergamot.uk. - flores-test.google.uk outperforms flores-test.bergamot.uk. @@ -363,4 +365,37 @@ We also compare the systems using the `comet-compare` tool that calculates the s - flores-dev.google.uk outperforms flores-dev.bergamot.uk. - flores-dev.google.uk outperforms flores-dev.microsoft.uk. +#### [wmt22.en-uk](en-uk/wmt22.en-uk.cometcompare) +- wmt22.microsoft.uk outperforms wmt22.bergamot.uk. +- wmt22.google.uk outperforms wmt22.bergamot.uk. +- wmt22.google.uk outperforms wmt22.microsoft.uk. + +--- + +## is-en + +| Translator/Dataset | flores-test | wmt21 | flores-dev | +| --- | --- | --- | --- | +| bergamot | 0.22 | 0.02 | 0.21 | +| google | 0.71 (+0.49, +226.94%) | 0.67 (+0.66, +4185.35%) | 0.71 (+0.50, +236.11%) | +| microsoft | 0.68 (+0.46, +213.75%) | 0.66 (+0.64, +4101.27%) | 0.68 (+0.47, +219.75%) | + +![Results](img/is-en-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [flores-test.is-en](is-en/flores-test.is-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.microsoft.en. + +#### [wmt21.is-en](is-en/wmt21.is-en.cometcompare) +- wmt21.microsoft.en outperforms wmt21.bergamot.en. +- wmt21.google.en outperforms wmt21.bergamot.en. +- wmt21.google.en outperforms wmt21.microsoft.en. + +#### [flores-dev.is-en](is-en/flores-dev.is-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.microsoft.en. + --- \ No newline at end of file diff --git a/evaluation/dev/img/avg-bleu.png b/evaluation/dev/img/avg-bleu.png index b445e303..40b5de3e 100644 Binary files a/evaluation/dev/img/avg-bleu.png and b/evaluation/dev/img/avg-bleu.png differ diff --git a/evaluation/dev/img/avg-comet.png b/evaluation/dev/img/avg-comet.png index 1f40b584..8fe4280d 100644 Binary files a/evaluation/dev/img/avg-comet.png and b/evaluation/dev/img/avg-comet.png differ diff --git a/evaluation/dev/img/ca-en-bleu.png b/evaluation/dev/img/ca-en-bleu.png new file mode 100644 index 00000000..17f0d44c Binary files /dev/null and b/evaluation/dev/img/ca-en-bleu.png differ diff --git a/evaluation/dev/img/ca-en-comet.png b/evaluation/dev/img/ca-en-comet.png new file mode 100644 index 00000000..a775accc Binary files /dev/null and b/evaluation/dev/img/ca-en-comet.png differ diff --git a/evaluation/dev/img/en-fa-bleu.png b/evaluation/dev/img/en-fa-bleu.png index 2b4a593e..566d9194 100644 Binary files a/evaluation/dev/img/en-fa-bleu.png and b/evaluation/dev/img/en-fa-bleu.png differ diff --git a/evaluation/dev/img/en-fa-comet.png b/evaluation/dev/img/en-fa-comet.png index 18f22dd3..eb3b2848 100644 Binary files a/evaluation/dev/img/en-fa-comet.png and b/evaluation/dev/img/en-fa-comet.png differ diff --git a/evaluation/dev/img/en-nl-bleu.png b/evaluation/dev/img/en-nl-bleu.png index a8ed8d82..8943bdab 100644 Binary files a/evaluation/dev/img/en-nl-bleu.png and b/evaluation/dev/img/en-nl-bleu.png differ diff --git a/evaluation/dev/img/en-nl-comet.png b/evaluation/dev/img/en-nl-comet.png index 6a67a9d4..192929b5 100644 Binary files a/evaluation/dev/img/en-nl-comet.png and b/evaluation/dev/img/en-nl-comet.png differ diff --git a/evaluation/dev/img/en-ru-bleu.png b/evaluation/dev/img/en-ru-bleu.png index 9c99546e..48adc9a2 100644 Binary files a/evaluation/dev/img/en-ru-bleu.png and b/evaluation/dev/img/en-ru-bleu.png differ diff --git a/evaluation/dev/img/en-ru-comet.png b/evaluation/dev/img/en-ru-comet.png index cfcf9827..4ba80a14 100644 Binary files a/evaluation/dev/img/en-ru-comet.png and b/evaluation/dev/img/en-ru-comet.png differ diff --git a/evaluation/dev/img/en-uk-bleu.png b/evaluation/dev/img/en-uk-bleu.png index 772dae81..bbd4c683 100644 Binary files a/evaluation/dev/img/en-uk-bleu.png and b/evaluation/dev/img/en-uk-bleu.png differ diff --git a/evaluation/dev/img/en-uk-comet.png b/evaluation/dev/img/en-uk-comet.png index c7235404..76442cc5 100644 Binary files a/evaluation/dev/img/en-uk-comet.png and b/evaluation/dev/img/en-uk-comet.png differ diff --git a/evaluation/dev/img/fa-en-comet.png b/evaluation/dev/img/fa-en-comet.png index ab543895..ee9cdd5b 100644 Binary files a/evaluation/dev/img/fa-en-comet.png and b/evaluation/dev/img/fa-en-comet.png differ diff --git a/evaluation/dev/img/is-en-comet.png b/evaluation/dev/img/is-en-comet.png index 0a300384..292615c5 100644 Binary files a/evaluation/dev/img/is-en-comet.png and b/evaluation/dev/img/is-en-comet.png differ diff --git a/evaluation/dev/img/nl-en-comet.png b/evaluation/dev/img/nl-en-comet.png index 58741cc2..d1bb6ae2 100644 Binary files a/evaluation/dev/img/nl-en-comet.png and b/evaluation/dev/img/nl-en-comet.png differ diff --git a/evaluation/dev/img/ru-en-bleu.png b/evaluation/dev/img/ru-en-bleu.png index bb921c1a..832a3a5e 100644 Binary files a/evaluation/dev/img/ru-en-bleu.png and b/evaluation/dev/img/ru-en-bleu.png differ diff --git a/evaluation/dev/img/ru-en-comet.png b/evaluation/dev/img/ru-en-comet.png index 697380b6..4d6f4b78 100644 Binary files a/evaluation/dev/img/ru-en-comet.png and b/evaluation/dev/img/ru-en-comet.png differ diff --git a/evaluation/dev/img/uk-en-comet.png b/evaluation/dev/img/uk-en-comet.png index 1339cb7b..50702b88 100644 Binary files a/evaluation/dev/img/uk-en-comet.png and b/evaluation/dev/img/uk-en-comet.png differ diff --git a/evaluation/prod/bleu-results.md b/evaluation/prod/bleu-results.md index 254304b5..2ba7600a 100644 --- a/evaluation/prod/bleu-results.md +++ b/evaluation/prod/bleu-results.md @@ -56,15 +56,59 @@ Both absolute and relative differences in BLEU scores between Bergamot and other ## avg -| Translator/Dataset | en-pt | pt-en | en-bg | nb-en | it-en | en-et | fr-en | en-de | es-en | en-it | en-pl | en-es | pl-en | en-fr | et-en | bg-en | de-en | en-cs | cs-en | +| Translator/Dataset | cs-en | en-et | en-it | fr-en | en-pt | et-en | nb-en | bg-en | en-es | en-bg | en-cs | de-en | it-en | pl-en | en-fr | en-pl | pt-en | es-en | en-de | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 49.85 | 44.87 | 42.10 | 37.60 | 32.67 | 25.50 | 35.43 | 32.01 | 32.38 | 29.77 | 22.27 | 32.41 | 27.87 | 36.01 | 32.37 | 38.50 | 33.16 | 24.76 | 31.07 | -| google | 53.75 (+3.90, +7.82%) | 46.60 (+1.73, +3.86%) | 44.60 (+2.50, +5.94%) | 42.05 (+4.45, +11.84%) | 34.50 (+1.83, +5.59%) | 28.60 (+3.10, +12.16%) | 37.81 (+2.38, +6.70%) | 33.16 (+1.14, +3.58%) | 33.64 (+1.27, +3.91%) | 28.97 (-0.80, -2.69%) | 25.50 (+3.23, +14.52%) | 34.74 (+2.32, +7.17%) | 31.23 (+3.37, +12.08%) | 29.47 (-6.54, -18.15%) | 35.80 (+3.43, +10.61%) | 41.30 (+2.80, +7.27%) | 35.65 (+2.49, +7.52%) | 27.72 (+2.96, +11.95%) | 33.36 (+2.29, +7.36%) | -| microsoft | 50.15 (+0.30, +0.60%) | 46.47 (+1.60, +3.57%) | 38.55 (-3.55, -8.43%) | 42.90 (+5.30, +14.10%) | 34.55 (+1.88, +5.74%) | 28.47 (+2.97, +11.63%) | 39.13 (+3.70, +10.44%) | 33.54 (+1.53, +4.79%) | 32.93 (+0.56, +1.72%) | 32.30 (+2.53, +8.51%) | 24.83 (+2.57, +11.53%) | 33.76 (+1.35, +4.17%) | 31.83 (+3.97, +14.23%) | 36.48 (+0.47, +1.31%) | 36.17 (+3.80, +11.74%) | 41.20 (+2.70, +7.01%) | 37.73 (+4.57, +13.79%) | 28.26 (+3.50, +14.14%) | 34.67 (+3.61, +11.61%) | +| bergamot | 31.07 | 25.50 | 29.77 | 35.43 | 49.85 | 32.37 | 37.60 | 38.50 | 32.41 | 42.10 | 24.76 | 33.16 | 32.67 | 27.87 | 36.01 | 22.27 | 44.87 | 32.38 | 32.01 | +| google | 33.36 (+2.29, +7.36%) | 28.60 (+3.10, +12.16%) | 28.97 (-0.80, -2.69%) | 37.81 (+2.38, +6.70%) | 53.75 (+3.90, +7.82%) | 35.80 (+3.43, +10.61%) | 42.05 (+4.45, +11.84%) | 41.30 (+2.80, +7.27%) | 34.74 (+2.32, +7.17%) | 44.60 (+2.50, +5.94%) | 27.72 (+2.96, +11.95%) | 35.65 (+2.49, +7.52%) | 34.50 (+1.83, +5.59%) | 31.23 (+3.37, +12.08%) | 29.47 (-6.54, -18.15%) | 25.50 (+3.23, +14.52%) | 46.60 (+1.73, +3.86%) | 33.64 (+1.27, +3.91%) | 33.16 (+1.14, +3.58%) | +| microsoft | 34.67 (+3.61, +11.61%) | 28.47 (+2.97, +11.63%) | 32.30 (+2.53, +8.51%) | 39.13 (+3.70, +10.44%) | 50.15 (+0.30, +0.60%) | 36.17 (+3.80, +11.74%) | 42.90 (+5.30, +14.10%) | 41.20 (+2.70, +7.01%) | 33.76 (+1.35, +4.17%) | 38.55 (-3.55, -8.43%) | 28.26 (+3.50, +14.14%) | 37.73 (+4.57, +13.79%) | 34.55 (+1.88, +5.74%) | 31.83 (+3.97, +14.23%) | 36.48 (+0.47, +1.31%) | 24.83 (+2.57, +11.53%) | 46.47 (+1.60, +3.57%) | 32.93 (+0.56, +1.72%) | 33.54 (+1.53, +4.79%) | ![Results](img/avg-bleu.png) --- +## cs-en + +| Translator/Dataset | wmt08 | wmt17 | wmt10 | flores-dev | wmt22 | flores-test | wmt12 | wmt11 | wmt14 | wmt15 | wmt16 | wmt13 | wmt18 | wmt09 | wmt21 | wmt20 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 24.50 | 30.20 | 28.20 | 35.30 | 44.50 | 35.30 | 26.50 | 28.10 | 35.00 | 32.00 | 33.40 | 30.30 | 31.30 | 27.60 | 27.90 | 27.00 | +| google | 26.30 (+1.80, +7.35%) | 31.20 (+1.00, +3.31%) | 30.50 (+2.30, +8.16%) | 38.60 (+3.30, +9.35%) | 49.40 (+4.90, +11.01%) | 39.00 (+3.70, +10.48%) | 28.60 (+2.10, +7.92%) | 30.20 (+2.10, +7.47%) | 38.00 (+3.00, +8.57%) | 33.60 (+1.60, +5.00%) | 34.80 (+1.40, +4.19%) | 32.40 (+2.10, +6.93%) | 32.10 (+0.80, +2.56%) | 29.90 (+2.30, +8.33%) | 30.70 (+2.80, +10.04%) | 28.40 (+1.40, +5.19%) | +| microsoft | 26.40 (+1.90, +7.76%) | 33.60 (+3.40, +11.26%) | 30.70 (+2.50, +8.87%) | 40.00 (+4.70, +13.31%) | 54.90 (+10.40, +23.37%) | 40.30 (+5.00, +14.16%) | 29.70 (+3.20, +12.08%) | 30.90 (+2.80, +9.96%) | 39.90 (+4.90, +14.00%) | 34.70 (+2.70, +8.44%) | 38.30 (+4.90, +14.67%) | 33.40 (+3.10, +10.23%) | 34.30 (+3.00, +9.58%) | 29.60 (+2.00, +7.25%) | 30.50 (+2.60, +9.32%) | 27.60 (+0.60, +2.22%) | + +![Results](img/cs-en-bleu.png) +--- + +## en-et + +| Translator/Dataset | flores-dev | flores-test | wmt18 | +| --- | --- | --- | --- | +| bergamot | 25.60 | 25.70 | 25.20 | +| google | 30.20 (+4.60, +17.97%) | 29.00 (+3.30, +12.84%) | 26.60 (+1.40, +5.56%) | +| microsoft | 28.60 (+3.00, +11.72%) | 29.20 (+3.50, +13.62%) | 27.60 (+2.40, +9.52%) | + +![Results](img/en-et-bleu.png) +--- + +## en-it + +| Translator/Dataset | flores-test | flores-dev | wmt09 | +| --- | --- | --- | --- | +| bergamot | 29.30 | 29.20 | 30.80 | +| google | 29.60 (+0.30, +1.02%) | 28.50 (-0.70, -2.40%) | 28.80 (-2.00, -6.49%) | +| microsoft | 32.10 (+2.80, +9.56%) | 31.10 (+1.90, +6.51%) | 33.70 (+2.90, +9.42%) | + +![Results](img/en-it-bleu.png) +--- + +## fr-en + +| Translator/Dataset | wmt08 | mtedx_test | iwslt17 | wmt10 | flores-dev | flores-test | wmt12 | wmt11 | wmt14 | wmt15 | wmt13 | wmt09 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 24.50 | 42.80 | 39.80 | 31.40 | 43.80 | 42.10 | 31.90 | 32.00 | 37.30 | 37.20 | 33.30 | 29.10 | +| google | 26.60 (+2.10, +8.57%) | 42.70 (-0.10, -0.23%) | 40.60 (+0.80, +2.01%) | 34.10 (+2.70, +8.60%) | 48.70 (+4.90, +11.19%) | 46.70 (+4.60, +10.93%) | 33.80 (+1.90, +5.96%) | 34.30 (+2.30, +7.19%) | 40.60 (+3.30, +8.85%) | 39.90 (+2.70, +7.26%) | 34.50 (+1.20, +3.60%) | 31.20 (+2.10, +7.22%) | +| microsoft | 27.40 (+2.90, +11.84%) | 46.40 (+3.60, +8.41%) | 41.80 (+2.00, +5.03%) | 35.00 (+3.60, +11.46%) | 48.90 (+5.10, +11.64%) | 47.00 (+4.90, +11.64%) | 34.60 (+2.70, +8.46%) | 35.20 (+3.20, +10.00%) | 42.30 (+5.00, +13.40%) | 42.70 (+5.50, +14.78%) | 36.10 (+2.80, +8.41%) | 32.20 (+3.10, +10.65%) | + +![Results](img/fr-en-bleu.png) +--- + ## en-pt | Translator/Dataset | flores-test | flores-dev | @@ -76,26 +120,15 @@ Both absolute and relative differences in BLEU scores between Bergamot and other ![Results](img/en-pt-bleu.png) --- -## pt-en +## et-en -| Translator/Dataset | flores-dev | mtedx_test | flores-test | +| Translator/Dataset | flores-dev | flores-test | wmt18 | | --- | --- | --- | --- | -| bergamot | 47.80 | 40.20 | 46.60 | -| google | 50.40 (+2.60, +5.44%) | 39.10 (-1.10, -2.74%) | 50.30 (+3.70, +7.94%) | -| microsoft | 49.80 (+2.00, +4.18%) | 41.00 (+0.80, +1.99%) | 48.60 (+2.00, +4.29%) | +| bergamot | 33.50 | 32.70 | 30.90 | +| google | 38.30 (+4.80, +14.33%) | 37.00 (+4.30, +13.15%) | 32.10 (+1.20, +3.88%) | +| microsoft | 37.40 (+3.90, +11.64%) | 37.00 (+4.30, +13.15%) | 34.10 (+3.20, +10.36%) | -![Results](img/pt-en-bleu.png) ---- - -## en-bg - -| Translator/Dataset | flores-dev | flores-test | -| --- | --- | --- | -| bergamot | 42.00 | 42.20 | -| google | 44.10 (+2.10, +5.00%) | 45.10 (+2.90, +6.87%) | -| microsoft | 38.00 (-4.00, -9.52%) | 39.10 (-3.10, -7.35%) | - -![Results](img/en-bg-bleu.png) +![Results](img/et-en-bleu.png) --- ## nb-en @@ -109,167 +142,134 @@ Both absolute and relative differences in BLEU scores between Bergamot and other ![Results](img/nb-en-bleu.png) --- -## it-en - -| Translator/Dataset | flores-dev | mtedx_test | wmt09 | flores-test | -| --- | --- | --- | --- | --- | -| bergamot | 31.10 | 35.70 | 33.50 | 30.40 | -| google | 33.40 (+2.30, +7.40%) | 35.90 (+0.20, +0.56%) | 35.40 (+1.90, +5.67%) | 33.30 (+2.90, +9.54%) | -| microsoft | 33.30 (+2.20, +7.07%) | 36.40 (+0.70, +1.96%) | 35.80 (+2.30, +6.87%) | 32.70 (+2.30, +7.57%) | - -![Results](img/it-en-bleu.png) ---- - -## en-et - -| Translator/Dataset | flores-test | wmt18 | flores-dev | -| --- | --- | --- | --- | -| bergamot | 25.70 | 25.20 | 25.60 | -| google | 29.00 (+3.30, +12.84%) | 26.60 (+1.40, +5.56%) | 30.20 (+4.60, +17.97%) | -| microsoft | 29.20 (+3.50, +13.62%) | 27.60 (+2.40, +9.52%) | 28.60 (+3.00, +11.72%) | - -![Results](img/en-et-bleu.png) ---- - -## fr-en +## bg-en -| Translator/Dataset | flores-dev | iwslt17 | wmt10 | wmt08 | mtedx_test | wmt12 | wmt15 | wmt09 | wmt14 | wmt11 | wmt13 | flores-test | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 43.80 | 39.80 | 31.40 | 24.50 | 42.80 | 31.90 | 37.20 | 29.10 | 37.30 | 32.00 | 33.30 | 42.10 | -| google | 48.70 (+4.90, +11.19%) | 40.60 (+0.80, +2.01%) | 34.10 (+2.70, +8.60%) | 26.60 (+2.10, +8.57%) | 42.70 (-0.10, -0.23%) | 33.80 (+1.90, +5.96%) | 39.90 (+2.70, +7.26%) | 31.20 (+2.10, +7.22%) | 40.60 (+3.30, +8.85%) | 34.30 (+2.30, +7.19%) | 34.50 (+1.20, +3.60%) | 46.70 (+4.60, +10.93%) | -| microsoft | 48.90 (+5.10, +11.64%) | 41.80 (+2.00, +5.03%) | 35.00 (+3.60, +11.46%) | 27.40 (+2.90, +11.84%) | 46.40 (+3.60, +8.41%) | 34.60 (+2.70, +8.46%) | 42.70 (+5.50, +14.78%) | 32.20 (+3.10, +10.65%) | 42.30 (+5.00, +13.40%) | 35.20 (+3.20, +10.00%) | 36.10 (+2.80, +8.41%) | 47.00 (+4.90, +11.64%) | +| Translator/Dataset | flores-dev | flores-test | +| --- | --- | --- | +| bergamot | 39.10 | 37.90 | +| google | 41.90 (+2.80, +7.16%) | 40.70 (+2.80, +7.39%) | +| microsoft | 42.10 (+3.00, +7.67%) | 40.30 (+2.40, +6.33%) | -![Results](img/fr-en-bleu.png) +![Results](img/bg-en-bleu.png) --- -## en-de +## en-es -| Translator/Dataset | wmt17 | wmt18 | iwslt17 | wmt13 | flores-test | wmt10 | wmt11 | wmt19 | wmt14 | wmt09 | wmt20 | wmt08 | wmt15 | wmt12 | wmt21 | flores-dev | wmt22 | wmt16 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 32.00 | 47.70 | 26.70 | 28.20 | 38.80 | 26.80 | 23.40 | 44.50 | 29.80 | 23.00 | 35.70 | 23.60 | 33.10 | 24.30 | 27.70 | 38.80 | 32.10 | 40.00 | -| google | 31.50 (-0.50, -1.56%) | 47.80 (+0.10, +0.21%) | 28.90 (+2.20, +8.24%) | 28.80 (+0.60, +2.13%) | 42.30 (+3.50, +9.02%) | 26.50 (-0.30, -1.12%) | 24.10 (+0.70, +2.99%) | 43.50 (-1.00, -2.25%) | 30.90 (+1.10, +3.69%) | 23.60 (+0.60, +2.61%) | 36.50 (+0.80, +2.24%) | 23.70 (+0.10, +0.42%) | 33.70 (+0.60, +1.81%) | 24.70 (+0.40, +1.65%) | 29.70 (+2.00, +7.22%) | 43.70 (+4.90, +12.63%) | 38.30 (+6.20, +19.31%) | 38.60 (-1.40, -3.50%) | -| microsoft | 33.10 (+1.10, +3.44%) | 48.70 (+1.00, +2.10%) | 28.20 (+1.50, +5.62%) | 28.80 (+0.60, +2.13%) | 42.90 (+4.10, +10.57%) | 27.20 (+0.40, +1.49%) | 23.70 (+0.30, +1.28%) | 43.80 (-0.70, -1.57%) | 32.20 (+2.40, +8.05%) | 23.90 (+0.90, +3.91%) | 36.10 (+0.40, +1.12%) | 24.00 (+0.40, +1.69%) | 34.30 (+1.20, +3.63%) | 25.30 (+1.00, +4.12%) | 29.80 (+2.10, +7.58%) | 44.00 (+5.20, +13.40%) | 37.30 (+5.20, +16.20%) | 40.50 (+0.50, +1.25%) | +| Translator/Dataset | wmt08 | wmt11 | wmt09 | flores-dev | wmt10 | wmt13 | wmt12 | flores-test | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 29.00 | 37.90 | 29.90 | 25.90 | 36.70 | 34.80 | 38.90 | 26.20 | +| google | 30.00 (+1.00, +3.45%) | 39.90 (+2.00, +5.28%) | 30.90 (+1.00, +3.34%) | 30.50 (+4.60, +17.76%) | 38.80 (+2.10, +5.72%) | 36.90 (+2.10, +6.03%) | 40.50 (+1.60, +4.11%) | 30.40 (+4.20, +16.03%) | +| microsoft | 29.90 (+0.90, +3.10%) | 39.10 (+1.20, +3.17%) | 30.70 (+0.80, +2.68%) | 28.40 (+2.50, +9.65%) | 37.80 (+1.10, +3.00%) | 35.70 (+0.90, +2.59%) | 40.00 (+1.10, +2.83%) | 28.50 (+2.30, +8.78%) | -![Results](img/en-de-bleu.png) +![Results](img/en-es-bleu.png) --- -## es-en +## en-bg -| Translator/Dataset | flores-dev | wmt10 | wmt08 | mtedx_test | wmt12 | wmt09 | wmt11 | wmt13 | flores-test | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 27.50 | 35.80 | 27.30 | 36.80 | 38.30 | 29.40 | 34.30 | 35.20 | 26.80 | -| google | 30.50 (+3.00, +10.91%) | 37.00 (+1.20, +3.35%) | 28.30 (+1.00, +3.66%) | 35.40 (-1.40, -3.80%) | 38.80 (+0.50, +1.31%) | 31.60 (+2.20, +7.48%) | 35.20 (+0.90, +2.62%) | 35.70 (+0.50, +1.42%) | 30.30 (+3.50, +13.06%) | -| microsoft | 30.30 (+2.80, +10.18%) | 35.40 (-0.40, -1.12%) | 26.80 (-0.50, -1.83%) | 37.60 (+0.80, +2.17%) | 37.80 (-0.50, -1.31%) | 29.60 (+0.20, +0.68%) | 33.70 (-0.60, -1.75%) | 35.30 (+0.10, +0.28%) | 29.90 (+3.10, +11.57%) | +| Translator/Dataset | flores-dev | flores-test | +| --- | --- | --- | +| bergamot | 42.00 | 42.20 | +| google | 44.10 (+2.10, +5.00%) | 45.10 (+2.90, +6.87%) | +| microsoft | 38.00 (-4.00, -9.52%) | 39.10 (-3.10, -7.35%) | -![Results](img/es-en-bleu.png) +![Results](img/en-bg-bleu.png) --- -## en-it +## en-cs -| Translator/Dataset | flores-test | wmt09 | flores-dev | -| --- | --- | --- | --- | -| bergamot | 29.30 | 30.80 | 29.20 | -| google | 29.60 (+0.30, +1.02%) | 28.80 (-2.00, -6.49%) | 28.50 (-0.70, -2.40%) | -| microsoft | 32.10 (+2.80, +9.56%) | 33.70 (+2.90, +9.42%) | 31.10 (+1.90, +6.51%) | +| Translator/Dataset | wmt12 | flores-dev | wmt19 | wmt21 | wmt14 | wmt17 | wmt20 | wmt15 | wmt22 | wmt08 | wmt11 | wmt18 | wmt09 | wmt10 | flores-test | wmt13 | wmt16 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 18.80 | 30.10 | 27.10 | 19.50 | 28.70 | 23.40 | 32.70 | 25.40 | 31.70 | 19.00 | 20.70 | 22.70 | 20.80 | 20.90 | 30.20 | 23.40 | 25.80 | +| google | 20.70 (+1.90, +10.11%) | 34.10 (+4.00, +13.29%) | 27.20 (+0.10, +0.37%) | 21.80 (+2.30, +11.79%) | 31.20 (+2.50, +8.71%) | 24.70 (+1.30, +5.56%) | 35.50 (+2.80, +8.56%) | 26.80 (+1.40, +5.51%) | 48.40 (+16.70, +52.68%) | 20.50 (+1.50, +7.89%) | 23.00 (+2.30, +11.11%) | 24.40 (+1.70, +7.49%) | 22.60 (+1.80, +8.65%) | 22.40 (+1.50, +7.18%) | 34.40 (+4.20, +13.91%) | 25.20 (+1.80, +7.69%) | 28.30 (+2.50, +9.69%) | +| microsoft | 22.90 (+4.10, +21.81%) | 33.50 (+3.40, +11.30%) | 27.20 (+0.10, +0.37%) | 22.00 (+2.50, +12.82%) | 31.90 (+3.20, +11.15%) | 25.60 (+2.20, +9.40%) | 34.10 (+1.40, +4.28%) | 27.40 (+2.00, +7.87%) | 42.10 (+10.40, +32.81%) | 22.60 (+3.60, +18.95%) | 25.30 (+4.60, +22.22%) | 24.90 (+2.20, +9.69%) | 25.00 (+4.20, +20.19%) | 24.30 (+3.40, +16.27%) | 34.00 (+3.80, +12.58%) | 27.70 (+4.30, +18.38%) | 29.90 (+4.10, +15.89%) | -![Results](img/en-it-bleu.png) +![Results](img/en-cs-bleu.png) --- -## en-pl +## de-en -| Translator/Dataset | flores-test | wmt20 | flores-dev | -| --- | --- | --- | --- | -| bergamot | 21.00 | 25.10 | 20.70 | -| google | 24.40 (+3.40, +16.19%) | 27.90 (+2.80, +11.16%) | 24.20 (+3.50, +16.91%) | -| microsoft | 23.80 (+2.80, +13.33%) | 27.70 (+2.60, +10.36%) | 23.00 (+2.30, +11.11%) | +| Translator/Dataset | wmt08 | wmt19 | wmt17 | iwslt17 | wmt10 | flores-dev | wmt22 | flores-test | wmt12 | wmt11 | wmt14 | wmt15 | wmt16 | wmt13 | wmt18 | wmt09 | wmt21 | wmt20 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 26.00 | 39.00 | 35.00 | 28.60 | 29.00 | 39.60 | 29.20 | 39.10 | 27.60 | 26.30 | 33.50 | 33.50 | 39.60 | 30.80 | 43.30 | 26.40 | 31.50 | 38.80 | +| google | 27.60 (+1.60, +6.15%) | 41.10 (+2.10, +5.38%) | 38.70 (+3.70, +10.57%) | 30.10 (+1.50, +5.24%) | 32.10 (+3.10, +10.69%) | 43.10 (+3.50, +8.84%) | 33.30 (+4.10, +14.04%) | 42.80 (+3.70, +9.46%) | 29.50 (+1.90, +6.88%) | 27.30 (+1.00, +3.80%) | 37.40 (+3.90, +11.64%) | 36.10 (+2.60, +7.76%) | 42.30 (+2.70, +6.82%) | 32.40 (+1.60, +5.19%) | 46.20 (+2.90, +6.70%) | 27.20 (+0.80, +3.03%) | 32.70 (+1.20, +3.81%) | 41.80 (+3.00, +7.73%) | +| microsoft | 29.40 (+3.40, +13.08%) | 43.80 (+4.80, +12.31%) | 40.80 (+5.80, +16.57%) | 32.50 (+3.90, +13.64%) | 33.40 (+4.40, +15.17%) | 44.90 (+5.30, +13.38%) | 33.50 (+4.30, +14.73%) | 45.80 (+6.70, +17.14%) | 31.30 (+3.70, +13.41%) | 29.20 (+2.90, +11.03%) | 39.20 (+5.70, +17.01%) | 38.10 (+4.60, +13.73%) | 46.30 (+6.70, +16.92%) | 34.30 (+3.50, +11.36%) | 49.60 (+6.30, +14.55%) | 29.10 (+2.70, +10.23%) | 34.30 (+2.80, +8.89%) | 43.60 (+4.80, +12.37%) | -![Results](img/en-pl-bleu.png) +![Results](img/de-en-bleu.png) --- -## en-es +## it-en -| Translator/Dataset | wmt11 | wmt08 | wmt12 | wmt09 | flores-dev | wmt13 | wmt10 | flores-test | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 37.90 | 29.00 | 38.90 | 29.90 | 25.90 | 34.80 | 36.70 | 26.20 | -| google | 39.90 (+2.00, +5.28%) | 30.00 (+1.00, +3.45%) | 40.50 (+1.60, +4.11%) | 30.90 (+1.00, +3.34%) | 30.50 (+4.60, +17.76%) | 36.90 (+2.10, +6.03%) | 38.80 (+2.10, +5.72%) | 30.40 (+4.20, +16.03%) | -| microsoft | 39.10 (+1.20, +3.17%) | 29.90 (+0.90, +3.10%) | 40.00 (+1.10, +2.83%) | 30.70 (+0.80, +2.68%) | 28.40 (+2.50, +9.65%) | 35.70 (+0.90, +2.59%) | 37.80 (+1.10, +3.00%) | 28.50 (+2.30, +8.78%) | +| Translator/Dataset | mtedx_test | flores-dev | flores-test | wmt09 | +| --- | --- | --- | --- | --- | +| bergamot | 35.70 | 31.10 | 30.40 | 33.50 | +| google | 35.90 (+0.20, +0.56%) | 33.40 (+2.30, +7.40%) | 33.30 (+2.90, +9.54%) | 35.40 (+1.90, +5.67%) | +| microsoft | 36.40 (+0.70, +1.96%) | 33.30 (+2.20, +7.07%) | 32.70 (+2.30, +7.57%) | 35.80 (+2.30, +6.87%) | -![Results](img/en-es-bleu.png) +![Results](img/it-en-bleu.png) --- ## pl-en -| Translator/Dataset | flores-dev | wmt20 | flores-test | +| Translator/Dataset | flores-dev | flores-test | wmt20 | | --- | --- | --- | --- | -| bergamot | 26.80 | 31.00 | 25.80 | -| google | 30.00 (+3.20, +11.94%) | 34.10 (+3.10, +10.00%) | 29.60 (+3.80, +14.73%) | -| microsoft | 30.10 (+3.30, +12.31%) | 35.50 (+4.50, +14.52%) | 29.90 (+4.10, +15.89%) | +| bergamot | 26.80 | 25.80 | 31.00 | +| google | 30.00 (+3.20, +11.94%) | 29.60 (+3.80, +14.73%) | 34.10 (+3.10, +10.00%) | +| microsoft | 30.10 (+3.30, +12.31%) | 29.90 (+4.10, +15.89%) | 35.50 (+4.50, +14.52%) | ![Results](img/pl-en-bleu.png) --- ## en-fr -| Translator/Dataset | wmt13 | wmt14 | flores-test | wmt08 | wmt09 | flores-dev | iwslt17 | wmt12 | wmt10 | wmt15 | wmt11 | +| Translator/Dataset | iwslt17 | wmt13 | wmt09 | wmt11 | wmt10 | wmt08 | wmt12 | flores-test | wmt14 | wmt15 | flores-dev | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 33.40 | 39.70 | 48.70 | 25.50 | 28.80 | 48.50 | 38.60 | 31.40 | 31.00 | 36.90 | 33.60 | -| google | 26.50 (-6.90, -20.66%) | 32.60 (-7.10, -17.88%) | 41.80 (-6.90, -14.17%) | 20.70 (-4.80, -18.82%) | 23.50 (-5.30, -18.40%) | 41.30 (-7.20, -14.85%) | 28.00 (-10.60, -27.46%) | 25.10 (-6.30, -20.06%) | 26.60 (-4.40, -14.19%) | 30.60 (-6.30, -17.07%) | 27.50 (-6.10, -18.15%) | -| microsoft | 31.50 (-1.90, -5.69%) | 40.40 (+0.70, +1.76%) | 52.70 (+4.00, +8.21%) | 25.10 (-0.40, -1.57%) | 28.20 (-0.60, -2.08%) | 52.50 (+4.00, +8.25%) | 36.50 (-2.10, -5.44%) | 29.60 (-1.80, -5.73%) | 33.00 (+2.00, +6.45%) | 39.70 (+2.80, +7.59%) | 32.10 (-1.50, -4.46%) | +| bergamot | 38.60 | 33.40 | 28.80 | 33.60 | 31.00 | 25.50 | 31.40 | 48.70 | 39.70 | 36.90 | 48.50 | +| google | 28.00 (-10.60, -27.46%) | 26.50 (-6.90, -20.66%) | 23.50 (-5.30, -18.40%) | 27.50 (-6.10, -18.15%) | 26.60 (-4.40, -14.19%) | 20.70 (-4.80, -18.82%) | 25.10 (-6.30, -20.06%) | 41.80 (-6.90, -14.17%) | 32.60 (-7.10, -17.88%) | 30.60 (-6.30, -17.07%) | 41.30 (-7.20, -14.85%) | +| microsoft | 36.50 (-2.10, -5.44%) | 31.50 (-1.90, -5.69%) | 28.20 (-0.60, -2.08%) | 32.10 (-1.50, -4.46%) | 33.00 (+2.00, +6.45%) | 25.10 (-0.40, -1.57%) | 29.60 (-1.80, -5.73%) | 52.70 (+4.00, +8.21%) | 40.40 (+0.70, +1.76%) | 39.70 (+2.80, +7.59%) | 52.50 (+4.00, +8.25%) | ![Results](img/en-fr-bleu.png) --- -## et-en +## en-pl -| Translator/Dataset | flores-dev | wmt18 | flores-test | +| Translator/Dataset | wmt20 | flores-dev | flores-test | | --- | --- | --- | --- | -| bergamot | 33.50 | 30.90 | 32.70 | -| google | 38.30 (+4.80, +14.33%) | 32.10 (+1.20, +3.88%) | 37.00 (+4.30, +13.15%) | -| microsoft | 37.40 (+3.90, +11.64%) | 34.10 (+3.20, +10.36%) | 37.00 (+4.30, +13.15%) | - -![Results](img/et-en-bleu.png) ---- +| bergamot | 25.10 | 20.70 | 21.00 | +| google | 27.90 (+2.80, +11.16%) | 24.20 (+3.50, +16.91%) | 24.40 (+3.40, +16.19%) | +| microsoft | 27.70 (+2.60, +10.36%) | 23.00 (+2.30, +11.11%) | 23.80 (+2.80, +13.33%) | -## bg-en - -| Translator/Dataset | flores-dev | flores-test | -| --- | --- | --- | -| bergamot | 39.10 | 37.90 | -| google | 41.90 (+2.80, +7.16%) | 40.70 (+2.80, +7.39%) | -| microsoft | 42.10 (+3.00, +7.67%) | 40.30 (+2.40, +6.33%) | - -![Results](img/bg-en-bleu.png) +![Results](img/en-pl-bleu.png) --- -## de-en +## pt-en -| Translator/Dataset | flores-dev | iwslt17 | wmt10 | wmt08 | wmt18 | wmt20 | wmt19 | wmt12 | wmt15 | wmt09 | wmt17 | wmt14 | wmt11 | wmt16 | wmt22 | wmt13 | flores-test | wmt21 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 39.60 | 28.60 | 29.00 | 26.00 | 43.30 | 38.80 | 39.00 | 27.60 | 33.50 | 26.40 | 35.00 | 33.50 | 26.30 | 39.60 | 29.20 | 30.80 | 39.10 | 31.50 | -| google | 43.10 (+3.50, +8.84%) | 30.10 (+1.50, +5.24%) | 32.10 (+3.10, +10.69%) | 27.60 (+1.60, +6.15%) | 46.20 (+2.90, +6.70%) | 41.80 (+3.00, +7.73%) | 41.10 (+2.10, +5.38%) | 29.50 (+1.90, +6.88%) | 36.10 (+2.60, +7.76%) | 27.20 (+0.80, +3.03%) | 38.70 (+3.70, +10.57%) | 37.40 (+3.90, +11.64%) | 27.30 (+1.00, +3.80%) | 42.30 (+2.70, +6.82%) | 33.30 (+4.10, +14.04%) | 32.40 (+1.60, +5.19%) | 42.80 (+3.70, +9.46%) | 32.70 (+1.20, +3.81%) | -| microsoft | 44.90 (+5.30, +13.38%) | 32.50 (+3.90, +13.64%) | 33.40 (+4.40, +15.17%) | 29.40 (+3.40, +13.08%) | 49.60 (+6.30, +14.55%) | 43.60 (+4.80, +12.37%) | 43.80 (+4.80, +12.31%) | 31.30 (+3.70, +13.41%) | 38.10 (+4.60, +13.73%) | 29.10 (+2.70, +10.23%) | 40.80 (+5.80, +16.57%) | 39.20 (+5.70, +17.01%) | 29.20 (+2.90, +11.03%) | 46.30 (+6.70, +16.92%) | 33.50 (+4.30, +14.73%) | 34.30 (+3.50, +11.36%) | 45.80 (+6.70, +17.14%) | 34.30 (+2.80, +8.89%) | +| Translator/Dataset | mtedx_test | flores-dev | flores-test | +| --- | --- | --- | --- | +| bergamot | 40.20 | 47.80 | 46.60 | +| google | 39.10 (-1.10, -2.74%) | 50.40 (+2.60, +5.44%) | 50.30 (+3.70, +7.94%) | +| microsoft | 41.00 (+0.80, +1.99%) | 49.80 (+2.00, +4.18%) | 48.60 (+2.00, +4.29%) | -![Results](img/de-en-bleu.png) +![Results](img/pt-en-bleu.png) --- -## en-cs +## es-en -| Translator/Dataset | wmt21 | wmt11 | wmt09 | wmt19 | wmt16 | wmt20 | flores-dev | wmt13 | wmt08 | wmt15 | wmt18 | wmt10 | wmt12 | wmt22 | wmt14 | wmt17 | flores-test | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 19.50 | 20.70 | 20.80 | 27.10 | 25.80 | 32.70 | 30.10 | 23.40 | 19.00 | 25.40 | 22.70 | 20.90 | 18.80 | 31.70 | 28.70 | 23.40 | 30.20 | -| google | 21.80 (+2.30, +11.79%) | 23.00 (+2.30, +11.11%) | 22.60 (+1.80, +8.65%) | 27.20 (+0.10, +0.37%) | 28.30 (+2.50, +9.69%) | 35.50 (+2.80, +8.56%) | 34.10 (+4.00, +13.29%) | 25.20 (+1.80, +7.69%) | 20.50 (+1.50, +7.89%) | 26.80 (+1.40, +5.51%) | 24.40 (+1.70, +7.49%) | 22.40 (+1.50, +7.18%) | 20.70 (+1.90, +10.11%) | 48.40 (+16.70, +52.68%) | 31.20 (+2.50, +8.71%) | 24.70 (+1.30, +5.56%) | 34.40 (+4.20, +13.91%) | -| microsoft | 22.00 (+2.50, +12.82%) | 25.30 (+4.60, +22.22%) | 25.00 (+4.20, +20.19%) | 27.20 (+0.10, +0.37%) | 29.90 (+4.10, +15.89%) | 34.10 (+1.40, +4.28%) | 33.50 (+3.40, +11.30%) | 27.70 (+4.30, +18.38%) | 22.60 (+3.60, +18.95%) | 27.40 (+2.00, +7.87%) | 24.90 (+2.20, +9.69%) | 24.30 (+3.40, +16.27%) | 22.90 (+4.10, +21.81%) | 42.10 (+10.40, +32.81%) | 31.90 (+3.20, +11.15%) | 25.60 (+2.20, +9.40%) | 34.00 (+3.80, +12.58%) | +| Translator/Dataset | wmt08 | mtedx_test | wmt10 | flores-dev | flores-test | wmt12 | wmt11 | wmt13 | wmt09 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 27.30 | 36.80 | 35.80 | 27.50 | 26.80 | 38.30 | 34.30 | 35.20 | 29.40 | +| google | 28.30 (+1.00, +3.66%) | 35.40 (-1.40, -3.80%) | 37.00 (+1.20, +3.35%) | 30.50 (+3.00, +10.91%) | 30.30 (+3.50, +13.06%) | 38.80 (+0.50, +1.31%) | 35.20 (+0.90, +2.62%) | 35.70 (+0.50, +1.42%) | 31.60 (+2.20, +7.48%) | +| microsoft | 26.80 (-0.50, -1.83%) | 37.60 (+0.80, +2.17%) | 35.40 (-0.40, -1.12%) | 30.30 (+2.80, +10.18%) | 29.90 (+3.10, +11.57%) | 37.80 (-0.50, -1.31%) | 33.70 (-0.60, -1.75%) | 35.30 (+0.10, +0.28%) | 29.60 (+0.20, +0.68%) | -![Results](img/en-cs-bleu.png) +![Results](img/es-en-bleu.png) --- -## cs-en +## en-de -| Translator/Dataset | flores-dev | wmt10 | wmt08 | wmt18 | wmt20 | wmt12 | wmt15 | wmt09 | wmt17 | wmt14 | wmt11 | wmt16 | wmt22 | wmt13 | flores-test | wmt21 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 35.30 | 28.20 | 24.50 | 31.30 | 27.00 | 26.50 | 32.00 | 27.60 | 30.20 | 35.00 | 28.10 | 33.40 | 44.50 | 30.30 | 35.30 | 27.90 | -| google | 38.60 (+3.30, +9.35%) | 30.50 (+2.30, +8.16%) | 26.30 (+1.80, +7.35%) | 32.10 (+0.80, +2.56%) | 28.40 (+1.40, +5.19%) | 28.60 (+2.10, +7.92%) | 33.60 (+1.60, +5.00%) | 29.90 (+2.30, +8.33%) | 31.20 (+1.00, +3.31%) | 38.00 (+3.00, +8.57%) | 30.20 (+2.10, +7.47%) | 34.80 (+1.40, +4.19%) | 49.40 (+4.90, +11.01%) | 32.40 (+2.10, +6.93%) | 39.00 (+3.70, +10.48%) | 30.70 (+2.80, +10.04%) | -| microsoft | 40.00 (+4.70, +13.31%) | 30.70 (+2.50, +8.87%) | 26.40 (+1.90, +7.76%) | 34.30 (+3.00, +9.58%) | 27.60 (+0.60, +2.22%) | 29.70 (+3.20, +12.08%) | 34.70 (+2.70, +8.44%) | 29.60 (+2.00, +7.25%) | 33.60 (+3.40, +11.26%) | 39.90 (+4.90, +14.00%) | 30.90 (+2.80, +9.96%) | 38.30 (+4.90, +14.67%) | 54.90 (+10.40, +23.37%) | 33.40 (+3.10, +10.23%) | 40.30 (+5.00, +14.16%) | 30.50 (+2.60, +9.32%) | +| Translator/Dataset | flores-test | wmt22 | iwslt17 | wmt20 | wmt08 | wmt12 | wmt19 | wmt10 | wmt11 | flores-dev | wmt09 | wmt13 | wmt17 | wmt16 | wmt18 | wmt14 | wmt21 | wmt15 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 38.80 | 32.10 | 26.70 | 35.70 | 23.60 | 24.30 | 44.50 | 26.80 | 23.40 | 38.80 | 23.00 | 28.20 | 32.00 | 40.00 | 47.70 | 29.80 | 27.70 | 33.10 | +| google | 42.30 (+3.50, +9.02%) | 38.30 (+6.20, +19.31%) | 28.90 (+2.20, +8.24%) | 36.50 (+0.80, +2.24%) | 23.70 (+0.10, +0.42%) | 24.70 (+0.40, +1.65%) | 43.50 (-1.00, -2.25%) | 26.50 (-0.30, -1.12%) | 24.10 (+0.70, +2.99%) | 43.70 (+4.90, +12.63%) | 23.60 (+0.60, +2.61%) | 28.80 (+0.60, +2.13%) | 31.50 (-0.50, -1.56%) | 38.60 (-1.40, -3.50%) | 47.80 (+0.10, +0.21%) | 30.90 (+1.10, +3.69%) | 29.70 (+2.00, +7.22%) | 33.70 (+0.60, +1.81%) | +| microsoft | 42.90 (+4.10, +10.57%) | 37.30 (+5.20, +16.20%) | 28.20 (+1.50, +5.62%) | 36.10 (+0.40, +1.12%) | 24.00 (+0.40, +1.69%) | 25.30 (+1.00, +4.12%) | 43.80 (-0.70, -1.57%) | 27.20 (+0.40, +1.49%) | 23.70 (+0.30, +1.28%) | 44.00 (+5.20, +13.40%) | 23.90 (+0.90, +3.91%) | 28.80 (+0.60, +2.13%) | 33.10 (+1.10, +3.44%) | 40.50 (+0.50, +1.25%) | 48.70 (+1.00, +2.10%) | 32.20 (+2.40, +8.05%) | 29.80 (+2.10, +7.58%) | 34.30 (+1.20, +3.63%) | -![Results](img/cs-en-bleu.png) +![Results](img/en-de-bleu.png) --- \ No newline at end of file diff --git a/evaluation/prod/comet-results.md b/evaluation/prod/comet-results.md index fbe7247b..85f6f211 100644 --- a/evaluation/prod/comet-results.md +++ b/evaluation/prod/comet-results.md @@ -6,6 +6,18 @@ Three models with different human judgments have been trained to showcase the fr The models developed by COMET have achieved new state-of-the-art performance on the WMT 2019 Metrics shared task, demonstrating robustness to high-performing systems. +## Interpreting Scores: + +When using COMET to evaluate machine translation, it's important to understand how to interpret the scores it produces. + +In general, COMET models are trained to predict quality scores for translations. These scores are typically normalized using a z-score transformation to account for individual differences among annotators. While the raw score itself does not have a direct interpretation, it is useful for ranking translations and systems according to their quality. + +However, for the latest COMET models like Unbabel/wmt22-comet-da, we have introduced a new training approach that scales the scores between 0 and 1. This makes it easier to interpret the scores: a score close to 1 indicates a high-quality translation, while a score close to 0 indicates a translation that is no better than random chance. + +It's worth noting that when using COMET to compare the performance of two different translation systems, it's important to run the comet-compare command to obtain statistical significance measures. This command compares the output of two systems using a statistical hypothesis test, providing an estimate of the probability that the observed difference in scores between the systems is due to chance. This is an important step to ensure that any differences in scores between systems are statistically significant. + +Overall, the added interpretability of scores in the latest COMET models, combined with the ability to assess statistical significance between systems using comet-compare, make COMET a valuable tool for evaluating machine translation. + Source: https://aclanthology.org/2020.emnlp-main.213.pdf Tool: https://github.com/Unbabel/COMET @@ -36,150 +48,117 @@ We also compare the systems using the `comet-compare` tool that calculates the s ## avg -| Translator/Dataset | en-pt | pt-en | en-bg | nb-en | it-en | en-et | fr-en | en-de | es-en | en-it | en-pl | en-es | pl-en | en-fr | et-en | bg-en | de-en | en-cs | cs-en | +| Translator/Dataset | cs-en | en-et | en-it | fr-en | en-pt | et-en | nb-en | bg-en | en-es | en-bg | en-cs | de-en | it-en | pl-en | en-fr | en-pl | pt-en | es-en | en-de | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.86 | 0.75 | 0.80 | 0.64 | 0.62 | 0.86 | 0.59 | 0.51 | 0.55 | 0.68 | 0.66 | 0.61 | 0.56 | 0.56 | 0.60 | 0.68 | 0.49 | 0.61 | 0.48 | -| google | 0.93 (+0.06, +7.42%) | 0.80 (+0.05, +6.85%) | 0.88 (+0.09, +10.75%) | 0.81 (+0.17, +26.28%) | 0.68 (+0.06, +10.18%) | 1.07 (+0.21, +24.57%) | 0.65 (+0.06, +10.69%) | 0.60 (+0.09, +17.91%) | 0.62 (+0.07, +11.99%) | 0.76 (+0.08, +12.35%) | 0.82 (+0.17, +25.52%) | 0.69 (+0.09, +14.08%) | 0.66 (+0.09, +16.81%) | 0.67 (+0.11, +20.19%) | 0.77 (+0.17, +28.18%) | 0.75 (+0.07, +10.49%) | 0.61 (+0.11, +22.99%) | 0.83 (+0.22, +35.44%) | 0.61 (+0.13, +26.78%) | -| microsoft | 0.91 (+0.05, +5.54%) | 0.79 (+0.05, +6.20%) | 0.86 (+0.07, +8.17%) | 0.81 (+0.16, +25.57%) | 0.68 (+0.06, +10.18%) | 1.05 (+0.19, +22.64%) | 0.67 (+0.08, +13.12%) | 0.61 (+0.10, +20.20%) | 0.63 (+0.08, +13.82%) | 0.76 (+0.09, +12.87%) | 0.80 (+0.14, +21.56%) | 0.70 (+0.09, +14.92%) | 0.66 (+0.10, +17.36%) | 0.69 (+0.13, +23.23%) | 0.74 (+0.14, +23.46%) | 0.73 (+0.05, +7.05%) | 0.63 (+0.13, +26.85%) | 0.84 (+0.23, +37.82%) | 0.62 (+0.14, +27.97%) | +| bergamot | 0.48 | 0.86 | 0.68 | 0.59 | 0.86 | 0.60 | 0.64 | 0.68 | 0.61 | 0.80 | 0.61 | 0.49 | 0.62 | 0.56 | 0.56 | 0.66 | 0.75 | 0.55 | 0.51 | +| google | 0.61 (+0.13, +26.78%) | 1.07 (+0.21, +24.57%) | 0.76 (+0.08, +12.35%) | 0.65 (+0.06, +10.69%) | 0.93 (+0.06, +7.42%) | 0.77 (+0.17, +28.18%) | 0.81 (+0.17, +26.28%) | 0.75 (+0.07, +10.49%) | 0.69 (+0.09, +14.08%) | 0.88 (+0.09, +10.75%) | 0.83 (+0.22, +35.44%) | 0.61 (+0.11, +22.99%) | 0.68 (+0.06, +10.18%) | 0.66 (+0.09, +16.81%) | 0.67 (+0.11, +20.19%) | 0.82 (+0.17, +25.52%) | 0.80 (+0.05, +6.85%) | 0.62 (+0.07, +11.99%) | 0.60 (+0.09, +17.91%) | +| microsoft | 0.62 (+0.14, +27.97%) | 1.05 (+0.19, +22.64%) | 0.76 (+0.09, +12.87%) | 0.67 (+0.08, +13.12%) | 0.91 (+0.05, +5.54%) | 0.74 (+0.14, +23.46%) | 0.81 (+0.16, +25.57%) | 0.73 (+0.05, +7.05%) | 0.70 (+0.09, +14.92%) | 0.86 (+0.07, +8.17%) | 0.84 (+0.23, +37.82%) | 0.63 (+0.13, +26.85%) | 0.68 (+0.06, +10.18%) | 0.66 (+0.10, +17.36%) | 0.69 (+0.13, +23.23%) | 0.80 (+0.14, +21.56%) | 0.79 (+0.05, +6.20%) | 0.63 (+0.08, +13.82%) | 0.61 (+0.10, +20.20%) | ![Results](img/avg-comet.png) --- -## en-pt +## cs-en -| Translator/Dataset | flores-dev | flores-test | -| --- | --- | --- | -| bergamot | 0.87 | 0.86 | -| google | 0.93 (+0.06, +7.41%) | 0.92 (+0.06, +7.44%) | -| microsoft | 0.91 (+0.04, +4.89%) | 0.91 (+0.05, +6.19%) | +| Translator/Dataset | wmt17 | wmt22 | wmt08 | flores-test | wmt20 | wmt15 | wmt18 | wmt09 | wmt14 | wmt16 | wmt11 | wmt12 | wmt10 | wmt21 | flores-dev | wmt13 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 0.45 | 0.55 | 0.37 | 0.65 | 0.35 | 0.49 | 0.48 | 0.43 | 0.57 | 0.48 | 0.41 | 0.42 | 0.44 | 0.47 | 0.65 | 0.50 | +| google | 0.58 (+0.13, +28.12%) | 0.70 (+0.15, +27.55%) | 0.52 (+0.15, +41.56%) | 0.77 (+0.11, +17.45%) | 0.51 (+0.16, +46.19%) | 0.62 (+0.13, +26.40%) | 0.59 (+0.11, +23.27%) | 0.56 (+0.13, +30.83%) | 0.70 (+0.13, +22.40%) | 0.62 (+0.14, +28.50%) | 0.53 (+0.12, +29.26%) | 0.54 (+0.12, +28.77%) | 0.57 (+0.13, +29.95%) | 0.60 (+0.13, +27.51%) | 0.76 (+0.10, +15.71%) | 0.61 (+0.12, +23.25%) | +| microsoft | 0.57 (+0.13, +28.01%) | 0.72 (+0.17, +30.46%) | 0.52 (+0.15, +41.59%) | 0.76 (+0.11, +16.30%) | 0.50 (+0.16, +44.44%) | 0.63 (+0.14, +27.93%) | 0.60 (+0.12, +25.68%) | 0.57 (+0.14, +32.48%) | 0.72 (+0.14, +24.79%) | 0.62 (+0.14, +28.46%) | 0.56 (+0.15, +35.20%) | 0.56 (+0.14, +33.29%) | 0.58 (+0.14, +30.79%) | 0.59 (+0.12, +24.74%) | 0.75 (+0.10, +14.95%) | 0.63 (+0.14, +27.56%) | -![Results](img/en-pt-comet.png) +![Results](img/cs-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.en-pt](en-pt/flores-dev.en-pt.cometcompare) -- flores-dev.microsoft.pt outperforms flores-dev.bergamot.pt. -- flores-dev.google.pt outperforms flores-dev.bergamot.pt. -- flores-dev.google.pt outperforms flores-dev.microsoft.pt. - -#### [flores-test.en-pt](en-pt/flores-test.en-pt.cometcompare) -- flores-test.microsoft.pt outperforms flores-test.bergamot.pt. -- flores-test.google.pt outperforms flores-test.bergamot.pt. -- flores-test.google.pt outperforms flores-test.microsoft.pt. - ---- - -## pt-en +#### [wmt17.cs-en](cs-en/wmt17.cs-en.cometcompare) +- wmt17.microsoft.en outperforms wmt17.bergamot.en. +- wmt17.google.en outperforms wmt17.bergamot.en. -| Translator/Dataset | flores-dev | flores-test | mtedx_test | -| --- | --- | --- | --- | -| bergamot | 0.81 | 0.80 | 0.63 | -| google | 0.85 (+0.04, +4.40%) | 0.85 (+0.05, +6.39%) | 0.70 (+0.07, +10.56%) | -| microsoft | 0.84 (+0.03, +4.17%) | 0.85 (+0.05, +6.39%) | 0.69 (+0.05, +8.54%) | +#### [wmt22.cs-en](cs-en/wmt22.cs-en.cometcompare) +- wmt22.microsoft.en outperforms wmt22.bergamot.en. +- wmt22.google.en outperforms wmt22.bergamot.en. +- wmt22.microsoft.en outperforms wmt22.google.en. -![Results](img/pt-en-comet.png) -### Comparisons between systems -*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.pt-en](pt-en/flores-dev.pt-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. +#### [wmt08.cs-en](cs-en/wmt08.cs-en.cometcompare) +- wmt08.microsoft.en outperforms wmt08.bergamot.en. +- wmt08.google.en outperforms wmt08.bergamot.en. -#### [flores-test.pt-en](pt-en/flores-test.pt-en.cometcompare) +#### [flores-test.cs-en](cs-en/flores-test.cs-en.cometcompare) - flores-test.microsoft.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.microsoft.en. -#### [mtedx_test.pt-en](pt-en/mtedx_test.pt-en.cometcompare) -- mtedx_test.microsoft.en outperforms mtedx_test.bergamot.en. -- mtedx_test.google.en outperforms mtedx_test.bergamot.en. -- mtedx_test.google.en outperforms mtedx_test.microsoft.en. - ---- - -## en-bg - -| Translator/Dataset | flores-test | flores-dev | -| --- | --- | --- | -| bergamot | 0.80 | 0.79 | -| google | 0.89 (+0.09, +10.78%) | 0.88 (+0.09, +10.71%) | -| microsoft | 0.87 (+0.07, +8.38%) | 0.86 (+0.06, +7.95%) | - -![Results](img/en-bg-comet.png) -### Comparisons between systems -*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-test.en-bg](en-bg/flores-test.en-bg.cometcompare) -- flores-test.microsoft.bg outperforms flores-test.bergamot.bg. -- flores-test.google.bg outperforms flores-test.bergamot.bg. -- flores-test.google.bg outperforms flores-test.microsoft.bg. +#### [wmt20.cs-en](cs-en/wmt20.cs-en.cometcompare) +- wmt20.microsoft.en outperforms wmt20.bergamot.en. +- wmt20.google.en outperforms wmt20.bergamot.en. -#### [flores-dev.en-bg](en-bg/flores-dev.en-bg.cometcompare) -- flores-dev.microsoft.bg outperforms flores-dev.bergamot.bg. -- flores-dev.google.bg outperforms flores-dev.bergamot.bg. -- flores-dev.google.bg outperforms flores-dev.microsoft.bg. +#### [wmt15.cs-en](cs-en/wmt15.cs-en.cometcompare) +- wmt15.microsoft.en outperforms wmt15.bergamot.en. +- wmt15.google.en outperforms wmt15.bergamot.en. ---- +#### [wmt18.cs-en](cs-en/wmt18.cs-en.cometcompare) +- wmt18.microsoft.en outperforms wmt18.bergamot.en. +- wmt18.google.en outperforms wmt18.bergamot.en. +- wmt18.microsoft.en outperforms wmt18.google.en. -## nb-en +#### [wmt09.cs-en](cs-en/wmt09.cs-en.cometcompare) +- wmt09.microsoft.en outperforms wmt09.bergamot.en. +- wmt09.google.en outperforms wmt09.bergamot.en. +- wmt09.microsoft.en outperforms wmt09.google.en. -| Translator/Dataset | flores-dev | flores-test | -| --- | --- | --- | -| bergamot | 0.64 | 0.65 | -| google | 0.81 (+0.18, +27.71%) | 0.81 (+0.16, +24.86%) | -| microsoft | 0.81 (+0.17, +26.75%) | 0.80 (+0.16, +24.41%) | +#### [wmt14.cs-en](cs-en/wmt14.cs-en.cometcompare) +- wmt14.microsoft.en outperforms wmt14.bergamot.en. +- wmt14.google.en outperforms wmt14.bergamot.en. +- wmt14.microsoft.en outperforms wmt14.google.en. -![Results](img/nb-en-comet.png) -### Comparisons between systems -*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.nb-en](nb-en/flores-dev.nb-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.microsoft.en. +#### [wmt16.cs-en](cs-en/wmt16.cs-en.cometcompare) +- wmt16.microsoft.en outperforms wmt16.bergamot.en. +- wmt16.google.en outperforms wmt16.bergamot.en. -#### [flores-test.nb-en](nb-en/flores-test.nb-en.cometcompare) -- flores-test.microsoft.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.bergamot.en. +#### [wmt11.cs-en](cs-en/wmt11.cs-en.cometcompare) +- wmt11.microsoft.en outperforms wmt11.bergamot.en. +- wmt11.google.en outperforms wmt11.bergamot.en. +- wmt11.microsoft.en outperforms wmt11.google.en. ---- +#### [wmt12.cs-en](cs-en/wmt12.cs-en.cometcompare) +- wmt12.microsoft.en outperforms wmt12.bergamot.en. +- wmt12.google.en outperforms wmt12.bergamot.en. +- wmt12.microsoft.en outperforms wmt12.google.en. -## it-en +#### [wmt10.cs-en](cs-en/wmt10.cs-en.cometcompare) +- wmt10.microsoft.en outperforms wmt10.bergamot.en. +- wmt10.google.en outperforms wmt10.bergamot.en. -| Translator/Dataset | flores-dev | flores-test | mtedx_test | wmt09 | -| --- | --- | --- | --- | --- | -| bergamot | 0.72 | 0.70 | 0.55 | 0.53 | -| google | 0.76 (+0.04, +6.00%) | 0.76 (+0.06, +9.33%) | 0.62 (+0.07, +12.79%) | 0.60 (+0.08, +14.29%) | -| microsoft | 0.76 (+0.04, +6.16%) | 0.76 (+0.06, +8.80%) | 0.61 (+0.06, +11.31%) | 0.61 (+0.09, +16.28%) | +#### [wmt21.cs-en](cs-en/wmt21.cs-en.cometcompare) +- wmt21.microsoft.en outperforms wmt21.bergamot.en. +- wmt21.google.en outperforms wmt21.bergamot.en. +- wmt21.google.en outperforms wmt21.microsoft.en. -![Results](img/it-en-comet.png) -### Comparisons between systems -*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.it-en](it-en/flores-dev.it-en.cometcompare) +#### [flores-dev.cs-en](cs-en/flores-dev.cs-en.cometcompare) - flores-dev.microsoft.en outperforms flores-dev.bergamot.en. - flores-dev.google.en outperforms flores-dev.bergamot.en. -#### [flores-test.it-en](it-en/flores-test.it-en.cometcompare) -- flores-test.microsoft.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.bergamot.en. - -#### [mtedx_test.it-en](it-en/mtedx_test.it-en.cometcompare) -- mtedx_test.microsoft.en outperforms mtedx_test.bergamot.en. -- mtedx_test.google.en outperforms mtedx_test.bergamot.en. - -#### [wmt09.it-en](it-en/wmt09.it-en.cometcompare) -- wmt09.microsoft.en outperforms wmt09.bergamot.en. -- wmt09.google.en outperforms wmt09.bergamot.en. -- wmt09.microsoft.en outperforms wmt09.google.en. +#### [wmt13.cs-en](cs-en/wmt13.cs-en.cometcompare) +- wmt13.microsoft.en outperforms wmt13.bergamot.en. +- wmt13.google.en outperforms wmt13.bergamot.en. +- wmt13.microsoft.en outperforms wmt13.google.en. --- ## en-et -| Translator/Dataset | wmt18 | flores-test | flores-dev | +| Translator/Dataset | flores-dev | wmt18 | flores-test | | --- | --- | --- | --- | -| bergamot | 0.83 | 0.87 | 0.86 | -| google | 1.03 (+0.20, +23.75%) | 1.07 (+0.20, +23.56%) | 1.09 (+0.23, +26.39%) | -| microsoft | 1.02 (+0.18, +22.12%) | 1.07 (+0.20, +23.04%) | 1.06 (+0.20, +22.76%) | +| bergamot | 0.86 | 0.83 | 0.87 | +| google | 1.09 (+0.23, +26.39%) | 1.03 (+0.20, +23.75%) | 1.07 (+0.20, +23.56%) | +| microsoft | 1.06 (+0.20, +22.76%) | 1.02 (+0.18, +22.12%) | 1.07 (+0.20, +23.04%) | ![Results](img/en-et-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [flores-dev.en-et](en-et/flores-dev.en-et.cometcompare) +- flores-dev.microsoft.et outperforms flores-dev.bergamot.et. +- flores-dev.google.et outperforms flores-dev.bergamot.et. +- flores-dev.google.et outperforms flores-dev.microsoft.et. + #### [wmt18.en-et](en-et/wmt18.en-et.cometcompare) - wmt18.microsoft.et outperforms wmt18.bergamot.et. - wmt18.google.et outperforms wmt18.bergamot.et. @@ -189,38 +168,71 @@ We also compare the systems using the `comet-compare` tool that calculates the s - flores-test.microsoft.et outperforms flores-test.bergamot.et. - flores-test.google.et outperforms flores-test.bergamot.et. -#### [flores-dev.en-et](en-et/flores-dev.en-et.cometcompare) -- flores-dev.microsoft.et outperforms flores-dev.bergamot.et. -- flores-dev.google.et outperforms flores-dev.bergamot.et. -- flores-dev.google.et outperforms flores-dev.microsoft.et. +--- + +## en-it + +| Translator/Dataset | flores-test | wmt09 | flores-dev | +| --- | --- | --- | --- | +| bergamot | 0.68 | 0.64 | 0.70 | +| google | 0.78 (+0.10, +14.43%) | 0.72 (+0.07, +11.31%) | 0.78 (+0.08, +11.28%) | +| microsoft | 0.78 (+0.10, +14.07%) | 0.73 (+0.09, +13.86%) | 0.78 (+0.08, +10.79%) | + +![Results](img/en-it-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [flores-test.en-it](en-it/flores-test.en-it.cometcompare) +- flores-test.microsoft.it outperforms flores-test.bergamot.it. +- flores-test.google.it outperforms flores-test.bergamot.it. + +#### [wmt09.en-it](en-it/wmt09.en-it.cometcompare) +- wmt09.microsoft.it outperforms wmt09.bergamot.it. +- wmt09.google.it outperforms wmt09.bergamot.it. +- wmt09.microsoft.it outperforms wmt09.google.it. + +#### [flores-dev.en-it](en-it/flores-dev.en-it.cometcompare) +- flores-dev.microsoft.it outperforms flores-dev.bergamot.it. +- flores-dev.google.it outperforms flores-dev.bergamot.it. --- ## fr-en -| Translator/Dataset | wmt13 | iwslt17 | wmt10 | wmt14 | wmt11 | flores-dev | flores-test | wmt15 | wmt08 | mtedx_test | wmt12 | wmt09 | +| Translator/Dataset | wmt08 | flores-test | mtedx_test | wmt15 | iwslt17 | wmt09 | wmt14 | wmt11 | wmt12 | wmt10 | flores-dev | wmt13 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.58 | 0.67 | 0.53 | 0.64 | 0.51 | 0.77 | 0.77 | 0.56 | 0.44 | 0.62 | 0.50 | 0.50 | -| google | 0.63 (+0.04, +7.32%) | 0.71 (+0.04, +5.84%) | 0.61 (+0.08, +15.85%) | 0.71 (+0.06, +9.51%) | 0.57 (+0.06, +11.60%) | 0.82 (+0.05, +6.94%) | 0.82 (+0.05, +6.04%) | 0.66 (+0.10, +17.10%) | 0.52 (+0.09, +19.65%) | 0.69 (+0.07, +11.22%) | 0.55 (+0.05, +10.25%) | 0.57 (+0.07, +13.97%) | -| microsoft | 0.65 (+0.07, +11.46%) | 0.73 (+0.06, +8.36%) | 0.63 (+0.10, +18.76%) | 0.72 (+0.08, +12.36%) | 0.60 (+0.09, +17.14%) | 0.82 (+0.05, +7.03%) | 0.82 (+0.05, +6.45%) | 0.66 (+0.10, +17.71%) | 0.53 (+0.10, +22.35%) | 0.69 (+0.07, +11.77%) | 0.58 (+0.08, +16.77%) | 0.58 (+0.08, +16.90%) | +| bergamot | 0.44 | 0.77 | 0.62 | 0.56 | 0.67 | 0.50 | 0.64 | 0.51 | 0.50 | 0.53 | 0.77 | 0.58 | +| google | 0.52 (+0.09, +19.65%) | 0.82 (+0.05, +6.04%) | 0.69 (+0.07, +11.22%) | 0.66 (+0.10, +17.10%) | 0.71 (+0.04, +5.84%) | 0.57 (+0.07, +13.97%) | 0.71 (+0.06, +9.51%) | 0.57 (+0.06, +11.60%) | 0.55 (+0.05, +10.25%) | 0.61 (+0.08, +15.85%) | 0.82 (+0.05, +6.94%) | 0.63 (+0.04, +7.32%) | +| microsoft | 0.53 (+0.10, +22.35%) | 0.82 (+0.05, +6.45%) | 0.69 (+0.07, +11.77%) | 0.66 (+0.10, +17.71%) | 0.73 (+0.06, +8.36%) | 0.58 (+0.08, +16.90%) | 0.72 (+0.08, +12.36%) | 0.60 (+0.09, +17.14%) | 0.58 (+0.08, +16.77%) | 0.63 (+0.10, +18.76%) | 0.82 (+0.05, +7.03%) | 0.65 (+0.07, +11.46%) | ![Results](img/fr-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt13.fr-en](fr-en/wmt13.fr-en.cometcompare) -- wmt13.microsoft.en outperforms wmt13.bergamot.en. -- wmt13.google.en outperforms wmt13.bergamot.en. -- wmt13.microsoft.en outperforms wmt13.google.en. +#### [wmt08.fr-en](fr-en/wmt08.fr-en.cometcompare) +- wmt08.microsoft.en outperforms wmt08.bergamot.en. +- wmt08.google.en outperforms wmt08.bergamot.en. +- wmt08.microsoft.en outperforms wmt08.google.en. + +#### [flores-test.fr-en](fr-en/flores-test.fr-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. + +#### [mtedx_test.fr-en](fr-en/mtedx_test.fr-en.cometcompare) +- mtedx_test.microsoft.en outperforms mtedx_test.bergamot.en. +- mtedx_test.google.en outperforms mtedx_test.bergamot.en. + +#### [wmt15.fr-en](fr-en/wmt15.fr-en.cometcompare) +- wmt15.microsoft.en outperforms wmt15.bergamot.en. +- wmt15.google.en outperforms wmt15.bergamot.en. #### [iwslt17.fr-en](fr-en/iwslt17.fr-en.cometcompare) - iwslt17.microsoft.en outperforms iwslt17.bergamot.en. - iwslt17.google.en outperforms iwslt17.bergamot.en. - iwslt17.microsoft.en outperforms iwslt17.google.en. -#### [wmt10.fr-en](fr-en/wmt10.fr-en.cometcompare) -- wmt10.microsoft.en outperforms wmt10.bergamot.en. -- wmt10.google.en outperforms wmt10.bergamot.en. -- wmt10.microsoft.en outperforms wmt10.google.en. +#### [wmt09.fr-en](fr-en/wmt09.fr-en.cometcompare) +- wmt09.microsoft.en outperforms wmt09.bergamot.en. +- wmt09.google.en outperforms wmt09.bergamot.en. +- wmt09.microsoft.en outperforms wmt09.google.en. #### [wmt14.fr-en](fr-en/wmt14.fr-en.cometcompare) - wmt14.microsoft.en outperforms wmt14.bergamot.en. @@ -232,718 +244,718 @@ We also compare the systems using the `comet-compare` tool that calculates the s - wmt11.google.en outperforms wmt11.bergamot.en. - wmt11.microsoft.en outperforms wmt11.google.en. +#### [wmt12.fr-en](fr-en/wmt12.fr-en.cometcompare) +- wmt12.microsoft.en outperforms wmt12.bergamot.en. +- wmt12.google.en outperforms wmt12.bergamot.en. +- wmt12.microsoft.en outperforms wmt12.google.en. + +#### [wmt10.fr-en](fr-en/wmt10.fr-en.cometcompare) +- wmt10.microsoft.en outperforms wmt10.bergamot.en. +- wmt10.google.en outperforms wmt10.bergamot.en. +- wmt10.microsoft.en outperforms wmt10.google.en. + #### [flores-dev.fr-en](fr-en/flores-dev.fr-en.cometcompare) - flores-dev.microsoft.en outperforms flores-dev.bergamot.en. - flores-dev.google.en outperforms flores-dev.bergamot.en. -#### [flores-test.fr-en](fr-en/flores-test.fr-en.cometcompare) -- flores-test.microsoft.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.bergamot.en. +#### [wmt13.fr-en](fr-en/wmt13.fr-en.cometcompare) +- wmt13.microsoft.en outperforms wmt13.bergamot.en. +- wmt13.google.en outperforms wmt13.bergamot.en. +- wmt13.microsoft.en outperforms wmt13.google.en. -#### [wmt15.fr-en](fr-en/wmt15.fr-en.cometcompare) -- wmt15.microsoft.en outperforms wmt15.bergamot.en. -- wmt15.google.en outperforms wmt15.bergamot.en. +--- -#### [wmt08.fr-en](fr-en/wmt08.fr-en.cometcompare) -- wmt08.microsoft.en outperforms wmt08.bergamot.en. -- wmt08.google.en outperforms wmt08.bergamot.en. -- wmt08.microsoft.en outperforms wmt08.google.en. +## en-pt -#### [mtedx_test.fr-en](fr-en/mtedx_test.fr-en.cometcompare) -- mtedx_test.microsoft.en outperforms mtedx_test.bergamot.en. -- mtedx_test.google.en outperforms mtedx_test.bergamot.en. +| Translator/Dataset | flores-test | flores-dev | +| --- | --- | --- | +| bergamot | 0.86 | 0.87 | +| google | 0.92 (+0.06, +7.44%) | 0.93 (+0.06, +7.41%) | +| microsoft | 0.91 (+0.05, +6.19%) | 0.91 (+0.04, +4.89%) | -#### [wmt12.fr-en](fr-en/wmt12.fr-en.cometcompare) -- wmt12.microsoft.en outperforms wmt12.bergamot.en. -- wmt12.google.en outperforms wmt12.bergamot.en. -- wmt12.microsoft.en outperforms wmt12.google.en. +![Results](img/en-pt-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [flores-test.en-pt](en-pt/flores-test.en-pt.cometcompare) +- flores-test.microsoft.pt outperforms flores-test.bergamot.pt. +- flores-test.google.pt outperforms flores-test.bergamot.pt. +- flores-test.google.pt outperforms flores-test.microsoft.pt. -#### [wmt09.fr-en](fr-en/wmt09.fr-en.cometcompare) -- wmt09.microsoft.en outperforms wmt09.bergamot.en. -- wmt09.google.en outperforms wmt09.bergamot.en. -- wmt09.microsoft.en outperforms wmt09.google.en. +#### [flores-dev.en-pt](en-pt/flores-dev.en-pt.cometcompare) +- flores-dev.microsoft.pt outperforms flores-dev.bergamot.pt. +- flores-dev.google.pt outperforms flores-dev.bergamot.pt. +- flores-dev.google.pt outperforms flores-dev.microsoft.pt. --- -## en-de +## et-en -| Translator/Dataset | wmt12 | wmt16 | wmt15 | flores-dev | wmt21 | wmt17 | wmt10 | flores-test | wmt08 | wmt18 | wmt22 | wmt20 | wmt19 | wmt09 | wmt11 | wmt13 | wmt14 | iwslt17 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.43 | 0.60 | 0.54 | 0.60 | 0.46 | 0.55 | 0.49 | 0.62 | 0.42 | 0.65 | 0.46 | 0.50 | 0.55 | 0.43 | 0.41 | 0.51 | 0.55 | 0.43 | -| google | 0.52 (+0.08, +19.17%) | 0.67 (+0.08, +12.64%) | 0.64 (+0.10, +17.92%) | 0.70 (+0.09, +15.43%) | 0.52 (+0.06, +14.16%) | 0.64 (+0.09, +16.72%) | 0.57 (+0.07, +15.28%) | 0.70 (+0.09, +14.23%) | 0.52 (+0.10, +24.24%) | 0.72 (+0.07, +10.80%) | 0.62 (+0.16, +34.59%) | 0.60 (+0.10, +19.53%) | 0.62 (+0.07, +13.58%) | 0.53 (+0.10, +22.21%) | 0.50 (+0.10, +24.37%) | 0.59 (+0.08, +15.78%) | 0.66 (+0.10, +18.81%) | 0.52 (+0.10, +22.49%) | -| microsoft | 0.54 (+0.10, +23.89%) | 0.69 (+0.09, +15.21%) | 0.64 (+0.10, +18.87%) | 0.70 (+0.09, +15.61%) | 0.55 (+0.09, +19.86%) | 0.64 (+0.10, +18.33%) | 0.59 (+0.10, +19.38%) | 0.71 (+0.09, +14.73%) | 0.54 (+0.12, +27.31%) | 0.73 (+0.08, +12.41%) | 0.63 (+0.17, +36.02%) | 0.61 (+0.11, +22.10%) | 0.64 (+0.09, +16.54%) | 0.54 (+0.11, +25.27%) | 0.52 (+0.12, +29.30%) | 0.60 (+0.09, +18.10%) | 0.66 (+0.11, +19.77%) | 0.52 (+0.10, +22.70%) | +| Translator/Dataset | flores-test | wmt18 | flores-dev | +| --- | --- | --- | --- | +| bergamot | 0.64 | 0.52 | 0.63 | +| google | 0.79 (+0.15, +24.06%) | 0.70 (+0.18, +33.71%) | 0.81 (+0.18, +27.76%) | +| microsoft | 0.76 (+0.12, +19.44%) | 0.67 (+0.15, +29.00%) | 0.78 (+0.15, +22.94%) | -![Results](img/en-de-comet.png) -### Comparisons between systems -*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt12.en-de](en-de/wmt12.en-de.cometcompare) -- wmt12.microsoft.de outperforms wmt12.bergamot.de. -- wmt12.google.de outperforms wmt12.bergamot.de. -- wmt12.microsoft.de outperforms wmt12.google.de. - -#### [wmt16.en-de](en-de/wmt16.en-de.cometcompare) -- wmt16.microsoft.de outperforms wmt16.bergamot.de. -- wmt16.google.de outperforms wmt16.bergamot.de. -- wmt16.microsoft.de outperforms wmt16.google.de. - -#### [wmt15.en-de](en-de/wmt15.en-de.cometcompare) -- wmt15.microsoft.de outperforms wmt15.bergamot.de. -- wmt15.google.de outperforms wmt15.bergamot.de. - -#### [flores-dev.en-de](en-de/flores-dev.en-de.cometcompare) -- flores-dev.microsoft.de outperforms flores-dev.bergamot.de. -- flores-dev.google.de outperforms flores-dev.bergamot.de. - -#### [wmt21.en-de](en-de/wmt21.en-de.cometcompare) -- wmt21.microsoft.de outperforms wmt21.bergamot.de. -- wmt21.google.de outperforms wmt21.bergamot.de. -- wmt21.microsoft.de outperforms wmt21.google.de. - -#### [wmt17.en-de](en-de/wmt17.en-de.cometcompare) -- wmt17.microsoft.de outperforms wmt17.bergamot.de. -- wmt17.google.de outperforms wmt17.bergamot.de. -- wmt17.microsoft.de outperforms wmt17.google.de. - -#### [wmt10.en-de](en-de/wmt10.en-de.cometcompare) -- wmt10.microsoft.de outperforms wmt10.bergamot.de. -- wmt10.google.de outperforms wmt10.bergamot.de. -- wmt10.microsoft.de outperforms wmt10.google.de. - -#### [flores-test.en-de](en-de/flores-test.en-de.cometcompare) -- flores-test.microsoft.de outperforms flores-test.bergamot.de. -- flores-test.google.de outperforms flores-test.bergamot.de. - -#### [wmt08.en-de](en-de/wmt08.en-de.cometcompare) -- wmt08.microsoft.de outperforms wmt08.bergamot.de. -- wmt08.google.de outperforms wmt08.bergamot.de. -- wmt08.microsoft.de outperforms wmt08.google.de. - -#### [wmt18.en-de](en-de/wmt18.en-de.cometcompare) -- wmt18.microsoft.de outperforms wmt18.bergamot.de. -- wmt18.google.de outperforms wmt18.bergamot.de. -- wmt18.microsoft.de outperforms wmt18.google.de. - -#### [wmt22.en-de](en-de/wmt22.en-de.cometcompare) -- wmt22.microsoft.de outperforms wmt22.bergamot.de. -- wmt22.google.de outperforms wmt22.bergamot.de. - -#### [wmt20.en-de](en-de/wmt20.en-de.cometcompare) -- wmt20.microsoft.de outperforms wmt20.bergamot.de. -- wmt20.google.de outperforms wmt20.bergamot.de. -- wmt20.microsoft.de outperforms wmt20.google.de. - -#### [wmt19.en-de](en-de/wmt19.en-de.cometcompare) -- wmt19.microsoft.de outperforms wmt19.bergamot.de. -- wmt19.google.de outperforms wmt19.bergamot.de. -- wmt19.microsoft.de outperforms wmt19.google.de. - -#### [wmt09.en-de](en-de/wmt09.en-de.cometcompare) -- wmt09.microsoft.de outperforms wmt09.bergamot.de. -- wmt09.google.de outperforms wmt09.bergamot.de. -- wmt09.microsoft.de outperforms wmt09.google.de. - -#### [wmt11.en-de](en-de/wmt11.en-de.cometcompare) -- wmt11.microsoft.de outperforms wmt11.bergamot.de. -- wmt11.google.de outperforms wmt11.bergamot.de. -- wmt11.microsoft.de outperforms wmt11.google.de. - -#### [wmt13.en-de](en-de/wmt13.en-de.cometcompare) -- wmt13.microsoft.de outperforms wmt13.bergamot.de. -- wmt13.google.de outperforms wmt13.bergamot.de. -- wmt13.microsoft.de outperforms wmt13.google.de. - -#### [wmt14.en-de](en-de/wmt14.en-de.cometcompare) -- wmt14.microsoft.de outperforms wmt14.bergamot.de. -- wmt14.google.de outperforms wmt14.bergamot.de. -- wmt14.microsoft.de outperforms wmt14.google.de. - -#### [iwslt17.en-de](en-de/iwslt17.en-de.cometcompare) -- iwslt17.microsoft.de outperforms iwslt17.bergamot.de. -- iwslt17.google.de outperforms iwslt17.bergamot.de. - ---- - -## es-en - -| Translator/Dataset | wmt13 | wmt10 | wmt11 | flores-dev | flores-test | wmt08 | mtedx_test | wmt12 | wmt09 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.60 | 0.58 | 0.52 | 0.66 | 0.66 | 0.46 | 0.43 | 0.56 | 0.47 | -| google | 0.65 (+0.05, +7.48%) | 0.65 (+0.07, +11.52%) | 0.57 (+0.04, +8.52%) | 0.74 (+0.08, +12.89%) | 0.74 (+0.08, +12.52%) | 0.52 (+0.06, +12.89%) | 0.53 (+0.09, +21.84%) | 0.61 (+0.04, +7.32%) | 0.55 (+0.07, +15.87%) | -| microsoft | 0.67 (+0.07, +10.80%) | 0.66 (+0.07, +12.86%) | 0.60 (+0.07, +14.17%) | 0.73 (+0.07, +11.07%) | 0.73 (+0.07, +11.40%) | 0.53 (+0.07, +14.99%) | 0.54 (+0.11, +24.18%) | 0.63 (+0.07, +11.90%) | 0.55 (+0.08, +17.25%) | - -![Results](img/es-en-comet.png) +![Results](img/et-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt13.es-en](es-en/wmt13.es-en.cometcompare) -- wmt13.microsoft.en outperforms wmt13.bergamot.en. -- wmt13.google.en outperforms wmt13.bergamot.en. -- wmt13.microsoft.en outperforms wmt13.google.en. - -#### [wmt10.es-en](es-en/wmt10.es-en.cometcompare) -- wmt10.microsoft.en outperforms wmt10.bergamot.en. -- wmt10.google.en outperforms wmt10.bergamot.en. -- wmt10.microsoft.en outperforms wmt10.google.en. - -#### [wmt11.es-en](es-en/wmt11.es-en.cometcompare) -- wmt11.microsoft.en outperforms wmt11.bergamot.en. -- wmt11.google.en outperforms wmt11.bergamot.en. -- wmt11.microsoft.en outperforms wmt11.google.en. - -#### [flores-dev.es-en](es-en/flores-dev.es-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.microsoft.en. - -#### [flores-test.es-en](es-en/flores-test.es-en.cometcompare) +#### [flores-test.et-en](et-en/flores-test.et-en.cometcompare) - flores-test.microsoft.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.microsoft.en. -#### [wmt08.es-en](es-en/wmt08.es-en.cometcompare) -- wmt08.microsoft.en outperforms wmt08.bergamot.en. -- wmt08.google.en outperforms wmt08.bergamot.en. -- wmt08.microsoft.en outperforms wmt08.google.en. - -#### [mtedx_test.es-en](es-en/mtedx_test.es-en.cometcompare) -- mtedx_test.microsoft.en outperforms mtedx_test.bergamot.en. -- mtedx_test.google.en outperforms mtedx_test.bergamot.en. - -#### [wmt12.es-en](es-en/wmt12.es-en.cometcompare) -- wmt12.microsoft.en outperforms wmt12.bergamot.en. -- wmt12.google.en outperforms wmt12.bergamot.en. -- wmt12.microsoft.en outperforms wmt12.google.en. +#### [wmt18.et-en](et-en/wmt18.et-en.cometcompare) +- wmt18.microsoft.en outperforms wmt18.bergamot.en. +- wmt18.google.en outperforms wmt18.bergamot.en. +- wmt18.google.en outperforms wmt18.microsoft.en. -#### [wmt09.es-en](es-en/wmt09.es-en.cometcompare) -- wmt09.microsoft.en outperforms wmt09.bergamot.en. -- wmt09.google.en outperforms wmt09.bergamot.en. -- wmt09.microsoft.en outperforms wmt09.google.en. +#### [flores-dev.et-en](et-en/flores-dev.et-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.microsoft.en. --- -## en-it +## nb-en -| Translator/Dataset | wmt09 | flores-test | flores-dev | -| --- | --- | --- | --- | -| bergamot | 0.64 | 0.68 | 0.70 | -| google | 0.72 (+0.07, +11.31%) | 0.78 (+0.10, +14.43%) | 0.78 (+0.08, +11.28%) | -| microsoft | 0.73 (+0.09, +13.86%) | 0.78 (+0.10, +14.07%) | 0.78 (+0.08, +10.79%) | +| Translator/Dataset | flores-test | flores-dev | +| --- | --- | --- | +| bergamot | 0.65 | 0.64 | +| google | 0.81 (+0.16, +24.86%) | 0.81 (+0.18, +27.71%) | +| microsoft | 0.80 (+0.16, +24.41%) | 0.81 (+0.17, +26.75%) | -![Results](img/en-it-comet.png) +![Results](img/nb-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt09.en-it](en-it/wmt09.en-it.cometcompare) -- wmt09.microsoft.it outperforms wmt09.bergamot.it. -- wmt09.google.it outperforms wmt09.bergamot.it. -- wmt09.microsoft.it outperforms wmt09.google.it. - -#### [flores-test.en-it](en-it/flores-test.en-it.cometcompare) -- flores-test.microsoft.it outperforms flores-test.bergamot.it. -- flores-test.google.it outperforms flores-test.bergamot.it. +#### [flores-test.nb-en](nb-en/flores-test.nb-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. -#### [flores-dev.en-it](en-it/flores-dev.en-it.cometcompare) -- flores-dev.microsoft.it outperforms flores-dev.bergamot.it. -- flores-dev.google.it outperforms flores-dev.bergamot.it. +#### [flores-dev.nb-en](nb-en/flores-dev.nb-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.microsoft.en. --- -## en-pl +## bg-en -| Translator/Dataset | wmt20 | flores-test | flores-dev | -| --- | --- | --- | --- | -| bergamot | 0.62 | 0.67 | 0.68 | -| google | 0.78 (+0.16, +25.87%) | 0.84 (+0.17, +25.53%) | 0.85 (+0.17, +25.18%) | -| microsoft | 0.77 (+0.15, +23.45%) | 0.81 (+0.14, +21.62%) | 0.81 (+0.13, +19.77%) | +| Translator/Dataset | flores-test | flores-dev | +| --- | --- | --- | +| bergamot | 0.68 | 0.68 | +| google | 0.75 (+0.07, +10.26%) | 0.75 (+0.07, +10.71%) | +| microsoft | 0.73 (+0.05, +6.92%) | 0.73 (+0.05, +7.18%) | -![Results](img/en-pl-comet.png) +![Results](img/bg-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt20.en-pl](en-pl/wmt20.en-pl.cometcompare) -- wmt20.microsoft.pl outperforms wmt20.bergamot.pl. -- wmt20.google.pl outperforms wmt20.bergamot.pl. -- wmt20.google.pl outperforms wmt20.microsoft.pl. - -#### [flores-test.en-pl](en-pl/flores-test.en-pl.cometcompare) -- flores-test.microsoft.pl outperforms flores-test.bergamot.pl. -- flores-test.google.pl outperforms flores-test.bergamot.pl. -- flores-test.google.pl outperforms flores-test.microsoft.pl. +#### [flores-test.bg-en](bg-en/flores-test.bg-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.microsoft.en. -#### [flores-dev.en-pl](en-pl/flores-dev.en-pl.cometcompare) -- flores-dev.microsoft.pl outperforms flores-dev.bergamot.pl. -- flores-dev.google.pl outperforms flores-dev.bergamot.pl. -- flores-dev.google.pl outperforms flores-dev.microsoft.pl. +#### [flores-dev.bg-en](bg-en/flores-dev.bg-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.microsoft.en. --- ## en-es -| Translator/Dataset | flores-test | wmt12 | flores-dev | wmt13 | wmt10 | wmt11 | wmt09 | wmt08 | +| Translator/Dataset | wmt11 | wmt13 | wmt08 | flores-dev | flores-test | wmt10 | wmt09 | wmt12 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.64 | 0.63 | 0.64 | 0.64 | 0.64 | 0.58 | 0.58 | 0.51 | -| google | 0.76 (+0.12, +19.02%) | 0.69 (+0.06, +10.04%) | 0.75 (+0.11, +17.02%) | 0.71 (+0.06, +9.98%) | 0.72 (+0.08, +13.00%) | 0.66 (+0.08, +13.74%) | 0.65 (+0.07, +11.55%) | 0.60 (+0.10, +18.96%) | -| microsoft | 0.74 (+0.10, +16.21%) | 0.71 (+0.08, +13.38%) | 0.74 (+0.09, +14.75%) | 0.72 (+0.08, +12.08%) | 0.73 (+0.09, +13.86%) | 0.68 (+0.10, +16.92%) | 0.66 (+0.08, +13.63%) | 0.60 (+0.10, +19.53%) | +| bergamot | 0.58 | 0.64 | 0.51 | 0.64 | 0.64 | 0.64 | 0.58 | 0.63 | +| google | 0.66 (+0.08, +13.74%) | 0.71 (+0.06, +9.98%) | 0.60 (+0.10, +18.96%) | 0.75 (+0.11, +17.02%) | 0.76 (+0.12, +19.02%) | 0.72 (+0.08, +13.00%) | 0.65 (+0.07, +11.55%) | 0.69 (+0.06, +10.04%) | +| microsoft | 0.68 (+0.10, +16.92%) | 0.72 (+0.08, +12.08%) | 0.60 (+0.10, +19.53%) | 0.74 (+0.09, +14.75%) | 0.74 (+0.10, +16.21%) | 0.73 (+0.09, +13.86%) | 0.66 (+0.08, +13.63%) | 0.71 (+0.08, +13.38%) | ![Results](img/en-es-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-test.en-es](en-es/flores-test.en-es.cometcompare) -- flores-test.microsoft.es outperforms flores-test.bergamot.es. -- flores-test.google.es outperforms flores-test.bergamot.es. -- flores-test.google.es outperforms flores-test.microsoft.es. +#### [wmt11.en-es](en-es/wmt11.en-es.cometcompare) +- wmt11.microsoft.es outperforms wmt11.bergamot.es. +- wmt11.google.es outperforms wmt11.bergamot.es. +- wmt11.microsoft.es outperforms wmt11.google.es. -#### [wmt12.en-es](en-es/wmt12.en-es.cometcompare) -- wmt12.microsoft.es outperforms wmt12.bergamot.es. -- wmt12.google.es outperforms wmt12.bergamot.es. -- wmt12.microsoft.es outperforms wmt12.google.es. +#### [wmt13.en-es](en-es/wmt13.en-es.cometcompare) +- wmt13.microsoft.es outperforms wmt13.bergamot.es. +- wmt13.google.es outperforms wmt13.bergamot.es. +- wmt13.microsoft.es outperforms wmt13.google.es. + +#### [wmt08.en-es](en-es/wmt08.en-es.cometcompare) +- wmt08.microsoft.es outperforms wmt08.bergamot.es. +- wmt08.google.es outperforms wmt08.bergamot.es. #### [flores-dev.en-es](en-es/flores-dev.en-es.cometcompare) - flores-dev.microsoft.es outperforms flores-dev.bergamot.es. - flores-dev.google.es outperforms flores-dev.bergamot.es. - flores-dev.google.es outperforms flores-dev.microsoft.es. -#### [wmt13.en-es](en-es/wmt13.en-es.cometcompare) -- wmt13.microsoft.es outperforms wmt13.bergamot.es. -- wmt13.google.es outperforms wmt13.bergamot.es. -- wmt13.microsoft.es outperforms wmt13.google.es. +#### [flores-test.en-es](en-es/flores-test.en-es.cometcompare) +- flores-test.microsoft.es outperforms flores-test.bergamot.es. +- flores-test.google.es outperforms flores-test.bergamot.es. +- flores-test.google.es outperforms flores-test.microsoft.es. #### [wmt10.en-es](en-es/wmt10.en-es.cometcompare) - wmt10.microsoft.es outperforms wmt10.bergamot.es. - wmt10.google.es outperforms wmt10.bergamot.es. -#### [wmt11.en-es](en-es/wmt11.en-es.cometcompare) -- wmt11.microsoft.es outperforms wmt11.bergamot.es. -- wmt11.google.es outperforms wmt11.bergamot.es. -- wmt11.microsoft.es outperforms wmt11.google.es. - #### [wmt09.en-es](en-es/wmt09.en-es.cometcompare) - wmt09.microsoft.es outperforms wmt09.bergamot.es. - wmt09.google.es outperforms wmt09.bergamot.es. - wmt09.microsoft.es outperforms wmt09.google.es. -#### [wmt08.en-es](en-es/wmt08.en-es.cometcompare) -- wmt08.microsoft.es outperforms wmt08.bergamot.es. -- wmt08.google.es outperforms wmt08.bergamot.es. +#### [wmt12.en-es](en-es/wmt12.en-es.cometcompare) +- wmt12.microsoft.es outperforms wmt12.bergamot.es. +- wmt12.google.es outperforms wmt12.bergamot.es. +- wmt12.microsoft.es outperforms wmt12.google.es. --- -## pl-en +## en-bg -| Translator/Dataset | flores-dev | wmt20 | flores-test | -| --- | --- | --- | --- | -| bergamot | 0.59 | 0.53 | 0.57 | -| google | 0.68 (+0.09, +15.30%) | 0.62 (+0.09, +17.51%) | 0.68 (+0.10, +17.70%) | -| microsoft | 0.67 (+0.08, +13.89%) | 0.64 (+0.11, +20.84%) | 0.68 (+0.10, +17.70%) | +| Translator/Dataset | flores-test | flores-dev | +| --- | --- | --- | +| bergamot | 0.80 | 0.79 | +| google | 0.89 (+0.09, +10.78%) | 0.88 (+0.09, +10.71%) | +| microsoft | 0.87 (+0.07, +8.38%) | 0.86 (+0.06, +7.95%) | -![Results](img/pl-en-comet.png) +![Results](img/en-bg-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.pl-en](pl-en/flores-dev.pl-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.microsoft.en. +#### [flores-test.en-bg](en-bg/flores-test.en-bg.cometcompare) +- flores-test.microsoft.bg outperforms flores-test.bergamot.bg. +- flores-test.google.bg outperforms flores-test.bergamot.bg. +- flores-test.google.bg outperforms flores-test.microsoft.bg. -#### [wmt20.pl-en](pl-en/wmt20.pl-en.cometcompare) -- wmt20.microsoft.en outperforms wmt20.bergamot.en. -- wmt20.google.en outperforms wmt20.bergamot.en. -- wmt20.microsoft.en outperforms wmt20.google.en. - -#### [flores-test.pl-en](pl-en/flores-test.pl-en.cometcompare) -- flores-test.microsoft.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.bergamot.en. +#### [flores-dev.en-bg](en-bg/flores-dev.en-bg.cometcompare) +- flores-dev.microsoft.bg outperforms flores-dev.bergamot.bg. +- flores-dev.google.bg outperforms flores-dev.bergamot.bg. +- flores-dev.google.bg outperforms flores-dev.microsoft.bg. --- -## en-fr +## en-cs -| Translator/Dataset | wmt10 | flores-dev | iwslt17 | flores-test | wmt14 | wmt09 | wmt11 | wmt08 | wmt13 | wmt15 | wmt12 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.52 | 0.73 | 0.59 | 0.71 | 0.63 | 0.48 | 0.52 | 0.40 | 0.55 | 0.50 | 0.49 | -| google | 0.63 (+0.12, +22.59%) | 0.83 (+0.10, +13.28%) | 0.67 (+0.08, +13.94%) | 0.84 (+0.13, +17.81%) | 0.75 (+0.12, +19.13%) | 0.59 (+0.11, +21.82%) | 0.62 (+0.10, +18.30%) | 0.54 (+0.13, +33.43%) | 0.64 (+0.10, +17.68%) | 0.67 (+0.17, +34.26%) | 0.58 (+0.09, +18.36%) | -| microsoft | 0.65 (+0.14, +26.13%) | 0.85 (+0.11, +15.24%) | 0.69 (+0.10, +16.63%) | 0.85 (+0.14, +18.99%) | 0.78 (+0.15, +23.35%) | 0.61 (+0.13, +26.38%) | 0.64 (+0.12, +22.03%) | 0.54 (+0.14, +35.05%) | 0.66 (+0.12, +21.10%) | 0.68 (+0.18, +35.82%) | 0.61 (+0.12, +24.12%) | +| Translator/Dataset | wmt13 | wmt18 | wmt22 | flores-dev | wmt16 | wmt11 | wmt08 | wmt10 | wmt21 | wmt09 | wmt15 | wmt19 | wmt14 | flores-test | wmt17 | wmt12 | wmt20 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 0.68 | 0.63 | 0.62 | 0.68 | 0.64 | 0.56 | 0.54 | 0.61 | 0.46 | 0.60 | 0.66 | 0.52 | 0.75 | 0.70 | 0.62 | 0.55 | 0.55 | +| google | 0.85 (+0.17, +24.72%) | 0.81 (+0.18, +29.20%) | 0.97 (+0.36, +57.96%) | 0.93 (+0.25, +36.44%) | 0.85 (+0.21, +32.72%) | 0.77 (+0.21, +37.08%) | 0.78 (+0.23, +43.36%) | 0.81 (+0.20, +33.54%) | 0.65 (+0.19, +40.85%) | 0.81 (+0.22, +36.87%) | 0.86 (+0.20, +29.98%) | 0.72 (+0.20, +38.74%) | 0.95 (+0.20, +26.46%) | 0.94 (+0.23, +33.13%) | 0.81 (+0.19, +30.14%) | 0.76 (+0.21, +37.61%) | 0.78 (+0.23, +41.54%) | +| microsoft | 0.87 (+0.19, +27.50%) | 0.82 (+0.20, +31.49%) | 0.94 (+0.32, +52.78%) | 0.93 (+0.25, +36.25%) | 0.85 (+0.21, +33.51%) | 0.80 (+0.24, +41.92%) | 0.78 (+0.24, +44.75%) | 0.83 (+0.22, +35.78%) | 0.69 (+0.23, +49.02%) | 0.82 (+0.22, +37.76%) | 0.87 (+0.21, +31.11%) | 0.77 (+0.25, +49.14%) | 0.97 (+0.22, +29.34%) | 0.93 (+0.23, +32.76%) | 0.83 (+0.20, +32.21%) | 0.78 (+0.23, +41.83%) | 0.81 (+0.26, +47.71%) | -![Results](img/en-fr-comet.png) +![Results](img/en-cs-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt10.en-fr](en-fr/wmt10.en-fr.cometcompare) -- wmt10.microsoft.fr outperforms wmt10.bergamot.fr. -- wmt10.google.fr outperforms wmt10.bergamot.fr. -- wmt10.microsoft.fr outperforms wmt10.google.fr. - -#### [flores-dev.en-fr](en-fr/flores-dev.en-fr.cometcompare) -- flores-dev.microsoft.fr outperforms flores-dev.bergamot.fr. -- flores-dev.google.fr outperforms flores-dev.bergamot.fr. -- flores-dev.microsoft.fr outperforms flores-dev.google.fr. - -#### [iwslt17.en-fr](en-fr/iwslt17.en-fr.cometcompare) -- iwslt17.microsoft.fr outperforms iwslt17.bergamot.fr. -- iwslt17.google.fr outperforms iwslt17.bergamot.fr. -- iwslt17.microsoft.fr outperforms iwslt17.google.fr. - -#### [flores-test.en-fr](en-fr/flores-test.en-fr.cometcompare) -- flores-test.microsoft.fr outperforms flores-test.bergamot.fr. -- flores-test.google.fr outperforms flores-test.bergamot.fr. - -#### [wmt14.en-fr](en-fr/wmt14.en-fr.cometcompare) -- wmt14.microsoft.fr outperforms wmt14.bergamot.fr. -- wmt14.google.fr outperforms wmt14.bergamot.fr. -- wmt14.microsoft.fr outperforms wmt14.google.fr. - -#### [wmt09.en-fr](en-fr/wmt09.en-fr.cometcompare) -- wmt09.microsoft.fr outperforms wmt09.bergamot.fr. -- wmt09.google.fr outperforms wmt09.bergamot.fr. -- wmt09.microsoft.fr outperforms wmt09.google.fr. +#### [wmt13.en-cs](en-cs/wmt13.en-cs.cometcompare) +- wmt13.microsoft.cs outperforms wmt13.bergamot.cs. +- wmt13.google.cs outperforms wmt13.bergamot.cs. +- wmt13.microsoft.cs outperforms wmt13.google.cs. -#### [wmt11.en-fr](en-fr/wmt11.en-fr.cometcompare) -- wmt11.microsoft.fr outperforms wmt11.bergamot.fr. -- wmt11.google.fr outperforms wmt11.bergamot.fr. -- wmt11.microsoft.fr outperforms wmt11.google.fr. +#### [wmt18.en-cs](en-cs/wmt18.en-cs.cometcompare) +- wmt18.microsoft.cs outperforms wmt18.bergamot.cs. +- wmt18.google.cs outperforms wmt18.bergamot.cs. +- wmt18.microsoft.cs outperforms wmt18.google.cs. -#### [wmt08.en-fr](en-fr/wmt08.en-fr.cometcompare) -- wmt08.microsoft.fr outperforms wmt08.bergamot.fr. -- wmt08.google.fr outperforms wmt08.bergamot.fr. +#### [wmt22.en-cs](en-cs/wmt22.en-cs.cometcompare) +- wmt22.microsoft.cs outperforms wmt22.bergamot.cs. +- wmt22.google.cs outperforms wmt22.bergamot.cs. +- wmt22.google.cs outperforms wmt22.microsoft.cs. -#### [wmt13.en-fr](en-fr/wmt13.en-fr.cometcompare) -- wmt13.microsoft.fr outperforms wmt13.bergamot.fr. -- wmt13.google.fr outperforms wmt13.bergamot.fr. -- wmt13.microsoft.fr outperforms wmt13.google.fr. +#### [flores-dev.en-cs](en-cs/flores-dev.en-cs.cometcompare) +- flores-dev.microsoft.cs outperforms flores-dev.bergamot.cs. +- flores-dev.google.cs outperforms flores-dev.bergamot.cs. -#### [wmt15.en-fr](en-fr/wmt15.en-fr.cometcompare) -- wmt15.microsoft.fr outperforms wmt15.bergamot.fr. -- wmt15.google.fr outperforms wmt15.bergamot.fr. +#### [wmt16.en-cs](en-cs/wmt16.en-cs.cometcompare) +- wmt16.microsoft.cs outperforms wmt16.bergamot.cs. +- wmt16.google.cs outperforms wmt16.bergamot.cs. -#### [wmt12.en-fr](en-fr/wmt12.en-fr.cometcompare) -- wmt12.microsoft.fr outperforms wmt12.bergamot.fr. -- wmt12.google.fr outperforms wmt12.bergamot.fr. -- wmt12.microsoft.fr outperforms wmt12.google.fr. +#### [wmt11.en-cs](en-cs/wmt11.en-cs.cometcompare) +- wmt11.microsoft.cs outperforms wmt11.bergamot.cs. +- wmt11.google.cs outperforms wmt11.bergamot.cs. +- wmt11.microsoft.cs outperforms wmt11.google.cs. ---- +#### [wmt08.en-cs](en-cs/wmt08.en-cs.cometcompare) +- wmt08.microsoft.cs outperforms wmt08.bergamot.cs. +- wmt08.google.cs outperforms wmt08.bergamot.cs. -## et-en +#### [wmt10.en-cs](en-cs/wmt10.en-cs.cometcompare) +- wmt10.microsoft.cs outperforms wmt10.bergamot.cs. +- wmt10.google.cs outperforms wmt10.bergamot.cs. +- wmt10.microsoft.cs outperforms wmt10.google.cs. -| Translator/Dataset | flores-dev | flores-test | wmt18 | -| --- | --- | --- | --- | -| bergamot | 0.63 | 0.64 | 0.52 | -| google | 0.81 (+0.18, +27.76%) | 0.79 (+0.15, +24.06%) | 0.70 (+0.18, +33.71%) | -| microsoft | 0.78 (+0.15, +22.94%) | 0.76 (+0.12, +19.44%) | 0.67 (+0.15, +29.00%) | +#### [wmt21.en-cs](en-cs/wmt21.en-cs.cometcompare) +- wmt21.microsoft.cs outperforms wmt21.bergamot.cs. +- wmt21.google.cs outperforms wmt21.bergamot.cs. +- wmt21.microsoft.cs outperforms wmt21.google.cs. -![Results](img/et-en-comet.png) -### Comparisons between systems -*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.et-en](et-en/flores-dev.et-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.microsoft.en. +#### [wmt09.en-cs](en-cs/wmt09.en-cs.cometcompare) +- wmt09.microsoft.cs outperforms wmt09.bergamot.cs. +- wmt09.google.cs outperforms wmt09.bergamot.cs. -#### [flores-test.et-en](et-en/flores-test.et-en.cometcompare) -- flores-test.microsoft.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.microsoft.en. +#### [wmt15.en-cs](en-cs/wmt15.en-cs.cometcompare) +- wmt15.microsoft.cs outperforms wmt15.bergamot.cs. +- wmt15.google.cs outperforms wmt15.bergamot.cs. -#### [wmt18.et-en](et-en/wmt18.et-en.cometcompare) -- wmt18.microsoft.en outperforms wmt18.bergamot.en. -- wmt18.google.en outperforms wmt18.bergamot.en. -- wmt18.google.en outperforms wmt18.microsoft.en. +#### [wmt19.en-cs](en-cs/wmt19.en-cs.cometcompare) +- wmt19.microsoft.cs outperforms wmt19.bergamot.cs. +- wmt19.google.cs outperforms wmt19.bergamot.cs. +- wmt19.microsoft.cs outperforms wmt19.google.cs. ---- +#### [wmt14.en-cs](en-cs/wmt14.en-cs.cometcompare) +- wmt14.microsoft.cs outperforms wmt14.bergamot.cs. +- wmt14.google.cs outperforms wmt14.bergamot.cs. +- wmt14.microsoft.cs outperforms wmt14.google.cs. -## bg-en +#### [flores-test.en-cs](en-cs/flores-test.en-cs.cometcompare) +- flores-test.microsoft.cs outperforms flores-test.bergamot.cs. +- flores-test.google.cs outperforms flores-test.bergamot.cs. -| Translator/Dataset | flores-dev | flores-test | -| --- | --- | --- | -| bergamot | 0.68 | 0.68 | -| google | 0.75 (+0.07, +10.71%) | 0.75 (+0.07, +10.26%) | -| microsoft | 0.73 (+0.05, +7.18%) | 0.73 (+0.05, +6.92%) | +#### [wmt17.en-cs](en-cs/wmt17.en-cs.cometcompare) +- wmt17.microsoft.cs outperforms wmt17.bergamot.cs. +- wmt17.google.cs outperforms wmt17.bergamot.cs. +- wmt17.microsoft.cs outperforms wmt17.google.cs. -![Results](img/bg-en-comet.png) -### Comparisons between systems -*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [flores-dev.bg-en](bg-en/flores-dev.bg-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.microsoft.en. +#### [wmt12.en-cs](en-cs/wmt12.en-cs.cometcompare) +- wmt12.microsoft.cs outperforms wmt12.bergamot.cs. +- wmt12.google.cs outperforms wmt12.bergamot.cs. +- wmt12.microsoft.cs outperforms wmt12.google.cs. -#### [flores-test.bg-en](bg-en/flores-test.bg-en.cometcompare) -- flores-test.microsoft.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.bergamot.en. -- flores-test.google.en outperforms flores-test.microsoft.en. +#### [wmt20.en-cs](en-cs/wmt20.en-cs.cometcompare) +- wmt20.microsoft.cs outperforms wmt20.bergamot.cs. +- wmt20.google.cs outperforms wmt20.bergamot.cs. +- wmt20.microsoft.cs outperforms wmt20.google.cs. --- ## de-en -| Translator/Dataset | wmt16 | wmt13 | iwslt17 | wmt10 | wmt17 | wmt21 | wmt14 | wmt19 | wmt11 | flores-dev | wmt22 | wmt20 | flores-test | wmt15 | wmt08 | wmt12 | wmt09 | wmt18 | +| Translator/Dataset | wmt17 | wmt22 | wmt08 | flores-test | wmt20 | wmt15 | wmt18 | iwslt17 | wmt09 | wmt14 | wmt16 | wmt11 | wmt12 | wmt10 | wmt19 | wmt21 | flores-dev | wmt13 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.56 | 0.51 | 0.48 | 0.45 | 0.52 | 0.48 | 0.49 | 0.41 | 0.39 | 0.68 | 0.40 | 0.54 | 0.68 | 0.52 | 0.39 | 0.39 | 0.41 | 0.58 | -| google | 0.67 (+0.11, +19.50%) | 0.59 (+0.09, +16.79%) | 0.57 (+0.09, +18.39%) | 0.57 (+0.12, +26.27%) | 0.65 (+0.13, +25.09%) | 0.61 (+0.13, +27.30%) | 0.62 (+0.13, +27.07%) | 0.56 (+0.14, +34.37%) | 0.48 (+0.09, +24.10%) | 0.76 (+0.08, +11.51%) | 0.56 (+0.16, +40.53%) | 0.66 (+0.12, +22.01%) | 0.76 (+0.08, +11.57%) | 0.64 (+0.12, +22.71%) | 0.51 (+0.12, +29.36%) | 0.52 (+0.13, +32.18%) | 0.51 (+0.10, +25.46%) | 0.70 (+0.11, +19.06%) | -| microsoft | 0.69 (+0.13, +22.98%) | 0.62 (+0.11, +21.80%) | 0.58 (+0.10, +20.87%) | 0.59 (+0.14, +31.46%) | 0.66 (+0.14, +27.41%) | 0.62 (+0.14, +30.21%) | 0.63 (+0.14, +29.25%) | 0.59 (+0.17, +41.07%) | 0.53 (+0.14, +34.74%) | 0.77 (+0.09, +13.67%) | 0.55 (+0.15, +38.95%) | 0.69 (+0.15, +27.97%) | 0.77 (+0.09, +13.19%) | 0.65 (+0.13, +24.73%) | 0.53 (+0.13, +34.22%) | 0.55 (+0.16, +39.48%) | 0.54 (+0.13, +31.15%) | 0.72 (+0.13, +23.05%) | +| bergamot | 0.52 | 0.40 | 0.39 | 0.68 | 0.54 | 0.52 | 0.58 | 0.48 | 0.41 | 0.49 | 0.56 | 0.39 | 0.39 | 0.45 | 0.41 | 0.48 | 0.68 | 0.51 | +| google | 0.65 (+0.13, +25.09%) | 0.56 (+0.16, +40.53%) | 0.51 (+0.12, +29.36%) | 0.76 (+0.08, +11.57%) | 0.66 (+0.12, +22.01%) | 0.64 (+0.12, +22.71%) | 0.70 (+0.11, +19.06%) | 0.57 (+0.09, +18.39%) | 0.51 (+0.10, +25.46%) | 0.62 (+0.13, +27.07%) | 0.67 (+0.11, +19.50%) | 0.48 (+0.09, +24.10%) | 0.52 (+0.13, +32.18%) | 0.57 (+0.12, +26.27%) | 0.56 (+0.14, +34.37%) | 0.61 (+0.13, +27.30%) | 0.76 (+0.08, +11.51%) | 0.59 (+0.09, +16.79%) | +| microsoft | 0.66 (+0.14, +27.41%) | 0.55 (+0.15, +38.95%) | 0.53 (+0.13, +34.22%) | 0.77 (+0.09, +13.19%) | 0.69 (+0.15, +27.97%) | 0.65 (+0.13, +24.73%) | 0.72 (+0.13, +23.05%) | 0.58 (+0.10, +20.87%) | 0.54 (+0.13, +31.15%) | 0.63 (+0.14, +29.25%) | 0.69 (+0.13, +22.98%) | 0.53 (+0.14, +34.74%) | 0.55 (+0.16, +39.48%) | 0.59 (+0.14, +31.46%) | 0.59 (+0.17, +41.07%) | 0.62 (+0.14, +30.21%) | 0.77 (+0.09, +13.67%) | 0.62 (+0.11, +21.80%) | ![Results](img/de-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt16.de-en](de-en/wmt16.de-en.cometcompare) -- wmt16.microsoft.en outperforms wmt16.bergamot.en. -- wmt16.google.en outperforms wmt16.bergamot.en. -- wmt16.microsoft.en outperforms wmt16.google.en. - -#### [wmt13.de-en](de-en/wmt13.de-en.cometcompare) -- wmt13.microsoft.en outperforms wmt13.bergamot.en. -- wmt13.google.en outperforms wmt13.bergamot.en. -- wmt13.microsoft.en outperforms wmt13.google.en. - -#### [iwslt17.de-en](de-en/iwslt17.de-en.cometcompare) -- iwslt17.microsoft.en outperforms iwslt17.bergamot.en. -- iwslt17.google.en outperforms iwslt17.bergamot.en. -- iwslt17.microsoft.en outperforms iwslt17.google.en. - -#### [wmt10.de-en](de-en/wmt10.de-en.cometcompare) -- wmt10.microsoft.en outperforms wmt10.bergamot.en. -- wmt10.google.en outperforms wmt10.bergamot.en. -- wmt10.microsoft.en outperforms wmt10.google.en. - #### [wmt17.de-en](de-en/wmt17.de-en.cometcompare) - wmt17.microsoft.en outperforms wmt17.bergamot.en. - wmt17.google.en outperforms wmt17.bergamot.en. - wmt17.microsoft.en outperforms wmt17.google.en. -#### [wmt21.de-en](de-en/wmt21.de-en.cometcompare) -- wmt21.microsoft.en outperforms wmt21.bergamot.en. -- wmt21.google.en outperforms wmt21.bergamot.en. -- wmt21.microsoft.en outperforms wmt21.google.en. - -#### [wmt14.de-en](de-en/wmt14.de-en.cometcompare) -- wmt14.microsoft.en outperforms wmt14.bergamot.en. -- wmt14.google.en outperforms wmt14.bergamot.en. -- wmt14.microsoft.en outperforms wmt14.google.en. - -#### [wmt19.de-en](de-en/wmt19.de-en.cometcompare) -- wmt19.microsoft.en outperforms wmt19.bergamot.en. -- wmt19.google.en outperforms wmt19.bergamot.en. -- wmt19.microsoft.en outperforms wmt19.google.en. - -#### [wmt11.de-en](de-en/wmt11.de-en.cometcompare) -- wmt11.microsoft.en outperforms wmt11.bergamot.en. -- wmt11.google.en outperforms wmt11.bergamot.en. -- wmt11.microsoft.en outperforms wmt11.google.en. - -#### [flores-dev.de-en](de-en/flores-dev.de-en.cometcompare) -- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. -- flores-dev.google.en outperforms flores-dev.bergamot.en. -- flores-dev.microsoft.en outperforms flores-dev.google.en. - #### [wmt22.de-en](de-en/wmt22.de-en.cometcompare) - wmt22.microsoft.en outperforms wmt22.bergamot.en. - wmt22.google.en outperforms wmt22.bergamot.en. -#### [wmt20.de-en](de-en/wmt20.de-en.cometcompare) -- wmt20.microsoft.en outperforms wmt20.bergamot.en. -- wmt20.google.en outperforms wmt20.bergamot.en. -- wmt20.microsoft.en outperforms wmt20.google.en. +#### [wmt08.de-en](de-en/wmt08.de-en.cometcompare) +- wmt08.microsoft.en outperforms wmt08.bergamot.en. +- wmt08.google.en outperforms wmt08.bergamot.en. +- wmt08.microsoft.en outperforms wmt08.google.en. #### [flores-test.de-en](de-en/flores-test.de-en.cometcompare) - flores-test.microsoft.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.bergamot.en. - flores-test.microsoft.en outperforms flores-test.google.en. +#### [wmt20.de-en](de-en/wmt20.de-en.cometcompare) +- wmt20.microsoft.en outperforms wmt20.bergamot.en. +- wmt20.google.en outperforms wmt20.bergamot.en. +- wmt20.microsoft.en outperforms wmt20.google.en. + #### [wmt15.de-en](de-en/wmt15.de-en.cometcompare) - wmt15.microsoft.en outperforms wmt15.bergamot.en. - wmt15.google.en outperforms wmt15.bergamot.en. - wmt15.microsoft.en outperforms wmt15.google.en. -#### [wmt08.de-en](de-en/wmt08.de-en.cometcompare) -- wmt08.microsoft.en outperforms wmt08.bergamot.en. -- wmt08.google.en outperforms wmt08.bergamot.en. -- wmt08.microsoft.en outperforms wmt08.google.en. +#### [wmt18.de-en](de-en/wmt18.de-en.cometcompare) +- wmt18.microsoft.en outperforms wmt18.bergamot.en. +- wmt18.google.en outperforms wmt18.bergamot.en. +- wmt18.microsoft.en outperforms wmt18.google.en. -#### [wmt12.de-en](de-en/wmt12.de-en.cometcompare) -- wmt12.microsoft.en outperforms wmt12.bergamot.en. -- wmt12.google.en outperforms wmt12.bergamot.en. -- wmt12.microsoft.en outperforms wmt12.google.en. +#### [iwslt17.de-en](de-en/iwslt17.de-en.cometcompare) +- iwslt17.microsoft.en outperforms iwslt17.bergamot.en. +- iwslt17.google.en outperforms iwslt17.bergamot.en. +- iwslt17.microsoft.en outperforms iwslt17.google.en. #### [wmt09.de-en](de-en/wmt09.de-en.cometcompare) - wmt09.microsoft.en outperforms wmt09.bergamot.en. - wmt09.google.en outperforms wmt09.bergamot.en. - wmt09.microsoft.en outperforms wmt09.google.en. -#### [wmt18.de-en](de-en/wmt18.de-en.cometcompare) -- wmt18.microsoft.en outperforms wmt18.bergamot.en. -- wmt18.google.en outperforms wmt18.bergamot.en. -- wmt18.microsoft.en outperforms wmt18.google.en. +#### [wmt14.de-en](de-en/wmt14.de-en.cometcompare) +- wmt14.microsoft.en outperforms wmt14.bergamot.en. +- wmt14.google.en outperforms wmt14.bergamot.en. +- wmt14.microsoft.en outperforms wmt14.google.en. ---- +#### [wmt16.de-en](de-en/wmt16.de-en.cometcompare) +- wmt16.microsoft.en outperforms wmt16.bergamot.en. +- wmt16.google.en outperforms wmt16.bergamot.en. +- wmt16.microsoft.en outperforms wmt16.google.en. -## en-cs +#### [wmt11.de-en](de-en/wmt11.de-en.cometcompare) +- wmt11.microsoft.en outperforms wmt11.bergamot.en. +- wmt11.google.en outperforms wmt11.bergamot.en. +- wmt11.microsoft.en outperforms wmt11.google.en. -| Translator/Dataset | wmt13 | wmt08 | wmt20 | wmt18 | wmt12 | flores-test | wmt22 | wmt15 | wmt19 | flores-dev | wmt10 | wmt21 | wmt11 | wmt17 | wmt16 | wmt14 | wmt09 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.68 | 0.54 | 0.55 | 0.63 | 0.55 | 0.70 | 0.62 | 0.66 | 0.52 | 0.68 | 0.61 | 0.46 | 0.56 | 0.62 | 0.64 | 0.75 | 0.60 | -| google | 0.85 (+0.17, +24.72%) | 0.78 (+0.23, +43.36%) | 0.78 (+0.23, +41.54%) | 0.81 (+0.18, +29.20%) | 0.76 (+0.21, +37.61%) | 0.94 (+0.23, +33.13%) | 0.97 (+0.36, +57.96%) | 0.86 (+0.20, +29.98%) | 0.72 (+0.20, +38.74%) | 0.93 (+0.25, +36.44%) | 0.81 (+0.20, +33.54%) | 0.65 (+0.19, +40.85%) | 0.77 (+0.21, +37.08%) | 0.81 (+0.19, +30.14%) | 0.85 (+0.21, +32.72%) | 0.95 (+0.20, +26.46%) | 0.81 (+0.22, +36.87%) | -| microsoft | 0.87 (+0.19, +27.50%) | 0.78 (+0.24, +44.75%) | 0.81 (+0.26, +47.71%) | 0.82 (+0.20, +31.49%) | 0.78 (+0.23, +41.83%) | 0.93 (+0.23, +32.76%) | 0.94 (+0.32, +52.78%) | 0.87 (+0.21, +31.11%) | 0.77 (+0.25, +49.14%) | 0.93 (+0.25, +36.25%) | 0.83 (+0.22, +35.78%) | 0.69 (+0.23, +49.02%) | 0.80 (+0.24, +41.92%) | 0.83 (+0.20, +32.21%) | 0.85 (+0.21, +33.51%) | 0.97 (+0.22, +29.34%) | 0.82 (+0.22, +37.76%) | +#### [wmt12.de-en](de-en/wmt12.de-en.cometcompare) +- wmt12.microsoft.en outperforms wmt12.bergamot.en. +- wmt12.google.en outperforms wmt12.bergamot.en. +- wmt12.microsoft.en outperforms wmt12.google.en. -![Results](img/en-cs-comet.png) +#### [wmt10.de-en](de-en/wmt10.de-en.cometcompare) +- wmt10.microsoft.en outperforms wmt10.bergamot.en. +- wmt10.google.en outperforms wmt10.bergamot.en. +- wmt10.microsoft.en outperforms wmt10.google.en. + +#### [wmt19.de-en](de-en/wmt19.de-en.cometcompare) +- wmt19.microsoft.en outperforms wmt19.bergamot.en. +- wmt19.google.en outperforms wmt19.bergamot.en. +- wmt19.microsoft.en outperforms wmt19.google.en. + +#### [wmt21.de-en](de-en/wmt21.de-en.cometcompare) +- wmt21.microsoft.en outperforms wmt21.bergamot.en. +- wmt21.google.en outperforms wmt21.bergamot.en. +- wmt21.microsoft.en outperforms wmt21.google.en. + +#### [flores-dev.de-en](de-en/flores-dev.de-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.microsoft.en outperforms flores-dev.google.en. + +#### [wmt13.de-en](de-en/wmt13.de-en.cometcompare) +- wmt13.microsoft.en outperforms wmt13.bergamot.en. +- wmt13.google.en outperforms wmt13.bergamot.en. +- wmt13.microsoft.en outperforms wmt13.google.en. + +--- + +## it-en + +| Translator/Dataset | flores-test | mtedx_test | wmt09 | flores-dev | +| --- | --- | --- | --- | --- | +| bergamot | 0.70 | 0.55 | 0.53 | 0.72 | +| google | 0.76 (+0.06, +9.33%) | 0.62 (+0.07, +12.79%) | 0.60 (+0.08, +14.29%) | 0.76 (+0.04, +6.00%) | +| microsoft | 0.76 (+0.06, +8.80%) | 0.61 (+0.06, +11.31%) | 0.61 (+0.09, +16.28%) | 0.76 (+0.04, +6.16%) | + +![Results](img/it-en-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt13.en-cs](en-cs/wmt13.en-cs.cometcompare) -- wmt13.microsoft.cs outperforms wmt13.bergamot.cs. -- wmt13.google.cs outperforms wmt13.bergamot.cs. -- wmt13.microsoft.cs outperforms wmt13.google.cs. +#### [flores-test.it-en](it-en/flores-test.it-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. -#### [wmt08.en-cs](en-cs/wmt08.en-cs.cometcompare) -- wmt08.microsoft.cs outperforms wmt08.bergamot.cs. -- wmt08.google.cs outperforms wmt08.bergamot.cs. +#### [mtedx_test.it-en](it-en/mtedx_test.it-en.cometcompare) +- mtedx_test.microsoft.en outperforms mtedx_test.bergamot.en. +- mtedx_test.google.en outperforms mtedx_test.bergamot.en. -#### [wmt20.en-cs](en-cs/wmt20.en-cs.cometcompare) -- wmt20.microsoft.cs outperforms wmt20.bergamot.cs. -- wmt20.google.cs outperforms wmt20.bergamot.cs. -- wmt20.microsoft.cs outperforms wmt20.google.cs. +#### [wmt09.it-en](it-en/wmt09.it-en.cometcompare) +- wmt09.microsoft.en outperforms wmt09.bergamot.en. +- wmt09.google.en outperforms wmt09.bergamot.en. +- wmt09.microsoft.en outperforms wmt09.google.en. -#### [wmt18.en-cs](en-cs/wmt18.en-cs.cometcompare) -- wmt18.microsoft.cs outperforms wmt18.bergamot.cs. -- wmt18.google.cs outperforms wmt18.bergamot.cs. -- wmt18.microsoft.cs outperforms wmt18.google.cs. +#### [flores-dev.it-en](it-en/flores-dev.it-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. -#### [wmt12.en-cs](en-cs/wmt12.en-cs.cometcompare) -- wmt12.microsoft.cs outperforms wmt12.bergamot.cs. -- wmt12.google.cs outperforms wmt12.bergamot.cs. -- wmt12.microsoft.cs outperforms wmt12.google.cs. +--- -#### [flores-test.en-cs](en-cs/flores-test.en-cs.cometcompare) -- flores-test.microsoft.cs outperforms flores-test.bergamot.cs. -- flores-test.google.cs outperforms flores-test.bergamot.cs. +## pl-en -#### [wmt22.en-cs](en-cs/wmt22.en-cs.cometcompare) -- wmt22.microsoft.cs outperforms wmt22.bergamot.cs. -- wmt22.google.cs outperforms wmt22.bergamot.cs. -- wmt22.google.cs outperforms wmt22.microsoft.cs. +| Translator/Dataset | flores-test | wmt20 | flores-dev | +| --- | --- | --- | --- | +| bergamot | 0.57 | 0.53 | 0.59 | +| google | 0.68 (+0.10, +17.70%) | 0.62 (+0.09, +17.51%) | 0.68 (+0.09, +15.30%) | +| microsoft | 0.68 (+0.10, +17.70%) | 0.64 (+0.11, +20.84%) | 0.67 (+0.08, +13.89%) | -#### [wmt15.en-cs](en-cs/wmt15.en-cs.cometcompare) -- wmt15.microsoft.cs outperforms wmt15.bergamot.cs. -- wmt15.google.cs outperforms wmt15.bergamot.cs. +![Results](img/pl-en-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [flores-test.pl-en](pl-en/flores-test.pl-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. -#### [wmt19.en-cs](en-cs/wmt19.en-cs.cometcompare) -- wmt19.microsoft.cs outperforms wmt19.bergamot.cs. -- wmt19.google.cs outperforms wmt19.bergamot.cs. -- wmt19.microsoft.cs outperforms wmt19.google.cs. +#### [wmt20.pl-en](pl-en/wmt20.pl-en.cometcompare) +- wmt20.microsoft.en outperforms wmt20.bergamot.en. +- wmt20.google.en outperforms wmt20.bergamot.en. +- wmt20.microsoft.en outperforms wmt20.google.en. -#### [flores-dev.en-cs](en-cs/flores-dev.en-cs.cometcompare) -- flores-dev.microsoft.cs outperforms flores-dev.bergamot.cs. -- flores-dev.google.cs outperforms flores-dev.bergamot.cs. +#### [flores-dev.pl-en](pl-en/flores-dev.pl-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.microsoft.en. -#### [wmt10.en-cs](en-cs/wmt10.en-cs.cometcompare) -- wmt10.microsoft.cs outperforms wmt10.bergamot.cs. -- wmt10.google.cs outperforms wmt10.bergamot.cs. -- wmt10.microsoft.cs outperforms wmt10.google.cs. +--- -#### [wmt21.en-cs](en-cs/wmt21.en-cs.cometcompare) -- wmt21.microsoft.cs outperforms wmt21.bergamot.cs. -- wmt21.google.cs outperforms wmt21.bergamot.cs. -- wmt21.microsoft.cs outperforms wmt21.google.cs. +## en-fr -#### [wmt11.en-cs](en-cs/wmt11.en-cs.cometcompare) -- wmt11.microsoft.cs outperforms wmt11.bergamot.cs. -- wmt11.google.cs outperforms wmt11.bergamot.cs. -- wmt11.microsoft.cs outperforms wmt11.google.cs. +| Translator/Dataset | wmt11 | wmt10 | wmt14 | wmt09 | flores-test | wmt08 | wmt15 | wmt12 | iwslt17 | wmt13 | flores-dev | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 0.52 | 0.52 | 0.63 | 0.48 | 0.71 | 0.40 | 0.50 | 0.49 | 0.59 | 0.55 | 0.73 | +| google | 0.62 (+0.10, +18.30%) | 0.63 (+0.12, +22.59%) | 0.75 (+0.12, +19.13%) | 0.59 (+0.11, +21.82%) | 0.84 (+0.13, +17.81%) | 0.54 (+0.13, +33.43%) | 0.67 (+0.17, +34.26%) | 0.58 (+0.09, +18.36%) | 0.67 (+0.08, +13.94%) | 0.64 (+0.10, +17.68%) | 0.83 (+0.10, +13.28%) | +| microsoft | 0.64 (+0.12, +22.03%) | 0.65 (+0.14, +26.13%) | 0.78 (+0.15, +23.35%) | 0.61 (+0.13, +26.38%) | 0.85 (+0.14, +18.99%) | 0.54 (+0.14, +35.05%) | 0.68 (+0.18, +35.82%) | 0.61 (+0.12, +24.12%) | 0.69 (+0.10, +16.63%) | 0.66 (+0.12, +21.10%) | 0.85 (+0.11, +15.24%) | -#### [wmt17.en-cs](en-cs/wmt17.en-cs.cometcompare) -- wmt17.microsoft.cs outperforms wmt17.bergamot.cs. -- wmt17.google.cs outperforms wmt17.bergamot.cs. -- wmt17.microsoft.cs outperforms wmt17.google.cs. +![Results](img/en-fr-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [wmt11.en-fr](en-fr/wmt11.en-fr.cometcompare) +- wmt11.microsoft.fr outperforms wmt11.bergamot.fr. +- wmt11.google.fr outperforms wmt11.bergamot.fr. +- wmt11.microsoft.fr outperforms wmt11.google.fr. -#### [wmt16.en-cs](en-cs/wmt16.en-cs.cometcompare) -- wmt16.microsoft.cs outperforms wmt16.bergamot.cs. -- wmt16.google.cs outperforms wmt16.bergamot.cs. +#### [wmt10.en-fr](en-fr/wmt10.en-fr.cometcompare) +- wmt10.microsoft.fr outperforms wmt10.bergamot.fr. +- wmt10.google.fr outperforms wmt10.bergamot.fr. +- wmt10.microsoft.fr outperforms wmt10.google.fr. -#### [wmt14.en-cs](en-cs/wmt14.en-cs.cometcompare) -- wmt14.microsoft.cs outperforms wmt14.bergamot.cs. -- wmt14.google.cs outperforms wmt14.bergamot.cs. -- wmt14.microsoft.cs outperforms wmt14.google.cs. +#### [wmt14.en-fr](en-fr/wmt14.en-fr.cometcompare) +- wmt14.microsoft.fr outperforms wmt14.bergamot.fr. +- wmt14.google.fr outperforms wmt14.bergamot.fr. +- wmt14.microsoft.fr outperforms wmt14.google.fr. -#### [wmt09.en-cs](en-cs/wmt09.en-cs.cometcompare) -- wmt09.microsoft.cs outperforms wmt09.bergamot.cs. -- wmt09.google.cs outperforms wmt09.bergamot.cs. +#### [wmt09.en-fr](en-fr/wmt09.en-fr.cometcompare) +- wmt09.microsoft.fr outperforms wmt09.bergamot.fr. +- wmt09.google.fr outperforms wmt09.bergamot.fr. +- wmt09.microsoft.fr outperforms wmt09.google.fr. + +#### [flores-test.en-fr](en-fr/flores-test.en-fr.cometcompare) +- flores-test.microsoft.fr outperforms flores-test.bergamot.fr. +- flores-test.google.fr outperforms flores-test.bergamot.fr. + +#### [wmt08.en-fr](en-fr/wmt08.en-fr.cometcompare) +- wmt08.microsoft.fr outperforms wmt08.bergamot.fr. +- wmt08.google.fr outperforms wmt08.bergamot.fr. + +#### [wmt15.en-fr](en-fr/wmt15.en-fr.cometcompare) +- wmt15.microsoft.fr outperforms wmt15.bergamot.fr. +- wmt15.google.fr outperforms wmt15.bergamot.fr. + +#### [wmt12.en-fr](en-fr/wmt12.en-fr.cometcompare) +- wmt12.microsoft.fr outperforms wmt12.bergamot.fr. +- wmt12.google.fr outperforms wmt12.bergamot.fr. +- wmt12.microsoft.fr outperforms wmt12.google.fr. + +#### [iwslt17.en-fr](en-fr/iwslt17.en-fr.cometcompare) +- iwslt17.microsoft.fr outperforms iwslt17.bergamot.fr. +- iwslt17.google.fr outperforms iwslt17.bergamot.fr. +- iwslt17.microsoft.fr outperforms iwslt17.google.fr. + +#### [wmt13.en-fr](en-fr/wmt13.en-fr.cometcompare) +- wmt13.microsoft.fr outperforms wmt13.bergamot.fr. +- wmt13.google.fr outperforms wmt13.bergamot.fr. +- wmt13.microsoft.fr outperforms wmt13.google.fr. + +#### [flores-dev.en-fr](en-fr/flores-dev.en-fr.cometcompare) +- flores-dev.microsoft.fr outperforms flores-dev.bergamot.fr. +- flores-dev.google.fr outperforms flores-dev.bergamot.fr. +- flores-dev.microsoft.fr outperforms flores-dev.google.fr. --- -## cs-en +## en-pl -| Translator/Dataset | wmt16 | wmt13 | wmt10 | wmt17 | wmt21 | wmt14 | wmt11 | flores-dev | wmt22 | wmt20 | flores-test | wmt15 | wmt08 | wmt12 | wmt09 | wmt18 | -| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | -| bergamot | 0.48 | 0.50 | 0.44 | 0.45 | 0.47 | 0.57 | 0.41 | 0.65 | 0.55 | 0.35 | 0.65 | 0.49 | 0.37 | 0.42 | 0.43 | 0.48 | -| google | 0.62 (+0.14, +28.50%) | 0.61 (+0.12, +23.25%) | 0.57 (+0.13, +29.95%) | 0.58 (+0.13, +28.12%) | 0.60 (+0.13, +27.51%) | 0.70 (+0.13, +22.40%) | 0.53 (+0.12, +29.26%) | 0.76 (+0.10, +15.71%) | 0.70 (+0.15, +27.55%) | 0.51 (+0.16, +46.19%) | 0.77 (+0.11, +17.45%) | 0.62 (+0.13, +26.40%) | 0.52 (+0.15, +41.56%) | 0.54 (+0.12, +28.77%) | 0.56 (+0.13, +30.83%) | 0.59 (+0.11, +23.27%) | -| microsoft | 0.62 (+0.14, +28.46%) | 0.63 (+0.14, +27.56%) | 0.58 (+0.14, +30.79%) | 0.57 (+0.13, +28.01%) | 0.59 (+0.12, +24.74%) | 0.72 (+0.14, +24.79%) | 0.56 (+0.15, +35.20%) | 0.75 (+0.10, +14.95%) | 0.72 (+0.17, +30.46%) | 0.50 (+0.16, +44.44%) | 0.76 (+0.11, +16.30%) | 0.63 (+0.14, +27.93%) | 0.52 (+0.15, +41.59%) | 0.56 (+0.14, +33.29%) | 0.57 (+0.14, +32.48%) | 0.60 (+0.12, +25.68%) | +| Translator/Dataset | flores-dev | flores-test | wmt20 | +| --- | --- | --- | --- | +| bergamot | 0.68 | 0.67 | 0.62 | +| google | 0.85 (+0.17, +25.18%) | 0.84 (+0.17, +25.53%) | 0.78 (+0.16, +25.87%) | +| microsoft | 0.81 (+0.13, +19.77%) | 0.81 (+0.14, +21.62%) | 0.77 (+0.15, +23.45%) | -![Results](img/cs-en-comet.png) +![Results](img/en-pl-comet.png) ### Comparisons between systems *If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* -#### [wmt16.cs-en](cs-en/wmt16.cs-en.cometcompare) -- wmt16.microsoft.en outperforms wmt16.bergamot.en. -- wmt16.google.en outperforms wmt16.bergamot.en. +#### [flores-dev.en-pl](en-pl/flores-dev.en-pl.cometcompare) +- flores-dev.microsoft.pl outperforms flores-dev.bergamot.pl. +- flores-dev.google.pl outperforms flores-dev.bergamot.pl. +- flores-dev.google.pl outperforms flores-dev.microsoft.pl. -#### [wmt13.cs-en](cs-en/wmt13.cs-en.cometcompare) -- wmt13.microsoft.en outperforms wmt13.bergamot.en. -- wmt13.google.en outperforms wmt13.bergamot.en. -- wmt13.microsoft.en outperforms wmt13.google.en. +#### [flores-test.en-pl](en-pl/flores-test.en-pl.cometcompare) +- flores-test.microsoft.pl outperforms flores-test.bergamot.pl. +- flores-test.google.pl outperforms flores-test.bergamot.pl. +- flores-test.google.pl outperforms flores-test.microsoft.pl. -#### [wmt10.cs-en](cs-en/wmt10.cs-en.cometcompare) -- wmt10.microsoft.en outperforms wmt10.bergamot.en. -- wmt10.google.en outperforms wmt10.bergamot.en. +#### [wmt20.en-pl](en-pl/wmt20.en-pl.cometcompare) +- wmt20.microsoft.pl outperforms wmt20.bergamot.pl. +- wmt20.google.pl outperforms wmt20.bergamot.pl. +- wmt20.google.pl outperforms wmt20.microsoft.pl. -#### [wmt17.cs-en](cs-en/wmt17.cs-en.cometcompare) -- wmt17.microsoft.en outperforms wmt17.bergamot.en. -- wmt17.google.en outperforms wmt17.bergamot.en. +--- -#### [wmt21.cs-en](cs-en/wmt21.cs-en.cometcompare) -- wmt21.microsoft.en outperforms wmt21.bergamot.en. -- wmt21.google.en outperforms wmt21.bergamot.en. -- wmt21.google.en outperforms wmt21.microsoft.en. +## pt-en -#### [wmt14.cs-en](cs-en/wmt14.cs-en.cometcompare) -- wmt14.microsoft.en outperforms wmt14.bergamot.en. -- wmt14.google.en outperforms wmt14.bergamot.en. -- wmt14.microsoft.en outperforms wmt14.google.en. +| Translator/Dataset | flores-test | mtedx_test | flores-dev | +| --- | --- | --- | --- | +| bergamot | 0.80 | 0.63 | 0.81 | +| google | 0.85 (+0.05, +6.39%) | 0.70 (+0.07, +10.56%) | 0.85 (+0.04, +4.40%) | +| microsoft | 0.85 (+0.05, +6.39%) | 0.69 (+0.05, +8.54%) | 0.84 (+0.03, +4.17%) | -#### [wmt11.cs-en](cs-en/wmt11.cs-en.cometcompare) -- wmt11.microsoft.en outperforms wmt11.bergamot.en. -- wmt11.google.en outperforms wmt11.bergamot.en. -- wmt11.microsoft.en outperforms wmt11.google.en. +![Results](img/pt-en-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [flores-test.pt-en](pt-en/flores-test.pt-en.cometcompare) +- flores-test.microsoft.en outperforms flores-test.bergamot.en. +- flores-test.google.en outperforms flores-test.bergamot.en. -#### [flores-dev.cs-en](cs-en/flores-dev.cs-en.cometcompare) +#### [mtedx_test.pt-en](pt-en/mtedx_test.pt-en.cometcompare) +- mtedx_test.microsoft.en outperforms mtedx_test.bergamot.en. +- mtedx_test.google.en outperforms mtedx_test.bergamot.en. +- mtedx_test.google.en outperforms mtedx_test.microsoft.en. + +#### [flores-dev.pt-en](pt-en/flores-dev.pt-en.cometcompare) - flores-dev.microsoft.en outperforms flores-dev.bergamot.en. - flores-dev.google.en outperforms flores-dev.bergamot.en. -#### [wmt22.cs-en](cs-en/wmt22.cs-en.cometcompare) -- wmt22.microsoft.en outperforms wmt22.bergamot.en. -- wmt22.google.en outperforms wmt22.bergamot.en. -- wmt22.microsoft.en outperforms wmt22.google.en. +--- -#### [wmt20.cs-en](cs-en/wmt20.cs-en.cometcompare) -- wmt20.microsoft.en outperforms wmt20.bergamot.en. -- wmt20.google.en outperforms wmt20.bergamot.en. +## es-en -#### [flores-test.cs-en](cs-en/flores-test.cs-en.cometcompare) +| Translator/Dataset | wmt08 | flores-test | mtedx_test | wmt09 | wmt11 | wmt12 | wmt10 | flores-dev | wmt13 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 0.46 | 0.66 | 0.43 | 0.47 | 0.52 | 0.56 | 0.58 | 0.66 | 0.60 | +| google | 0.52 (+0.06, +12.89%) | 0.74 (+0.08, +12.52%) | 0.53 (+0.09, +21.84%) | 0.55 (+0.07, +15.87%) | 0.57 (+0.04, +8.52%) | 0.61 (+0.04, +7.32%) | 0.65 (+0.07, +11.52%) | 0.74 (+0.08, +12.89%) | 0.65 (+0.05, +7.48%) | +| microsoft | 0.53 (+0.07, +14.99%) | 0.73 (+0.07, +11.40%) | 0.54 (+0.11, +24.18%) | 0.55 (+0.08, +17.25%) | 0.60 (+0.07, +14.17%) | 0.63 (+0.07, +11.90%) | 0.66 (+0.07, +12.86%) | 0.73 (+0.07, +11.07%) | 0.67 (+0.07, +10.80%) | + +![Results](img/es-en-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [wmt08.es-en](es-en/wmt08.es-en.cometcompare) +- wmt08.microsoft.en outperforms wmt08.bergamot.en. +- wmt08.google.en outperforms wmt08.bergamot.en. +- wmt08.microsoft.en outperforms wmt08.google.en. + +#### [flores-test.es-en](es-en/flores-test.es-en.cometcompare) - flores-test.microsoft.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.bergamot.en. - flores-test.google.en outperforms flores-test.microsoft.en. -#### [wmt15.cs-en](cs-en/wmt15.cs-en.cometcompare) -- wmt15.microsoft.en outperforms wmt15.bergamot.en. -- wmt15.google.en outperforms wmt15.bergamot.en. +#### [mtedx_test.es-en](es-en/mtedx_test.es-en.cometcompare) +- mtedx_test.microsoft.en outperforms mtedx_test.bergamot.en. +- mtedx_test.google.en outperforms mtedx_test.bergamot.en. -#### [wmt08.cs-en](cs-en/wmt08.cs-en.cometcompare) -- wmt08.microsoft.en outperforms wmt08.bergamot.en. -- wmt08.google.en outperforms wmt08.bergamot.en. +#### [wmt09.es-en](es-en/wmt09.es-en.cometcompare) +- wmt09.microsoft.en outperforms wmt09.bergamot.en. +- wmt09.google.en outperforms wmt09.bergamot.en. +- wmt09.microsoft.en outperforms wmt09.google.en. -#### [wmt12.cs-en](cs-en/wmt12.cs-en.cometcompare) +#### [wmt11.es-en](es-en/wmt11.es-en.cometcompare) +- wmt11.microsoft.en outperforms wmt11.bergamot.en. +- wmt11.google.en outperforms wmt11.bergamot.en. +- wmt11.microsoft.en outperforms wmt11.google.en. + +#### [wmt12.es-en](es-en/wmt12.es-en.cometcompare) - wmt12.microsoft.en outperforms wmt12.bergamot.en. - wmt12.google.en outperforms wmt12.bergamot.en. - wmt12.microsoft.en outperforms wmt12.google.en. -#### [wmt09.cs-en](cs-en/wmt09.cs-en.cometcompare) -- wmt09.microsoft.en outperforms wmt09.bergamot.en. -- wmt09.google.en outperforms wmt09.bergamot.en. -- wmt09.microsoft.en outperforms wmt09.google.en. +#### [wmt10.es-en](es-en/wmt10.es-en.cometcompare) +- wmt10.microsoft.en outperforms wmt10.bergamot.en. +- wmt10.google.en outperforms wmt10.bergamot.en. +- wmt10.microsoft.en outperforms wmt10.google.en. -#### [wmt18.cs-en](cs-en/wmt18.cs-en.cometcompare) -- wmt18.microsoft.en outperforms wmt18.bergamot.en. -- wmt18.google.en outperforms wmt18.bergamot.en. -- wmt18.microsoft.en outperforms wmt18.google.en. +#### [flores-dev.es-en](es-en/flores-dev.es-en.cometcompare) +- flores-dev.microsoft.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.bergamot.en. +- flores-dev.google.en outperforms flores-dev.microsoft.en. + +#### [wmt13.es-en](es-en/wmt13.es-en.cometcompare) +- wmt13.microsoft.en outperforms wmt13.bergamot.en. +- wmt13.google.en outperforms wmt13.bergamot.en. +- wmt13.microsoft.en outperforms wmt13.google.en. + +--- + +## en-de + +| Translator/Dataset | wmt12 | wmt13 | wmt22 | flores-dev | wmt15 | wmt08 | wmt18 | wmt14 | wmt19 | wmt17 | wmt21 | iwslt17 | wmt20 | wmt11 | flores-test | wmt09 | wmt10 | wmt16 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | +| bergamot | 0.43 | 0.51 | 0.46 | 0.60 | 0.54 | 0.42 | 0.65 | 0.55 | 0.55 | 0.55 | 0.46 | 0.43 | 0.50 | 0.41 | 0.62 | 0.43 | 0.49 | 0.60 | +| google | 0.52 (+0.08, +19.17%) | 0.59 (+0.08, +15.78%) | 0.62 (+0.16, +34.59%) | 0.70 (+0.09, +15.43%) | 0.64 (+0.10, +17.92%) | 0.52 (+0.10, +24.24%) | 0.72 (+0.07, +10.80%) | 0.66 (+0.10, +18.81%) | 0.62 (+0.07, +13.58%) | 0.64 (+0.09, +16.72%) | 0.52 (+0.06, +14.16%) | 0.52 (+0.10, +22.49%) | 0.60 (+0.10, +19.53%) | 0.50 (+0.10, +24.37%) | 0.70 (+0.09, +14.23%) | 0.53 (+0.10, +22.21%) | 0.57 (+0.07, +15.28%) | 0.67 (+0.08, +12.64%) | +| microsoft | 0.54 (+0.10, +23.89%) | 0.60 (+0.09, +18.10%) | 0.63 (+0.17, +36.02%) | 0.70 (+0.09, +15.61%) | 0.64 (+0.10, +18.87%) | 0.54 (+0.12, +27.31%) | 0.73 (+0.08, +12.41%) | 0.66 (+0.11, +19.77%) | 0.64 (+0.09, +16.54%) | 0.64 (+0.10, +18.33%) | 0.55 (+0.09, +19.86%) | 0.52 (+0.10, +22.70%) | 0.61 (+0.11, +22.10%) | 0.52 (+0.12, +29.30%) | 0.71 (+0.09, +14.73%) | 0.54 (+0.11, +25.27%) | 0.59 (+0.10, +19.38%) | 0.69 (+0.09, +15.21%) | + +![Results](img/en-de-comet.png) +### Comparisons between systems +*If a comparison is omitted, the systems have equal averages (tie). Click on the dataset for a complete report* +#### [wmt12.en-de](en-de/wmt12.en-de.cometcompare) +- wmt12.microsoft.de outperforms wmt12.bergamot.de. +- wmt12.google.de outperforms wmt12.bergamot.de. +- wmt12.microsoft.de outperforms wmt12.google.de. + +#### [wmt13.en-de](en-de/wmt13.en-de.cometcompare) +- wmt13.microsoft.de outperforms wmt13.bergamot.de. +- wmt13.google.de outperforms wmt13.bergamot.de. +- wmt13.microsoft.de outperforms wmt13.google.de. + +#### [wmt22.en-de](en-de/wmt22.en-de.cometcompare) +- wmt22.microsoft.de outperforms wmt22.bergamot.de. +- wmt22.google.de outperforms wmt22.bergamot.de. + +#### [flores-dev.en-de](en-de/flores-dev.en-de.cometcompare) +- flores-dev.microsoft.de outperforms flores-dev.bergamot.de. +- flores-dev.google.de outperforms flores-dev.bergamot.de. + +#### [wmt15.en-de](en-de/wmt15.en-de.cometcompare) +- wmt15.microsoft.de outperforms wmt15.bergamot.de. +- wmt15.google.de outperforms wmt15.bergamot.de. + +#### [wmt08.en-de](en-de/wmt08.en-de.cometcompare) +- wmt08.microsoft.de outperforms wmt08.bergamot.de. +- wmt08.google.de outperforms wmt08.bergamot.de. +- wmt08.microsoft.de outperforms wmt08.google.de. + +#### [wmt18.en-de](en-de/wmt18.en-de.cometcompare) +- wmt18.microsoft.de outperforms wmt18.bergamot.de. +- wmt18.google.de outperforms wmt18.bergamot.de. +- wmt18.microsoft.de outperforms wmt18.google.de. + +#### [wmt14.en-de](en-de/wmt14.en-de.cometcompare) +- wmt14.microsoft.de outperforms wmt14.bergamot.de. +- wmt14.google.de outperforms wmt14.bergamot.de. +- wmt14.microsoft.de outperforms wmt14.google.de. + +#### [wmt19.en-de](en-de/wmt19.en-de.cometcompare) +- wmt19.microsoft.de outperforms wmt19.bergamot.de. +- wmt19.google.de outperforms wmt19.bergamot.de. +- wmt19.microsoft.de outperforms wmt19.google.de. + +#### [wmt17.en-de](en-de/wmt17.en-de.cometcompare) +- wmt17.microsoft.de outperforms wmt17.bergamot.de. +- wmt17.google.de outperforms wmt17.bergamot.de. +- wmt17.microsoft.de outperforms wmt17.google.de. + +#### [wmt21.en-de](en-de/wmt21.en-de.cometcompare) +- wmt21.microsoft.de outperforms wmt21.bergamot.de. +- wmt21.google.de outperforms wmt21.bergamot.de. +- wmt21.microsoft.de outperforms wmt21.google.de. + +#### [iwslt17.en-de](en-de/iwslt17.en-de.cometcompare) +- iwslt17.microsoft.de outperforms iwslt17.bergamot.de. +- iwslt17.google.de outperforms iwslt17.bergamot.de. + +#### [wmt20.en-de](en-de/wmt20.en-de.cometcompare) +- wmt20.microsoft.de outperforms wmt20.bergamot.de. +- wmt20.google.de outperforms wmt20.bergamot.de. +- wmt20.microsoft.de outperforms wmt20.google.de. + +#### [wmt11.en-de](en-de/wmt11.en-de.cometcompare) +- wmt11.microsoft.de outperforms wmt11.bergamot.de. +- wmt11.google.de outperforms wmt11.bergamot.de. +- wmt11.microsoft.de outperforms wmt11.google.de. + +#### [flores-test.en-de](en-de/flores-test.en-de.cometcompare) +- flores-test.microsoft.de outperforms flores-test.bergamot.de. +- flores-test.google.de outperforms flores-test.bergamot.de. + +#### [wmt09.en-de](en-de/wmt09.en-de.cometcompare) +- wmt09.microsoft.de outperforms wmt09.bergamot.de. +- wmt09.google.de outperforms wmt09.bergamot.de. +- wmt09.microsoft.de outperforms wmt09.google.de. + +#### [wmt10.en-de](en-de/wmt10.en-de.cometcompare) +- wmt10.microsoft.de outperforms wmt10.bergamot.de. +- wmt10.google.de outperforms wmt10.bergamot.de. +- wmt10.microsoft.de outperforms wmt10.google.de. + +#### [wmt16.en-de](en-de/wmt16.en-de.cometcompare) +- wmt16.microsoft.de outperforms wmt16.bergamot.de. +- wmt16.google.de outperforms wmt16.bergamot.de. +- wmt16.microsoft.de outperforms wmt16.google.de. --- \ No newline at end of file diff --git a/evaluation/prod/img/avg-bleu.png b/evaluation/prod/img/avg-bleu.png index 729200d7..adcd11d9 100644 Binary files a/evaluation/prod/img/avg-bleu.png and b/evaluation/prod/img/avg-bleu.png differ diff --git a/evaluation/prod/img/avg-comet.png b/evaluation/prod/img/avg-comet.png index 4beb54fd..2edda401 100644 Binary files a/evaluation/prod/img/avg-comet.png and b/evaluation/prod/img/avg-comet.png differ diff --git a/evaluation/prod/img/bg-en-comet.png b/evaluation/prod/img/bg-en-comet.png index d17a225c..65a481cc 100644 Binary files a/evaluation/prod/img/bg-en-comet.png and b/evaluation/prod/img/bg-en-comet.png differ diff --git a/evaluation/prod/img/cs-en-bleu.png b/evaluation/prod/img/cs-en-bleu.png index 63e770f3..d13f92a9 100644 Binary files a/evaluation/prod/img/cs-en-bleu.png and b/evaluation/prod/img/cs-en-bleu.png differ diff --git a/evaluation/prod/img/cs-en-comet.png b/evaluation/prod/img/cs-en-comet.png index fa7f756b..e452f06a 100644 Binary files a/evaluation/prod/img/cs-en-comet.png and b/evaluation/prod/img/cs-en-comet.png differ diff --git a/evaluation/prod/img/de-en-bleu.png b/evaluation/prod/img/de-en-bleu.png index 2a06509a..7d9f3684 100644 Binary files a/evaluation/prod/img/de-en-bleu.png and b/evaluation/prod/img/de-en-bleu.png differ diff --git a/evaluation/prod/img/de-en-comet.png b/evaluation/prod/img/de-en-comet.png index 378372ef..406fca8b 100644 Binary files a/evaluation/prod/img/de-en-comet.png and b/evaluation/prod/img/de-en-comet.png differ diff --git a/evaluation/prod/img/en-cs-bleu.png b/evaluation/prod/img/en-cs-bleu.png index d5969c71..2b5dd2c3 100644 Binary files a/evaluation/prod/img/en-cs-bleu.png and b/evaluation/prod/img/en-cs-bleu.png differ diff --git a/evaluation/prod/img/en-cs-comet.png b/evaluation/prod/img/en-cs-comet.png index 948247ad..4bc16930 100644 Binary files a/evaluation/prod/img/en-cs-comet.png and b/evaluation/prod/img/en-cs-comet.png differ diff --git a/evaluation/prod/img/en-de-bleu.png b/evaluation/prod/img/en-de-bleu.png index a8fe7531..4835aec2 100644 Binary files a/evaluation/prod/img/en-de-bleu.png and b/evaluation/prod/img/en-de-bleu.png differ diff --git a/evaluation/prod/img/en-de-comet.png b/evaluation/prod/img/en-de-comet.png index 12be96da..1c9d5451 100644 Binary files a/evaluation/prod/img/en-de-comet.png and b/evaluation/prod/img/en-de-comet.png differ diff --git a/evaluation/prod/img/en-es-bleu.png b/evaluation/prod/img/en-es-bleu.png index 23667146..59ea34e3 100644 Binary files a/evaluation/prod/img/en-es-bleu.png and b/evaluation/prod/img/en-es-bleu.png differ diff --git a/evaluation/prod/img/en-es-comet.png b/evaluation/prod/img/en-es-comet.png index fcdaf012..d8b9ca92 100644 Binary files a/evaluation/prod/img/en-es-comet.png and b/evaluation/prod/img/en-es-comet.png differ diff --git a/evaluation/prod/img/en-et-bleu.png b/evaluation/prod/img/en-et-bleu.png index 7475b249..d9934ba6 100644 Binary files a/evaluation/prod/img/en-et-bleu.png and b/evaluation/prod/img/en-et-bleu.png differ diff --git a/evaluation/prod/img/en-et-comet.png b/evaluation/prod/img/en-et-comet.png index e1ef51ec..b868b332 100644 Binary files a/evaluation/prod/img/en-et-comet.png and b/evaluation/prod/img/en-et-comet.png differ diff --git a/evaluation/prod/img/en-fr-bleu.png b/evaluation/prod/img/en-fr-bleu.png index 42bfa4b8..f4ee86b7 100644 Binary files a/evaluation/prod/img/en-fr-bleu.png and b/evaluation/prod/img/en-fr-bleu.png differ diff --git a/evaluation/prod/img/en-fr-comet.png b/evaluation/prod/img/en-fr-comet.png index 522c9878..1b6f2a23 100644 Binary files a/evaluation/prod/img/en-fr-comet.png and b/evaluation/prod/img/en-fr-comet.png differ diff --git a/evaluation/prod/img/en-it-bleu.png b/evaluation/prod/img/en-it-bleu.png index ad59e8a8..2865aa92 100644 Binary files a/evaluation/prod/img/en-it-bleu.png and b/evaluation/prod/img/en-it-bleu.png differ diff --git a/evaluation/prod/img/en-it-comet.png b/evaluation/prod/img/en-it-comet.png index 137c7b66..1e8fefae 100644 Binary files a/evaluation/prod/img/en-it-comet.png and b/evaluation/prod/img/en-it-comet.png differ diff --git a/evaluation/prod/img/en-pl-bleu.png b/evaluation/prod/img/en-pl-bleu.png index 7761d25a..f1119e8c 100644 Binary files a/evaluation/prod/img/en-pl-bleu.png and b/evaluation/prod/img/en-pl-bleu.png differ diff --git a/evaluation/prod/img/en-pl-comet.png b/evaluation/prod/img/en-pl-comet.png index 427e8f4d..fb12b125 100644 Binary files a/evaluation/prod/img/en-pl-comet.png and b/evaluation/prod/img/en-pl-comet.png differ diff --git a/evaluation/prod/img/en-pt-comet.png b/evaluation/prod/img/en-pt-comet.png index 472576f2..19143477 100644 Binary files a/evaluation/prod/img/en-pt-comet.png and b/evaluation/prod/img/en-pt-comet.png differ diff --git a/evaluation/prod/img/es-en-bleu.png b/evaluation/prod/img/es-en-bleu.png index cc13103e..8760133c 100644 Binary files a/evaluation/prod/img/es-en-bleu.png and b/evaluation/prod/img/es-en-bleu.png differ diff --git a/evaluation/prod/img/es-en-comet.png b/evaluation/prod/img/es-en-comet.png index a8794e39..97314ff1 100644 Binary files a/evaluation/prod/img/es-en-comet.png and b/evaluation/prod/img/es-en-comet.png differ diff --git a/evaluation/prod/img/et-en-bleu.png b/evaluation/prod/img/et-en-bleu.png index fb8e2c9d..4fbbc326 100644 Binary files a/evaluation/prod/img/et-en-bleu.png and b/evaluation/prod/img/et-en-bleu.png differ diff --git a/evaluation/prod/img/et-en-comet.png b/evaluation/prod/img/et-en-comet.png index a65a401b..07cb3ec7 100644 Binary files a/evaluation/prod/img/et-en-comet.png and b/evaluation/prod/img/et-en-comet.png differ diff --git a/evaluation/prod/img/fr-en-bleu.png b/evaluation/prod/img/fr-en-bleu.png index 7825c5c2..35dabcc7 100644 Binary files a/evaluation/prod/img/fr-en-bleu.png and b/evaluation/prod/img/fr-en-bleu.png differ diff --git a/evaluation/prod/img/fr-en-comet.png b/evaluation/prod/img/fr-en-comet.png index e7003229..ee4acab9 100644 Binary files a/evaluation/prod/img/fr-en-comet.png and b/evaluation/prod/img/fr-en-comet.png differ diff --git a/evaluation/prod/img/it-en-bleu.png b/evaluation/prod/img/it-en-bleu.png index 6b1d203d..078aa5bb 100644 Binary files a/evaluation/prod/img/it-en-bleu.png and b/evaluation/prod/img/it-en-bleu.png differ diff --git a/evaluation/prod/img/it-en-comet.png b/evaluation/prod/img/it-en-comet.png index dd78b03b..2bc36fb2 100644 Binary files a/evaluation/prod/img/it-en-comet.png and b/evaluation/prod/img/it-en-comet.png differ diff --git a/evaluation/prod/img/nb-en-comet.png b/evaluation/prod/img/nb-en-comet.png index 5a5096e0..2b970821 100644 Binary files a/evaluation/prod/img/nb-en-comet.png and b/evaluation/prod/img/nb-en-comet.png differ diff --git a/evaluation/prod/img/pl-en-bleu.png b/evaluation/prod/img/pl-en-bleu.png index 7c6c038c..ef472a36 100644 Binary files a/evaluation/prod/img/pl-en-bleu.png and b/evaluation/prod/img/pl-en-bleu.png differ diff --git a/evaluation/prod/img/pl-en-comet.png b/evaluation/prod/img/pl-en-comet.png index d4f231f9..d6e5eb32 100644 Binary files a/evaluation/prod/img/pl-en-comet.png and b/evaluation/prod/img/pl-en-comet.png differ diff --git a/evaluation/prod/img/pt-en-bleu.png b/evaluation/prod/img/pt-en-bleu.png index 4716878f..11775806 100644 Binary files a/evaluation/prod/img/pt-en-bleu.png and b/evaluation/prod/img/pt-en-bleu.png differ diff --git a/evaluation/prod/img/pt-en-comet.png b/evaluation/prod/img/pt-en-comet.png index 3fe5292d..e4b7433c 100644 Binary files a/evaluation/prod/img/pt-en-comet.png and b/evaluation/prod/img/pt-en-comet.png differ diff --git a/models/dev/caen/lex.50.50.caen.s2t.bin.gz b/models/dev/caen/lex.50.50.caen.s2t.bin.gz new file mode 100644 index 00000000..e31815c5 --- /dev/null +++ b/models/dev/caen/lex.50.50.caen.s2t.bin.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8f1b9054453a0bced8600d67150b4a468c6607b7ba69600a64d29112b998823 +size 2735538 diff --git a/models/dev/caen/model.caen.intgemm.alphas.bin.gz b/models/dev/caen/model.caen.intgemm.alphas.bin.gz new file mode 100644 index 00000000..60fde7f0 --- /dev/null +++ b/models/dev/caen/model.caen.intgemm.alphas.bin.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7a76467335bc45389b695465996eecca6a5613ce8a9f3e4db865b2afad1672b +size 12825781 diff --git a/models/dev/caen/vocab.caen.spm.gz b/models/dev/caen/vocab.caen.spm.gz new file mode 100644 index 00000000..ef25556e --- /dev/null +++ b/models/dev/caen/vocab.caen.spm.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a34751849d5cce1430bafb374105252e2b2f6ace8d7e7e70d1a8c5279644702e +size 413033 diff --git a/registry.json b/registry.json index 561e2f3f..89550850 100644 --- a/registry.json +++ b/registry.json @@ -457,6 +457,29 @@ "modelType": "prod" } }, + "caen": { + "model": { + "name": "model.caen.intgemm.alphas.bin", + "size": 17140899, + "estimatedCompressedSize": 12825781, + "expectedSha256Hash": "3a315266490d87f72adf9e5387ee567b2fb76a30018e51586b882b1d87bf5aed", + "modelType": "dev" + }, + "lex": { + "name": "lex.50.50.caen.s2t.bin", + "size": 5244644, + "estimatedCompressedSize": 2735538, + "expectedSha256Hash": "a648be17d6f008feee687b455d00dbfaedba2ead8bee32658783c4325a8d3ece", + "modelType": "dev" + }, + "vocab": { + "name": "vocab.caen.spm", + "size": 811443, + "estimatedCompressedSize": 413033, + "expectedSha256Hash": "10a1f25e5640f596b547190082f87ba4994f8714693904c82a35d965b9cc7470", + "modelType": "dev" + } + }, "enfa": { "model": { "name": "model.enfa.intgemm.alphas.bin",