diff --git a/.gitignore b/.gitignore
index aaae2d2db..a9b67e00e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,7 +16,7 @@ fastspeech_output
.bash_history.local
# Byte-compiled / optimized / DLL files
-__pycache__/
+**/__pycache__/
*.py[cod]
*$py.class
**.pyc
diff --git a/nemo_gym/config_types.py b/nemo_gym/config_types.py
index b340081c7..dd1b18831 100644
--- a/nemo_gym/config_types.py
+++ b/nemo_gym/config_types.py
@@ -312,6 +312,7 @@ class DatasetConfig(BaseModel):
Literal["MIT"],
Literal["Creative Commons Attribution 4.0 International"],
Literal["Creative Commons Attribution-ShareAlike 4.0 International"],
+ Literal["NVIDIA Internal Use Only, Do Not Distribute"],
Literal["TBD"],
Literal["MIT"],
]
diff --git a/resources_servers/translation_bleu/README.md b/resources_servers/translation_bleu/README.md
new file mode 100644
index 000000000..6a7d6bffc
--- /dev/null
+++ b/resources_servers/translation_bleu/README.md
@@ -0,0 +1,11 @@
+# Description
+
+Data links: ?
+
+# Licensing information
+Code: Apache 2.0
+Data: NVIDIA Internal Use Only, Do Not Distribute
+
+Dependencies
+- nemo_gym: Apache 2.0
+- sacrebleu: Apache 2.0
diff --git a/resources_servers/translation_bleu/app.py b/resources_servers/translation_bleu/app.py
new file mode 100644
index 000000000..6cb85a1bb
--- /dev/null
+++ b/resources_servers/translation_bleu/app.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict
+
+from fastapi import FastAPI
+from sacrebleu.metrics import BLEU
+
+from nemo_gym.base_resources_server import (
+ BaseResourcesServerConfig,
+ BaseVerifyRequest,
+ BaseVerifyResponse,
+ SimpleResourcesServer,
+)
+
+
+class TranslationBleuResourcesServerConfig(BaseResourcesServerConfig):
+ reasoning_split_word: str = ""
+
+
+class TranslationBleuVerifyRequest(BaseVerifyRequest):
+ trg_txt: str
+ trg_lang: str
+
+
+class TranslationBleuVerifyResponse(BaseVerifyResponse):
+ trg_txt: str
+ trg_lang: str
+ extracted_answer: str
+
+
+class TranslationBleuResourcesServer(SimpleResourcesServer):
+ config: TranslationBleuResourcesServerConfig
+
+ TOKENIZER_MAP: Dict[str, str] = {
+ "zh": "zh",
+ "zh-cn": "zh",
+ "zh-tw": "zh",
+ "zho-CN": "zh",
+ "zho_simpl": "zh",
+ "ja": "ja-mecab",
+ "jpn": "ja-mecab",
+ "th": "flores200",
+ "ko": "ko-mecab",
+ }
+
+ def setup_webserver(self) -> FastAPI:
+ app = super().setup_webserver()
+
+ # Additional server routes go here! e.g.:
+ # app.post("/get_weather")(self.get_weather)
+
+ return app
+
+ async def verify(self, body: TranslationBleuVerifyRequest) -> TranslationBleuVerifyResponse:
+ assistant_responses = []
+ for output_item in body.response.output:
+ if output_item.type != "message":
+ continue
+
+ for content_item in output_item.content:
+ if content_item.type != "output_text":
+ continue
+
+ assistant_responses.append(content_item.text)
+
+ combined_response = "".join(assistant_responses)
+
+ (reward, extracted_answer) = self._verify_answer(
+ ground_truth=body.trg_txt, target_lang=body.trg_lang, model_response=combined_response
+ )
+
+ return TranslationBleuVerifyResponse(**body.model_dump(), extracted_answer=extracted_answer, reward=reward)
+
+ def _verify_answer(self, ground_truth: str, target_lang: str, model_response: str) -> tuple[float, str]:
+ extracted_answer = self._extract_answer(model_response)
+
+ if target_lang in self.TOKENIZER_MAP:
+ tokenize = self.TOKENIZER_MAP[target_lang]
+ else:
+ tokenize = None
+ # Use effective_order for sentence-level BLEU
+ bleu = BLEU(trg_lang=target_lang, effective_order=True, tokenize=tokenize)
+
+ bleu_output = bleu.sentence_score(extracted_answer, [ground_truth])
+ # TODO Do we want to report any other BLEU outputs?
+ bleu_score = bleu_output.score
+ reward = bleu_score / 100.0
+
+ return reward, extracted_answer
+
+ def _extract_answer(self, model_response: str) -> str:
+ # Strip any thinking
+ no_think_response = model_response.split(self.config.reasoning_split_word)[-1]
+ no_think_response = no_think_response.strip()
+ return no_think_response
+
+
+if __name__ == "__main__":
+ TranslationBleuResourcesServer.run_webserver()
diff --git a/resources_servers/translation_bleu/configs/translation_bleu.yaml b/resources_servers/translation_bleu/configs/translation_bleu.yaml
new file mode 100644
index 000000000..81b004aac
--- /dev/null
+++ b/resources_servers/translation_bleu/configs/translation_bleu.yaml
@@ -0,0 +1,38 @@
+translation_bleu:
+ resources_servers:
+ translation_bleu:
+ entrypoint: app.py
+ domain: translation
+translation_bleu_simple_agent:
+ responses_api_agents:
+ simple_agent:
+ entrypoint: app.py
+ resources_server:
+ type: resources_servers
+ name: translation_bleu
+ model_server:
+ type: responses_api_models
+ name: policy_model
+ datasets:
+ - name: train
+ type: train
+ jsonl_fpath: resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_train
+ version: 0.0.3
+ artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: validation
+ type: validation
+ jsonl_fpath: resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_validation
+ version: 0.0.1
+ artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: example
+ type: example
+ jsonl_fpath: resources_servers/translation_bleu/data/example.jsonl
+ num_repeats: 1
diff --git a/resources_servers/translation_bleu/data/.gitignore b/resources_servers/translation_bleu/data/.gitignore
new file mode 100644
index 000000000..4424b6fde
--- /dev/null
+++ b/resources_servers/translation_bleu/data/.gitignore
@@ -0,0 +1,5 @@
+*train.jsonl
+*validation.jsonl
+*train_prepare.jsonl
+*validation_prepare.jsonl
+*example_prepare.jsonl
diff --git a/resources_servers/translation_bleu/data/example.jsonl b/resources_servers/translation_bleu/data/example.jsonl
new file mode 100644
index 000000000..2c921e654
--- /dev/null
+++ b/resources_servers/translation_bleu/data/example.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"}
\ No newline at end of file
diff --git a/resources_servers/translation_bleu/data/example_nothink.jsonl b/resources_servers/translation_bleu/data/example_nothink.jsonl
new file mode 100644
index 000000000..42529ab9a
--- /dev/null
+++ b/resources_servers/translation_bleu/data/example_nothink.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"}
+{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"}
+{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"}
\ No newline at end of file
diff --git a/resources_servers/translation_bleu/data/example_nothink_rollouts.jsonl b/resources_servers/translation_bleu/data/example_nothink_rollouts.jsonl
new file mode 100644
index 000000000..7c7c2c018
--- /dev/null
+++ b/resources_servers/translation_bleu/data/example_nothink_rollouts.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_de822fae9a3048609f9d29124711ef7e", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_ca1ab146369449f6af314513647808ef", "content": [{"annotations": [], "text": "The Conservatory and the theater were dedicated to him, in which every year the \"Rossini Opera Festival\" is held, attracting opera lovers from around the world.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.1826249361348376, "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "trg_lang": "en", "extracted_answer": "The Conservatory and the theater were dedicated to him, in which every year the \"Rossini Opera Festival\" is held, attracting opera lovers from around the world."}
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_1c8244efef744b38a8adb1e3265e6ca9", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_ff0e183cd1f6448a8840104b9beb235c", "content": [{"annotations": [], "text": "次の図は、原子が光子を放出している様子を示しています。もちろん、実際には光子はその図の中のものよりもずっと小さいです。\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.41614625199196853, "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "trg_lang": "ja", "extracted_answer": "次の図は、原子が光子を放出している様子を示しています。もちろん、実際には光子はその図の中のものよりもずっと小さいです。"}
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_ac13960214d348b1b341ab0c8db10cfc", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_94ba9b4835954fceaa534381996ace6c", "content": [{"annotations": [], "text": "Why waste time looking for the keys?\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.6752918218126552, "trg_txt": "Why waste your time looking for the keys?", "trg_lang": "en", "extracted_answer": "Why waste time looking for the keys?"}
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_a8cda57b843444168553e60a61b6d0e7", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_4da079740c9440169da96073a99113bc", "content": [{"annotations": [], "text": "Toutes les questions ont été répondues correctement par le trésorier Thomas Kräuchi.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.15727800941615358, "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "trg_lang": "fr", "extracted_answer": "Toutes les questions ont été répondues correctement par le trésorier Thomas Kräuchi."}
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_027e2b003199426d88afbf0be7b18ab0", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_8f45cceaf9c741fb8dda838c68338f3b", "content": [{"annotations": [], "text": "The competition takes place on the grass, and the grass around the holes is cut shorter, referred to as the green.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.3204144198812986, "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "trg_lang": "en", "extracted_answer": "The competition takes place on the grass, and the grass around the holes is cut shorter, referred to as the green."}
diff --git a/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train_metrics.json b/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train_metrics.json
new file mode 100644
index 000000000..369e43852
--- /dev/null
+++ b/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train_metrics.json
@@ -0,0 +1,61 @@
+{
+ "name": "train",
+ "type": "train",
+ "jsonl_fpath": "resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train.jsonl",
+ "num_repeats": 1,
+ "gitlab_identifier": {
+ "dataset_name": "riva_mt_v3_nothinkInSys_train",
+ "version": "0.0.3",
+ "artifact_fpath": "riva_mt_v3_nothinkInSys_train.jsonl"
+ },
+ "license": "NVIDIA Internal Use Only, Do Not Distribute",
+ "Number of examples": 770000,
+ "Number of tools": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "Json-dumped number of words (proxy for token count)": {
+ "Total # non-null values": 770000,
+ "Average": 26.18,
+ "Min": 10.0,
+ "Max": 267.0,
+ "Median": 24.44,
+ "Standard deviation": 9.86
+ },
+ "Number of turns": {
+ "Total # non-null values": 770000,
+ "Average": 1.0,
+ "Min": 1.0,
+ "Max": 1.0,
+ "Median": 1.0,
+ "Standard deviation": 0.0
+ },
+ "Temperature": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "src_txt": {
+ "unique_count": 767619,
+ "total_count": 770000
+ },
+ "trg_txt": {
+ "unique_count": 767567,
+ "total_count": 770000
+ },
+ "src_lang": {
+ "unique_count": 12,
+ "total_count": 770000
+ },
+ "trg_lang": {
+ "unique_count": 12,
+ "total_count": 770000
+ }
+}
\ No newline at end of file
diff --git a/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation_metrics.json b/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation_metrics.json
new file mode 100644
index 000000000..1d6645b38
--- /dev/null
+++ b/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation_metrics.json
@@ -0,0 +1,61 @@
+{
+ "name": "validation",
+ "type": "validation",
+ "jsonl_fpath": "resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation.jsonl",
+ "num_repeats": 1,
+ "gitlab_identifier": {
+ "dataset_name": "riva_mt_v3_nothinkInSys_validation",
+ "version": "0.0.1",
+ "artifact_fpath": "riva_mt_v3_nothinkInSys_validation.jsonl"
+ },
+ "license": "NVIDIA Internal Use Only, Do Not Distribute",
+ "Number of examples": 19940,
+ "Number of tools": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "Json-dumped number of words (proxy for token count)": {
+ "Total # non-null values": 19940,
+ "Average": 32.68,
+ "Min": 10.0,
+ "Max": 82.0,
+ "Median": 33.05,
+ "Standard deviation": 10.38
+ },
+ "Number of turns": {
+ "Total # non-null values": 19940,
+ "Average": 1.0,
+ "Min": 1.0,
+ "Max": 1.0,
+ "Median": 1.0,
+ "Standard deviation": 0.0
+ },
+ "Temperature": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "src_txt": {
+ "unique_count": 10965,
+ "total_count": 19940
+ },
+ "trg_txt": {
+ "unique_count": 10965,
+ "total_count": 19940
+ },
+ "src_lang": {
+ "unique_count": 11,
+ "total_count": 19940
+ },
+ "trg_lang": {
+ "unique_count": 11,
+ "total_count": 19940
+ }
+}
\ No newline at end of file
diff --git a/resources_servers/translation_bleu/requirements.txt b/resources_servers/translation_bleu/requirements.txt
new file mode 100644
index 000000000..f3ee63442
--- /dev/null
+++ b/resources_servers/translation_bleu/requirements.txt
@@ -0,0 +1,3 @@
+-e nemo-gym[dev] @ ../../
+mecab-ko==1.0.2
+sacrebleu[ja,ko]>=2.0.0
\ No newline at end of file
diff --git a/resources_servers/translation_bleu/tests/test_app.py b/resources_servers/translation_bleu/tests/test_app.py
new file mode 100644
index 000000000..06dff1afc
--- /dev/null
+++ b/resources_servers/translation_bleu/tests/test_app.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from copy import deepcopy
+from typing import Any
+from unittest.mock import MagicMock
+
+from pytest import approx, fixture
+
+from nemo_gym.openai_utils import (
+ NeMoGymResponse,
+ NeMoGymResponseCreateParamsNonStreaming,
+ NeMoGymResponseOutputMessage,
+ NeMoGymResponseOutputText,
+)
+from nemo_gym.server_utils import ServerClient
+from resources_servers.translation_bleu.app import (
+ TranslationBleuResourcesServer,
+ TranslationBleuResourcesServerConfig,
+ TranslationBleuVerifyRequest,
+)
+
+
+class TestApp:
+ @fixture
+ def config(self) -> TranslationBleuResourcesServerConfig:
+ return TranslationBleuResourcesServerConfig(
+ host="0.0.0.0",
+ port=8080,
+ entrypoint="",
+ name="",
+ )
+
+ def _create_response(self, id: str, model_response_text: str) -> dict[str, Any]:
+ return NeMoGymResponse(
+ id=id,
+ created_at=1234.5,
+ model="response_model",
+ object="response",
+ parallel_tool_calls=False,
+ tool_choice="none",
+ tools=[],
+ output=[
+ NeMoGymResponseOutputMessage(
+ id=f"ID for {model_response_text}",
+ role="assistant",
+ status="in_progress",
+ type="message",
+ content=[NeMoGymResponseOutputText(annotations=[], text=model_response_text, type="output_text")],
+ )
+ ],
+ ).model_dump()
+
+ async def test_verify_identical(self, config: TranslationBleuResourcesServerConfig) -> None:
+ server_mock = MagicMock(spec=ServerClient)
+ resources_server = TranslationBleuResourcesServer(config=config, server_client=server_mock)
+
+ source_text = "What is the name of your cat?"
+ ground_truth = "Was ist der Name deiner Katze?"
+ target_lang = "de"
+ target_lang_name = "German"
+ model_create_params = NeMoGymResponseCreateParamsNonStreaming(
+ input=[
+ {
+ "role": "user",
+ "content": f'Translate this into {target_lang_name}: "{source_text}"',
+ }
+ ]
+ )
+ model_response = NeMoGymResponse(**self._create_response("model_response_id", ground_truth))
+ identical_verify_request = TranslationBleuVerifyRequest(
+ responses_create_params=deepcopy(model_create_params),
+ response=model_response.model_copy(deep=True),
+ trg_txt=ground_truth,
+ trg_lang=target_lang,
+ )
+ identical_verify_response = await resources_server.verify(identical_verify_request)
+ assert identical_verify_response.responses_create_params == model_create_params
+ assert identical_verify_response.response == model_response
+ assert identical_verify_response.trg_txt == ground_truth
+ assert identical_verify_response.trg_lang == target_lang
+ assert identical_verify_response.reward == approx(1.0)
+ assert identical_verify_response.extracted_answer == ground_truth
+
+ assert sorted(list(identical_verify_response.model_dump())) == [
+ "extracted_answer",
+ "response",
+ "responses_create_params",
+ "reward",
+ "trg_lang",
+ "trg_txt",
+ ]
+
+ def test_verify_answer_identical(self, config: TranslationBleuResourcesServerConfig) -> None:
+ resources_server = TranslationBleuResourcesServer(config=config, server_client=MagicMock(spec=ServerClient))
+
+ # source_text = "What is the name of your cat?"
+ ground_truth = "Was ist der Name deiner Katze?"
+ target_lang = "de"
+ model_response_text = ground_truth
+
+ assert resources_server._verify_answer(ground_truth, target_lang, model_response_text) == (
+ approx(1.0),
+ ground_truth,
+ )
+
+ def test_verify_answer_think_tags(self, config: TranslationBleuResourcesServerConfig) -> None:
+ resources_server = TranslationBleuResourcesServer(config=config, server_client=MagicMock(spec=ServerClient))
+
+ # source_text = "What is the name of your cat?"
+ ground_truth = "Was ist der Name deiner Katze?"
+ target_lang = "de"
+ model_response_text = f"\n\n{ground_truth}"
+
+ assert resources_server._verify_answer(ground_truth, target_lang, model_response_text) == (
+ approx(1.0),
+ ground_truth,
+ )
+
+ def test_verify_answer_no_match(self, config: TranslationBleuResourcesServerConfig) -> None:
+ resources_server = TranslationBleuResourcesServer(config=config, server_client=MagicMock(spec=ServerClient))
+
+ # source_text = "What is the name of your cat?"
+ ground_truth = "Was ist der Name deiner Katze?"
+ target_lang = "de"
+ model_response_text = "Incorrect translation."
+
+ assert resources_server._verify_answer(ground_truth, target_lang, model_response_text) == (
+ approx(0.0),
+ model_response_text,
+ )
diff --git a/resources_servers/translation_comet/README.md b/resources_servers/translation_comet/README.md
new file mode 100644
index 000000000..cf6920b56
--- /dev/null
+++ b/resources_servers/translation_comet/README.md
@@ -0,0 +1,11 @@
+# Description
+
+Data links: ?
+
+# Licensing information
+Code: Apache 2.0
+Data: NVIDIA Internal Use Only, Do Not Distribute
+
+Dependencies
+- nemo_gym: Apache 2.0
+- unbabel-comet: Apache 2.0
diff --git a/resources_servers/translation_comet/app.py b/resources_servers/translation_comet/app.py
new file mode 100644
index 000000000..83ae1fc12
--- /dev/null
+++ b/resources_servers/translation_comet/app.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, List, Optional, Union
+
+from comet.models import download_model, load_from_checkpoint
+from fastapi import FastAPI
+
+from nemo_gym.base_resources_server import (
+ BaseResourcesServerConfig,
+ BaseVerifyRequest,
+ BaseVerifyResponse,
+ SimpleResourcesServer,
+)
+
+
+class TranslationCometResourcesServerConfig(BaseResourcesServerConfig):
+ use_reference: bool = True # Must match model set in comet_model_name
+ comet_model_name: str = "Unbabel/wmt22-comet-da"
+ comet_gpu_count: int = None # CPU only
+ comet_gpu_devices: Union[List[int], str, int] = "auto"
+ model_cache_dir: Optional[str] = None
+ reasoning_split_word: str = ""
+
+
+class TranslationCometVerifyRequest(BaseVerifyRequest):
+ src_txt: str
+ trg_txt: Optional[str] = None
+
+
+class TranslationCometVerifyResponse(BaseVerifyResponse):
+ src_txt: str
+ trg_txt: Optional[str] = None
+ extracted_answer: str
+
+
+class TranslationCometResourcesServer(SimpleResourcesServer):
+ config: TranslationCometResourcesServerConfig
+ batch_size: int = 1 # We only process one item at a time so this is always 1
+
+ def model_post_init(self, context: Any) -> None:
+ super().model_post_init(context)
+
+ # # Manually load the model without the Comet wrapper class so we can control the GPU allocation
+ # # https://stackoverflow.com/questions/75879866/how-to-load-unbabel-comet-model-without-nested-wrapper-initialization
+
+ # model_path = snapshot_download(repo_id=self.config.comet_model_name)
+ # model_checkpoint_path = f'{model_path}/checkpoints/model.ckpt'
+ # if self.config.use_reference:
+ # self._comet_model = RegressionMetric.load_from_checkpoint(model_checkpoint_path, layer_transformation='softmax')
+ # else:
+ # self._comet_model = ReferencelessRegression.load_from_checkpoint(model_checkpoint_path, layer_transformation='softmax')
+
+ model_path = download_model(model=self.config.comet_model_name, saving_directory=self.config.model_cache_dir)
+ self._comet_model = load_from_checkpoint(model_path)
+
+ # TODO long-term we want to pull logic out of predict in base.py in COMET
+ # so that we keep one PTL Trainer for the whole server, and don't make a new one for every datapoint
+ # since each PTL trainer moves the model weights from CPU to GPU each time which will be slow
+
+ def setup_webserver(self) -> FastAPI:
+ app = super().setup_webserver()
+
+ # Additional server routes go here! e.g.:
+ # app.post("/get_weather")(self.get_weather)
+
+ return app
+
+ async def verify(self, body: TranslationCometVerifyRequest) -> TranslationCometVerifyResponse:
+ assistant_responses = []
+ for output_item in body.response.output:
+ if output_item.type != "message":
+ continue
+
+ for content_item in output_item.content:
+ if content_item.type != "output_text":
+ continue
+
+ assistant_responses.append(content_item.text)
+
+ combined_response = "".join(assistant_responses)
+
+ (reward, extracted_answer) = self._verify_answer(
+ source_text=body.src_txt, target_text=body.trg_txt, model_response=combined_response
+ )
+
+ return TranslationCometVerifyResponse(**body.model_dump(), extracted_answer=extracted_answer, reward=reward)
+
+ def _verify_answer(self, source_text: str, target_text: str, model_response: str) -> tuple[float, str]:
+ extracted_answer = self._extract_answer(model_response)
+
+ if self.config.use_reference:
+ comet_data = [{"src": source_text, "mt": extracted_answer, "ref": target_text}]
+ else:
+ comet_data = [{"src": source_text, "mt": extracted_answer}]
+
+ # TODO this is inefficent and sets up a new PTL Trainer each time
+ # It's designed to be run on a whole dataset at once
+ # This means the weights get moved from CPU to GPU (if applicable) each time this is called
+ model_output = self._comet_model.predict(
+ comet_data,
+ batch_size=self.batch_size,
+ gpus=self.config.comet_gpu_count,
+ devices=self.config.comet_gpu_devices,
+ progress_bar=False,
+ )
+ reward = model_output.system_score
+
+ return reward, extracted_answer
+
+ def _extract_answer(self, model_response: str) -> str:
+ # Strip any thinking
+ no_think_response = model_response.split(self.config.reasoning_split_word)[-1]
+ no_think_response = no_think_response.strip()
+ return no_think_response
+
+
+if __name__ == "__main__":
+ TranslationCometResourcesServer.run_webserver()
diff --git a/resources_servers/translation_comet/configs/translation_comet.yaml b/resources_servers/translation_comet/configs/translation_comet.yaml
new file mode 100644
index 000000000..e0f5c5e75
--- /dev/null
+++ b/resources_servers/translation_comet/configs/translation_comet.yaml
@@ -0,0 +1,40 @@
+translation_comet:
+ resources_servers:
+ translation_comet:
+ entrypoint: app.py
+ domain: translation
+ use_reference: true
+ comet_model_name: Unbabel/wmt22-comet-da
+translation_comet_simple_agent:
+ responses_api_agents:
+ simple_agent:
+ entrypoint: app.py
+ resources_server:
+ type: resources_servers
+ name: translation_comet
+ model_server:
+ type: responses_api_models
+ name: policy_model
+ datasets:
+ - name: train
+ type: train
+ jsonl_fpath: resources_servers/translation_comet/data/riva_mt_v3_nothinkInSys_train.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_train
+ version: 0.0.1
+ artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: validation
+ type: validation
+ jsonl_fpath: resources_servers/translation_comet/data/riva_mt_v3_nothinkInSys_validation.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_validation
+ version: 0.0.1
+ artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: example
+ type: example
+ jsonl_fpath: resources_servers/translation_comet/data/example.jsonl
+ num_repeats: 1
diff --git a/resources_servers/translation_comet/configs/translation_comet_referencefree.yaml b/resources_servers/translation_comet/configs/translation_comet_referencefree.yaml
new file mode 100644
index 000000000..944df2982
--- /dev/null
+++ b/resources_servers/translation_comet/configs/translation_comet_referencefree.yaml
@@ -0,0 +1,40 @@
+translation_comet:
+ resources_servers:
+ translation_comet:
+ entrypoint: app.py
+ domain: translation
+ use_reference: false
+ comet_model_name: Unbabel/wmt22-cometkiwi-da
+translation_comet_simple_agent:
+ responses_api_agents:
+ simple_agent:
+ entrypoint: app.py
+ resources_server:
+ type: resources_servers
+ name: translation_comet
+ model_server:
+ type: responses_api_models
+ name: policy_model
+ datasets:
+ - name: train
+ type: train
+ jsonl_fpath: resources_servers/translation_comet/data/riva_mt_v3_nothinkInSys_train.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_train
+ version: 0.0.1
+ artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: validation
+ type: validation
+ jsonl_fpath: resources_servers/translation_comet/data/riva_mt_v3_nothinkInSys_validation.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_validation
+ version: 0.0.1
+ artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: example
+ type: example
+ jsonl_fpath: resources_servers/translation_comet/data/example.jsonl
+ num_repeats: 1
diff --git a/resources_servers/translation_comet/data/.gitignore b/resources_servers/translation_comet/data/.gitignore
new file mode 100644
index 000000000..4424b6fde
--- /dev/null
+++ b/resources_servers/translation_comet/data/.gitignore
@@ -0,0 +1,5 @@
+*train.jsonl
+*validation.jsonl
+*train_prepare.jsonl
+*validation_prepare.jsonl
+*example_prepare.jsonl
diff --git a/resources_servers/translation_comet/data/example.jsonl b/resources_servers/translation_comet/data/example.jsonl
new file mode 100644
index 000000000..2c921e654
--- /dev/null
+++ b/resources_servers/translation_comet/data/example.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"}
\ No newline at end of file
diff --git a/resources_servers/translation_comet/data/example_nothink.jsonl b/resources_servers/translation_comet/data/example_nothink.jsonl
new file mode 100644
index 000000000..42529ab9a
--- /dev/null
+++ b/resources_servers/translation_comet/data/example_nothink.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"}
+{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"}
+{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"}
\ No newline at end of file
diff --git a/resources_servers/translation_comet/data/example_nothink_rollouts.jsonl b/resources_servers/translation_comet/data/example_nothink_rollouts.jsonl
new file mode 100644
index 000000000..8b253272b
--- /dev/null
+++ b/resources_servers/translation_comet/data/example_nothink_rollouts.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_9ba0dbafb376409e8a7ed772adab1ecf", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_f0f3a1e9b0784933bdd719fd569e6077", "content": [{"annotations": [], "text": "Why waste time looking for the keys?\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.9511789679527283, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "extracted_answer": "Why waste time looking for the keys?"}
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_411f1d1acdeb4190ac860db659abc3cf", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_edbfc6e1503143dc82f1b94a9ea51bdc", "content": [{"annotations": [], "text": "The competition was held on a lawn, where the grass around the hole was cut shorter and is called the putting green.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.8277285695075989, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "extracted_answer": "The competition was held on a lawn, where the grass around the hole was cut shorter and is called the putting green."}
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_4b200f77ec6d47cebbd6f4dec45239e8", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_eb5d7cb1c8f74a4ead4d9d1d8d0e7cd5", "content": [{"annotations": [], "text": "Toutes les questions ont été répondues correctement par le trésorier Thomas Kräuchi.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.9093431234359741, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "extracted_answer": "Toutes les questions ont été répondues correctement par le trésorier Thomas Kräuchi."}
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_64b8e9cb25a8414b9a61615c90d63b68", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_71c84f51f02c437d9421550f6870d827", "content": [{"annotations": [], "text": "画像の次に原子が光子を放出している様子が示されています。もちろん、実際の光子はそのようにはとても小さいです。\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.9037765264511108, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "extracted_answer": "画像の次に原子が光子を放出している様子が示されています。もちろん、実際の光子はそのようにはとても小さいです。"}
+{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_ef4433e72c5d46ed8a39460bf8039518", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_d60da844dc104c3aa41836580f4970bc", "content": [{"annotations": [], "text": "The conservatory and the theater were dedicated to him, in which every year the \"Rossini Opera Festival\" takes place, attracting opera lovers from all over the world.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.8553621172904968, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "extracted_answer": "The conservatory and the theater were dedicated to him, in which every year the \"Rossini Opera Festival\" takes place, attracting opera lovers from all over the world."}
diff --git a/resources_servers/translation_comet/requirements.txt b/resources_servers/translation_comet/requirements.txt
new file mode 100644
index 000000000..302adca33
--- /dev/null
+++ b/resources_servers/translation_comet/requirements.txt
@@ -0,0 +1,2 @@
+-e nemo-gym[dev] @ ../../
+unbabel-comet>=2.2.4
\ No newline at end of file
diff --git a/resources_servers/translation_comet/tests/test_app.py b/resources_servers/translation_comet/tests/test_app.py
new file mode 100644
index 000000000..68b2477b4
--- /dev/null
+++ b/resources_servers/translation_comet/tests/test_app.py
@@ -0,0 +1,211 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from copy import deepcopy
+from os.path import dirname, join
+from typing import Any
+from unittest.mock import MagicMock
+
+from pytest import approx, fixture
+
+from nemo_gym import CACHE_DIR
+from nemo_gym.openai_utils import (
+ NeMoGymResponse,
+ NeMoGymResponseCreateParamsNonStreaming,
+ NeMoGymResponseOutputMessage,
+ NeMoGymResponseOutputText,
+)
+from nemo_gym.server_utils import ServerClient
+from resources_servers.translation_comet.app import (
+ TranslationCometResourcesServer,
+ TranslationCometResourcesServerConfig,
+ TranslationCometVerifyRequest,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+class TestApp:
+ @fixture(scope="class")
+ def resources_server(self) -> TranslationCometResourcesServer:
+ """We only want to spin up the server once since it has to load the comet model.
+ Although, the slowest part is actually `import comet` :)
+ """
+ logger.info("Spinning up server with COMET model...")
+
+ server = TranslationCometResourcesServer(
+ config=TranslationCometResourcesServerConfig(
+ host="0.0.0.0",
+ port=8080,
+ entrypoint="",
+ name="",
+ use_reference=True,
+ comet_model_name="Unbabel/wmt22-comet-da", # 0.5B parameter model runs fine on CPU
+ # Need to use the actual model as the cometinho model does not return values in [0,1]
+ comet_gpu_count=0, # CPU
+ comet_gpu_devices="auto", # CPU
+ model_cache_dir=join(CACHE_DIR, "ptl_cache"),
+ ),
+ server_client=MagicMock(spec=ServerClient),
+ )
+
+ logger.info("Model loaded and server started successfully")
+ return server
+
+ def reference_free_resources_server(self) -> TranslationCometResourcesServer:
+ logger.info("Spinning up server with reference-free COMET model...")
+
+ server = TranslationCometResourcesServer(
+ config=TranslationCometResourcesServerConfig(
+ host="0.0.0.0",
+ port=8080,
+ entrypoint="",
+ name="",
+ use_reference=False,
+ comet_model_name="Unbabel/wmt22-cometkiwi-da", # reference-free COMET
+ comet_gpu_count=0, # CPU
+ comet_gpu_devices="auto", # CPU
+ model_cache_dir=join(dirname(__file__), "..", "..", "..", "cache", "ptl_cache"),
+ ),
+ server_client=MagicMock(spec=ServerClient),
+ )
+
+ logger.info("Reference-free COMET model loaded and server started successfully")
+ return server
+
+ def _create_response(self, id: str, model_response_text: str) -> dict[str, Any]:
+ return NeMoGymResponse(
+ id=id,
+ created_at=1234.5,
+ model="response_model",
+ object="response",
+ parallel_tool_calls=False,
+ tool_choice="none",
+ tools=[],
+ output=[
+ NeMoGymResponseOutputMessage(
+ id=f"ID for {model_response_text}",
+ role="assistant",
+ status="in_progress",
+ type="message",
+ content=[NeMoGymResponseOutputText(annotations=[], text=model_response_text, type="output_text")],
+ )
+ ],
+ ).model_dump()
+
+ async def test_verify_identical(self, resources_server: TranslationCometResourcesServer) -> None:
+ source_text = "What is the name of your cat?"
+ target_text = "Was ist der Name deiner Katze?"
+ target_lang_name = "German"
+ model_create_params = NeMoGymResponseCreateParamsNonStreaming(
+ input=[
+ {
+ "role": "user",
+ "content": f'Translate this into {target_lang_name}: "{source_text}"',
+ }
+ ]
+ )
+ model_response = NeMoGymResponse(**self._create_response("model_response_id", target_text))
+ identical_verify_request = TranslationCometVerifyRequest(
+ responses_create_params=deepcopy(model_create_params),
+ response=model_response.model_copy(deep=True),
+ src_txt=source_text,
+ trg_txt=target_text,
+ )
+ identical_verify_response = await resources_server.verify(identical_verify_request)
+ assert identical_verify_response.responses_create_params == model_create_params
+ assert identical_verify_response.response == model_response
+ assert identical_verify_response.src_txt == source_text
+ assert identical_verify_response.trg_txt == target_text
+ assert identical_verify_response.reward == approx(1.0, abs=0.05)
+ assert identical_verify_response.extracted_answer == target_text
+
+ assert sorted(list(identical_verify_response.model_dump())) == [
+ "extracted_answer",
+ "response",
+ "responses_create_params",
+ "reward",
+ "src_txt",
+ "trg_txt",
+ ]
+
+ def test_verify_answer_identical(self, resources_server: TranslationCometResourcesServer) -> None:
+ source_text = "What is the name of your cat?"
+ target_text = "Was ist der Name deiner Katze?"
+ model_response_text = target_text
+
+ assert resources_server._verify_answer(source_text, target_text, model_response_text) == (
+ approx(1.0, abs=0.05), # It's a model output so it won't be exact
+ target_text,
+ )
+
+ def test_verify_answer_think_tags(self, resources_server: TranslationCometResourcesServer) -> None:
+ source_text = "What is the name of your cat?"
+ target_text = "Was ist der Name deiner Katze?"
+ model_response_text = f"\n\n{target_text}"
+
+ assert resources_server._verify_answer(source_text, target_text, model_response_text) == (
+ approx(1.0, abs=0.05), # It's a model output so it won't be exact
+ target_text,
+ )
+
+ def test_verify_answer_no_match(self, resources_server: TranslationCometResourcesServer) -> None:
+ source_text = "What is the name of your cat?"
+ target_text = "Was ist der Name deiner Katze?"
+ model_response_text = "Incorrect translation."
+
+ assert resources_server._verify_answer(source_text, target_text, model_response_text) == (
+ approx(0.0, abs=0.5), # This returns about 0.3 in practice but it's fine as long as it's low
+ model_response_text,
+ )
+
+ async def test_verify_identical_reference_free(self, resources_server: TranslationCometResourcesServer) -> None:
+ reference_free_resources_server = self.reference_free_resources_server()
+
+ source_text = "two three"
+ target_text = "zwei drei"
+ target_lang_name = "German"
+ model_create_params = NeMoGymResponseCreateParamsNonStreaming(
+ input=[
+ {
+ "role": "user",
+ "content": f'Translate this into {target_lang_name}: "{source_text}"',
+ }
+ ]
+ )
+ model_response = NeMoGymResponse(**self._create_response("model_response_id", target_text))
+ identical_verify_request = TranslationCometVerifyRequest(
+ responses_create_params=deepcopy(model_create_params),
+ response=model_response.model_copy(deep=True),
+ src_txt=source_text,
+ )
+ identical_verify_response = await reference_free_resources_server.verify(identical_verify_request)
+ assert identical_verify_response.responses_create_params == model_create_params
+ assert identical_verify_response.response == model_response
+ assert identical_verify_response.src_txt == source_text
+ assert identical_verify_response.trg_txt is None
+ assert identical_verify_response.reward == approx(
+ 1.0, abs=0.25
+ ) # It's hard to get a score near 1.0 with the reference-free model
+ assert identical_verify_response.extracted_answer == target_text
+
+ assert sorted(list(identical_verify_response.model_dump())) == [
+ "extracted_answer",
+ "response",
+ "responses_create_params",
+ "reward",
+ "src_txt",
+ "trg_txt", # Should be present but None
+ ]
diff --git a/resources_servers/translation_llm_judge/README.md b/resources_servers/translation_llm_judge/README.md
new file mode 100644
index 000000000..c427b79a4
--- /dev/null
+++ b/resources_servers/translation_llm_judge/README.md
@@ -0,0 +1,10 @@
+# Description
+
+Data links: ?
+
+# Licensing information
+Code: Apache 2.0
+Data: NVIDIA Internal Use Only, Do Not Distribute
+
+Dependencies
+- nemo_gym: Apache 2.0
diff --git a/resources_servers/translation_llm_judge/app.py b/resources_servers/translation_llm_judge/app.py
new file mode 100644
index 000000000..b5e3923f5
--- /dev/null
+++ b/resources_servers/translation_llm_judge/app.py
@@ -0,0 +1,207 @@
+"""
+LLM-as-judge resources server.
+
+Compares a model's generated answer to an expected answer using an LLM judge.
+The judge prompt is fully configurable via server config.
+"""
+
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+from fastapi import FastAPI
+from pydantic import BaseModel
+
+from nemo_gym.base_resources_server import (
+ BaseResourcesServerConfig,
+ BaseRunRequest,
+ BaseVerifyRequest,
+ BaseVerifyResponse,
+ SimpleResourcesServer,
+)
+from nemo_gym.config_types import ModelServerRef
+from nemo_gym.openai_utils import (
+ NeMoGymEasyInputMessage,
+ NeMoGymResponse,
+ NeMoGymResponseCreateParamsNonStreaming,
+)
+
+
+class TranslationLLMJudgeResourcesServerConfig(BaseResourcesServerConfig):
+ """Configuration for the LLM judge server.
+
+ - judge_model_server: target model server to use as the judge.
+ - judge_responses_create_params: base create params; input will be set per request.
+ - judge_system_message: optional custom system message for the judge.
+ - judge_prompt_template: optional custom prompt template. Supported placeholders:
+ {generated_text}, {src_text}, {src_lang}, {trg_lang}, {trg_text} (if use_reference is True, otherwise {trg_lang} and {src_lang} are required)
+ """
+
+ name: str = "translation_llm_judge"
+ judge_model_server: ModelServerRef
+ judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming
+
+ judge_system_message: Optional[str] = None
+ judge_prompt_template: str
+ judge_score_extract_regex: str
+ judge_max_score: int
+ use_reference: bool = True # If True, judge_prompt_template should include {trg_text}
+ reasoning_split_word: str = ""
+
+
+class TranslationLLMJudgeRunRequest(BaseRunRequest):
+ src_txt: str
+ src_lang: str # TODO should this be optional? Not all judge prompts will use it
+ trg_lang: str # TODO should this be optional? Not all judge prompts will use it
+ trg_txt: Optional[str] = None
+
+
+class TranslationLLMJudgeVerifyRequest(TranslationLLMJudgeRunRequest, BaseVerifyRequest):
+ pass
+
+
+class TranslationLLMJudgeEvaluation(BaseModel):
+ responses_create_params: NeMoGymResponseCreateParamsNonStreaming
+ response: NeMoGymResponse
+ # Extracted score from judge output
+ score: Optional[float] = None
+
+
+class TranslationLLMJudgeVerifyResponse(BaseVerifyResponse):
+ src_txt: str
+ src_lang: str
+ trg_lang: str
+ trg_txt: Optional[str] = None
+ judge_evaluation: TranslationLLMJudgeEvaluation
+
+
+class TranslationLLMJudgeResourcesServer(SimpleResourcesServer):
+ """Judge-only verifier using an LLM to evaluate translation quality."""
+
+ config: TranslationLLMJudgeResourcesServerConfig
+
+ def setup_webserver(self) -> FastAPI:
+ app = super().setup_webserver()
+ return app
+
+ def _extract_last_assistant_text(self, body: BaseVerifyRequest) -> str:
+ """Extract the last assistant message text from the response.
+
+ - If the assistant message has multiple text blocks, they are joined with newlines.
+ - If ``extract_regex`` is provided, the last regex match is used; if capture
+ groups exist, the first non-empty group is returned, otherwise the full match.
+ - Returns an empty string when no assistant text is available.
+ """
+ # Return only the last assistant message's text content.
+ for o in reversed(body.response.output):
+ text = ""
+ if getattr(o, "type", None) == "message" and getattr(o, "role", None) == "assistant":
+ content = getattr(o, "content", None)
+ if isinstance(content, list):
+ # Some providers split a single assistant message into multiple text blocks.
+ # Join all text blocks to reconstruct the full message text.
+ texts: list[str] = []
+ for c in content:
+ t = getattr(c, "text", None)
+ if isinstance(t, str):
+ texts.append(t)
+ text = "\n".join(texts).strip()
+ elif isinstance(content, str):
+ text = content.strip()
+
+ # Strip thinking if not already removed by reasoning parser
+ text = self._strip_thinking(text)
+ return text
+
+ def _strip_thinking(self, model_response: str) -> str:
+ # Strip any thinking
+ no_think_response = model_response.split(self.config.reasoning_split_word)[-1]
+ no_think_response = no_think_response.strip()
+ return no_think_response
+
+ async def verify(self, body: TranslationLLMJudgeVerifyRequest) -> TranslationLLMJudgeVerifyResponse:
+ generated = self._extract_last_assistant_text(body)
+
+ eval = await self._generate_judge_evaluation(
+ generated_text=generated,
+ src_txt=body.src_txt,
+ src_lang=body.src_lang,
+ trg_lang=body.trg_lang,
+ trg_txt=body.trg_txt,
+ )
+
+ payload = body.model_dump()
+ reward = eval.score / self.config.judge_max_score
+ return TranslationLLMJudgeVerifyResponse(**payload, reward=reward, judge_evaluation=eval)
+
+ async def _generate_judge_evaluation(
+ self, *, generated_text: str, src_txt: str, src_lang: str, trg_lang: str, trg_txt: Optional[str] = None
+ ) -> TranslationLLMJudgeEvaluation:
+ cfg = self.config
+
+ responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True)
+ prompt_template = cfg.judge_prompt_template
+ system_message = cfg.judge_system_message
+
+ if self.config.use_reference and trg_txt is not None:
+ user_prompt = prompt_template.format(
+ generated_text=generated_text, src_txt=src_txt, src_lang=src_lang, trg_lang=trg_lang, trg_txt=trg_txt
+ )
+ else:
+ user_prompt = prompt_template.format(
+ generated_text=generated_text, src_txt=src_txt, src_lang=src_lang, trg_lang=trg_lang
+ )
+
+ msgs: list[NeMoGymEasyInputMessage] = []
+ if system_message is not None and system_message != "":
+ msgs.append(NeMoGymEasyInputMessage(role="system", content=system_message))
+ msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt))
+ responses_create_params.input = msgs
+
+ response = await self.server_client.post(
+ server_name=cfg.judge_model_server.name,
+ url_path="/v1/responses",
+ json=responses_create_params,
+ )
+ judge_response = NeMoGymResponse.model_validate(await response.json())
+ eval_record = TranslationLLMJudgeEvaluation(
+ responses_create_params=responses_create_params,
+ response=judge_response,
+ score=0.0,
+ )
+
+ # Parse the last output; fall back to a score of 0 if unexpected.
+ try:
+ last_output = judge_response.output[-1]
+ if getattr(last_output, "type", None) != "message":
+ return eval_record
+ last_content = last_output.content[-1]
+ text = getattr(last_content, "text", "")
+ except Exception:
+ return eval_record
+
+ # Extract the score from the judge output
+ match = re.search(cfg.judge_score_extract_regex, text)
+ if match is None:
+ return eval_record
+ score = int(match.group(1))
+ eval_record.score = score
+ return eval_record
+
+
+if __name__ == "__main__":
+ TranslationLLMJudgeResourcesServer.run_webserver()
diff --git a/resources_servers/translation_llm_judge/configs/translation_llm_judge_vip.yaml b/resources_servers/translation_llm_judge/configs/translation_llm_judge_vip.yaml
new file mode 100644
index 000000000..8a752c9e5
--- /dev/null
+++ b/resources_servers/translation_llm_judge/configs/translation_llm_judge_vip.yaml
@@ -0,0 +1,66 @@
+translation_llm_judge:
+ resources_servers:
+ translation_llm_judge:
+ entrypoint: app.py
+ domain: translation
+ judge_model_server:
+ type: responses_api_models
+ name: policy_model
+ judge_responses_create_params:
+ input: []
+ judge_system_message: "You are a professional translation evaluator."
+ # This uses the VIP process from Cheng et al. 2024 (https://arxiv.org/abs/2407.21646)
+ # as implemented by Ouyang et al. 2025 (NVIDIA internal, not yet published)
+ judge_prompt_template: |-
+ Your task is to assess whether a translation segment successfully conveys the semantic content of the original speech according to the following criteria:
+
+ 1. Key Information Recognition: Identify whether the key information in the source (e.g., proper nouns, keywords, terminologies, or sentence structures) is present in the translation.
+ 2. Correctness Assessment: Determine whether the translation accurately conveys the speaker’s intention, without misinterpretation or contextual errors.
+ 3. Expressiveness Assessment: Evaluate whether the translation is fluent, clear, and intuitive to human readers. It should avoid unnecessary verbosity, ambiguous phrases, or awkward grammar.
+
+ Given a source text and its translation, answer "1" if the translation meets all three criteria and answer "0" otherwise. Only output the answer, no other text.
+
+
+ {src_txt}
+
+
+
+ {generated_text}
+
+ judge_score_extract_regex: "(1|0)"
+ judge_max_score: 1
+translation_llm_judge_simple_agent:
+ responses_api_agents:
+ simple_agent:
+ entrypoint: app.py
+ resources_server:
+ type: resources_servers
+ name: translation_llm_judge
+ model_server:
+ type: responses_api_models
+ name: policy_model
+ datasets:
+ - name: train
+ type: train
+ jsonl_fpath: resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_train
+ version: 0.0.1
+ artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: validation
+ type: validation
+ jsonl_fpath: resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_validation
+ version: 0.0.1
+ artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: example
+ type: example
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ jsonl_fpath: resources_servers/translation_llm_judge/data/example.jsonl
+
+
diff --git a/resources_servers/translation_llm_judge/data/example.jsonl b/resources_servers/translation_llm_judge/data/example.jsonl
new file mode 100644
index 000000000..2c921e654
--- /dev/null
+++ b/resources_servers/translation_llm_judge/data/example.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"}
\ No newline at end of file
diff --git a/resources_servers/translation_llm_judge/data/example_nothink.jsonl b/resources_servers/translation_llm_judge/data/example_nothink.jsonl
new file mode 100644
index 000000000..42529ab9a
--- /dev/null
+++ b/resources_servers/translation_llm_judge/data/example_nothink.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"}
+{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"}
+{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"}
\ No newline at end of file
diff --git a/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train_metrics.json b/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train_metrics.json
new file mode 100644
index 000000000..0d0c53e2e
--- /dev/null
+++ b/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train_metrics.json
@@ -0,0 +1,61 @@
+{
+ "name": "train",
+ "type": "train",
+ "jsonl_fpath": "resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train.jsonl",
+ "num_repeats": 1,
+ "gitlab_identifier": {
+ "dataset_name": "riva_mt_v3_nothinkInSys_train",
+ "version": "0.0.1",
+ "artifact_fpath": "riva_mt_v3_nothinkInSys_train.jsonl"
+ },
+ "license": "NVIDIA Internal Use Only, Do Not Distribute",
+ "Number of examples": 770000,
+ "Number of tools": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "Json-dumped number of words (proxy for token count)": {
+ "Total # non-null values": 770000,
+ "Average": 26.18,
+ "Min": 10.0,
+ "Max": 267.0,
+ "Median": 24.44,
+ "Standard deviation": 9.86
+ },
+ "Number of turns": {
+ "Total # non-null values": 770000,
+ "Average": 1.0,
+ "Min": 1.0,
+ "Max": 1.0,
+ "Median": 1.0,
+ "Standard deviation": 0.0
+ },
+ "Temperature": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "src_txt": {
+ "unique_count": 767619,
+ "total_count": 770000
+ },
+ "trg_txt": {
+ "unique_count": 767567,
+ "total_count": 770000
+ },
+ "src_lang": {
+ "unique_count": 12,
+ "total_count": 770000
+ },
+ "trg_lang": {
+ "unique_count": 12,
+ "total_count": 770000
+ }
+}
\ No newline at end of file
diff --git a/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation_metrics.json b/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation_metrics.json
new file mode 100644
index 000000000..bc35bb4d6
--- /dev/null
+++ b/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation_metrics.json
@@ -0,0 +1,61 @@
+{
+ "name": "validation",
+ "type": "validation",
+ "jsonl_fpath": "resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation.jsonl",
+ "num_repeats": 1,
+ "gitlab_identifier": {
+ "dataset_name": "riva_mt_v3_nothinkInSys_validation",
+ "version": "0.0.1",
+ "artifact_fpath": "riva_mt_v3_nothinkInSys_validation.jsonl"
+ },
+ "license": "NVIDIA Internal Use Only, Do Not Distribute",
+ "Number of examples": 19940,
+ "Number of tools": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "Json-dumped number of words (proxy for token count)": {
+ "Total # non-null values": 19940,
+ "Average": 32.68,
+ "Min": 10.0,
+ "Max": 82.0,
+ "Median": 33.05,
+ "Standard deviation": 10.38
+ },
+ "Number of turns": {
+ "Total # non-null values": 19940,
+ "Average": 1.0,
+ "Min": 1.0,
+ "Max": 1.0,
+ "Median": 1.0,
+ "Standard deviation": 0.0
+ },
+ "Temperature": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "src_txt": {
+ "unique_count": 10965,
+ "total_count": 19940
+ },
+ "trg_txt": {
+ "unique_count": 10965,
+ "total_count": 19940
+ },
+ "src_lang": {
+ "unique_count": 11,
+ "total_count": 19940
+ },
+ "trg_lang": {
+ "unique_count": 11,
+ "total_count": 19940
+ }
+}
\ No newline at end of file
diff --git a/resources_servers/translation_llm_judge/requirements.txt b/resources_servers/translation_llm_judge/requirements.txt
new file mode 100644
index 000000000..5042d4648
--- /dev/null
+++ b/resources_servers/translation_llm_judge/requirements.txt
@@ -0,0 +1,2 @@
+-e nemo-gym[dev] @ ../../
+
diff --git a/resources_servers/translation_llm_judge/tests/test_app.py b/resources_servers/translation_llm_judge/tests/test_app.py
new file mode 100644
index 000000000..3f5f3ade3
--- /dev/null
+++ b/resources_servers/translation_llm_judge/tests/test_app.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from copy import deepcopy
+from pathlib import Path
+from typing import Any
+from unittest.mock import AsyncMock, MagicMock
+
+from omegaconf import OmegaConf
+from pytest import approx, fixture
+
+from nemo_gym.config_types import ModelServerRef
+from nemo_gym.openai_utils import (
+ NeMoGymResponse,
+ NeMoGymResponseCreateParamsNonStreaming,
+ NeMoGymResponseOutputItem,
+ NeMoGymResponseOutputMessage,
+ NeMoGymResponseOutputText,
+)
+from nemo_gym.server_utils import ServerClient
+from resources_servers.translation_llm_judge.app import (
+ TranslationLLMJudgeResourcesServer,
+ TranslationLLMJudgeResourcesServerConfig,
+ TranslationLLMJudgeVerifyRequest,
+)
+
+
+class TestApp:
+ @fixture
+ def config(self) -> TranslationLLMJudgeResourcesServerConfig:
+ # Load judge template from YAML so tests mirror runtime config
+ yaml_path = Path(__file__).resolve().parents[1] / "configs" / "translation_llm_judge_vip.yaml"
+ yaml_cfg = OmegaConf.load(str(yaml_path))
+ judge_template: str = (
+ yaml_cfg.translation_llm_judge.resources_servers.translation_llm_judge.judge_prompt_template
+ )
+ judge_score_extract_regex: str = (
+ yaml_cfg.translation_llm_judge.resources_servers.translation_llm_judge.judge_score_extract_regex
+ )
+ judge_max_score: int = yaml_cfg.translation_llm_judge.resources_servers.translation_llm_judge.judge_max_score
+
+ cfg = TranslationLLMJudgeResourcesServerConfig(
+ host="0.0.0.0",
+ port=8080,
+ entrypoint="",
+ judge_model_server=ModelServerRef(type="responses_api_models", name="judge"),
+ judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]),
+ judge_prompt_template=judge_template,
+ judge_score_extract_regex=judge_score_extract_regex,
+ judge_max_score=judge_max_score,
+ )
+ return cfg
+
+ def _create_response(self, id: str, output_item: NeMoGymResponseOutputItem) -> dict[str, Any]:
+ return NeMoGymResponse(
+ id=id,
+ created_at=123.0,
+ model="judge_model",
+ object="response",
+ output=[output_item],
+ parallel_tool_calls=False,
+ tool_choice="none",
+ tools=[],
+ ).model_dump()
+
+ def _msg(self, text: str) -> NeMoGymResponseOutputMessage:
+ return NeMoGymResponseOutputMessage(
+ id="msg_id",
+ content=[NeMoGymResponseOutputText(annotations=[], text=text, type="output_text")],
+ role="assistant",
+ status="completed",
+ type="message",
+ )
+
+ async def test_verify_identical(self, config: TranslationLLMJudgeResourcesServerConfig) -> None:
+ server_mock = MagicMock(spec=ServerClient)
+ rs = TranslationLLMJudgeResourcesServer(config=config, server_client=server_mock)
+
+ post_mock = MagicMock()
+ post_mock.json = AsyncMock()
+ server_mock.post = AsyncMock(return_value=post_mock)
+
+ post_mock.json.side_effect = [
+ self._create_response("first", self._msg("1")),
+ ]
+
+ src_txt = "One two three"
+ trg_txt = "Eins zwei drei"
+ src_lang = "en"
+ trg_lang = "de"
+
+ model_create_params = NeMoGymResponseCreateParamsNonStreaming(input=[{"role": "user", "content": "Q: 1+1?"}])
+ model_response = NeMoGymResponse(
+ id="resp",
+ created_at=0.0,
+ model="m",
+ object="response",
+ output=[self._msg(trg_txt)],
+ parallel_tool_calls=False,
+ tool_choice="none",
+ tools=[],
+ )
+
+ req = TranslationLLMJudgeVerifyRequest(
+ responses_create_params=deepcopy(model_create_params),
+ response=model_response.model_copy(deep=True),
+ src_txt=src_txt,
+ trg_txt=trg_txt,
+ src_lang=src_lang,
+ trg_lang=trg_lang,
+ )
+ res = await rs.verify(req)
+ assert res.reward == approx(1.0)
+ assert res.src_txt == src_txt
+ assert res.trg_txt == trg_txt
+ assert res.src_lang == src_lang
+ assert res.trg_lang == trg_lang
+ assert res.judge_evaluation.score == 1
+
+ async def test_unexpected_judge_output_defaults_to_zero(
+ self, config: TranslationLLMJudgeResourcesServerConfig
+ ) -> None:
+ server_mock = MagicMock(spec=ServerClient)
+ rs = TranslationLLMJudgeResourcesServer(config=config, server_client=server_mock)
+
+ post_mock = MagicMock()
+ post_mock.json = AsyncMock(return_value=self._create_response("f", self._msg("no label present")))
+ server_mock.post = AsyncMock(return_value=post_mock)
+
+ req = TranslationLLMJudgeVerifyRequest(
+ responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]),
+ response=NeMoGymResponse(
+ id="r",
+ created_at=0.0,
+ model="m",
+ object="response",
+ output=[self._msg("text")],
+ parallel_tool_calls=False,
+ tool_choice="none",
+ tools=[],
+ ),
+ src_txt="One two three",
+ trg_txt="Eins zwei drei",
+ src_lang="en",
+ trg_lang="de",
+ )
+ res = await rs.verify(req)
+ assert res.reward == approx(0.0)
diff --git a/resources_servers/translation_metricx/README.md b/resources_servers/translation_metricx/README.md
new file mode 100644
index 000000000..654132fd8
--- /dev/null
+++ b/resources_servers/translation_metricx/README.md
@@ -0,0 +1,11 @@
+# Description
+
+Data links: ?
+
+# Licensing information
+Code: Apache 2.0
+Data: NVIDIA Internal Use Only, Do Not Distribute
+
+Dependencies
+- nemo_gym: Apache 2.0
+- google-research/metricx: Apache 2.0
diff --git a/resources_servers/translation_metricx/app.py b/resources_servers/translation_metricx/app.py
new file mode 100644
index 000000000..dcb17d9bf
--- /dev/null
+++ b/resources_servers/translation_metricx/app.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from pathlib import Path
+from typing import Any, Optional
+
+import datasets
+import transformers
+from fastapi import FastAPI
+from metricx24.models import MT5ForRegression
+
+from nemo_gym import CACHE_DIR
+from nemo_gym.base_resources_server import (
+ BaseResourcesServerConfig,
+ BaseVerifyRequest,
+ BaseVerifyResponse,
+ SimpleResourcesServer,
+)
+
+
+class TranslationMetricxResourcesServerConfig(BaseResourcesServerConfig):
+ """
+ Configuration for the TranslationMetricxResourcesServer.
+
+ Attributes:
+ use_reference (bool): Whether to use a reference translation
+ metricx_model_name (str): The MetricX model name to use. The default "google/metricx-24-hybrid-large-v2p6-bfloat16"
+ is the smallest model at 1.2B parameters.
+ tokenizer_name (str): The name of the mT5 tokenizer to use with the MetricX model. Size must match MetricX model.
+ For the default model above, use "google/mt5-large".
+ device_map (str): Device placement for the model. Options include "cpu", specific GPU (e.g., "cuda:1"),
+ "auto", "balanced", "balanced_low_0", "sequential".
+ max_input_length (int): Maximum input sequence length (see MetricX documentation, default 1536)
+ output_dir (str): Output directory for Trainer class. Nothing is actually output during prediction, but it's mandatory to supply.
+ """
+
+ use_reference: bool = True
+ metricx_model_name: str = "google/metricx-24-hybrid-large-v2p6-bfloat16"
+ tokenizer_name: str = "google/mt5-large"
+ device_map: str = "cpu"
+ max_input_length: int = 1536
+ output_dir: str = str(Path(CACHE_DIR) / "metricx_output")
+ reasoning_split_word: str = ""
+
+
+class TranslationMetricxVerifyRequest(BaseVerifyRequest):
+ src_txt: str
+ trg_txt: Optional[str] = None
+
+
+class TranslationMetricxVerifyResponse(BaseVerifyResponse):
+ src_txt: str
+ trg_txt: Optional[str] = None
+ extracted_answer: str
+
+
+class TranslationMetricxResourcesServer(SimpleResourcesServer):
+ config: TranslationMetricxResourcesServerConfig
+
+ def model_post_init(self, context: Any) -> None:
+ super().model_post_init(context)
+
+ # Load tokenizer (MetricX models use MT5 tokenizers, separate from the model name)
+ tokenizer = transformers.AutoTokenizer.from_pretrained(self.config.tokenizer_name)
+ self._tokenizer = tokenizer
+
+ # Load model with device placement
+ model = MT5ForRegression.from_pretrained(
+ self.config.metricx_model_name, torch_dtype="auto", device_map=self.config.device_map
+ )
+ # Inputs should go to the device where the first layer is
+ # Get device from the first model parameter
+ self._inputs_device = next(model.parameters()).device
+
+ model.eval()
+ self._metricx_model = model
+
+ # Ensure output directory exists (following predict.py lines 167-169)
+ os.makedirs(self.config.output_dir, exist_ok=True)
+
+ # Create trainer
+ training_args = transformers.TrainingArguments(
+ output_dir=self.config.output_dir,
+ per_device_eval_batch_size=1,
+ dataloader_pin_memory=False,
+ )
+ trainer = transformers.Trainer(
+ model=model,
+ args=training_args,
+ )
+ self._metricx_trainer = trainer
+
+ def setup_webserver(self) -> FastAPI:
+ app = super().setup_webserver()
+
+ # Additional server routes go here! e.g.:
+ # app.post("/get_weather")(self.get_weather)
+
+ return app
+
+ async def verify(self, body: TranslationMetricxVerifyRequest) -> TranslationMetricxVerifyResponse:
+ assistant_responses = []
+ for output_item in body.response.output:
+ if output_item.type != "message":
+ continue
+
+ for content_item in output_item.content:
+ if content_item.type != "output_text":
+ continue
+
+ assistant_responses.append(content_item.text)
+
+ combined_response = "".join(assistant_responses)
+
+ (reward, extracted_answer) = self._verify_answer(
+ model_response=combined_response, source_text=body.src_txt, target_text=body.trg_txt
+ )
+
+ return TranslationMetricxVerifyResponse(**body.model_dump(), extracted_answer=extracted_answer, reward=reward)
+
+ def _verify_answer(
+ self, model_response: str, source_text: str, target_text: Optional[str] = None
+ ) -> tuple[float, str]:
+ extracted_answer = self._extract_answer(model_response)
+ ds = self._create_dataset_from_example(extracted_answer, source_text, target_text)
+ predictions, _, _ = self._metricx_trainer.predict(test_dataset=ds)
+ score = float(predictions[0])
+
+ # MetricX scores are between 0 and 25, where 25 is worst, so we normalize to 0 to 1 where 0 is worst
+ reward = (25 - score) / 25.0
+ return reward, extracted_answer
+
+ def _create_dataset_from_example(
+ self, hypothesis: str, source_text: str, reference_text: Optional[str] = None
+ ) -> datasets.Dataset:
+ """Create a dataset from a single example, following get_dataset logic from predict.py."""
+ # Create input string based on reference ("QE") mode (QE is when use_reference is False)
+ if not self.config.use_reference or reference_text is None:
+ input_text = "source: " + source_text + " candidate: " + hypothesis
+ else:
+ input_text = "source: " + source_text + " candidate: " + hypothesis + " reference: " + reference_text
+
+ # Tokenize (returns dict with lists)
+ tokenized = self._tokenizer(
+ input_text,
+ max_length=self.config.max_input_length,
+ truncation=True,
+ padding=False,
+ )
+
+ # Create a single example dict (matching the format that predict.py creates)
+ example_dict = {
+ "input_ids": tokenized["input_ids"],
+ "attention_mask": tokenized["attention_mask"],
+ }
+
+ # If last token is EOS, remove it (following predict.py _remove_eos function logic)
+ eos_token_id = self._tokenizer.eos_token_id
+ if eos_token_id is not None and example_dict["input_ids"][-1] == eos_token_id:
+ example_dict["input_ids"] = example_dict["input_ids"][:-1]
+ example_dict["attention_mask"] = example_dict["attention_mask"][:-1]
+
+ # Create dataset from a list of examples (each example is one dict)
+ # Following predict.py structure: ds.map() operations create per-example dicts
+ ds = datasets.Dataset.from_list([example_dict])
+
+ # Set format to torch and move to device (following predict.py line 119-124)
+ ds.set_format(
+ type="torch",
+ columns=["input_ids", "attention_mask"],
+ device=self._inputs_device,
+ output_all_columns=True,
+ )
+
+ return ds
+
+ def _extract_answer(self, model_response: str) -> str:
+ # Strip any thinking
+ no_think_response = model_response.split(self.config.reasoning_split_word)[-1]
+ no_think_response = no_think_response.strip()
+ return no_think_response
+
+
+if __name__ == "__main__":
+ TranslationMetricxResourcesServer.run_webserver()
diff --git a/resources_servers/translation_metricx/configs/translation_metricx.yaml b/resources_servers/translation_metricx/configs/translation_metricx.yaml
new file mode 100644
index 000000000..8a8b18bcc
--- /dev/null
+++ b/resources_servers/translation_metricx/configs/translation_metricx.yaml
@@ -0,0 +1,42 @@
+translation_metricx:
+ resources_servers:
+ translation_metricx:
+ entrypoint: app.py
+ domain: translation
+ use_reference: true
+ metricx_model_name: google/metricx-24-hybrid-xl-v2p6
+ tokenizer_name: google/mt5-large
+ max_input_length: 1536
+translation_metricx_simple_agent:
+ responses_api_agents:
+ simple_agent:
+ entrypoint: app.py
+ resources_server:
+ type: resources_servers
+ name: translation_metricx
+ model_server:
+ type: responses_api_models
+ name: policy_model
+ datasets:
+ - name: train
+ type: train
+ jsonl_fpath: resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_train
+ version: 0.0.1
+ artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: validation
+ type: validation
+ jsonl_fpath: resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation.jsonl
+ num_repeats: 1
+ gitlab_identifier:
+ dataset_name: riva_mt_v3_nothinkInSys_validation
+ version: 0.0.1
+ artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl
+ license: NVIDIA Internal Use Only, Do Not Distribute
+ - name: example
+ type: example
+ jsonl_fpath: resources_servers/translation_metricx/data/example.jsonl
+ num_repeats: 1
diff --git a/resources_servers/translation_metricx/data/.gitignore b/resources_servers/translation_metricx/data/.gitignore
new file mode 100644
index 000000000..4424b6fde
--- /dev/null
+++ b/resources_servers/translation_metricx/data/.gitignore
@@ -0,0 +1,5 @@
+*train.jsonl
+*validation.jsonl
+*train_prepare.jsonl
+*validation_prepare.jsonl
+*example_prepare.jsonl
diff --git a/resources_servers/translation_metricx/data/example.jsonl b/resources_servers/translation_metricx/data/example.jsonl
new file mode 100644
index 000000000..2c921e654
--- /dev/null
+++ b/resources_servers/translation_metricx/data/example.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"}
+{"responses_create_params": {"input": [{"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"}
\ No newline at end of file
diff --git a/resources_servers/translation_metricx/data/example_nothink.jsonl b/resources_servers/translation_metricx/data/example_nothink.jsonl
new file mode 100644
index 000000000..42529ab9a
--- /dev/null
+++ b/resources_servers/translation_metricx/data/example_nothink.jsonl
@@ -0,0 +1,5 @@
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"}
+{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"}
+{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"}
+{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"}
\ No newline at end of file
diff --git a/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train_metrics.json b/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train_metrics.json
new file mode 100644
index 000000000..146d1f6f9
--- /dev/null
+++ b/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train_metrics.json
@@ -0,0 +1,61 @@
+{
+ "name": "train",
+ "type": "train",
+ "jsonl_fpath": "resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train.jsonl",
+ "num_repeats": 1,
+ "gitlab_identifier": {
+ "dataset_name": "riva_mt_v3_nothinkInSys_train",
+ "version": "0.0.1",
+ "artifact_fpath": "riva_mt_v3_nothinkInSys_train.jsonl"
+ },
+ "license": "NVIDIA Internal Use Only, Do Not Distribute",
+ "Number of examples": 770000,
+ "Number of tools": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "Json-dumped number of words (proxy for token count)": {
+ "Total # non-null values": 770000,
+ "Average": 26.18,
+ "Min": 10.0,
+ "Max": 267.0,
+ "Median": 24.44,
+ "Standard deviation": 9.86
+ },
+ "Number of turns": {
+ "Total # non-null values": 770000,
+ "Average": 1.0,
+ "Min": 1.0,
+ "Max": 1.0,
+ "Median": 1.0,
+ "Standard deviation": 0.0
+ },
+ "Temperature": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "src_txt": {
+ "unique_count": 767619,
+ "total_count": 770000
+ },
+ "trg_txt": {
+ "unique_count": 767567,
+ "total_count": 770000
+ },
+ "src_lang": {
+ "unique_count": 12,
+ "total_count": 770000
+ },
+ "trg_lang": {
+ "unique_count": 12,
+ "total_count": 770000
+ }
+}
\ No newline at end of file
diff --git a/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation_metrics.json b/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation_metrics.json
new file mode 100644
index 000000000..9070a8dee
--- /dev/null
+++ b/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation_metrics.json
@@ -0,0 +1,61 @@
+{
+ "name": "validation",
+ "type": "validation",
+ "jsonl_fpath": "resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation.jsonl",
+ "num_repeats": 1,
+ "gitlab_identifier": {
+ "dataset_name": "riva_mt_v3_nothinkInSys_validation",
+ "version": "0.0.1",
+ "artifact_fpath": "riva_mt_v3_nothinkInSys_validation.jsonl"
+ },
+ "license": "NVIDIA Internal Use Only, Do Not Distribute",
+ "Number of examples": 19940,
+ "Number of tools": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "Json-dumped number of words (proxy for token count)": {
+ "Total # non-null values": 19940,
+ "Average": 32.68,
+ "Min": 10.0,
+ "Max": 82.0,
+ "Median": 33.05,
+ "Standard deviation": 10.38
+ },
+ "Number of turns": {
+ "Total # non-null values": 19940,
+ "Average": 1.0,
+ "Min": 1.0,
+ "Max": 1.0,
+ "Median": 1.0,
+ "Standard deviation": 0.0
+ },
+ "Temperature": {
+ "Total # non-null values": 0,
+ "Average": 0.0,
+ "Min": 0.0,
+ "Max": 0.0,
+ "Median": 0.0,
+ "Standard deviation": 0.0
+ },
+ "src_txt": {
+ "unique_count": 10965,
+ "total_count": 19940
+ },
+ "trg_txt": {
+ "unique_count": 10965,
+ "total_count": 19940
+ },
+ "src_lang": {
+ "unique_count": 11,
+ "total_count": 19940
+ },
+ "trg_lang": {
+ "unique_count": 11,
+ "total_count": 19940
+ }
+}
\ No newline at end of file
diff --git a/resources_servers/translation_metricx/requirements.txt b/resources_servers/translation_metricx/requirements.txt
new file mode 100644
index 000000000..3950124ba
--- /dev/null
+++ b/resources_servers/translation_metricx/requirements.txt
@@ -0,0 +1,2 @@
+-e nemo-gym[dev] @ ../../
+metricx @ git+https://github.com/hrossnv/metricx.git@main
\ No newline at end of file
diff --git a/resources_servers/translation_metricx/tests/test_app.py b/resources_servers/translation_metricx/tests/test_app.py
new file mode 100644
index 000000000..0cafd5f9a
--- /dev/null
+++ b/resources_servers/translation_metricx/tests/test_app.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+from copy import deepcopy
+from pathlib import Path
+from typing import Any
+from unittest.mock import MagicMock
+
+from pytest import approx, fixture
+
+from nemo_gym import CACHE_DIR
+from nemo_gym.openai_utils import (
+ NeMoGymResponse,
+ NeMoGymResponseCreateParamsNonStreaming,
+ NeMoGymResponseOutputMessage,
+ NeMoGymResponseOutputText,
+)
+from nemo_gym.server_utils import ServerClient
+from resources_servers.translation_metricx.app import (
+ TranslationMetricxResourcesServer,
+ TranslationMetricxResourcesServerConfig,
+ TranslationMetricxVerifyRequest,
+)
+
+
+logger = logging.getLogger(__name__)
+
+os.environ["HF_HOME"] = str(Path(CACHE_DIR) / "hf_cache")
+
+
+class TestApp:
+ @fixture(scope="class")
+ def resources_server(self) -> TranslationMetricxResourcesServer:
+ """We only want to spin up the server once since it has to load the model."""
+ logger.info("Spinning up server with MetricX model...")
+
+ server = TranslationMetricxResourcesServer(
+ config=TranslationMetricxResourcesServerConfig(
+ host="0.0.0.0",
+ port=8080,
+ entrypoint="",
+ name="",
+ use_reference=True,
+ # 1.2B parameter model runs fine on CPU, though tests will take a couple of minutes
+ metricx_model_name="google/metricx-24-hybrid-large-v2p6-bfloat16",
+ tokenizer_name="google/mt5-large",
+ device_map="cpu",
+ max_input_length=1536,
+ output_dir=str(Path(CACHE_DIR) / "metricx_output"),
+ ),
+ server_client=MagicMock(spec=ServerClient),
+ )
+
+ logger.info("Model loaded and server started successfully")
+ return server
+
+ def _create_response(self, id: str, model_response_text: str) -> dict[str, Any]:
+ return NeMoGymResponse(
+ id=id,
+ created_at=1234.5,
+ model="response_model",
+ object="response",
+ parallel_tool_calls=False,
+ tool_choice="none",
+ tools=[],
+ output=[
+ NeMoGymResponseOutputMessage(
+ id=f"ID for {model_response_text}",
+ role="assistant",
+ status="in_progress",
+ type="message",
+ content=[NeMoGymResponseOutputText(annotations=[], text=model_response_text, type="output_text")],
+ )
+ ],
+ ).model_dump()
+
+ async def test_verify_identical(self, resources_server: TranslationMetricxResourcesServer) -> None:
+ source_text = "What is the name of your cat?"
+ target_text = "Was ist der Name deiner Katze?"
+ target_lang_name = "German"
+ model_create_params = NeMoGymResponseCreateParamsNonStreaming(
+ input=[
+ {
+ "role": "user",
+ "content": f'Translate this into {target_lang_name}: "{source_text}"',
+ }
+ ]
+ )
+ model_response = NeMoGymResponse(**self._create_response("model_response_id", target_text))
+ identical_verify_request = TranslationMetricxVerifyRequest(
+ responses_create_params=deepcopy(model_create_params),
+ response=model_response.model_copy(deep=True),
+ src_txt=source_text,
+ trg_txt=target_text,
+ )
+ identical_verify_response = await resources_server.verify(identical_verify_request)
+ assert identical_verify_response.responses_create_params == model_create_params
+ assert identical_verify_response.response == model_response
+ assert identical_verify_response.src_txt == source_text
+ assert identical_verify_response.trg_txt == target_text
+ assert identical_verify_response.reward == approx(1.0, abs=0.1)
+ assert identical_verify_response.extracted_answer == target_text
+
+ assert sorted(list(identical_verify_response.model_dump())) == [
+ "extracted_answer",
+ "response",
+ "responses_create_params",
+ "reward",
+ "src_txt",
+ "trg_txt",
+ ]
+
+ async def test_verify_identical_without_reference(
+ self, resources_server: TranslationMetricxResourcesServer
+ ) -> None:
+ source_text = "two three"
+ target_text = "zwei drei"
+ target_lang_name = "German"
+ model_create_params = NeMoGymResponseCreateParamsNonStreaming(
+ input=[
+ {
+ "role": "user",
+ "content": f'Translate this into {target_lang_name}: "{source_text}"',
+ }
+ ]
+ )
+ model_response = NeMoGymResponse(**self._create_response("model_response_id", target_text))
+ identical_verify_request = TranslationMetricxVerifyRequest(
+ responses_create_params=deepcopy(model_create_params),
+ response=model_response.model_copy(deep=True),
+ src_txt=source_text,
+ trg_txt=None, # Technically the model config is set up to use a reference but this triggers the same behavior
+ )
+ identical_verify_response = await resources_server.verify(identical_verify_request)
+ assert identical_verify_response.responses_create_params == model_create_params
+ assert identical_verify_response.response == model_response
+ assert identical_verify_response.src_txt == source_text
+ assert identical_verify_response.trg_txt is None
+ assert identical_verify_response.reward == approx(1.0, abs=0.1)
+ assert identical_verify_response.extracted_answer == target_text
+
+ assert sorted(list(identical_verify_response.model_dump())) == [
+ "extracted_answer",
+ "response",
+ "responses_create_params",
+ "reward",
+ "src_txt",
+ "trg_txt",
+ ]
+
+ def test_verify_answer_identical(self, resources_server: TranslationMetricxResourcesServer) -> None:
+ source_text = "two three"
+ target_text = "zwei drei"
+ model_response_text = target_text
+
+ assert resources_server._verify_answer(model_response_text, source_text, target_text) == (
+ approx(1.0, abs=0.1), # It's a model output so it won't be exact
+ target_text,
+ )
+
+ def test_verify_answer_think_tags(self, resources_server: TranslationMetricxResourcesServer) -> None:
+ source_text = "What is the name of your cat?"
+ target_text = "Was ist der Name deiner Katze?"
+ model_response_text = f"\n\n{target_text}"
+
+ assert resources_server._verify_answer(model_response_text, source_text, target_text) == (
+ approx(1.0, abs=0.1), # It's a model output so it won't be exact
+ target_text,
+ )
+
+ def test_verify_answer_no_match(self, resources_server: TranslationMetricxResourcesServer) -> None:
+ source_text = "What is the name of your cat?"
+ target_text = "Was ist der Name deiner Katze?"
+ model_response_text = "Incorrect translation."
+
+ reward, extracted_answer = resources_server._verify_answer(model_response_text, source_text, target_text)
+ assert reward <= 0.6 # Raw score is around 10 for this example, where 25 is worst
+ assert extracted_answer == model_response_text
+
+ def test_verify_answer_without_reference(self, resources_server: TranslationMetricxResourcesServer) -> None:
+ source_text = "two three"
+ model_response_text = "zwei drei"
+
+ assert resources_server._verify_answer(model_response_text, source_text) == (
+ approx(1.0, abs=0.1), # It's a model output so it won't be exact
+ model_response_text,
+ )