diff --git a/.gitignore b/.gitignore index aaae2d2db..a9b67e00e 100644 --- a/.gitignore +++ b/.gitignore @@ -16,7 +16,7 @@ fastspeech_output .bash_history.local # Byte-compiled / optimized / DLL files -__pycache__/ +**/__pycache__/ *.py[cod] *$py.class **.pyc diff --git a/nemo_gym/config_types.py b/nemo_gym/config_types.py index b340081c7..dd1b18831 100644 --- a/nemo_gym/config_types.py +++ b/nemo_gym/config_types.py @@ -312,6 +312,7 @@ class DatasetConfig(BaseModel): Literal["MIT"], Literal["Creative Commons Attribution 4.0 International"], Literal["Creative Commons Attribution-ShareAlike 4.0 International"], + Literal["NVIDIA Internal Use Only, Do Not Distribute"], Literal["TBD"], Literal["MIT"], ] diff --git a/resources_servers/translation_bleu/README.md b/resources_servers/translation_bleu/README.md new file mode 100644 index 000000000..6a7d6bffc --- /dev/null +++ b/resources_servers/translation_bleu/README.md @@ -0,0 +1,11 @@ +# Description + +Data links: ? + +# Licensing information +Code: Apache 2.0 +Data: NVIDIA Internal Use Only, Do Not Distribute + +Dependencies +- nemo_gym: Apache 2.0 +- sacrebleu: Apache 2.0 diff --git a/resources_servers/translation_bleu/app.py b/resources_servers/translation_bleu/app.py new file mode 100644 index 000000000..6cb85a1bb --- /dev/null +++ b/resources_servers/translation_bleu/app.py @@ -0,0 +1,110 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict + +from fastapi import FastAPI +from sacrebleu.metrics import BLEU + +from nemo_gym.base_resources_server import ( + BaseResourcesServerConfig, + BaseVerifyRequest, + BaseVerifyResponse, + SimpleResourcesServer, +) + + +class TranslationBleuResourcesServerConfig(BaseResourcesServerConfig): + reasoning_split_word: str = "" + + +class TranslationBleuVerifyRequest(BaseVerifyRequest): + trg_txt: str + trg_lang: str + + +class TranslationBleuVerifyResponse(BaseVerifyResponse): + trg_txt: str + trg_lang: str + extracted_answer: str + + +class TranslationBleuResourcesServer(SimpleResourcesServer): + config: TranslationBleuResourcesServerConfig + + TOKENIZER_MAP: Dict[str, str] = { + "zh": "zh", + "zh-cn": "zh", + "zh-tw": "zh", + "zho-CN": "zh", + "zho_simpl": "zh", + "ja": "ja-mecab", + "jpn": "ja-mecab", + "th": "flores200", + "ko": "ko-mecab", + } + + def setup_webserver(self) -> FastAPI: + app = super().setup_webserver() + + # Additional server routes go here! e.g.: + # app.post("/get_weather")(self.get_weather) + + return app + + async def verify(self, body: TranslationBleuVerifyRequest) -> TranslationBleuVerifyResponse: + assistant_responses = [] + for output_item in body.response.output: + if output_item.type != "message": + continue + + for content_item in output_item.content: + if content_item.type != "output_text": + continue + + assistant_responses.append(content_item.text) + + combined_response = "".join(assistant_responses) + + (reward, extracted_answer) = self._verify_answer( + ground_truth=body.trg_txt, target_lang=body.trg_lang, model_response=combined_response + ) + + return TranslationBleuVerifyResponse(**body.model_dump(), extracted_answer=extracted_answer, reward=reward) + + def _verify_answer(self, ground_truth: str, target_lang: str, model_response: str) -> tuple[float, str]: + extracted_answer = self._extract_answer(model_response) + + if target_lang in self.TOKENIZER_MAP: + tokenize = self.TOKENIZER_MAP[target_lang] + else: + tokenize = None + # Use effective_order for sentence-level BLEU + bleu = BLEU(trg_lang=target_lang, effective_order=True, tokenize=tokenize) + + bleu_output = bleu.sentence_score(extracted_answer, [ground_truth]) + # TODO Do we want to report any other BLEU outputs? + bleu_score = bleu_output.score + reward = bleu_score / 100.0 + + return reward, extracted_answer + + def _extract_answer(self, model_response: str) -> str: + # Strip any thinking + no_think_response = model_response.split(self.config.reasoning_split_word)[-1] + no_think_response = no_think_response.strip() + return no_think_response + + +if __name__ == "__main__": + TranslationBleuResourcesServer.run_webserver() diff --git a/resources_servers/translation_bleu/configs/translation_bleu.yaml b/resources_servers/translation_bleu/configs/translation_bleu.yaml new file mode 100644 index 000000000..81b004aac --- /dev/null +++ b/resources_servers/translation_bleu/configs/translation_bleu.yaml @@ -0,0 +1,38 @@ +translation_bleu: + resources_servers: + translation_bleu: + entrypoint: app.py + domain: translation +translation_bleu_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: translation_bleu + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_train + version: 0.0.3 + artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: validation + type: validation + jsonl_fpath: resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_validation + version: 0.0.1 + artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: example + type: example + jsonl_fpath: resources_servers/translation_bleu/data/example.jsonl + num_repeats: 1 diff --git a/resources_servers/translation_bleu/data/.gitignore b/resources_servers/translation_bleu/data/.gitignore new file mode 100644 index 000000000..4424b6fde --- /dev/null +++ b/resources_servers/translation_bleu/data/.gitignore @@ -0,0 +1,5 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/resources_servers/translation_bleu/data/example.jsonl b/resources_servers/translation_bleu/data/example.jsonl new file mode 100644 index 000000000..2c921e654 --- /dev/null +++ b/resources_servers/translation_bleu/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"} +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"} +{"responses_create_params": {"input": [{"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"} \ No newline at end of file diff --git a/resources_servers/translation_bleu/data/example_nothink.jsonl b/resources_servers/translation_bleu/data/example_nothink.jsonl new file mode 100644 index 000000000..42529ab9a --- /dev/null +++ b/resources_servers/translation_bleu/data/example_nothink.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"} +{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"} +{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"} \ No newline at end of file diff --git a/resources_servers/translation_bleu/data/example_nothink_rollouts.jsonl b/resources_servers/translation_bleu/data/example_nothink_rollouts.jsonl new file mode 100644 index 000000000..7c7c2c018 --- /dev/null +++ b/resources_servers/translation_bleu/data/example_nothink_rollouts.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_de822fae9a3048609f9d29124711ef7e", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_ca1ab146369449f6af314513647808ef", "content": [{"annotations": [], "text": "The Conservatory and the theater were dedicated to him, in which every year the \"Rossini Opera Festival\" is held, attracting opera lovers from around the world.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.1826249361348376, "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "trg_lang": "en", "extracted_answer": "The Conservatory and the theater were dedicated to him, in which every year the \"Rossini Opera Festival\" is held, attracting opera lovers from around the world."} +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_1c8244efef744b38a8adb1e3265e6ca9", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_ff0e183cd1f6448a8840104b9beb235c", "content": [{"annotations": [], "text": "次の図は、原子が光子を放出している様子を示しています。もちろん、実際には光子はその図の中のものよりもずっと小さいです。\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.41614625199196853, "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "trg_lang": "ja", "extracted_answer": "次の図は、原子が光子を放出している様子を示しています。もちろん、実際には光子はその図の中のものよりもずっと小さいです。"} +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_ac13960214d348b1b341ab0c8db10cfc", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_94ba9b4835954fceaa534381996ace6c", "content": [{"annotations": [], "text": "Why waste time looking for the keys?\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.6752918218126552, "trg_txt": "Why waste your time looking for the keys?", "trg_lang": "en", "extracted_answer": "Why waste time looking for the keys?"} +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_a8cda57b843444168553e60a61b6d0e7", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_4da079740c9440169da96073a99113bc", "content": [{"annotations": [], "text": "Toutes les questions ont été répondues correctement par le trésorier Thomas Kräuchi.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.15727800941615358, "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "trg_lang": "fr", "extracted_answer": "Toutes les questions ont été répondues correctement par le trésorier Thomas Kräuchi."} +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_027e2b003199426d88afbf0be7b18ab0", "created_at": 1761320055.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_8f45cceaf9c741fb8dda838c68338f3b", "content": [{"annotations": [], "text": "The competition takes place on the grass, and the grass around the holes is cut shorter, referred to as the green.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.3204144198812986, "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "trg_lang": "en", "extracted_answer": "The competition takes place on the grass, and the grass around the holes is cut shorter, referred to as the green."} diff --git a/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train_metrics.json b/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train_metrics.json new file mode 100644 index 000000000..369e43852 --- /dev/null +++ b/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train_metrics.json @@ -0,0 +1,61 @@ +{ + "name": "train", + "type": "train", + "jsonl_fpath": "resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_train.jsonl", + "num_repeats": 1, + "gitlab_identifier": { + "dataset_name": "riva_mt_v3_nothinkInSys_train", + "version": "0.0.3", + "artifact_fpath": "riva_mt_v3_nothinkInSys_train.jsonl" + }, + "license": "NVIDIA Internal Use Only, Do Not Distribute", + "Number of examples": 770000, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 770000, + "Average": 26.18, + "Min": 10.0, + "Max": 267.0, + "Median": 24.44, + "Standard deviation": 9.86 + }, + "Number of turns": { + "Total # non-null values": 770000, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Median": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "src_txt": { + "unique_count": 767619, + "total_count": 770000 + }, + "trg_txt": { + "unique_count": 767567, + "total_count": 770000 + }, + "src_lang": { + "unique_count": 12, + "total_count": 770000 + }, + "trg_lang": { + "unique_count": 12, + "total_count": 770000 + } +} \ No newline at end of file diff --git a/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation_metrics.json b/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation_metrics.json new file mode 100644 index 000000000..1d6645b38 --- /dev/null +++ b/resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation_metrics.json @@ -0,0 +1,61 @@ +{ + "name": "validation", + "type": "validation", + "jsonl_fpath": "resources_servers/translation_bleu/data/riva_mt_v3_nothinkInSys_validation.jsonl", + "num_repeats": 1, + "gitlab_identifier": { + "dataset_name": "riva_mt_v3_nothinkInSys_validation", + "version": "0.0.1", + "artifact_fpath": "riva_mt_v3_nothinkInSys_validation.jsonl" + }, + "license": "NVIDIA Internal Use Only, Do Not Distribute", + "Number of examples": 19940, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 19940, + "Average": 32.68, + "Min": 10.0, + "Max": 82.0, + "Median": 33.05, + "Standard deviation": 10.38 + }, + "Number of turns": { + "Total # non-null values": 19940, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Median": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "src_txt": { + "unique_count": 10965, + "total_count": 19940 + }, + "trg_txt": { + "unique_count": 10965, + "total_count": 19940 + }, + "src_lang": { + "unique_count": 11, + "total_count": 19940 + }, + "trg_lang": { + "unique_count": 11, + "total_count": 19940 + } +} \ No newline at end of file diff --git a/resources_servers/translation_bleu/requirements.txt b/resources_servers/translation_bleu/requirements.txt new file mode 100644 index 000000000..f3ee63442 --- /dev/null +++ b/resources_servers/translation_bleu/requirements.txt @@ -0,0 +1,3 @@ +-e nemo-gym[dev] @ ../../ +mecab-ko==1.0.2 +sacrebleu[ja,ko]>=2.0.0 \ No newline at end of file diff --git a/resources_servers/translation_bleu/tests/test_app.py b/resources_servers/translation_bleu/tests/test_app.py new file mode 100644 index 000000000..06dff1afc --- /dev/null +++ b/resources_servers/translation_bleu/tests/test_app.py @@ -0,0 +1,143 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from copy import deepcopy +from typing import Any +from unittest.mock import MagicMock + +from pytest import approx, fixture + +from nemo_gym.openai_utils import ( + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, + NeMoGymResponseOutputMessage, + NeMoGymResponseOutputText, +) +from nemo_gym.server_utils import ServerClient +from resources_servers.translation_bleu.app import ( + TranslationBleuResourcesServer, + TranslationBleuResourcesServerConfig, + TranslationBleuVerifyRequest, +) + + +class TestApp: + @fixture + def config(self) -> TranslationBleuResourcesServerConfig: + return TranslationBleuResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + ) + + def _create_response(self, id: str, model_response_text: str) -> dict[str, Any]: + return NeMoGymResponse( + id=id, + created_at=1234.5, + model="response_model", + object="response", + parallel_tool_calls=False, + tool_choice="none", + tools=[], + output=[ + NeMoGymResponseOutputMessage( + id=f"ID for {model_response_text}", + role="assistant", + status="in_progress", + type="message", + content=[NeMoGymResponseOutputText(annotations=[], text=model_response_text, type="output_text")], + ) + ], + ).model_dump() + + async def test_verify_identical(self, config: TranslationBleuResourcesServerConfig) -> None: + server_mock = MagicMock(spec=ServerClient) + resources_server = TranslationBleuResourcesServer(config=config, server_client=server_mock) + + source_text = "What is the name of your cat?" + ground_truth = "Was ist der Name deiner Katze?" + target_lang = "de" + target_lang_name = "German" + model_create_params = NeMoGymResponseCreateParamsNonStreaming( + input=[ + { + "role": "user", + "content": f'Translate this into {target_lang_name}: "{source_text}"', + } + ] + ) + model_response = NeMoGymResponse(**self._create_response("model_response_id", ground_truth)) + identical_verify_request = TranslationBleuVerifyRequest( + responses_create_params=deepcopy(model_create_params), + response=model_response.model_copy(deep=True), + trg_txt=ground_truth, + trg_lang=target_lang, + ) + identical_verify_response = await resources_server.verify(identical_verify_request) + assert identical_verify_response.responses_create_params == model_create_params + assert identical_verify_response.response == model_response + assert identical_verify_response.trg_txt == ground_truth + assert identical_verify_response.trg_lang == target_lang + assert identical_verify_response.reward == approx(1.0) + assert identical_verify_response.extracted_answer == ground_truth + + assert sorted(list(identical_verify_response.model_dump())) == [ + "extracted_answer", + "response", + "responses_create_params", + "reward", + "trg_lang", + "trg_txt", + ] + + def test_verify_answer_identical(self, config: TranslationBleuResourcesServerConfig) -> None: + resources_server = TranslationBleuResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + # source_text = "What is the name of your cat?" + ground_truth = "Was ist der Name deiner Katze?" + target_lang = "de" + model_response_text = ground_truth + + assert resources_server._verify_answer(ground_truth, target_lang, model_response_text) == ( + approx(1.0), + ground_truth, + ) + + def test_verify_answer_think_tags(self, config: TranslationBleuResourcesServerConfig) -> None: + resources_server = TranslationBleuResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + # source_text = "What is the name of your cat?" + ground_truth = "Was ist der Name deiner Katze?" + target_lang = "de" + model_response_text = f"\n\n{ground_truth}" + + assert resources_server._verify_answer(ground_truth, target_lang, model_response_text) == ( + approx(1.0), + ground_truth, + ) + + def test_verify_answer_no_match(self, config: TranslationBleuResourcesServerConfig) -> None: + resources_server = TranslationBleuResourcesServer(config=config, server_client=MagicMock(spec=ServerClient)) + + # source_text = "What is the name of your cat?" + ground_truth = "Was ist der Name deiner Katze?" + target_lang = "de" + model_response_text = "Incorrect translation." + + assert resources_server._verify_answer(ground_truth, target_lang, model_response_text) == ( + approx(0.0), + model_response_text, + ) diff --git a/resources_servers/translation_comet/README.md b/resources_servers/translation_comet/README.md new file mode 100644 index 000000000..cf6920b56 --- /dev/null +++ b/resources_servers/translation_comet/README.md @@ -0,0 +1,11 @@ +# Description + +Data links: ? + +# Licensing information +Code: Apache 2.0 +Data: NVIDIA Internal Use Only, Do Not Distribute + +Dependencies +- nemo_gym: Apache 2.0 +- unbabel-comet: Apache 2.0 diff --git a/resources_servers/translation_comet/app.py b/resources_servers/translation_comet/app.py new file mode 100644 index 000000000..83ae1fc12 --- /dev/null +++ b/resources_servers/translation_comet/app.py @@ -0,0 +1,129 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, List, Optional, Union + +from comet.models import download_model, load_from_checkpoint +from fastapi import FastAPI + +from nemo_gym.base_resources_server import ( + BaseResourcesServerConfig, + BaseVerifyRequest, + BaseVerifyResponse, + SimpleResourcesServer, +) + + +class TranslationCometResourcesServerConfig(BaseResourcesServerConfig): + use_reference: bool = True # Must match model set in comet_model_name + comet_model_name: str = "Unbabel/wmt22-comet-da" + comet_gpu_count: int = None # CPU only + comet_gpu_devices: Union[List[int], str, int] = "auto" + model_cache_dir: Optional[str] = None + reasoning_split_word: str = "" + + +class TranslationCometVerifyRequest(BaseVerifyRequest): + src_txt: str + trg_txt: Optional[str] = None + + +class TranslationCometVerifyResponse(BaseVerifyResponse): + src_txt: str + trg_txt: Optional[str] = None + extracted_answer: str + + +class TranslationCometResourcesServer(SimpleResourcesServer): + config: TranslationCometResourcesServerConfig + batch_size: int = 1 # We only process one item at a time so this is always 1 + + def model_post_init(self, context: Any) -> None: + super().model_post_init(context) + + # # Manually load the model without the Comet wrapper class so we can control the GPU allocation + # # https://stackoverflow.com/questions/75879866/how-to-load-unbabel-comet-model-without-nested-wrapper-initialization + + # model_path = snapshot_download(repo_id=self.config.comet_model_name) + # model_checkpoint_path = f'{model_path}/checkpoints/model.ckpt' + # if self.config.use_reference: + # self._comet_model = RegressionMetric.load_from_checkpoint(model_checkpoint_path, layer_transformation='softmax') + # else: + # self._comet_model = ReferencelessRegression.load_from_checkpoint(model_checkpoint_path, layer_transformation='softmax') + + model_path = download_model(model=self.config.comet_model_name, saving_directory=self.config.model_cache_dir) + self._comet_model = load_from_checkpoint(model_path) + + # TODO long-term we want to pull logic out of predict in base.py in COMET + # so that we keep one PTL Trainer for the whole server, and don't make a new one for every datapoint + # since each PTL trainer moves the model weights from CPU to GPU each time which will be slow + + def setup_webserver(self) -> FastAPI: + app = super().setup_webserver() + + # Additional server routes go here! e.g.: + # app.post("/get_weather")(self.get_weather) + + return app + + async def verify(self, body: TranslationCometVerifyRequest) -> TranslationCometVerifyResponse: + assistant_responses = [] + for output_item in body.response.output: + if output_item.type != "message": + continue + + for content_item in output_item.content: + if content_item.type != "output_text": + continue + + assistant_responses.append(content_item.text) + + combined_response = "".join(assistant_responses) + + (reward, extracted_answer) = self._verify_answer( + source_text=body.src_txt, target_text=body.trg_txt, model_response=combined_response + ) + + return TranslationCometVerifyResponse(**body.model_dump(), extracted_answer=extracted_answer, reward=reward) + + def _verify_answer(self, source_text: str, target_text: str, model_response: str) -> tuple[float, str]: + extracted_answer = self._extract_answer(model_response) + + if self.config.use_reference: + comet_data = [{"src": source_text, "mt": extracted_answer, "ref": target_text}] + else: + comet_data = [{"src": source_text, "mt": extracted_answer}] + + # TODO this is inefficent and sets up a new PTL Trainer each time + # It's designed to be run on a whole dataset at once + # This means the weights get moved from CPU to GPU (if applicable) each time this is called + model_output = self._comet_model.predict( + comet_data, + batch_size=self.batch_size, + gpus=self.config.comet_gpu_count, + devices=self.config.comet_gpu_devices, + progress_bar=False, + ) + reward = model_output.system_score + + return reward, extracted_answer + + def _extract_answer(self, model_response: str) -> str: + # Strip any thinking + no_think_response = model_response.split(self.config.reasoning_split_word)[-1] + no_think_response = no_think_response.strip() + return no_think_response + + +if __name__ == "__main__": + TranslationCometResourcesServer.run_webserver() diff --git a/resources_servers/translation_comet/configs/translation_comet.yaml b/resources_servers/translation_comet/configs/translation_comet.yaml new file mode 100644 index 000000000..e0f5c5e75 --- /dev/null +++ b/resources_servers/translation_comet/configs/translation_comet.yaml @@ -0,0 +1,40 @@ +translation_comet: + resources_servers: + translation_comet: + entrypoint: app.py + domain: translation + use_reference: true + comet_model_name: Unbabel/wmt22-comet-da +translation_comet_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: translation_comet + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/translation_comet/data/riva_mt_v3_nothinkInSys_train.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_train + version: 0.0.1 + artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: validation + type: validation + jsonl_fpath: resources_servers/translation_comet/data/riva_mt_v3_nothinkInSys_validation.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_validation + version: 0.0.1 + artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: example + type: example + jsonl_fpath: resources_servers/translation_comet/data/example.jsonl + num_repeats: 1 diff --git a/resources_servers/translation_comet/configs/translation_comet_referencefree.yaml b/resources_servers/translation_comet/configs/translation_comet_referencefree.yaml new file mode 100644 index 000000000..944df2982 --- /dev/null +++ b/resources_servers/translation_comet/configs/translation_comet_referencefree.yaml @@ -0,0 +1,40 @@ +translation_comet: + resources_servers: + translation_comet: + entrypoint: app.py + domain: translation + use_reference: false + comet_model_name: Unbabel/wmt22-cometkiwi-da +translation_comet_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: translation_comet + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/translation_comet/data/riva_mt_v3_nothinkInSys_train.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_train + version: 0.0.1 + artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: validation + type: validation + jsonl_fpath: resources_servers/translation_comet/data/riva_mt_v3_nothinkInSys_validation.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_validation + version: 0.0.1 + artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: example + type: example + jsonl_fpath: resources_servers/translation_comet/data/example.jsonl + num_repeats: 1 diff --git a/resources_servers/translation_comet/data/.gitignore b/resources_servers/translation_comet/data/.gitignore new file mode 100644 index 000000000..4424b6fde --- /dev/null +++ b/resources_servers/translation_comet/data/.gitignore @@ -0,0 +1,5 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/resources_servers/translation_comet/data/example.jsonl b/resources_servers/translation_comet/data/example.jsonl new file mode 100644 index 000000000..2c921e654 --- /dev/null +++ b/resources_servers/translation_comet/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"} +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"} +{"responses_create_params": {"input": [{"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"} \ No newline at end of file diff --git a/resources_servers/translation_comet/data/example_nothink.jsonl b/resources_servers/translation_comet/data/example_nothink.jsonl new file mode 100644 index 000000000..42529ab9a --- /dev/null +++ b/resources_servers/translation_comet/data/example_nothink.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"} +{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"} +{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"} \ No newline at end of file diff --git a/resources_servers/translation_comet/data/example_nothink_rollouts.jsonl b/resources_servers/translation_comet/data/example_nothink_rollouts.jsonl new file mode 100644 index 000000000..8b253272b --- /dev/null +++ b/resources_servers/translation_comet/data/example_nothink_rollouts.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_9ba0dbafb376409e8a7ed772adab1ecf", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_f0f3a1e9b0784933bdd719fd569e6077", "content": [{"annotations": [], "text": "Why waste time looking for the keys?\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.9511789679527283, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "extracted_answer": "Why waste time looking for the keys?"} +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_411f1d1acdeb4190ac860db659abc3cf", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_edbfc6e1503143dc82f1b94a9ea51bdc", "content": [{"annotations": [], "text": "The competition was held on a lawn, where the grass around the hole was cut shorter and is called the putting green.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.8277285695075989, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "extracted_answer": "The competition was held on a lawn, where the grass around the hole was cut shorter and is called the putting green."} +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_4b200f77ec6d47cebbd6f4dec45239e8", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_eb5d7cb1c8f74a4ead4d9d1d8d0e7cd5", "content": [{"annotations": [], "text": "Toutes les questions ont été répondues correctement par le trésorier Thomas Kräuchi.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.9093431234359741, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "extracted_answer": "Toutes les questions ont été répondues correctement par le trésorier Thomas Kräuchi."} +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_64b8e9cb25a8414b9a61615c90d63b68", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_71c84f51f02c437d9421550f6870d827", "content": [{"annotations": [], "text": "画像の次に原子が光子を放出している様子が示されています。もちろん、実際の光子はそのようにはとても小さいです。\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.9037765264511108, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "extracted_answer": "画像の次に原子が光子を放出している様子が示されています。もちろん、実際の光子はそのようにはとても小さいです。"} +{"responses_create_params": {"background": null, "include": null, "input": [{"content": "/no_think", "role": "system", "type": "message"}, {"content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "role": "user", "type": "message"}], "instructions": null, "max_output_tokens": 4096, "max_tool_calls": null, "metadata": null, "model": null, "parallel_tool_calls": true, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "store": null, "temperature": null, "text": null, "tool_choice": "auto", "tools": [], "top_logprobs": null, "top_p": null, "truncation": null, "user": null, "stream": null}, "response": {"id": "resp_ef4433e72c5d46ed8a39460bf8039518", "created_at": 1761320110.0, "error": null, "incomplete_details": null, "instructions": null, "metadata": null, "model": "nvidia/nvidia-nemotron-nano-9b-v2", "object": "response", "output": [{"id": "msg_d60da844dc104c3aa41836580f4970bc", "content": [{"annotations": [], "text": "The conservatory and the theater were dedicated to him, in which every year the \"Rossini Opera Festival\" takes place, attracting opera lovers from all over the world.\n", "type": "output_text", "logprobs": null}], "role": "assistant", "status": "completed", "type": "message"}], "parallel_tool_calls": true, "temperature": null, "tool_choice": "auto", "tools": [], "top_p": null, "background": null, "max_output_tokens": 4096, "max_tool_calls": null, "previous_response_id": null, "prompt": null, "reasoning": null, "service_tier": null, "status": null, "text": null, "top_logprobs": null, "truncation": null, "usage": null, "user": null}, "reward": 0.8553621172904968, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "extracted_answer": "The conservatory and the theater were dedicated to him, in which every year the \"Rossini Opera Festival\" takes place, attracting opera lovers from all over the world."} diff --git a/resources_servers/translation_comet/requirements.txt b/resources_servers/translation_comet/requirements.txt new file mode 100644 index 000000000..302adca33 --- /dev/null +++ b/resources_servers/translation_comet/requirements.txt @@ -0,0 +1,2 @@ +-e nemo-gym[dev] @ ../../ +unbabel-comet>=2.2.4 \ No newline at end of file diff --git a/resources_servers/translation_comet/tests/test_app.py b/resources_servers/translation_comet/tests/test_app.py new file mode 100644 index 000000000..68b2477b4 --- /dev/null +++ b/resources_servers/translation_comet/tests/test_app.py @@ -0,0 +1,211 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from copy import deepcopy +from os.path import dirname, join +from typing import Any +from unittest.mock import MagicMock + +from pytest import approx, fixture + +from nemo_gym import CACHE_DIR +from nemo_gym.openai_utils import ( + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, + NeMoGymResponseOutputMessage, + NeMoGymResponseOutputText, +) +from nemo_gym.server_utils import ServerClient +from resources_servers.translation_comet.app import ( + TranslationCometResourcesServer, + TranslationCometResourcesServerConfig, + TranslationCometVerifyRequest, +) + + +logger = logging.getLogger(__name__) + + +class TestApp: + @fixture(scope="class") + def resources_server(self) -> TranslationCometResourcesServer: + """We only want to spin up the server once since it has to load the comet model. + Although, the slowest part is actually `import comet` :) + """ + logger.info("Spinning up server with COMET model...") + + server = TranslationCometResourcesServer( + config=TranslationCometResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + use_reference=True, + comet_model_name="Unbabel/wmt22-comet-da", # 0.5B parameter model runs fine on CPU + # Need to use the actual model as the cometinho model does not return values in [0,1] + comet_gpu_count=0, # CPU + comet_gpu_devices="auto", # CPU + model_cache_dir=join(CACHE_DIR, "ptl_cache"), + ), + server_client=MagicMock(spec=ServerClient), + ) + + logger.info("Model loaded and server started successfully") + return server + + def reference_free_resources_server(self) -> TranslationCometResourcesServer: + logger.info("Spinning up server with reference-free COMET model...") + + server = TranslationCometResourcesServer( + config=TranslationCometResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + use_reference=False, + comet_model_name="Unbabel/wmt22-cometkiwi-da", # reference-free COMET + comet_gpu_count=0, # CPU + comet_gpu_devices="auto", # CPU + model_cache_dir=join(dirname(__file__), "..", "..", "..", "cache", "ptl_cache"), + ), + server_client=MagicMock(spec=ServerClient), + ) + + logger.info("Reference-free COMET model loaded and server started successfully") + return server + + def _create_response(self, id: str, model_response_text: str) -> dict[str, Any]: + return NeMoGymResponse( + id=id, + created_at=1234.5, + model="response_model", + object="response", + parallel_tool_calls=False, + tool_choice="none", + tools=[], + output=[ + NeMoGymResponseOutputMessage( + id=f"ID for {model_response_text}", + role="assistant", + status="in_progress", + type="message", + content=[NeMoGymResponseOutputText(annotations=[], text=model_response_text, type="output_text")], + ) + ], + ).model_dump() + + async def test_verify_identical(self, resources_server: TranslationCometResourcesServer) -> None: + source_text = "What is the name of your cat?" + target_text = "Was ist der Name deiner Katze?" + target_lang_name = "German" + model_create_params = NeMoGymResponseCreateParamsNonStreaming( + input=[ + { + "role": "user", + "content": f'Translate this into {target_lang_name}: "{source_text}"', + } + ] + ) + model_response = NeMoGymResponse(**self._create_response("model_response_id", target_text)) + identical_verify_request = TranslationCometVerifyRequest( + responses_create_params=deepcopy(model_create_params), + response=model_response.model_copy(deep=True), + src_txt=source_text, + trg_txt=target_text, + ) + identical_verify_response = await resources_server.verify(identical_verify_request) + assert identical_verify_response.responses_create_params == model_create_params + assert identical_verify_response.response == model_response + assert identical_verify_response.src_txt == source_text + assert identical_verify_response.trg_txt == target_text + assert identical_verify_response.reward == approx(1.0, abs=0.05) + assert identical_verify_response.extracted_answer == target_text + + assert sorted(list(identical_verify_response.model_dump())) == [ + "extracted_answer", + "response", + "responses_create_params", + "reward", + "src_txt", + "trg_txt", + ] + + def test_verify_answer_identical(self, resources_server: TranslationCometResourcesServer) -> None: + source_text = "What is the name of your cat?" + target_text = "Was ist der Name deiner Katze?" + model_response_text = target_text + + assert resources_server._verify_answer(source_text, target_text, model_response_text) == ( + approx(1.0, abs=0.05), # It's a model output so it won't be exact + target_text, + ) + + def test_verify_answer_think_tags(self, resources_server: TranslationCometResourcesServer) -> None: + source_text = "What is the name of your cat?" + target_text = "Was ist der Name deiner Katze?" + model_response_text = f"\n\n{target_text}" + + assert resources_server._verify_answer(source_text, target_text, model_response_text) == ( + approx(1.0, abs=0.05), # It's a model output so it won't be exact + target_text, + ) + + def test_verify_answer_no_match(self, resources_server: TranslationCometResourcesServer) -> None: + source_text = "What is the name of your cat?" + target_text = "Was ist der Name deiner Katze?" + model_response_text = "Incorrect translation." + + assert resources_server._verify_answer(source_text, target_text, model_response_text) == ( + approx(0.0, abs=0.5), # This returns about 0.3 in practice but it's fine as long as it's low + model_response_text, + ) + + async def test_verify_identical_reference_free(self, resources_server: TranslationCometResourcesServer) -> None: + reference_free_resources_server = self.reference_free_resources_server() + + source_text = "two three" + target_text = "zwei drei" + target_lang_name = "German" + model_create_params = NeMoGymResponseCreateParamsNonStreaming( + input=[ + { + "role": "user", + "content": f'Translate this into {target_lang_name}: "{source_text}"', + } + ] + ) + model_response = NeMoGymResponse(**self._create_response("model_response_id", target_text)) + identical_verify_request = TranslationCometVerifyRequest( + responses_create_params=deepcopy(model_create_params), + response=model_response.model_copy(deep=True), + src_txt=source_text, + ) + identical_verify_response = await reference_free_resources_server.verify(identical_verify_request) + assert identical_verify_response.responses_create_params == model_create_params + assert identical_verify_response.response == model_response + assert identical_verify_response.src_txt == source_text + assert identical_verify_response.trg_txt is None + assert identical_verify_response.reward == approx( + 1.0, abs=0.25 + ) # It's hard to get a score near 1.0 with the reference-free model + assert identical_verify_response.extracted_answer == target_text + + assert sorted(list(identical_verify_response.model_dump())) == [ + "extracted_answer", + "response", + "responses_create_params", + "reward", + "src_txt", + "trg_txt", # Should be present but None + ] diff --git a/resources_servers/translation_llm_judge/README.md b/resources_servers/translation_llm_judge/README.md new file mode 100644 index 000000000..c427b79a4 --- /dev/null +++ b/resources_servers/translation_llm_judge/README.md @@ -0,0 +1,10 @@ +# Description + +Data links: ? + +# Licensing information +Code: Apache 2.0 +Data: NVIDIA Internal Use Only, Do Not Distribute + +Dependencies +- nemo_gym: Apache 2.0 diff --git a/resources_servers/translation_llm_judge/app.py b/resources_servers/translation_llm_judge/app.py new file mode 100644 index 000000000..b5e3923f5 --- /dev/null +++ b/resources_servers/translation_llm_judge/app.py @@ -0,0 +1,207 @@ +""" +LLM-as-judge resources server. + +Compares a model's generated answer to an expected answer using an LLM judge. +The judge prompt is fully configurable via server config. +""" + +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import re +from typing import Optional + +from fastapi import FastAPI +from pydantic import BaseModel + +from nemo_gym.base_resources_server import ( + BaseResourcesServerConfig, + BaseRunRequest, + BaseVerifyRequest, + BaseVerifyResponse, + SimpleResourcesServer, +) +from nemo_gym.config_types import ModelServerRef +from nemo_gym.openai_utils import ( + NeMoGymEasyInputMessage, + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, +) + + +class TranslationLLMJudgeResourcesServerConfig(BaseResourcesServerConfig): + """Configuration for the LLM judge server. + + - judge_model_server: target model server to use as the judge. + - judge_responses_create_params: base create params; input will be set per request. + - judge_system_message: optional custom system message for the judge. + - judge_prompt_template: optional custom prompt template. Supported placeholders: + {generated_text}, {src_text}, {src_lang}, {trg_lang}, {trg_text} (if use_reference is True, otherwise {trg_lang} and {src_lang} are required) + """ + + name: str = "translation_llm_judge" + judge_model_server: ModelServerRef + judge_responses_create_params: NeMoGymResponseCreateParamsNonStreaming + + judge_system_message: Optional[str] = None + judge_prompt_template: str + judge_score_extract_regex: str + judge_max_score: int + use_reference: bool = True # If True, judge_prompt_template should include {trg_text} + reasoning_split_word: str = "" + + +class TranslationLLMJudgeRunRequest(BaseRunRequest): + src_txt: str + src_lang: str # TODO should this be optional? Not all judge prompts will use it + trg_lang: str # TODO should this be optional? Not all judge prompts will use it + trg_txt: Optional[str] = None + + +class TranslationLLMJudgeVerifyRequest(TranslationLLMJudgeRunRequest, BaseVerifyRequest): + pass + + +class TranslationLLMJudgeEvaluation(BaseModel): + responses_create_params: NeMoGymResponseCreateParamsNonStreaming + response: NeMoGymResponse + # Extracted score from judge output + score: Optional[float] = None + + +class TranslationLLMJudgeVerifyResponse(BaseVerifyResponse): + src_txt: str + src_lang: str + trg_lang: str + trg_txt: Optional[str] = None + judge_evaluation: TranslationLLMJudgeEvaluation + + +class TranslationLLMJudgeResourcesServer(SimpleResourcesServer): + """Judge-only verifier using an LLM to evaluate translation quality.""" + + config: TranslationLLMJudgeResourcesServerConfig + + def setup_webserver(self) -> FastAPI: + app = super().setup_webserver() + return app + + def _extract_last_assistant_text(self, body: BaseVerifyRequest) -> str: + """Extract the last assistant message text from the response. + + - If the assistant message has multiple text blocks, they are joined with newlines. + - If ``extract_regex`` is provided, the last regex match is used; if capture + groups exist, the first non-empty group is returned, otherwise the full match. + - Returns an empty string when no assistant text is available. + """ + # Return only the last assistant message's text content. + for o in reversed(body.response.output): + text = "" + if getattr(o, "type", None) == "message" and getattr(o, "role", None) == "assistant": + content = getattr(o, "content", None) + if isinstance(content, list): + # Some providers split a single assistant message into multiple text blocks. + # Join all text blocks to reconstruct the full message text. + texts: list[str] = [] + for c in content: + t = getattr(c, "text", None) + if isinstance(t, str): + texts.append(t) + text = "\n".join(texts).strip() + elif isinstance(content, str): + text = content.strip() + + # Strip thinking if not already removed by reasoning parser + text = self._strip_thinking(text) + return text + + def _strip_thinking(self, model_response: str) -> str: + # Strip any thinking + no_think_response = model_response.split(self.config.reasoning_split_word)[-1] + no_think_response = no_think_response.strip() + return no_think_response + + async def verify(self, body: TranslationLLMJudgeVerifyRequest) -> TranslationLLMJudgeVerifyResponse: + generated = self._extract_last_assistant_text(body) + + eval = await self._generate_judge_evaluation( + generated_text=generated, + src_txt=body.src_txt, + src_lang=body.src_lang, + trg_lang=body.trg_lang, + trg_txt=body.trg_txt, + ) + + payload = body.model_dump() + reward = eval.score / self.config.judge_max_score + return TranslationLLMJudgeVerifyResponse(**payload, reward=reward, judge_evaluation=eval) + + async def _generate_judge_evaluation( + self, *, generated_text: str, src_txt: str, src_lang: str, trg_lang: str, trg_txt: Optional[str] = None + ) -> TranslationLLMJudgeEvaluation: + cfg = self.config + + responses_create_params = cfg.judge_responses_create_params.model_copy(deep=True) + prompt_template = cfg.judge_prompt_template + system_message = cfg.judge_system_message + + if self.config.use_reference and trg_txt is not None: + user_prompt = prompt_template.format( + generated_text=generated_text, src_txt=src_txt, src_lang=src_lang, trg_lang=trg_lang, trg_txt=trg_txt + ) + else: + user_prompt = prompt_template.format( + generated_text=generated_text, src_txt=src_txt, src_lang=src_lang, trg_lang=trg_lang + ) + + msgs: list[NeMoGymEasyInputMessage] = [] + if system_message is not None and system_message != "": + msgs.append(NeMoGymEasyInputMessage(role="system", content=system_message)) + msgs.append(NeMoGymEasyInputMessage(role="user", content=user_prompt)) + responses_create_params.input = msgs + + response = await self.server_client.post( + server_name=cfg.judge_model_server.name, + url_path="/v1/responses", + json=responses_create_params, + ) + judge_response = NeMoGymResponse.model_validate(await response.json()) + eval_record = TranslationLLMJudgeEvaluation( + responses_create_params=responses_create_params, + response=judge_response, + score=0.0, + ) + + # Parse the last output; fall back to a score of 0 if unexpected. + try: + last_output = judge_response.output[-1] + if getattr(last_output, "type", None) != "message": + return eval_record + last_content = last_output.content[-1] + text = getattr(last_content, "text", "") + except Exception: + return eval_record + + # Extract the score from the judge output + match = re.search(cfg.judge_score_extract_regex, text) + if match is None: + return eval_record + score = int(match.group(1)) + eval_record.score = score + return eval_record + + +if __name__ == "__main__": + TranslationLLMJudgeResourcesServer.run_webserver() diff --git a/resources_servers/translation_llm_judge/configs/translation_llm_judge_vip.yaml b/resources_servers/translation_llm_judge/configs/translation_llm_judge_vip.yaml new file mode 100644 index 000000000..8a752c9e5 --- /dev/null +++ b/resources_servers/translation_llm_judge/configs/translation_llm_judge_vip.yaml @@ -0,0 +1,66 @@ +translation_llm_judge: + resources_servers: + translation_llm_judge: + entrypoint: app.py + domain: translation + judge_model_server: + type: responses_api_models + name: policy_model + judge_responses_create_params: + input: [] + judge_system_message: "You are a professional translation evaluator." + # This uses the VIP process from Cheng et al. 2024 (https://arxiv.org/abs/2407.21646) + # as implemented by Ouyang et al. 2025 (NVIDIA internal, not yet published) + judge_prompt_template: |- + Your task is to assess whether a translation segment successfully conveys the semantic content of the original speech according to the following criteria: + + 1. Key Information Recognition: Identify whether the key information in the source (e.g., proper nouns, keywords, terminologies, or sentence structures) is present in the translation. + 2. Correctness Assessment: Determine whether the translation accurately conveys the speaker’s intention, without misinterpretation or contextual errors. + 3. Expressiveness Assessment: Evaluate whether the translation is fluent, clear, and intuitive to human readers. It should avoid unnecessary verbosity, ambiguous phrases, or awkward grammar. + + Given a source text and its translation, answer "1" if the translation meets all three criteria and answer "0" otherwise. Only output the answer, no other text. + + + {src_txt} + + + + {generated_text} + + judge_score_extract_regex: "(1|0)" + judge_max_score: 1 +translation_llm_judge_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: translation_llm_judge + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_train + version: 0.0.1 + artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: validation + type: validation + jsonl_fpath: resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_validation + version: 0.0.1 + artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: example + type: example + license: NVIDIA Internal Use Only, Do Not Distribute + jsonl_fpath: resources_servers/translation_llm_judge/data/example.jsonl + + diff --git a/resources_servers/translation_llm_judge/data/example.jsonl b/resources_servers/translation_llm_judge/data/example.jsonl new file mode 100644 index 000000000..2c921e654 --- /dev/null +++ b/resources_servers/translation_llm_judge/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"} +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"} +{"responses_create_params": {"input": [{"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"} \ No newline at end of file diff --git a/resources_servers/translation_llm_judge/data/example_nothink.jsonl b/resources_servers/translation_llm_judge/data/example_nothink.jsonl new file mode 100644 index 000000000..42529ab9a --- /dev/null +++ b/resources_servers/translation_llm_judge/data/example_nothink.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"} +{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"} +{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"} \ No newline at end of file diff --git a/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train_metrics.json b/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train_metrics.json new file mode 100644 index 000000000..0d0c53e2e --- /dev/null +++ b/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train_metrics.json @@ -0,0 +1,61 @@ +{ + "name": "train", + "type": "train", + "jsonl_fpath": "resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_train.jsonl", + "num_repeats": 1, + "gitlab_identifier": { + "dataset_name": "riva_mt_v3_nothinkInSys_train", + "version": "0.0.1", + "artifact_fpath": "riva_mt_v3_nothinkInSys_train.jsonl" + }, + "license": "NVIDIA Internal Use Only, Do Not Distribute", + "Number of examples": 770000, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 770000, + "Average": 26.18, + "Min": 10.0, + "Max": 267.0, + "Median": 24.44, + "Standard deviation": 9.86 + }, + "Number of turns": { + "Total # non-null values": 770000, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Median": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "src_txt": { + "unique_count": 767619, + "total_count": 770000 + }, + "trg_txt": { + "unique_count": 767567, + "total_count": 770000 + }, + "src_lang": { + "unique_count": 12, + "total_count": 770000 + }, + "trg_lang": { + "unique_count": 12, + "total_count": 770000 + } +} \ No newline at end of file diff --git a/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation_metrics.json b/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation_metrics.json new file mode 100644 index 000000000..bc35bb4d6 --- /dev/null +++ b/resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation_metrics.json @@ -0,0 +1,61 @@ +{ + "name": "validation", + "type": "validation", + "jsonl_fpath": "resources_servers/translation_llm_judge/data/riva_mt_v3_nothinkInSys_validation.jsonl", + "num_repeats": 1, + "gitlab_identifier": { + "dataset_name": "riva_mt_v3_nothinkInSys_validation", + "version": "0.0.1", + "artifact_fpath": "riva_mt_v3_nothinkInSys_validation.jsonl" + }, + "license": "NVIDIA Internal Use Only, Do Not Distribute", + "Number of examples": 19940, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 19940, + "Average": 32.68, + "Min": 10.0, + "Max": 82.0, + "Median": 33.05, + "Standard deviation": 10.38 + }, + "Number of turns": { + "Total # non-null values": 19940, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Median": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "src_txt": { + "unique_count": 10965, + "total_count": 19940 + }, + "trg_txt": { + "unique_count": 10965, + "total_count": 19940 + }, + "src_lang": { + "unique_count": 11, + "total_count": 19940 + }, + "trg_lang": { + "unique_count": 11, + "total_count": 19940 + } +} \ No newline at end of file diff --git a/resources_servers/translation_llm_judge/requirements.txt b/resources_servers/translation_llm_judge/requirements.txt new file mode 100644 index 000000000..5042d4648 --- /dev/null +++ b/resources_servers/translation_llm_judge/requirements.txt @@ -0,0 +1,2 @@ +-e nemo-gym[dev] @ ../../ + diff --git a/resources_servers/translation_llm_judge/tests/test_app.py b/resources_servers/translation_llm_judge/tests/test_app.py new file mode 100644 index 000000000..3f5f3ade3 --- /dev/null +++ b/resources_servers/translation_llm_judge/tests/test_app.py @@ -0,0 +1,158 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from copy import deepcopy +from pathlib import Path +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +from omegaconf import OmegaConf +from pytest import approx, fixture + +from nemo_gym.config_types import ModelServerRef +from nemo_gym.openai_utils import ( + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, + NeMoGymResponseOutputItem, + NeMoGymResponseOutputMessage, + NeMoGymResponseOutputText, +) +from nemo_gym.server_utils import ServerClient +from resources_servers.translation_llm_judge.app import ( + TranslationLLMJudgeResourcesServer, + TranslationLLMJudgeResourcesServerConfig, + TranslationLLMJudgeVerifyRequest, +) + + +class TestApp: + @fixture + def config(self) -> TranslationLLMJudgeResourcesServerConfig: + # Load judge template from YAML so tests mirror runtime config + yaml_path = Path(__file__).resolve().parents[1] / "configs" / "translation_llm_judge_vip.yaml" + yaml_cfg = OmegaConf.load(str(yaml_path)) + judge_template: str = ( + yaml_cfg.translation_llm_judge.resources_servers.translation_llm_judge.judge_prompt_template + ) + judge_score_extract_regex: str = ( + yaml_cfg.translation_llm_judge.resources_servers.translation_llm_judge.judge_score_extract_regex + ) + judge_max_score: int = yaml_cfg.translation_llm_judge.resources_servers.translation_llm_judge.judge_max_score + + cfg = TranslationLLMJudgeResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + judge_model_server=ModelServerRef(type="responses_api_models", name="judge"), + judge_responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]), + judge_prompt_template=judge_template, + judge_score_extract_regex=judge_score_extract_regex, + judge_max_score=judge_max_score, + ) + return cfg + + def _create_response(self, id: str, output_item: NeMoGymResponseOutputItem) -> dict[str, Any]: + return NeMoGymResponse( + id=id, + created_at=123.0, + model="judge_model", + object="response", + output=[output_item], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + ).model_dump() + + def _msg(self, text: str) -> NeMoGymResponseOutputMessage: + return NeMoGymResponseOutputMessage( + id="msg_id", + content=[NeMoGymResponseOutputText(annotations=[], text=text, type="output_text")], + role="assistant", + status="completed", + type="message", + ) + + async def test_verify_identical(self, config: TranslationLLMJudgeResourcesServerConfig) -> None: + server_mock = MagicMock(spec=ServerClient) + rs = TranslationLLMJudgeResourcesServer(config=config, server_client=server_mock) + + post_mock = MagicMock() + post_mock.json = AsyncMock() + server_mock.post = AsyncMock(return_value=post_mock) + + post_mock.json.side_effect = [ + self._create_response("first", self._msg("1")), + ] + + src_txt = "One two three" + trg_txt = "Eins zwei drei" + src_lang = "en" + trg_lang = "de" + + model_create_params = NeMoGymResponseCreateParamsNonStreaming(input=[{"role": "user", "content": "Q: 1+1?"}]) + model_response = NeMoGymResponse( + id="resp", + created_at=0.0, + model="m", + object="response", + output=[self._msg(trg_txt)], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + ) + + req = TranslationLLMJudgeVerifyRequest( + responses_create_params=deepcopy(model_create_params), + response=model_response.model_copy(deep=True), + src_txt=src_txt, + trg_txt=trg_txt, + src_lang=src_lang, + trg_lang=trg_lang, + ) + res = await rs.verify(req) + assert res.reward == approx(1.0) + assert res.src_txt == src_txt + assert res.trg_txt == trg_txt + assert res.src_lang == src_lang + assert res.trg_lang == trg_lang + assert res.judge_evaluation.score == 1 + + async def test_unexpected_judge_output_defaults_to_zero( + self, config: TranslationLLMJudgeResourcesServerConfig + ) -> None: + server_mock = MagicMock(spec=ServerClient) + rs = TranslationLLMJudgeResourcesServer(config=config, server_client=server_mock) + + post_mock = MagicMock() + post_mock.json = AsyncMock(return_value=self._create_response("f", self._msg("no label present"))) + server_mock.post = AsyncMock(return_value=post_mock) + + req = TranslationLLMJudgeVerifyRequest( + responses_create_params=NeMoGymResponseCreateParamsNonStreaming(input=[]), + response=NeMoGymResponse( + id="r", + created_at=0.0, + model="m", + object="response", + output=[self._msg("text")], + parallel_tool_calls=False, + tool_choice="none", + tools=[], + ), + src_txt="One two three", + trg_txt="Eins zwei drei", + src_lang="en", + trg_lang="de", + ) + res = await rs.verify(req) + assert res.reward == approx(0.0) diff --git a/resources_servers/translation_metricx/README.md b/resources_servers/translation_metricx/README.md new file mode 100644 index 000000000..654132fd8 --- /dev/null +++ b/resources_servers/translation_metricx/README.md @@ -0,0 +1,11 @@ +# Description + +Data links: ? + +# Licensing information +Code: Apache 2.0 +Data: NVIDIA Internal Use Only, Do Not Distribute + +Dependencies +- nemo_gym: Apache 2.0 +- google-research/metricx: Apache 2.0 diff --git a/resources_servers/translation_metricx/app.py b/resources_servers/translation_metricx/app.py new file mode 100644 index 000000000..dcb17d9bf --- /dev/null +++ b/resources_servers/translation_metricx/app.py @@ -0,0 +1,196 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path +from typing import Any, Optional + +import datasets +import transformers +from fastapi import FastAPI +from metricx24.models import MT5ForRegression + +from nemo_gym import CACHE_DIR +from nemo_gym.base_resources_server import ( + BaseResourcesServerConfig, + BaseVerifyRequest, + BaseVerifyResponse, + SimpleResourcesServer, +) + + +class TranslationMetricxResourcesServerConfig(BaseResourcesServerConfig): + """ + Configuration for the TranslationMetricxResourcesServer. + + Attributes: + use_reference (bool): Whether to use a reference translation + metricx_model_name (str): The MetricX model name to use. The default "google/metricx-24-hybrid-large-v2p6-bfloat16" + is the smallest model at 1.2B parameters. + tokenizer_name (str): The name of the mT5 tokenizer to use with the MetricX model. Size must match MetricX model. + For the default model above, use "google/mt5-large". + device_map (str): Device placement for the model. Options include "cpu", specific GPU (e.g., "cuda:1"), + "auto", "balanced", "balanced_low_0", "sequential". + max_input_length (int): Maximum input sequence length (see MetricX documentation, default 1536) + output_dir (str): Output directory for Trainer class. Nothing is actually output during prediction, but it's mandatory to supply. + """ + + use_reference: bool = True + metricx_model_name: str = "google/metricx-24-hybrid-large-v2p6-bfloat16" + tokenizer_name: str = "google/mt5-large" + device_map: str = "cpu" + max_input_length: int = 1536 + output_dir: str = str(Path(CACHE_DIR) / "metricx_output") + reasoning_split_word: str = "" + + +class TranslationMetricxVerifyRequest(BaseVerifyRequest): + src_txt: str + trg_txt: Optional[str] = None + + +class TranslationMetricxVerifyResponse(BaseVerifyResponse): + src_txt: str + trg_txt: Optional[str] = None + extracted_answer: str + + +class TranslationMetricxResourcesServer(SimpleResourcesServer): + config: TranslationMetricxResourcesServerConfig + + def model_post_init(self, context: Any) -> None: + super().model_post_init(context) + + # Load tokenizer (MetricX models use MT5 tokenizers, separate from the model name) + tokenizer = transformers.AutoTokenizer.from_pretrained(self.config.tokenizer_name) + self._tokenizer = tokenizer + + # Load model with device placement + model = MT5ForRegression.from_pretrained( + self.config.metricx_model_name, torch_dtype="auto", device_map=self.config.device_map + ) + # Inputs should go to the device where the first layer is + # Get device from the first model parameter + self._inputs_device = next(model.parameters()).device + + model.eval() + self._metricx_model = model + + # Ensure output directory exists (following predict.py lines 167-169) + os.makedirs(self.config.output_dir, exist_ok=True) + + # Create trainer + training_args = transformers.TrainingArguments( + output_dir=self.config.output_dir, + per_device_eval_batch_size=1, + dataloader_pin_memory=False, + ) + trainer = transformers.Trainer( + model=model, + args=training_args, + ) + self._metricx_trainer = trainer + + def setup_webserver(self) -> FastAPI: + app = super().setup_webserver() + + # Additional server routes go here! e.g.: + # app.post("/get_weather")(self.get_weather) + + return app + + async def verify(self, body: TranslationMetricxVerifyRequest) -> TranslationMetricxVerifyResponse: + assistant_responses = [] + for output_item in body.response.output: + if output_item.type != "message": + continue + + for content_item in output_item.content: + if content_item.type != "output_text": + continue + + assistant_responses.append(content_item.text) + + combined_response = "".join(assistant_responses) + + (reward, extracted_answer) = self._verify_answer( + model_response=combined_response, source_text=body.src_txt, target_text=body.trg_txt + ) + + return TranslationMetricxVerifyResponse(**body.model_dump(), extracted_answer=extracted_answer, reward=reward) + + def _verify_answer( + self, model_response: str, source_text: str, target_text: Optional[str] = None + ) -> tuple[float, str]: + extracted_answer = self._extract_answer(model_response) + ds = self._create_dataset_from_example(extracted_answer, source_text, target_text) + predictions, _, _ = self._metricx_trainer.predict(test_dataset=ds) + score = float(predictions[0]) + + # MetricX scores are between 0 and 25, where 25 is worst, so we normalize to 0 to 1 where 0 is worst + reward = (25 - score) / 25.0 + return reward, extracted_answer + + def _create_dataset_from_example( + self, hypothesis: str, source_text: str, reference_text: Optional[str] = None + ) -> datasets.Dataset: + """Create a dataset from a single example, following get_dataset logic from predict.py.""" + # Create input string based on reference ("QE") mode (QE is when use_reference is False) + if not self.config.use_reference or reference_text is None: + input_text = "source: " + source_text + " candidate: " + hypothesis + else: + input_text = "source: " + source_text + " candidate: " + hypothesis + " reference: " + reference_text + + # Tokenize (returns dict with lists) + tokenized = self._tokenizer( + input_text, + max_length=self.config.max_input_length, + truncation=True, + padding=False, + ) + + # Create a single example dict (matching the format that predict.py creates) + example_dict = { + "input_ids": tokenized["input_ids"], + "attention_mask": tokenized["attention_mask"], + } + + # If last token is EOS, remove it (following predict.py _remove_eos function logic) + eos_token_id = self._tokenizer.eos_token_id + if eos_token_id is not None and example_dict["input_ids"][-1] == eos_token_id: + example_dict["input_ids"] = example_dict["input_ids"][:-1] + example_dict["attention_mask"] = example_dict["attention_mask"][:-1] + + # Create dataset from a list of examples (each example is one dict) + # Following predict.py structure: ds.map() operations create per-example dicts + ds = datasets.Dataset.from_list([example_dict]) + + # Set format to torch and move to device (following predict.py line 119-124) + ds.set_format( + type="torch", + columns=["input_ids", "attention_mask"], + device=self._inputs_device, + output_all_columns=True, + ) + + return ds + + def _extract_answer(self, model_response: str) -> str: + # Strip any thinking + no_think_response = model_response.split(self.config.reasoning_split_word)[-1] + no_think_response = no_think_response.strip() + return no_think_response + + +if __name__ == "__main__": + TranslationMetricxResourcesServer.run_webserver() diff --git a/resources_servers/translation_metricx/configs/translation_metricx.yaml b/resources_servers/translation_metricx/configs/translation_metricx.yaml new file mode 100644 index 000000000..8a8b18bcc --- /dev/null +++ b/resources_servers/translation_metricx/configs/translation_metricx.yaml @@ -0,0 +1,42 @@ +translation_metricx: + resources_servers: + translation_metricx: + entrypoint: app.py + domain: translation + use_reference: true + metricx_model_name: google/metricx-24-hybrid-xl-v2p6 + tokenizer_name: google/mt5-large + max_input_length: 1536 +translation_metricx_simple_agent: + responses_api_agents: + simple_agent: + entrypoint: app.py + resources_server: + type: resources_servers + name: translation_metricx + model_server: + type: responses_api_models + name: policy_model + datasets: + - name: train + type: train + jsonl_fpath: resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_train + version: 0.0.1 + artifact_fpath: riva_mt_v3_nothinkInSys_train.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: validation + type: validation + jsonl_fpath: resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation.jsonl + num_repeats: 1 + gitlab_identifier: + dataset_name: riva_mt_v3_nothinkInSys_validation + version: 0.0.1 + artifact_fpath: riva_mt_v3_nothinkInSys_validation.jsonl + license: NVIDIA Internal Use Only, Do Not Distribute + - name: example + type: example + jsonl_fpath: resources_servers/translation_metricx/data/example.jsonl + num_repeats: 1 diff --git a/resources_servers/translation_metricx/data/.gitignore b/resources_servers/translation_metricx/data/.gitignore new file mode 100644 index 000000000..4424b6fde --- /dev/null +++ b/resources_servers/translation_metricx/data/.gitignore @@ -0,0 +1,5 @@ +*train.jsonl +*validation.jsonl +*train_prepare.jsonl +*validation_prepare.jsonl +*example_prepare.jsonl diff --git a/resources_servers/translation_metricx/data/example.jsonl b/resources_servers/translation_metricx/data/example.jsonl new file mode 100644 index 000000000..2c921e654 --- /dev/null +++ b/resources_servers/translation_metricx/data/example.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"} +{"responses_create_params": {"input": [{"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"} +{"responses_create_params": {"input": [{"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"} \ No newline at end of file diff --git a/resources_servers/translation_metricx/data/example_nothink.jsonl b/resources_servers/translation_metricx/data/example_nothink.jsonl new file mode 100644 index 000000000..42529ab9a --- /dev/null +++ b/resources_servers/translation_metricx/data/example_nothink.jsonl @@ -0,0 +1,5 @@ +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into English. Text:\n```\n¿Por qué malgastar el tiempo buscando las llaves?\n```"}]}, "src_txt": "¿Por qué malgastar el tiempo buscando las llaves?", "trg_txt": "Why waste your time looking for the keys?", "src_lang": "es-us", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Translate this into English: Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt."}]}, "src_txt": "Ihm wurden das Konservatorium und das Theater gewidmet, in dem jedes Jahr das „Rossini Opera Festival“ stattfindet, das Opernliebhaber aus aller Welt anlockt.", "trg_txt": "Every year, the conservatory and theatre, both named after him, host the Rossini Opera Festival, drawing enthusiasts from around the world.", "src_lang": "de", "trg_lang": "en"} +{"responses_create_params": {"input": [{"role": "system", "content":"/no_think"}, {"role": "user", "content": "Convert the following text into French. Text:\n```\nAll questions have been correctly answered by the treasurer Thomas Kräuchi.\n```"}]}, "src_txt": "All questions have been correctly answered by the treasurer Thomas Kräuchi.", "trg_txt": "Le trésorier Thomas Kräuchi a répondu correctement à toutes les questions.", "src_lang": "en", "trg_lang": "fr"} +{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Convert the following text into Japanese. Text:\n```\nThe next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.\n```"}]}, "src_txt": "The next picture shows the atoms emitting photons. Of course, in reality photons are a lot smaller than those in the picture.", "trg_txt": "次の写真は、原子が光子を放出している様子です。もちろん、実際には写真よりもはるかに微小です。", "src_lang": "en", "trg_lang": "ja"} +{"responses_create_params": {"input": [{"role": "system", "content": "/no_think"}, {"role": "user", "content": "Translate the following text from Simplified Chinese to English: 比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。"}]}, "src_txt": "比赛是在草地上进行的,洞周围的草被修剪得更短,被称为果岭。", "trg_txt": "The game is played on grass, and the grass around the hole is mown shorter and called the green.", "src_lang": "zh-cn", "trg_lang": "en"} \ No newline at end of file diff --git a/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train_metrics.json b/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train_metrics.json new file mode 100644 index 000000000..146d1f6f9 --- /dev/null +++ b/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train_metrics.json @@ -0,0 +1,61 @@ +{ + "name": "train", + "type": "train", + "jsonl_fpath": "resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_train.jsonl", + "num_repeats": 1, + "gitlab_identifier": { + "dataset_name": "riva_mt_v3_nothinkInSys_train", + "version": "0.0.1", + "artifact_fpath": "riva_mt_v3_nothinkInSys_train.jsonl" + }, + "license": "NVIDIA Internal Use Only, Do Not Distribute", + "Number of examples": 770000, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 770000, + "Average": 26.18, + "Min": 10.0, + "Max": 267.0, + "Median": 24.44, + "Standard deviation": 9.86 + }, + "Number of turns": { + "Total # non-null values": 770000, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Median": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "src_txt": { + "unique_count": 767619, + "total_count": 770000 + }, + "trg_txt": { + "unique_count": 767567, + "total_count": 770000 + }, + "src_lang": { + "unique_count": 12, + "total_count": 770000 + }, + "trg_lang": { + "unique_count": 12, + "total_count": 770000 + } +} \ No newline at end of file diff --git a/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation_metrics.json b/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation_metrics.json new file mode 100644 index 000000000..9070a8dee --- /dev/null +++ b/resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation_metrics.json @@ -0,0 +1,61 @@ +{ + "name": "validation", + "type": "validation", + "jsonl_fpath": "resources_servers/translation_metricx/data/riva_mt_v3_nothinkInSys_validation.jsonl", + "num_repeats": 1, + "gitlab_identifier": { + "dataset_name": "riva_mt_v3_nothinkInSys_validation", + "version": "0.0.1", + "artifact_fpath": "riva_mt_v3_nothinkInSys_validation.jsonl" + }, + "license": "NVIDIA Internal Use Only, Do Not Distribute", + "Number of examples": 19940, + "Number of tools": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "Json-dumped number of words (proxy for token count)": { + "Total # non-null values": 19940, + "Average": 32.68, + "Min": 10.0, + "Max": 82.0, + "Median": 33.05, + "Standard deviation": 10.38 + }, + "Number of turns": { + "Total # non-null values": 19940, + "Average": 1.0, + "Min": 1.0, + "Max": 1.0, + "Median": 1.0, + "Standard deviation": 0.0 + }, + "Temperature": { + "Total # non-null values": 0, + "Average": 0.0, + "Min": 0.0, + "Max": 0.0, + "Median": 0.0, + "Standard deviation": 0.0 + }, + "src_txt": { + "unique_count": 10965, + "total_count": 19940 + }, + "trg_txt": { + "unique_count": 10965, + "total_count": 19940 + }, + "src_lang": { + "unique_count": 11, + "total_count": 19940 + }, + "trg_lang": { + "unique_count": 11, + "total_count": 19940 + } +} \ No newline at end of file diff --git a/resources_servers/translation_metricx/requirements.txt b/resources_servers/translation_metricx/requirements.txt new file mode 100644 index 000000000..3950124ba --- /dev/null +++ b/resources_servers/translation_metricx/requirements.txt @@ -0,0 +1,2 @@ +-e nemo-gym[dev] @ ../../ +metricx @ git+https://github.com/hrossnv/metricx.git@main \ No newline at end of file diff --git a/resources_servers/translation_metricx/tests/test_app.py b/resources_servers/translation_metricx/tests/test_app.py new file mode 100644 index 000000000..0cafd5f9a --- /dev/null +++ b/resources_servers/translation_metricx/tests/test_app.py @@ -0,0 +1,199 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +from copy import deepcopy +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock + +from pytest import approx, fixture + +from nemo_gym import CACHE_DIR +from nemo_gym.openai_utils import ( + NeMoGymResponse, + NeMoGymResponseCreateParamsNonStreaming, + NeMoGymResponseOutputMessage, + NeMoGymResponseOutputText, +) +from nemo_gym.server_utils import ServerClient +from resources_servers.translation_metricx.app import ( + TranslationMetricxResourcesServer, + TranslationMetricxResourcesServerConfig, + TranslationMetricxVerifyRequest, +) + + +logger = logging.getLogger(__name__) + +os.environ["HF_HOME"] = str(Path(CACHE_DIR) / "hf_cache") + + +class TestApp: + @fixture(scope="class") + def resources_server(self) -> TranslationMetricxResourcesServer: + """We only want to spin up the server once since it has to load the model.""" + logger.info("Spinning up server with MetricX model...") + + server = TranslationMetricxResourcesServer( + config=TranslationMetricxResourcesServerConfig( + host="0.0.0.0", + port=8080, + entrypoint="", + name="", + use_reference=True, + # 1.2B parameter model runs fine on CPU, though tests will take a couple of minutes + metricx_model_name="google/metricx-24-hybrid-large-v2p6-bfloat16", + tokenizer_name="google/mt5-large", + device_map="cpu", + max_input_length=1536, + output_dir=str(Path(CACHE_DIR) / "metricx_output"), + ), + server_client=MagicMock(spec=ServerClient), + ) + + logger.info("Model loaded and server started successfully") + return server + + def _create_response(self, id: str, model_response_text: str) -> dict[str, Any]: + return NeMoGymResponse( + id=id, + created_at=1234.5, + model="response_model", + object="response", + parallel_tool_calls=False, + tool_choice="none", + tools=[], + output=[ + NeMoGymResponseOutputMessage( + id=f"ID for {model_response_text}", + role="assistant", + status="in_progress", + type="message", + content=[NeMoGymResponseOutputText(annotations=[], text=model_response_text, type="output_text")], + ) + ], + ).model_dump() + + async def test_verify_identical(self, resources_server: TranslationMetricxResourcesServer) -> None: + source_text = "What is the name of your cat?" + target_text = "Was ist der Name deiner Katze?" + target_lang_name = "German" + model_create_params = NeMoGymResponseCreateParamsNonStreaming( + input=[ + { + "role": "user", + "content": f'Translate this into {target_lang_name}: "{source_text}"', + } + ] + ) + model_response = NeMoGymResponse(**self._create_response("model_response_id", target_text)) + identical_verify_request = TranslationMetricxVerifyRequest( + responses_create_params=deepcopy(model_create_params), + response=model_response.model_copy(deep=True), + src_txt=source_text, + trg_txt=target_text, + ) + identical_verify_response = await resources_server.verify(identical_verify_request) + assert identical_verify_response.responses_create_params == model_create_params + assert identical_verify_response.response == model_response + assert identical_verify_response.src_txt == source_text + assert identical_verify_response.trg_txt == target_text + assert identical_verify_response.reward == approx(1.0, abs=0.1) + assert identical_verify_response.extracted_answer == target_text + + assert sorted(list(identical_verify_response.model_dump())) == [ + "extracted_answer", + "response", + "responses_create_params", + "reward", + "src_txt", + "trg_txt", + ] + + async def test_verify_identical_without_reference( + self, resources_server: TranslationMetricxResourcesServer + ) -> None: + source_text = "two three" + target_text = "zwei drei" + target_lang_name = "German" + model_create_params = NeMoGymResponseCreateParamsNonStreaming( + input=[ + { + "role": "user", + "content": f'Translate this into {target_lang_name}: "{source_text}"', + } + ] + ) + model_response = NeMoGymResponse(**self._create_response("model_response_id", target_text)) + identical_verify_request = TranslationMetricxVerifyRequest( + responses_create_params=deepcopy(model_create_params), + response=model_response.model_copy(deep=True), + src_txt=source_text, + trg_txt=None, # Technically the model config is set up to use a reference but this triggers the same behavior + ) + identical_verify_response = await resources_server.verify(identical_verify_request) + assert identical_verify_response.responses_create_params == model_create_params + assert identical_verify_response.response == model_response + assert identical_verify_response.src_txt == source_text + assert identical_verify_response.trg_txt is None + assert identical_verify_response.reward == approx(1.0, abs=0.1) + assert identical_verify_response.extracted_answer == target_text + + assert sorted(list(identical_verify_response.model_dump())) == [ + "extracted_answer", + "response", + "responses_create_params", + "reward", + "src_txt", + "trg_txt", + ] + + def test_verify_answer_identical(self, resources_server: TranslationMetricxResourcesServer) -> None: + source_text = "two three" + target_text = "zwei drei" + model_response_text = target_text + + assert resources_server._verify_answer(model_response_text, source_text, target_text) == ( + approx(1.0, abs=0.1), # It's a model output so it won't be exact + target_text, + ) + + def test_verify_answer_think_tags(self, resources_server: TranslationMetricxResourcesServer) -> None: + source_text = "What is the name of your cat?" + target_text = "Was ist der Name deiner Katze?" + model_response_text = f"\n\n{target_text}" + + assert resources_server._verify_answer(model_response_text, source_text, target_text) == ( + approx(1.0, abs=0.1), # It's a model output so it won't be exact + target_text, + ) + + def test_verify_answer_no_match(self, resources_server: TranslationMetricxResourcesServer) -> None: + source_text = "What is the name of your cat?" + target_text = "Was ist der Name deiner Katze?" + model_response_text = "Incorrect translation." + + reward, extracted_answer = resources_server._verify_answer(model_response_text, source_text, target_text) + assert reward <= 0.6 # Raw score is around 10 for this example, where 25 is worst + assert extracted_answer == model_response_text + + def test_verify_answer_without_reference(self, resources_server: TranslationMetricxResourcesServer) -> None: + source_text = "two three" + model_response_text = "zwei drei" + + assert resources_server._verify_answer(model_response_text, source_text) == ( + approx(1.0, abs=0.1), # It's a model output so it won't be exact + model_response_text, + )