NVIDIA-NeMo
diff --git a/‎.github/workflows/install-test.yml‎
Lines changed: 90 additions & 56 deletions b/‎.github/workflows/install-test.yml‎
Lines changed: 90 additions & 56 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/nemo-evaluator-launcher/configuration/deployment/vllm.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/nemo-evaluator-launcher/configuration/deployment/vllm.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/nemo-evaluator-launcher/tutorial.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/nemo-evaluator-launcher/tutorial.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/nemo-evaluator/reference/api.md‎
Lines changed: 7 additions & 4 deletions b/‎docs/nemo-evaluator/reference/api.md‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎packages/nemo-evaluator-launcher/examples/local_auto_export_llama_3_1_8b_instruct.yaml‎
Lines changed: 11 additions & 49 deletions b/‎packages/nemo-evaluator-launcher/examples/local_auto_export_llama_3_1_8b_instruct.yaml‎
Lines changed: 11 additions & 49 deletions
@@ -23,77 +23,111 @@ on:
       - main
       - "pull-request/[0-9]+"
       - "deploy-release/*"
-env:
-  UV_PROJECT_ENVIRONMENT: "./venv"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
-  pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
+  test-nemo-evaluator:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
 
-  pip-test:
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+      - name: Install nemo-evaluator
+        working-directory: packages/nemo-evaluator
+        run: |
+          uv pip install --system -e .
+
+      - name: Verify installation
+        run: |
+          python -c "import nemo_evaluator; print(f'nemo-evaluator version: {nemo_evaluator.__version__}')"
+          which nemo-evaluator
+          which eval-factory
+          nemo-evaluator --help
+
+  test-nemo-evaluator-launcher:
     runs-on: ubuntu-latest
-    name: Pip - Python${{ matrix.python-version }} - ${{ matrix.package }} - AMD64/Linux - Ubuntu Latest
-    needs: [pre-flight]
-    if: |
-      !(needs.pre-flight.outputs.docs_only == 'true'
-      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
-    container:
-      image: ubuntu:24.04
-    environment: nemo-ci
     strategy:
-      fail-fast: false
       matrix:
-        python-version: ["3.10", "3.11", "3.12"]
-        package: ["nemo-evaluator"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4
 
-      - name: Install ${{ matrix.package }}
-        shell: bash -x -e -u -o pipefail {0}
-        run: bash docker/common/install.sh --package ${{ matrix.package }} --python-version "${{ matrix.python-version }}"
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
 
-      - name: Checkout check-imports
+      - name: Install nemo-evaluator-launcher
+        working-directory: packages/nemo-evaluator-launcher
+        run: |
+          uv pip install --system -e .
+
+      - name: Verify installation
+        run: |
+          python -c "import nemo_evaluator_launcher; print(f'nemo-evaluator-launcher version: {nemo_evaluator_launcher.__version__}')"
+          which nemo-evaluator-launcher
+          which nv-eval
+          nemo-evaluator-launcher --help
+
+  check-imports:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
+        include:
+          - package: "nemo-evaluator"
+            module: "nemo_evaluator"
+          - package: "nemo-evaluator-launcher"
+            module: "nemo_evaluator_launcher"
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Checkout FW-CI-templates
         uses: actions/checkout@v4
         with:
           repository: NVIDIA-NeMo/FW-CI-templates
-          ref: v0.39.0
-          path: FW-CI-templates
+          path: ./FW-CI-templates
 
-      - name: Check imports for nemo_evaluator
-        uses: ./FW-CI-templates/.github/actions/check-imports
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
         with:
-          package-name: nemo_evaluator
-          python-binary: /opt/venv/bin/python
+          python-version: ${{ matrix.python-version }}
 
-  install-test-summary:
-    needs: [pre-flight, pip-test]
-    runs-on: ubuntu-latest
-    name: Install test summary
-    if: |
-      (
-        needs.pre-flight.outputs.docs_only == 'true'
-        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
-        || always()
-      )
-      && !cancelled()
-    steps:
-      - name: Get workflow result
-        id: result
-        shell: bash -x -e -u -o pipefail {0}
-        env:
-          GH_TOKEN: ${{ github.token }}
-          RUN_ID: ${{ github.run_id }}
-          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
+      - name: Install uv
         run: |
-          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
-
-          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
-              echo "✅ All previous jobs completed successfully"
-              exit 0
-          else
-              echo "❌ Found $FAILED_JOBS failed job(s)"
-              # Show which jobs failed
-              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
-              exit 1
-          fi
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+      - name: Install package
+        working-directory: packages/${{ matrix.package }}
+        run: |
+          uv pip install --system -e .
+
+      - name: Check imports
+        uses: ./FW-CI-templates/.github/actions/check-imports
+        with:
+          package-name: ${{ matrix.module }}
+          python-binary: python3
@@ -12,15 +12,15 @@
 
 NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models. It enables you to run hundreds of benchmarks across popular evaluation harnesses against any OpenAI-compatible model API. Evaluations execute in open-source Docker containers for auditable and trustworthy results. The platform's containerized architecture allows for the rapid integration of public benchmarks and private datasets.
 
-[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [NeMo FW model evaluations](#-evaluate-checkpoints-trained-by-nemo-framework) | [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/CONTRIBUTING.md)
+[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [NeMo FW model evaluations](#-evaluate-checkpoints-trained-by-nemo-framework) | [Supported Benchmarks](#-supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/CONTRIBUTING.md)
 
 ## ✨ Key Pillars
 
 NeMo Evaluator is built on four core principles to provide a reliable and versatile evaluation experience:
 
 - **Reproducibility by Default**: All configurations, random seeds, and software provenance are captured automatically for auditable and repeatable evaluations.
 - **Scale Anywhere**: Run evaluations from a local machine to a Slurm cluster or cloud-native backends like Lepton AI without changing your workflow.
-- **State-of-the-Art Benchmarking**: Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#supported-benchmarks-and-evaluation-harnesses).
+- **State-of-the-Art Benchmarking**: Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#-supported-benchmarks-and-evaluation-harnesses).
 - **Extensible and Customizable**: Integrate new evaluation harnesses, add custom benchmarks with proprietary data, and define custom result exporters for existing MLOps tooling.
 
 ## How It Works: Launcher and Core Engine
@@ -115,7 +115,7 @@ nemo-evaluator-launcher status <job_id_or_invocation_id>
   nemo-evaluator-launcher ls tasks
   ```
 
-- Explore the [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) to see all available harnesses and benchmarks.
+- Explore the [Supported Benchmarks](#-supported-benchmarks-and-evaluation-harnesses) to see all available harnesses and benchmarks.
 - Scale up your evaluations using the [Slurm Executor](./docs/nemo-evaluator-launcher/executors/slurm.md) or [Lepton Executor](./docs/nemo-evaluator-launcher/executors/lepton.md).
 - Learn to evaluate self-hosted models in the extended [Tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md) for nemo-evaluator-launcher.
 - Customize your workflow with [Custom Exporters](./docs/nemo-evaluator-launcher/exporters/overview.md) or by evaluating with [proprietary data](./docs/nemo-evaluator/extending/framework-definition-file.md).
 
@@ -16,6 +16,7 @@ See the complete configuration structure in the [vLLM Config File](../../../../p
 - **`tensor_parallel_size`**: Number of GPUs for tensor parallelism (default: 8)
 - **`pipeline_parallel_size`**: Number of pipeline parallel stages (default: 1)
 - **`data_parallel_size`**: Number of replicas for data parallelism (default: 1)
+- **`gpu_memory_utilization`**: Fraction of GPU memory to use for the model (default: 0.95)
 - **`extra_args`**: Additional arguments passed to vLLM server
 - **`env_vars`**: Environment variables for the container
 
 
@@ -24,7 +24,7 @@ Hosted endpoints (fastest):
 
   Minimal usage (override endpoint URL and key):
   ```bash
-  nemo-evaluator-launcher run --config-dir examples \
+  nemo-evaluator-launcher run --config-dir packages/nemo-evaluator-launcher/examples \
     --config-name local_llama_3_1_8b_instruct \
     -o target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \
     -o target.api_endpoint.api_key_name=API_KEY
 
@@ -373,20 +373,23 @@ interceptor_config = {
                     "enable_thinking": False
                 }
             }
-        }
+        },
+        "params_to_remove": ["field_in_msgs_to_remove"],
+        "params_to_rename": {"max_tokens": "max_completion_tokens"}
     }
 }
 ```
 
 **Explanation:**
 
-This interceptor is particularly useful when custom behavior is needed. In this example, the `enable_thinking` parameter is a custom key that controls the reasoning mode of the model. When set to `False`, it disables the model's internal reasoning/thinking process, which can be useful for scenarios where you want more direct responses without the model's step-by-step reasoning output.
+This interceptor is particularly useful when custom behavior is needed. In this example, the `enable_thinking` parameter is a custom key that controls the reasoning mode of the model. When set to `False`, it disables the model's internal reasoning/thinking process, which can be useful for scenarios where you want more direct responses without the model's step-by-step reasoning output.  
+The `field_in_msgs_to_remove` field would be removed recursively from all messages in the payload.
 
 **Features:**
 
-- Request payload modification
 - Custom parameter injection
-- Flexible configuration options
+- Remove fields recursively at all levels of the payload
+- Rename top-level payload keys
 
 ### 7. Client Error Interceptor
 
 
@@ -18,28 +18,18 @@
 # After successful evaluation, results will be automatically exported to W&B, MLFlow, and GSheets, depending on
 # which exporters you have enabled.
 
-# specify default configs for execution and deployment
+# specify default configs for execution, deployment, and export
 defaults:
   - execution: local
   - deployment: none
   - _self_
 
 execution:
-  output_dir: llama_3_1_8b_instruct_results
+  output_dir: llama_3_1_8b_instruct_auto_export_results
 
   # Auto-export destinations
   auto_export:
-    destinations: ["wandb", "mlflow", "gsheets"]
-  
-  # Export-related env vars (for auto-export only)
-  env_vars:
-    # evaluation:
-      # env vars needed by evaluation container
-    export:
-      # env vars needed by auto-export
-      WANDB_API_KEY: WANDB_API_KEY
-      MLFLOW_TRACKING_URI: MLFLOW_TRACKING_URI # either set here or in the exporter config via tracking_uri
-      PATH: "/path/to/env/bin:$PATH" # needed to load nemo-evaluator-launcher binaries on host machine
+    destinations: ["mlflow"]
 
 target:
   api_endpoint:
@@ -49,52 +39,24 @@ target:
 
 evaluation:
   tasks:
-    - name: simple_evals.gpqa_diamond
-      overrides:
-        config.params.limit_samples: 10
+    - name: gpqa_diamond
       env_vars:
-        HF_TOKEN: HF_TOKEN
+        HF_TOKEN: HF_TOKEN_FOR_GPQA_DIAMOND 
 
-# Exporter configurations (for auto-export only)
+# Exporter configurations (paired with auto-export only)
 export:
-  wandb:
-    entity: "nvidia"
-    project: "nemo-evaluator-launcher-test"
-    name: "llama-3.1-8b-instruct_experiment-v1.12"
-    group: "eval-formatting"
-    job_type: "evaluation"
-    tags: ["llama-3.1", "experiment-v1.12", "latest"]
-    description: "Evaluation of Llama 3.1 with prompts formatting"
-    log_metrics: ["accuracy", "pass@1"]
-    log_mode: "multi_task"
-    
-    # Additional metadata goes into wandb.config
-    extra_metadata:
-      checkpoint_dir: "path/to/checkpoint"
-      experiment_phase: "baseline"
-      hardware: "H100"
-      custom_param: "any_value"
-
   mlflow:
-    tracking_uri: "http://mlflow.nvidia.com:5000"
-    experiment_name: "AIME-2024_v2"
-    description: "Evaluation of Llama 3.1 with prompts formatting"
+    tracking_uri: "http://mlflow.nvidia.com:5003"
+    experiment_name: "nv-eval"
+    description: "nemo-evaluator-launcher Evaluation test run with auto-export"
     log_metrics: ["accuracy", "pass@1"]
     log_logs: true
     only_required: false
 
     # MLflow tags (key-value pairs)
     tags:
-      framework: "vLLM"
-      precision: "bf16"
+      framework: "none"
 
     # Additional metadata goes into mlflow.log_params()
     extra_metadata:
-      checkpoint_dir: "path/to/checkpoint"
-      experiment_phase: "baseline"
-      hardware: "H100"
-      custom_param: "any_value"
-
-  gsheets:
-    spreadsheet_name: "LLM Evaluation Results"
-    log_metrics: ["accuracy", "pass@1"]
+      experiment_phase: "baseline"
Original file line number	Diff line number	Diff line change
`@@ -373,20 +373,23 @@ interceptor_config = {`
`373`	`373`	`"enable_thinking": False`
`374`	`374`	`}`
`375`	`375`	`}`
`376`		`- }`
	`376`	`+ },`
	`377`	`+ "params_to_remove": ["field_in_msgs_to_remove"],`
	`378`	`+ "params_to_rename": {"max_tokens": "max_completion_tokens"}`
`377`	`379`	`}`
`378`	`380`	`}`
`379`	`381`	```
`380`	`382`
`381`	`383`	`Explanation:`
`382`	`384`
`383`		-This interceptor is particularly useful when custom behavior is needed. In this example, the `enable_thinking` parameter is a custom key that controls the reasoning mode of the model. When set to `False`, it disables the model's internal reasoning/thinking process, which can be useful for scenarios where you want more direct responses without the model's step-by-step reasoning output.
	`385`	+This interceptor is particularly useful when custom behavior is needed. In this example, the `enable_thinking` parameter is a custom key that controls the reasoning mode of the model. When set to `False`, it disables the model's internal reasoning/thinking process, which can be useful for scenarios where you want more direct responses without the model's step-by-step reasoning output.
	`386`	+The `field_in_msgs_to_remove` field would be removed recursively from all messages in the payload.
`384`	`387`
`385`	`388`	`Features:`
`386`	`389`
`387`		`-- Request payload modification`
`388`	`390`	`- Custom parameter injection`
`389`		`-- Flexible configuration options`
	`391`	`+- Remove fields recursively at all levels of the payload`
	`392`	`+- Rename top-level payload keys`
`390`	`393`
`391`	`394`	`### 7. Client Error Interceptor`
`392`	`395`