Skip to content

Commit 9d50c27

Browse files
authored
Merge branch 'main' into pablo-garay/dependabot_automerge_if_successful
2 parents 473f805 + 6bc0f00 commit 9d50c27

File tree

27 files changed

+680
-447
lines changed

27 files changed

+680
-447
lines changed

.github/workflows/install-test.yml

Lines changed: 90 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -23,77 +23,111 @@ on:
2323
- main
2424
- "pull-request/[0-9]+"
2525
- "deploy-release/*"
26-
env:
27-
UV_PROJECT_ENVIRONMENT: "./venv"
26+
27+
concurrency:
28+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
29+
cancel-in-progress: true
2830

2931
jobs:
30-
pre-flight:
31-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected]
32+
test-nemo-evaluator:
33+
runs-on: ubuntu-latest
34+
strategy:
35+
matrix:
36+
python-version: ["3.10", "3.11", "3.12", "3.13"]
37+
steps:
38+
- name: Checkout repository
39+
uses: actions/checkout@v4
3240

33-
pip-test:
41+
- name: Set up Python ${{ matrix.python-version }}
42+
uses: actions/setup-python@v5
43+
with:
44+
python-version: ${{ matrix.python-version }}
45+
46+
- name: Install uv
47+
run: |
48+
curl -LsSf https://astral.sh/uv/install.sh | sh
49+
echo "$HOME/.local/bin" >> $GITHUB_PATH
50+
51+
- name: Install nemo-evaluator
52+
working-directory: packages/nemo-evaluator
53+
run: |
54+
uv pip install --system -e .
55+
56+
- name: Verify installation
57+
run: |
58+
python -c "import nemo_evaluator; print(f'nemo-evaluator version: {nemo_evaluator.__version__}')"
59+
which nemo-evaluator
60+
which eval-factory
61+
nemo-evaluator --help
62+
63+
test-nemo-evaluator-launcher:
3464
runs-on: ubuntu-latest
35-
name: Pip - Python${{ matrix.python-version }} - ${{ matrix.package }} - AMD64/Linux - Ubuntu Latest
36-
needs: [pre-flight]
37-
if: |
38-
!(needs.pre-flight.outputs.docs_only == 'true'
39-
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
40-
container:
41-
image: ubuntu:24.04
42-
environment: nemo-ci
4365
strategy:
44-
fail-fast: false
4566
matrix:
46-
python-version: ["3.10", "3.11", "3.12"]
47-
package: ["nemo-evaluator"]
67+
python-version: ["3.10", "3.11", "3.12", "3.13"]
4868
steps:
4969
- name: Checkout repository
5070
uses: actions/checkout@v4
5171

52-
- name: Install ${{ matrix.package }}
53-
shell: bash -x -e -u -o pipefail {0}
54-
run: bash docker/common/install.sh --package ${{ matrix.package }} --python-version "${{ matrix.python-version }}"
72+
- name: Set up Python ${{ matrix.python-version }}
73+
uses: actions/setup-python@v5
74+
with:
75+
python-version: ${{ matrix.python-version }}
76+
77+
- name: Install uv
78+
run: |
79+
curl -LsSf https://astral.sh/uv/install.sh | sh
80+
echo "$HOME/.local/bin" >> $GITHUB_PATH
5581
56-
- name: Checkout check-imports
82+
- name: Install nemo-evaluator-launcher
83+
working-directory: packages/nemo-evaluator-launcher
84+
run: |
85+
uv pip install --system -e .
86+
87+
- name: Verify installation
88+
run: |
89+
python -c "import nemo_evaluator_launcher; print(f'nemo-evaluator-launcher version: {nemo_evaluator_launcher.__version__}')"
90+
which nemo-evaluator-launcher
91+
which nv-eval
92+
nemo-evaluator-launcher --help
93+
94+
check-imports:
95+
runs-on: ubuntu-latest
96+
strategy:
97+
matrix:
98+
python-version: ["3.10", "3.11", "3.12", "3.13"]
99+
include:
100+
- package: "nemo-evaluator"
101+
module: "nemo_evaluator"
102+
- package: "nemo-evaluator-launcher"
103+
module: "nemo_evaluator_launcher"
104+
steps:
105+
- name: Checkout repository
106+
uses: actions/checkout@v4
107+
108+
- name: Checkout FW-CI-templates
57109
uses: actions/checkout@v4
58110
with:
59111
repository: NVIDIA-NeMo/FW-CI-templates
60-
ref: v0.39.0
61-
path: FW-CI-templates
112+
path: ./FW-CI-templates
62113

63-
- name: Check imports for nemo_evaluator
64-
uses: ./FW-CI-templates/.github/actions/check-imports
114+
- name: Set up Python ${{ matrix.python-version }}
115+
uses: actions/setup-python@v5
65116
with:
66-
package-name: nemo_evaluator
67-
python-binary: /opt/venv/bin/python
117+
python-version: ${{ matrix.python-version }}
68118

69-
install-test-summary:
70-
needs: [pre-flight, pip-test]
71-
runs-on: ubuntu-latest
72-
name: Install test summary
73-
if: |
74-
(
75-
needs.pre-flight.outputs.docs_only == 'true'
76-
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
77-
|| always()
78-
)
79-
&& !cancelled()
80-
steps:
81-
- name: Get workflow result
82-
id: result
83-
shell: bash -x -e -u -o pipefail {0}
84-
env:
85-
GH_TOKEN: ${{ github.token }}
86-
RUN_ID: ${{ github.run_id }}
87-
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
119+
- name: Install uv
88120
run: |
89-
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
90-
91-
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
92-
echo "✅ All previous jobs completed successfully"
93-
exit 0
94-
else
95-
echo "❌ Found $FAILED_JOBS failed job(s)"
96-
# Show which jobs failed
97-
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
98-
exit 1
99-
fi
121+
curl -LsSf https://astral.sh/uv/install.sh | sh
122+
echo "$HOME/.local/bin" >> $GITHUB_PATH
123+
124+
- name: Install package
125+
working-directory: packages/${{ matrix.package }}
126+
run: |
127+
uv pip install --system -e .
128+
129+
- name: Check imports
130+
uses: ./FW-CI-templates/.github/actions/check-imports
131+
with:
132+
package-name: ${{ matrix.module }}
133+
python-binary: python3

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,15 @@
1212

1313
NeMo Evaluator is an open-source platform for robust, reproducible, and scalable evaluation of Large Language Models. It enables you to run hundreds of benchmarks across popular evaluation harnesses against any OpenAI-compatible model API. Evaluations execute in open-source Docker containers for auditable and trustworthy results. The platform's containerized architecture allows for the rapid integration of public benchmarks and private datasets.
1414

15-
[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [NeMo FW model evaluations](#-evaluate-checkpoints-trained-by-nemo-framework) | [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/CONTRIBUTING.md)
15+
[Tutorial](./docs/nemo-evaluator-launcher/tutorial.md) | [NeMo FW model evaluations](#-evaluate-checkpoints-trained-by-nemo-framework) | [Supported Benchmarks](#-supported-benchmarks-and-evaluation-harnesses) | [Configuration Examples](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/packages/nemo-evaluator-launcher/examples) | [Contribution Guide](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/CONTRIBUTING.md)
1616

1717
## ✨ Key Pillars
1818

1919
NeMo Evaluator is built on four core principles to provide a reliable and versatile evaluation experience:
2020

2121
- **Reproducibility by Default**: All configurations, random seeds, and software provenance are captured automatically for auditable and repeatable evaluations.
2222
- **Scale Anywhere**: Run evaluations from a local machine to a Slurm cluster or cloud-native backends like Lepton AI without changing your workflow.
23-
- **State-of-the-Art Benchmarking**: Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#supported-benchmarks-and-evaluation-harnesses).
23+
- **State-of-the-Art Benchmarking**: Access a comprehensive suite of over 100 benchmarks from 18 popular open-source evaluation harnesses. See the full list of [Supported benchmarks and evaluation harnesses](#-supported-benchmarks-and-evaluation-harnesses).
2424
- **Extensible and Customizable**: Integrate new evaluation harnesses, add custom benchmarks with proprietary data, and define custom result exporters for existing MLOps tooling.
2525

2626
## How It Works: Launcher and Core Engine
@@ -115,7 +115,7 @@ nemo-evaluator-launcher status <job_id_or_invocation_id>
115115
nemo-evaluator-launcher ls tasks
116116
```
117117

118-
- Explore the [Supported Benchmarks](#supported-benchmarks-and-evaluation-harnesses) to see all available harnesses and benchmarks.
118+
- Explore the [Supported Benchmarks](#-supported-benchmarks-and-evaluation-harnesses) to see all available harnesses and benchmarks.
119119
- Scale up your evaluations using the [Slurm Executor](./docs/nemo-evaluator-launcher/executors/slurm.md) or [Lepton Executor](./docs/nemo-evaluator-launcher/executors/lepton.md).
120120
- Learn to evaluate self-hosted models in the extended [Tutorial guide](./docs/nemo-evaluator-launcher/tutorial.md) for nemo-evaluator-launcher.
121121
- Customize your workflow with [Custom Exporters](./docs/nemo-evaluator-launcher/exporters/overview.md) or by evaluating with [proprietary data](./docs/nemo-evaluator/extending/framework-definition-file.md).

docs/nemo-evaluator-launcher/configuration/deployment/vllm.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ See the complete configuration structure in the [vLLM Config File](../../../../p
1616
- **`tensor_parallel_size`**: Number of GPUs for tensor parallelism (default: 8)
1717
- **`pipeline_parallel_size`**: Number of pipeline parallel stages (default: 1)
1818
- **`data_parallel_size`**: Number of replicas for data parallelism (default: 1)
19+
- **`gpu_memory_utilization`**: Fraction of GPU memory to use for the model (default: 0.95)
1920
- **`extra_args`**: Additional arguments passed to vLLM server
2021
- **`env_vars`**: Environment variables for the container
2122

docs/nemo-evaluator-launcher/tutorial.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Hosted endpoints (fastest):
2424

2525
Minimal usage (override endpoint URL and key):
2626
```bash
27-
nemo-evaluator-launcher run --config-dir examples \
27+
nemo-evaluator-launcher run --config-dir packages/nemo-evaluator-launcher/examples \
2828
--config-name local_llama_3_1_8b_instruct \
2929
-o target.api_endpoint.url=https://integrate.api.nvidia.com/v1/chat/completions \
3030
-o target.api_endpoint.api_key_name=API_KEY

docs/nemo-evaluator/reference/api.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -373,20 +373,23 @@ interceptor_config = {
373373
"enable_thinking": False
374374
}
375375
}
376-
}
376+
},
377+
"params_to_remove": ["field_in_msgs_to_remove"],
378+
"params_to_rename": {"max_tokens": "max_completion_tokens"}
377379
}
378380
}
379381
```
380382

381383
**Explanation:**
382384

383-
This interceptor is particularly useful when custom behavior is needed. In this example, the `enable_thinking` parameter is a custom key that controls the reasoning mode of the model. When set to `False`, it disables the model's internal reasoning/thinking process, which can be useful for scenarios where you want more direct responses without the model's step-by-step reasoning output.
385+
This interceptor is particularly useful when custom behavior is needed. In this example, the `enable_thinking` parameter is a custom key that controls the reasoning mode of the model. When set to `False`, it disables the model's internal reasoning/thinking process, which can be useful for scenarios where you want more direct responses without the model's step-by-step reasoning output.
386+
The `field_in_msgs_to_remove` field would be removed recursively from all messages in the payload.
384387

385388
**Features:**
386389

387-
- Request payload modification
388390
- Custom parameter injection
389-
- Flexible configuration options
391+
- Remove fields recursively at all levels of the payload
392+
- Rename top-level payload keys
390393

391394
### 7. Client Error Interceptor
392395

packages/nemo-evaluator-launcher/examples/local_auto_export_llama_3_1_8b_instruct.yaml

Lines changed: 11 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -18,28 +18,18 @@
1818
# After successful evaluation, results will be automatically exported to W&B, MLFlow, and GSheets, depending on
1919
# which exporters you have enabled.
2020

21-
# specify default configs for execution and deployment
21+
# specify default configs for execution, deployment, and export
2222
defaults:
2323
- execution: local
2424
- deployment: none
2525
- _self_
2626

2727
execution:
28-
output_dir: llama_3_1_8b_instruct_results
28+
output_dir: llama_3_1_8b_instruct_auto_export_results
2929

3030
# Auto-export destinations
3131
auto_export:
32-
destinations: ["wandb", "mlflow", "gsheets"]
33-
34-
# Export-related env vars (for auto-export only)
35-
env_vars:
36-
# evaluation:
37-
# env vars needed by evaluation container
38-
export:
39-
# env vars needed by auto-export
40-
WANDB_API_KEY: WANDB_API_KEY
41-
MLFLOW_TRACKING_URI: MLFLOW_TRACKING_URI # either set here or in the exporter config via tracking_uri
42-
PATH: "/path/to/env/bin:$PATH" # needed to load nemo-evaluator-launcher binaries on host machine
32+
destinations: ["mlflow"]
4333

4434
target:
4535
api_endpoint:
@@ -49,52 +39,24 @@ target:
4939

5040
evaluation:
5141
tasks:
52-
- name: simple_evals.gpqa_diamond
53-
overrides:
54-
config.params.limit_samples: 10
42+
- name: gpqa_diamond
5543
env_vars:
56-
HF_TOKEN: HF_TOKEN
44+
HF_TOKEN: HF_TOKEN_FOR_GPQA_DIAMOND
5745

58-
# Exporter configurations (for auto-export only)
46+
# Exporter configurations (paired with auto-export only)
5947
export:
60-
wandb:
61-
entity: "nvidia"
62-
project: "nemo-evaluator-launcher-test"
63-
name: "llama-3.1-8b-instruct_experiment-v1.12"
64-
group: "eval-formatting"
65-
job_type: "evaluation"
66-
tags: ["llama-3.1", "experiment-v1.12", "latest"]
67-
description: "Evaluation of Llama 3.1 with prompts formatting"
68-
log_metrics: ["accuracy", "pass@1"]
69-
log_mode: "multi_task"
70-
71-
# Additional metadata goes into wandb.config
72-
extra_metadata:
73-
checkpoint_dir: "path/to/checkpoint"
74-
experiment_phase: "baseline"
75-
hardware: "H100"
76-
custom_param: "any_value"
77-
7848
mlflow:
79-
tracking_uri: "http://mlflow.nvidia.com:5000"
80-
experiment_name: "AIME-2024_v2"
81-
description: "Evaluation of Llama 3.1 with prompts formatting"
49+
tracking_uri: "http://mlflow.nvidia.com:5003"
50+
experiment_name: "nv-eval"
51+
description: "nemo-evaluator-launcher Evaluation test run with auto-export"
8252
log_metrics: ["accuracy", "pass@1"]
8353
log_logs: true
8454
only_required: false
8555

8656
# MLflow tags (key-value pairs)
8757
tags:
88-
framework: "vLLM"
89-
precision: "bf16"
58+
framework: "none"
9059

9160
# Additional metadata goes into mlflow.log_params()
9261
extra_metadata:
93-
checkpoint_dir: "path/to/checkpoint"
94-
experiment_phase: "baseline"
95-
hardware: "H100"
96-
custom_param: "any_value"
97-
98-
gsheets:
99-
spreadsheet_name: "LLM Evaluation Results"
100-
log_metrics: ["accuracy", "pass@1"]
62+
experiment_phase: "baseline"

0 commit comments

Comments
 (0)