diff --git a/README.md b/README.md index 8303905..30956fa 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,6 @@ # Harbor Cookbook + [![](https://dcbadge.limes.pink/api/server/https://discord.gg/6xWPKhGDbA)](https://discord.gg/6xWPKhGDbA) [![Docs](https://img.shields.io/badge/Docs-000000?style=for-the-badge&logo=mdbook&color=105864)](https://harborframework.com/docs) Realistic examples of building evals and optimizing agents using [Harbor](https://github.com/harbor-framework/harbor). diff --git a/harbor_cookbook/recipes/dns-blacklisting/tests/test.sh b/harbor_cookbook/recipes/dns-blacklisting/tests/test.sh index b569242..c43b4f0 100644 --- a/harbor_cookbook/recipes/dns-blacklisting/tests/test.sh +++ b/harbor_cookbook/recipes/dns-blacklisting/tests/test.sh @@ -1,15 +1,18 @@ #!/bin/bash -set -uo pipefail + +apt-get update +apt-get install -y curl curl -LsSf https://astral.sh/uv/0.9.7/install.sh | sh + source $HOME/.local/bin/env uvx \ --with pytest==8.4.1 \ --with pytest-json-ctrf==0.3.5 \ - pytest --ctrf /logs/verifier/ctrf.json /tests/test_dns.py -rA || true + pytest --ctrf /logs/verifier/ctrf.json /tests/test_dns.py -rA -if [ "${PIPESTATUS[0]}" -eq 0 ]; then +if [ $? -eq 0 ]; then echo 1 > /logs/verifier/reward.txt else echo 0 > /logs/verifier/reward.txt diff --git a/harbor_cookbook/recipes/multi-reward/README.md b/harbor_cookbook/recipes/multi-reward/README.md index 17c68d5..e9d8a8f 100644 --- a/harbor_cookbook/recipes/multi-reward/README.md +++ b/harbor_cookbook/recipes/multi-reward/README.md @@ -24,16 +24,14 @@ multi-reward/ ## Run +This recipe writes two reward dimensions (`correctness`, `performance`) to `reward.json`. Harbor's default `mean` metric only supports single-key rewards, so you must pass the included `config.yaml` which uses a custom per-dimension metric: + ```bash -harbor trials start -p harbor_cookbook/recipes/multi-reward +harbor run -p harbor_cookbook/recipes/multi-reward -c harbor_cookbook/recipes/multi-reward/config.yaml ``` -## Metrics note - -Harbor's default `mean` metric only supports single-key `reward.json`. Since this recipe writes two keys (`correctness`, `performance`), running `harbor run` requires a custom metric config: +To run a single trial without metrics (useful for quick iteration): ```bash -harbor run -p harbor_cookbook/recipes/multi-reward -c harbor_cookbook/recipes/multi-reward/config.yaml +harbor trials start -p harbor_cookbook/recipes/multi-reward ``` - -The included `config.yaml` uses a `uv-script` metric (`metrics/per_dimension.py`) that computes mean reward per dimension. diff --git a/harbor_cookbook/recipes/multi-reward/task.toml b/harbor_cookbook/recipes/multi-reward/task.toml index 57a6e5e..7221e29 100644 --- a/harbor_cookbook/recipes/multi-reward/task.toml +++ b/harbor_cookbook/recipes/multi-reward/task.toml @@ -9,5 +9,5 @@ timeout_sec = 120.0 [environment] build_timeout_sec = 600.0 cpus = 1 -memory = "2G" -storage = "10G" +memory_mb = 2048 +storage_mb = 10240 diff --git a/harbor_cookbook/recipes/simple-task/task.toml b/harbor_cookbook/recipes/simple-task/task.toml index 57a6e5e..7221e29 100644 --- a/harbor_cookbook/recipes/simple-task/task.toml +++ b/harbor_cookbook/recipes/simple-task/task.toml @@ -9,5 +9,5 @@ timeout_sec = 120.0 [environment] build_timeout_sec = 600.0 cpus = 1 -memory = "2G" -storage = "10G" +memory_mb = 2048 +storage_mb = 10240