Skip to content

Commit 37a37fc

Browse files
authored
feat: Init command improvements (#229)
* docs: remove uvx instructions * chore: impromove empty handler commends and var names * chore: impromove empty handler commends and var names * fix: correct example format in testcase name prompt * fix: update init command * fix: enhance error handling and logging in _handle_errors function * fix: improve logging formatting in error handling and file utilities * fix: remove unused import of Path in base_init.py * test: add unit tests for print_table function handling normal and UnicodeError cases * fix: standardize string quotes in test_print_table_unicodeerror function * pr fixes
1 parent d94bf6d commit 37a37fc

File tree

9 files changed

+142
-30
lines changed

9 files changed

+142
-30
lines changed

docs/getting-started/tutorial.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ See supported providers and their required credentials in the [Judge Models and
3030
simpleval init
3131
```
3232

33-
- [ ] For `Enter eval folder (enter to stop):` enter `story-q-and-a`
33+
- [ ] For `Enter eval folder...:` enter `story-q-and-a`
3434
- [ ] For `Enter test case name (enter to stop):` enter `prompt1`
3535
- [ ] Select a judge provider you want to use, which ever you have access to. <br>For example `open-ai`<br>
3636
Don't worry if you get an error saying that the necessary credentials are not set; you can set them later.

simpleval/commands/init_command/base_init.py

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from simpleval.evaluation.utils import get_empty_eval_set_folder, get_empty_testcase_folder
1313
from simpleval.exceptions import TerminationError
1414
from simpleval.utilities.console import print_boxed_message
15+
from simpleval.utilities.files import is_subpath
1516

1617

1718
class BaseInit(ABC):
@@ -104,42 +105,65 @@ def _print_specific_instructions(self):
104105
"""
105106

106107
def _print_common_instructions_pre(self):
107-
print(f'{Fore.CYAN}Now it`s your turn, perform the following steps:{Fore.RESET}')
108108
print()
109+
print(f'{Fore.CYAN}Now it`s your turn, perform the following steps:{Fore.RESET}')
109110

110111
def _print_common_instructions_post(self, new_eval_set_folder: str, new_testcases_folder: str, testcase: str):
111112
idx = self.post_instructions_start_index
112113

114+
if is_subpath(new_eval_set_folder, os.getcwd()):
115+
eval_set_folder_to_show = os.path.relpath(new_eval_set_folder, os.getcwd())
116+
else:
117+
eval_set_folder_to_show = new_eval_set_folder
118+
113119
print()
114-
print(f'{Fore.CYAN}{idx}. Populate the ground truth file: {os.path.join(new_eval_set_folder, GROUND_TRUTH_FILE)}{Fore.RESET}')
120+
print(
121+
f'{Fore.CYAN}{idx}. Populate the ground truth file: {Fore.YELLOW}{os.path.join(new_eval_set_folder, GROUND_TRUTH_FILE)}{Fore.RESET}'
122+
)
115123
idx += 1
116124
print(f'{Fore.CYAN} This is a jsonl file - each line a valid json representing a test to run{Fore.RESET}')
117-
print(f'{Fore.CYAN} set `name`, `description` (optional) and `expected_result`{Fore.RESET}')
118125
print(
119-
f'{Fore.CYAN} payload is whatever you want to pass to the testcase logic (the code you`ll write in `task_handler.py`) as json{Fore.RESET}'
126+
f'{Fore.CYAN} set {Fore.YELLOW}`name`{Fore.RESET}, {Fore.YELLOW}`description`{Fore.CYAN} (optional) and {Fore.YELLOW}`expected_result`{Fore.RESET}'
127+
)
128+
print(
129+
f'{Fore.CYAN} {Fore.YELLOW}`payload`{Fore.CYAN} is whatever you want to pass to the testcase logic (the code you`ll write in {Fore.YELLOW}`task_handler.py`{Fore.CYAN}) as json{Fore.RESET}'
120130
)
121131
print(f'{Fore.CYAN} e.g. path to image files to use during llm inference{Fore.RESET}')
122-
print(f'{Fore.YELLOW} NOTE: Names are unique{Fore.RESET}')
132+
print(f'{Fore.YELLOW} NOTE: `name` is unique{Fore.RESET}')
123133
print()
124134
print(f'{Fore.CYAN}{idx}. Optionally update the README.md to describe the evaluation{Fore.RESET}')
125135
idx += 1
126136
print()
127137
print(f'{Fore.CYAN}{idx}. Implement the testcase logic{Fore.RESET}')
128138
idx += 1
129-
print(f'{Fore.CYAN} Open {os.path.join(new_testcases_folder, testcase, PLUGIN_FILE_NAME)}{Fore.RESET}')
130-
print(f'{Fore.CYAN} Implement the `{PLUGIN_FUNCTION_NAME}` function{Fore.RESET}')
139+
print(f'{Fore.CYAN} Open {Fore.YELLOW}{os.path.join(new_testcases_folder, PLUGIN_FILE_NAME)}{Fore.RESET}')
140+
print(f'{Fore.CYAN} Implement the {Fore.YELLOW}`{PLUGIN_FUNCTION_NAME}`{Fore.CYAN} function{Fore.RESET}')
131141
print(f'{Fore.CYAN} This is a typical implementation:{Fore.RESET}')
132142
print(f'{Fore.CYAN} - Call an llm using the input from payload{Fore.RESET}')
133-
print(f'{Fore.CYAN} - Set `prompt` with the prompt you used to call the llm{Fore.RESET}')
134-
print(f'{Fore.CYAN} - Set `prediction` with the result from your llm call (the llm model prediction){Fore.RESET}')
135-
print(f'{Fore.CYAN} - Set name and payload from your input args as is - this is used by the framework as metadata{Fore.RESET}')
136-
print(f'{Fore.YELLOW} NOTE: If it recommended to implement retries on rate limit errors{Fore.RESET}')
137-
print(f'{Fore.YELLOW} With bedrock, add @bedrock_limits_retry decorator to your `{PLUGIN_FUNCTION_NAME}`{Fore.RESET}')
138-
print(f'{Fore.CYAN} See `configuration` in the docs to set different concurrency per testcase{Fore.RESET}')
143+
print(f'{Fore.CYAN} - When returning LlmTaskResult: ')
144+
print(f'{Fore.CYAN} - Set {Fore.YELLOW}`prompt`{Fore.CYAN} with the prompt you called the llm with{Fore.RESET}')
145+
print(
146+
f'{Fore.CYAN} - Set {Fore.YELLOW}`prediction`{Fore.CYAN} with the result from your llm call (the llm model prediction){Fore.RESET}'
147+
)
148+
print(
149+
f'{Fore.CYAN} - Set {Fore.YELLOW}`name`{Fore.CYAN} and {Fore.YELLOW}`payload`{Fore.CYAN} from your input args as is - this is used by the framework as metadata{Fore.RESET}'
150+
)
151+
print()
152+
print(
153+
f'{Fore.YELLOW} NOTE: If it recommended to implement retries on rate limit errors on the call to {Fore.YELLOW}`{PLUGIN_FUNCTION_NAME}`{Fore.RESET}'
154+
)
155+
print(
156+
f'{Fore.YELLOW} Check out the built-in retry decorators in {Fore.YELLOW}`simpleval/utilities/retryables.py`{Fore.RESET}'
157+
)
158+
print()
159+
print(
160+
f'{Fore.CYAN}See https://cyberark.github.io/simple-llm-eval/latest/users/configuration/ on how to set different concurrency per testcase{Fore.RESET}'
161+
)
139162

140163
print()
141164
print(f'{Fore.CYAN}{idx}. You are ready to run the evaluation with:{Fore.RESET}')
142-
print(f'{Fore.CYAN} `simpleval run -e {new_eval_set_folder} -t {new_testcases_folder}`{Fore.RESET}')
165+
print(f'{Fore.YELLOW} `simpleval run -e {eval_set_folder_to_show} -t {testcase}`{Fore.RESET}')
166+
print()
143167
print(f'{Fore.YELLOW} NOTE: {LLM_TASKS_RESULT_FILE} is created in the testcase folder on first run{Fore.RESET}')
144168
print(f'{Fore.YELLOW} if results exist from previous run you will be prompted on how to proceed{Fore.RESET}')
145169
print(f'{Fore.YELLOW} you can also pass -o/--overwrite to overwrite all existing results{Fore.RESET}')

simpleval/commands/init_command/user_functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ def get_eval_dir_from_user() -> str:
3838
logger.info(f'{Fore.CYAN}The folder name should describe the evaluation you are going to perform.{Fore.RESET}')
3939
logger.info(f'{Fore.CYAN}For example: {Fore.YELLOW}my_eval{Fore.CYAN} or {Fore.YELLOW}{_example_dir()}{Fore.RESET}')
4040

41-
eval_dir = input(f'{Fore.CYAN}\nEnter eval folder (enter to stop): {Fore.RESET}')
41+
eval_dir = input(f'{Fore.CYAN}\nEnter eval folder - absolute or relative (enter to stop): {Fore.RESET}')
4242
if not eval_dir:
4343
raise TerminationError(f'{Fore.RED}No folder name provided, exiting...{Fore.RESET}')
4444

@@ -65,7 +65,7 @@ def get_testcase_name_from_user() -> str:
6565
logger.info(f'{Fore.CYAN}Enter the name of your first testcase.{Fore.RESET}')
6666
logger.info(f'{Fore.CYAN}This should reflect the conditions that you want to run.{Fore.RESET}')
6767
logger.info(f'{Fore.CYAN}This can be: using different model, different set of prompts, etc.{Fore.RESET}')
68-
logger.info(f'{Fore.CYAN}For example: {Fore.YELLOW}sonnet37{Fore.CYAN}, {Fore.YELLOW}prompt_v1`{Fore.RESET}')
68+
logger.info(f'{Fore.CYAN}For example: {Fore.YELLOW}sonnet37-prompt-v1{Fore.CYAN}')
6969

7070
testcase_name = ''
7171
while not testcase_name:

simpleval/commands/run_command.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -148,18 +148,32 @@ def _handle_errors(eval_dir: str, testcase: str, llm_task_errors: list, eval_err
148148
logger = logging.getLogger(LOGGER_NAME)
149149

150150
if llm_task_errors:
151-
logger.error(f'{Fore.RED}Errors occurred during llm tasks run. {len(llm_task_errors)} error(s) found. Terminating evaluation.')
152-
with open(get_llm_task_errors_file(eval_set_dir=eval_dir, testcase=testcase), 'w', encoding='utf-8') as file:
151+
logger.error(
152+
f'{Fore.RED}Errors occurred during llm tasks run. {len(llm_task_errors)} error(s) found. Terminating evaluation.{Fore.RESET}'
153+
)
154+
llm_task_error_file = get_llm_task_errors_file(eval_set_dir=eval_dir, testcase=testcase)
155+
with open(llm_task_error_file, 'w', encoding='utf-8') as file:
153156
file.writelines(f'{error}\n{"-" * 120}\n' for error in llm_task_errors)
154157

158+
logger.error(f'{Fore.YELLOW}LLM task errors saved to {llm_task_error_file}{Fore.RESET}')
159+
print()
160+
155161
if eval_errors:
156-
logger.error(f'{Fore.RED}Errors occurred during evaluation. {len(eval_errors)} error(s) found. Terminating evaluation.')
157-
logger.error(f'{Fore.RED}Run with --verbose/-v for more details')
158-
logger.error(f'{Fore.CYAN}If temporary failures, run again without overwriting (no -o) to run only the failures{Fore.RESET}')
162+
logger.error(f'{Fore.RED}Errors occurred during evaluation. {len(eval_errors)} error(s) found. Terminating evaluation.{Fore.RESET}')
159163

160-
with open(get_eval_errors_file(eval_set_dir=eval_dir, testcase=testcase), 'w', encoding='utf-8') as file:
164+
eval_errors_file = get_eval_errors_file(eval_set_dir=eval_dir, testcase=testcase)
165+
with open(eval_errors_file, 'w', encoding='utf-8') as file:
161166
file.writelines(f'{error}\n{"-" * 120}\n' for error in eval_errors)
162167

168+
logger.error(f'{Fore.YELLOW}Eval errors saved to {eval_errors_file}{Fore.RESET}')
169+
170+
if llm_task_errors or eval_errors:
171+
print()
172+
logger.error(f'{Fore.YELLOW}Run with --verbose/-v for more details{Fore.RESET}')
173+
logger.error(
174+
f'{Fore.YELLOW}If these are temporary failures, run again without overwriting (no -o) to run only the failures{Fore.RESET}'
175+
)
176+
163177

164178
def print_exec_time(elapsed_time_testcases: float, elapsed_time_runeval: float):
165179
print()

simpleval/eval_sets/empty/testcases/empty/task_handler.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,29 @@
66
from simpleval.testcases.schemas.llm_task_result import LlmTaskResult
77

88

9-
# @bedrock_limits_retry - use if using bedrock to call llm
9+
# Recommended to implement retry here, see built-in @litellm_limits_retry, @bedrock_limits_retry as reference.
1010
def task_logic(name: str, payload: dict) -> LlmTaskResult:
1111
"""
1212
Your llm task logic goes here.
1313
You can (but you don't have to) use the simpleval logger which works with the verbose flag.
1414
"""
15-
print('NOTE: implement retries on rate limits. for bedrock, use `@bedrock_limits_retry` decorator (simpleval.utilities.retryables)')
15+
print('NOTE: implement retries on rate limits. see simpleval.utilities.retryables for built-in decorators')
1616

1717
logger = logging.getLogger(LOGGER_NAME)
1818
logger.debug(f'{__name__}: Running task logic for {name} with payload: {payload}')
1919

2020
# Implement your logic here - typically call an llm to do your work, using the inputs from payload
21-
your_prompt_for_the_llm = 'this is what you send to your llm'
22-
llm_response = 'hi there!'
21+
# The user prompt is also returned in LlmTaskResult since the judge will use it when making judgement.
22+
user_prompt_to_llm = 'Hi LLM, please respond to this prompt, replace with your own prompt'
23+
# llm_response = call_an_llm_here(user_prompt=user_prompt_to_llm)
24+
llm_response = 'This is the response from the LLM' # The llm response is returned in LlmTaskResult
2325

2426
# To log token usage, call this with your token count, when verbose is on (-v) it will write it to tokens-bookkeeping.log
2527
# log_bookkeeping_data(source='llm', model_name=model_id, input_tokens=input_tokens, output_tokens=output_tokens)
2628

2729
result = LlmTaskResult(
2830
name=name,
29-
prompt=your_prompt_for_the_llm, # This is what you sent to your llm
31+
prompt=user_prompt_to_llm, # This is what you sent to your llm
3032
prediction=llm_response, # This is what your llm responded
3133
payload=payload,
3234
)

simpleval/utilities/files.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
import os
3+
from pathlib import Path
34

45
from colorama import Fore
56

@@ -19,3 +20,17 @@ def delete_file(file_path: str, log: bool = True):
1920
if log:
2021
logger = logging.getLogger(LOGGER_NAME)
2122
logger.debug(f'{Fore.YELLOW}`{file_path}` deleted{Fore.RESET}\n')
23+
24+
25+
def is_subpath(child_path: str, parent_path: str):
26+
child = Path(child_path).resolve()
27+
parent = Path(parent_path).resolve()
28+
29+
if child == parent:
30+
return False
31+
32+
try:
33+
child.relative_to(parent)
34+
return True
35+
except ValueError:
36+
return False

tests/unit/test_handlers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@ def test_empty_task_logic():
1111
result = empty_task_logic('test_empty', payload)
1212
assert isinstance(result, LlmTaskResult)
1313
assert result.name == 'test_empty'
14-
assert result.prompt == 'this is what you send to your llm'
15-
assert result.prediction == 'hi there!'
14+
assert result.prompt == 'Hi LLM, please respond to this prompt, replace with your own prompt'
15+
assert result.prediction == 'This is the response from the LLM'
1616
assert result.payload == payload
1717

1818

tests/unit/test_print_table.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import builtins
2+
import pytest
3+
from unittest.mock import patch
4+
from simpleval.commands.reporting.utils import print_table
5+
6+
7+
@pytest.fixture
8+
def capture_print(monkeypatch):
9+
"""Fixture to capture printed output."""
10+
output = []
11+
monkeypatch.setattr(builtins, 'print', output.append)
12+
return output
13+
14+
15+
def test_print_table_normal(capture_print):
16+
"""Test normal table printing with heavy_grid format."""
17+
class DummyTabulate:
18+
def __init__(self):
19+
self.calls = []
20+
def __call__(self, table, headers=None, tablefmt=None):
21+
self.calls.append((table, headers, tablefmt))
22+
return 'formatted-table'
23+
dummy_tabulate = DummyTabulate()
24+
with patch('simpleval.commands.reporting.utils.tabulate', dummy_tabulate):
25+
print_table([[1, 2]], headers=['a', 'b'])
26+
assert capture_print == ['formatted-table']
27+
assert dummy_tabulate.calls[0] == ([[1, 2]], ['a', 'b'], 'heavy_grid')
28+
29+
30+
def test_print_table_unicodeerror(capture_print):
31+
"""Test fallback when tabulate raises UnicodeError."""
32+
def fake_tabulate(table, headers=None, tablefmt=None):
33+
if tablefmt == 'heavy_grid':
34+
raise UnicodeError('fake unicode error')
35+
return 'fallback-table'
36+
with patch('simpleval.commands.reporting.utils.tabulate', fake_tabulate):
37+
print_table([[1, 2]], headers=['a', 'b'])
38+
assert capture_print == ['fallback-table']

tests/unit/test_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from simpleval.utilities.files import is_subpath
2+
3+
def test_is_subpath_child():
4+
path = '/parent/child'
5+
parent = '/parent'
6+
7+
assert is_subpath(path, parent) is True
8+
9+
def test_is_subpath_not_child():
10+
path = '/some-dir/child'
11+
parent = '/parent'
12+
13+
assert is_subpath(path, parent) is False
14+
15+
def test_is_subpath_same_path():
16+
path = '/parent'
17+
parent = '/parent'
18+
19+
assert is_subpath(path, parent) is False

0 commit comments

Comments
 (0)