feat: Init command improvements (#229)

roybeny · web-flow · commit 37a37fca0cc5 · 2025-06-04T12:05:23.000+03:00
* docs: remove uvx instructions

* chore: impromove empty handler commends and var names

* chore: impromove empty handler commends and var names

* fix: correct example format in testcase name prompt

* fix: update init command

* fix: enhance error handling and logging in _handle_errors function

* fix: improve logging formatting in error handling and file utilities

* fix: remove unused import of Path in base_init.py

* test: add unit tests for print_table function handling normal and UnicodeError cases

* fix: standardize string quotes in test_print_table_unicodeerror function

* pr fixes
diff --git a/docs/getting-started/tutorial.md b/docs/getting-started/tutorial.md
@@ -30,7 +30,7 @@ See supported providers and their required credentials in the [Judge Models and
 simpleval init
 ```
 
-- [ ] For `Enter eval folder (enter to stop):` enter `story-q-and-a`
+- [ ] For `Enter eval folder...:` enter `story-q-and-a`
 - [ ] For `Enter test case name (enter to stop):` enter `prompt1`
 - [ ] Select a judge provider you want to use, which ever you have access to. <br>For example `open-ai`<br>
       Don't worry if you get an error saying that the necessary credentials are not set; you can set them later.
diff --git a/simpleval/commands/init_command/base_init.py b/simpleval/commands/init_command/base_init.py
@@ -12,6 +12,7 @@
 from simpleval.evaluation.utils import get_empty_eval_set_folder, get_empty_testcase_folder
 from simpleval.exceptions import TerminationError
 from simpleval.utilities.console import print_boxed_message
+from simpleval.utilities.files import is_subpath
 
 
 class BaseInit(ABC):
@@ -104,42 +105,65 @@ def _print_specific_instructions(self):
         """
 
     def _print_common_instructions_pre(self):
-        print(f'{Fore.CYAN}Now it`s your turn, perform the following steps:{Fore.RESET}')
         print()
+        print(f'{Fore.CYAN}Now it`s your turn, perform the following steps:{Fore.RESET}')
 
     def _print_common_instructions_post(self, new_eval_set_folder: str, new_testcases_folder: str, testcase: str):
         idx = self.post_instructions_start_index
 
+        if is_subpath(new_eval_set_folder, os.getcwd()):
+            eval_set_folder_to_show = os.path.relpath(new_eval_set_folder, os.getcwd())
+        else:
+            eval_set_folder_to_show = new_eval_set_folder
+
         print()
-        print(f'{Fore.CYAN}{idx}. Populate the ground truth file: {os.path.join(new_eval_set_folder, GROUND_TRUTH_FILE)}{Fore.RESET}')
+        print(
+            f'{Fore.CYAN}{idx}. Populate the ground truth file: {Fore.YELLOW}{os.path.join(new_eval_set_folder, GROUND_TRUTH_FILE)}{Fore.RESET}'
+        )
         idx += 1
         print(f'{Fore.CYAN}   This is a jsonl file - each line a valid json representing a test to run{Fore.RESET}')
-        print(f'{Fore.CYAN}   set `name`, `description` (optional) and `expected_result`{Fore.RESET}')
         print(
-            f'{Fore.CYAN}   payload is whatever you want to pass to the testcase logic (the code you`ll write in `task_handler.py`) as json{Fore.RESET}'
+            f'{Fore.CYAN}   set {Fore.YELLOW}`name`{Fore.RESET}, {Fore.YELLOW}`description`{Fore.CYAN} (optional) and {Fore.YELLOW}`expected_result`{Fore.RESET}'
+        )
+        print(
+            f'{Fore.CYAN}   {Fore.YELLOW}`payload`{Fore.CYAN} is whatever you want to pass to the testcase logic (the code you`ll write in {Fore.YELLOW}`task_handler.py`{Fore.CYAN}) as json{Fore.RESET}'
         )
         print(f'{Fore.CYAN}   e.g. path to image files to use during llm inference{Fore.RESET}')
-        print(f'{Fore.YELLOW}   NOTE: Names are unique{Fore.RESET}')
+        print(f'{Fore.YELLOW}   NOTE: `name` is unique{Fore.RESET}')
         print()
         print(f'{Fore.CYAN}{idx}. Optionally update the README.md to describe the evaluation{Fore.RESET}')
         idx += 1
         print()
         print(f'{Fore.CYAN}{idx}. Implement the testcase logic{Fore.RESET}')
         idx += 1
-        print(f'{Fore.CYAN}   Open {os.path.join(new_testcases_folder, testcase, PLUGIN_FILE_NAME)}{Fore.RESET}')
-        print(f'{Fore.CYAN}   Implement the `{PLUGIN_FUNCTION_NAME}` function{Fore.RESET}')
+        print(f'{Fore.CYAN}   Open {Fore.YELLOW}{os.path.join(new_testcases_folder, PLUGIN_FILE_NAME)}{Fore.RESET}')
+        print(f'{Fore.CYAN}   Implement the {Fore.YELLOW}`{PLUGIN_FUNCTION_NAME}`{Fore.CYAN} function{Fore.RESET}')
         print(f'{Fore.CYAN}   This is a typical implementation:{Fore.RESET}')
         print(f'{Fore.CYAN}   - Call an llm using the input from payload{Fore.RESET}')
-        print(f'{Fore.CYAN}   - Set `prompt` with the prompt you used to call the llm{Fore.RESET}')
-        print(f'{Fore.CYAN}   - Set `prediction` with the result from your llm call (the llm model prediction){Fore.RESET}')
-        print(f'{Fore.CYAN}   - Set name and payload from your input args as is - this is used by the framework as metadata{Fore.RESET}')
-        print(f'{Fore.YELLOW}   NOTE: If it recommended to implement retries on rate limit errors{Fore.RESET}')
-        print(f'{Fore.YELLOW}         With bedrock, add @bedrock_limits_retry decorator to your `{PLUGIN_FUNCTION_NAME}`{Fore.RESET}')
-        print(f'{Fore.CYAN}     See `configuration` in the docs to set different concurrency per testcase{Fore.RESET}')
+        print(f'{Fore.CYAN}   - When returning LlmTaskResult: ')
+        print(f'{Fore.CYAN}      - Set {Fore.YELLOW}`prompt`{Fore.CYAN} with the prompt you called the llm with{Fore.RESET}')
+        print(
+            f'{Fore.CYAN}      - Set {Fore.YELLOW}`prediction`{Fore.CYAN} with the result from your llm call (the llm model prediction){Fore.RESET}'
+        )
+        print(
+            f'{Fore.CYAN}      - Set {Fore.YELLOW}`name`{Fore.CYAN} and {Fore.YELLOW}`payload`{Fore.CYAN} from your input args as is - this is used by the framework as metadata{Fore.RESET}'
+        )
+        print()
+        print(
+            f'{Fore.YELLOW}   NOTE: If it recommended to implement retries on rate limit errors on the call to {Fore.YELLOW}`{PLUGIN_FUNCTION_NAME}`{Fore.RESET}'
+        )
+        print(
+            f'{Fore.YELLOW}         Check out the built-in retry decorators in  {Fore.YELLOW}`simpleval/utilities/retryables.py`{Fore.RESET}'
+        )
+        print()
+        print(
+            f'{Fore.CYAN}See https://cyberark.github.io/simple-llm-eval/latest/users/configuration/ on how to set different concurrency per testcase{Fore.RESET}'
+        )
 
         print()
         print(f'{Fore.CYAN}{idx}. You are ready to run the evaluation with:{Fore.RESET}')
-        print(f'{Fore.CYAN}   `simpleval run -e {new_eval_set_folder} -t {new_testcases_folder}`{Fore.RESET}')
+        print(f'{Fore.YELLOW}   `simpleval run -e {eval_set_folder_to_show} -t {testcase}`{Fore.RESET}')
+        print()
         print(f'{Fore.YELLOW}   NOTE: {LLM_TASKS_RESULT_FILE} is created in the testcase folder on first run{Fore.RESET}')
         print(f'{Fore.YELLOW}         if results exist from previous run you will be prompted on how to proceed{Fore.RESET}')
         print(f'{Fore.YELLOW}         you can also pass -o/--overwrite to overwrite all existing results{Fore.RESET}')
diff --git a/simpleval/commands/init_command/user_functions.py b/simpleval/commands/init_command/user_functions.py
@@ -38,7 +38,7 @@ def get_eval_dir_from_user() -> str:
         logger.info(f'{Fore.CYAN}The folder name should describe the evaluation you are going to perform.{Fore.RESET}')
         logger.info(f'{Fore.CYAN}For example: {Fore.YELLOW}my_eval{Fore.CYAN} or {Fore.YELLOW}{_example_dir()}{Fore.RESET}')
 
-        eval_dir = input(f'{Fore.CYAN}\nEnter eval folder (enter to stop): {Fore.RESET}')
+        eval_dir = input(f'{Fore.CYAN}\nEnter eval folder - absolute or relative (enter to stop): {Fore.RESET}')
         if not eval_dir:
             raise TerminationError(f'{Fore.RED}No folder name provided, exiting...{Fore.RESET}')
 
@@ -65,7 +65,7 @@ def get_testcase_name_from_user() -> str:
     logger.info(f'{Fore.CYAN}Enter the name of your first testcase.{Fore.RESET}')
     logger.info(f'{Fore.CYAN}This should reflect the conditions that you want to run.{Fore.RESET}')
     logger.info(f'{Fore.CYAN}This can be: using different model, different set of prompts, etc.{Fore.RESET}')
-    logger.info(f'{Fore.CYAN}For example: {Fore.YELLOW}sonnet37{Fore.CYAN}, {Fore.YELLOW}prompt_v1`{Fore.RESET}')
+    logger.info(f'{Fore.CYAN}For example: {Fore.YELLOW}sonnet37-prompt-v1{Fore.CYAN}')
 
     testcase_name = ''
     while not testcase_name:
diff --git a/simpleval/commands/run_command.py b/simpleval/commands/run_command.py
@@ -148,18 +148,32 @@ def _handle_errors(eval_dir: str, testcase: str, llm_task_errors: list, eval_err
     logger = logging.getLogger(LOGGER_NAME)
 
     if llm_task_errors:
-        logger.error(f'{Fore.RED}Errors occurred during llm tasks run. {len(llm_task_errors)} error(s) found. Terminating evaluation.')
-        with open(get_llm_task_errors_file(eval_set_dir=eval_dir, testcase=testcase), 'w', encoding='utf-8') as file:
+        logger.error(
+            f'{Fore.RED}Errors occurred during llm tasks run. {len(llm_task_errors)} error(s) found. Terminating evaluation.{Fore.RESET}'
+        )
+        llm_task_error_file = get_llm_task_errors_file(eval_set_dir=eval_dir, testcase=testcase)
+        with open(llm_task_error_file, 'w', encoding='utf-8') as file:
             file.writelines(f'{error}\n{"-" * 120}\n' for error in llm_task_errors)
 
+        logger.error(f'{Fore.YELLOW}LLM task errors saved to {llm_task_error_file}{Fore.RESET}')
+        print()
+
     if eval_errors:
-        logger.error(f'{Fore.RED}Errors occurred during evaluation. {len(eval_errors)} error(s) found. Terminating evaluation.')
-        logger.error(f'{Fore.RED}Run with --verbose/-v for more details')
-        logger.error(f'{Fore.CYAN}If temporary failures, run again without overwriting (no -o) to run only the failures{Fore.RESET}')
+        logger.error(f'{Fore.RED}Errors occurred during evaluation. {len(eval_errors)} error(s) found. Terminating evaluation.{Fore.RESET}')
 
-        with open(get_eval_errors_file(eval_set_dir=eval_dir, testcase=testcase), 'w', encoding='utf-8') as file:
+        eval_errors_file = get_eval_errors_file(eval_set_dir=eval_dir, testcase=testcase)
+        with open(eval_errors_file, 'w', encoding='utf-8') as file:
             file.writelines(f'{error}\n{"-" * 120}\n' for error in eval_errors)
 
+        logger.error(f'{Fore.YELLOW}Eval errors saved to {eval_errors_file}{Fore.RESET}')
+
+    if llm_task_errors or eval_errors:
+        print()
+        logger.error(f'{Fore.YELLOW}Run with --verbose/-v for more details{Fore.RESET}')
+        logger.error(
+            f'{Fore.YELLOW}If these are temporary failures, run again without overwriting (no -o) to run only the failures{Fore.RESET}'
+        )
+
 
 def print_exec_time(elapsed_time_testcases: float, elapsed_time_runeval: float):
     print()
diff --git a/simpleval/eval_sets/empty/testcases/empty/task_handler.py b/simpleval/eval_sets/empty/testcases/empty/task_handler.py
@@ -6,27 +6,29 @@
 from simpleval.testcases.schemas.llm_task_result import LlmTaskResult
 
 
-# @bedrock_limits_retry - use if using bedrock to call llm
+# Recommended to implement retry here, see built-in @litellm_limits_retry, @bedrock_limits_retry as reference.
 def task_logic(name: str, payload: dict) -> LlmTaskResult:
     """
     Your llm task logic goes here.
     You can (but you don't have to) use the simpleval logger which works with the verbose flag.
     """
-    print('NOTE: implement retries on rate limits. for bedrock, use `@bedrock_limits_retry` decorator (simpleval.utilities.retryables)')
+    print('NOTE: implement retries on rate limits. see simpleval.utilities.retryables for built-in decorators')
 
     logger = logging.getLogger(LOGGER_NAME)
     logger.debug(f'{__name__}: Running task logic for {name} with payload: {payload}')
 
     # Implement your logic here - typically call an llm to do your work, using the inputs from payload
-    your_prompt_for_the_llm = 'this is what you send to your llm'
-    llm_response = 'hi there!'
+    # The user prompt is also returned in LlmTaskResult since the judge will use it when making judgement.
+    user_prompt_to_llm = 'Hi LLM, please respond to this prompt, replace with your own prompt'
+    # llm_response = call_an_llm_here(user_prompt=user_prompt_to_llm)
+    llm_response = 'This is the response from the LLM'  # The llm response is returned in LlmTaskResult
 
     # To log token usage, call this with your token count, when verbose is on (-v) it will write it to tokens-bookkeeping.log
     # log_bookkeeping_data(source='llm', model_name=model_id, input_tokens=input_tokens, output_tokens=output_tokens)
 
     result = LlmTaskResult(
         name=name,
-        prompt=your_prompt_for_the_llm,  # This is what you sent to your llm
+        prompt=user_prompt_to_llm,  # This is what you sent to your llm
         prediction=llm_response,  # This is what your llm responded
         payload=payload,
     )
diff --git a/simpleval/utilities/files.py b/simpleval/utilities/files.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from pathlib import Path
 
 from colorama import Fore
 
@@ -19,3 +20,17 @@ def delete_file(file_path: str, log: bool = True):
         if log:
             logger = logging.getLogger(LOGGER_NAME)
             logger.debug(f'{Fore.YELLOW}`{file_path}` deleted{Fore.RESET}\n')
+
+
+def is_subpath(child_path: str, parent_path: str):
+    child = Path(child_path).resolve()
+    parent = Path(parent_path).resolve()
+
+    if child == parent:
+        return False
+
+    try:
+        child.relative_to(parent)
+        return True
+    except ValueError:
+        return False
diff --git a/tests/unit/test_handlers.py b/tests/unit/test_handlers.py
@@ -11,8 +11,8 @@ def test_empty_task_logic():
     result = empty_task_logic('test_empty', payload)
     assert isinstance(result, LlmTaskResult)
     assert result.name == 'test_empty'
-    assert result.prompt == 'this is what you send to your llm'
-    assert result.prediction == 'hi there!'
+    assert result.prompt == 'Hi LLM, please respond to this prompt, replace with your own prompt'
+    assert result.prediction == 'This is the response from the LLM'
     assert result.payload == payload
 
 
diff --git a/tests/unit/test_print_table.py b/tests/unit/test_print_table.py
@@ -0,0 +1,38 @@
+import builtins
+import pytest
+from unittest.mock import patch
+from simpleval.commands.reporting.utils import print_table
+
+
+@pytest.fixture
+def capture_print(monkeypatch):
+    """Fixture to capture printed output."""
+    output = []
+    monkeypatch.setattr(builtins, 'print', output.append)
+    return output
+
+
+def test_print_table_normal(capture_print):
+    """Test normal table printing with heavy_grid format."""
+    class DummyTabulate:
+        def __init__(self):
+            self.calls = []
+        def __call__(self, table, headers=None, tablefmt=None):
+            self.calls.append((table, headers, tablefmt))
+            return 'formatted-table'
+    dummy_tabulate = DummyTabulate()
+    with patch('simpleval.commands.reporting.utils.tabulate', dummy_tabulate):
+        print_table([[1, 2]], headers=['a', 'b'])
+        assert capture_print == ['formatted-table']
+        assert dummy_tabulate.calls[0] == ([[1, 2]], ['a', 'b'], 'heavy_grid')
+
+
+def test_print_table_unicodeerror(capture_print):
+    """Test fallback when tabulate raises UnicodeError."""
+    def fake_tabulate(table, headers=None, tablefmt=None):
+        if tablefmt == 'heavy_grid':
+            raise UnicodeError('fake unicode error')
+        return 'fallback-table'
+    with patch('simpleval.commands.reporting.utils.tabulate', fake_tabulate):
+        print_table([[1, 2]], headers=['a', 'b'])
+        assert capture_print == ['fallback-table']
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
@@ -0,0 +1,19 @@
+from simpleval.utilities.files import is_subpath
+
+def test_is_subpath_child():
+    path = '/parent/child'
+    parent = '/parent'
+
+    assert is_subpath(path, parent) is True
+
+def test_is_subpath_not_child():
+    path = '/some-dir/child'
+    parent = '/parent'
+
+    assert is_subpath(path, parent) is False
+
+def test_is_subpath_same_path():
+    path = '/parent'
+    parent = '/parent'
+
+    assert is_subpath(path, parent) is False