|
12 | 12 | from simpleval.evaluation.utils import get_empty_eval_set_folder, get_empty_testcase_folder |
13 | 13 | from simpleval.exceptions import TerminationError |
14 | 14 | from simpleval.utilities.console import print_boxed_message |
| 15 | +from simpleval.utilities.files import is_subpath |
15 | 16 |
|
16 | 17 |
|
17 | 18 | class BaseInit(ABC): |
@@ -104,42 +105,65 @@ def _print_specific_instructions(self): |
104 | 105 | """ |
105 | 106 |
|
106 | 107 | def _print_common_instructions_pre(self): |
107 | | - print(f'{Fore.CYAN}Now it`s your turn, perform the following steps:{Fore.RESET}') |
108 | 108 | print() |
| 109 | + print(f'{Fore.CYAN}Now it`s your turn, perform the following steps:{Fore.RESET}') |
109 | 110 |
|
110 | 111 | def _print_common_instructions_post(self, new_eval_set_folder: str, new_testcases_folder: str, testcase: str): |
111 | 112 | idx = self.post_instructions_start_index |
112 | 113 |
|
| 114 | + if is_subpath(new_eval_set_folder, os.getcwd()): |
| 115 | + eval_set_folder_to_show = os.path.relpath(new_eval_set_folder, os.getcwd()) |
| 116 | + else: |
| 117 | + eval_set_folder_to_show = new_eval_set_folder |
| 118 | + |
113 | 119 | print() |
114 | | - print(f'{Fore.CYAN}{idx}. Populate the ground truth file: {os.path.join(new_eval_set_folder, GROUND_TRUTH_FILE)}{Fore.RESET}') |
| 120 | + print( |
| 121 | + f'{Fore.CYAN}{idx}. Populate the ground truth file: {Fore.YELLOW}{os.path.join(new_eval_set_folder, GROUND_TRUTH_FILE)}{Fore.RESET}' |
| 122 | + ) |
115 | 123 | idx += 1 |
116 | 124 | print(f'{Fore.CYAN} This is a jsonl file - each line a valid json representing a test to run{Fore.RESET}') |
117 | | - print(f'{Fore.CYAN} set `name`, `description` (optional) and `expected_result`{Fore.RESET}') |
118 | 125 | print( |
119 | | - f'{Fore.CYAN} payload is whatever you want to pass to the testcase logic (the code you`ll write in `task_handler.py`) as json{Fore.RESET}' |
| 126 | + f'{Fore.CYAN} set {Fore.YELLOW}`name`{Fore.RESET}, {Fore.YELLOW}`description`{Fore.CYAN} (optional) and {Fore.YELLOW}`expected_result`{Fore.RESET}' |
| 127 | + ) |
| 128 | + print( |
| 129 | + f'{Fore.CYAN} {Fore.YELLOW}`payload`{Fore.CYAN} is whatever you want to pass to the testcase logic (the code you`ll write in {Fore.YELLOW}`task_handler.py`{Fore.CYAN}) as json{Fore.RESET}' |
120 | 130 | ) |
121 | 131 | print(f'{Fore.CYAN} e.g. path to image files to use during llm inference{Fore.RESET}') |
122 | | - print(f'{Fore.YELLOW} NOTE: Names are unique{Fore.RESET}') |
| 132 | + print(f'{Fore.YELLOW} NOTE: `name` is unique{Fore.RESET}') |
123 | 133 | print() |
124 | 134 | print(f'{Fore.CYAN}{idx}. Optionally update the README.md to describe the evaluation{Fore.RESET}') |
125 | 135 | idx += 1 |
126 | 136 | print() |
127 | 137 | print(f'{Fore.CYAN}{idx}. Implement the testcase logic{Fore.RESET}') |
128 | 138 | idx += 1 |
129 | | - print(f'{Fore.CYAN} Open {os.path.join(new_testcases_folder, testcase, PLUGIN_FILE_NAME)}{Fore.RESET}') |
130 | | - print(f'{Fore.CYAN} Implement the `{PLUGIN_FUNCTION_NAME}` function{Fore.RESET}') |
| 139 | + print(f'{Fore.CYAN} Open {Fore.YELLOW}{os.path.join(new_testcases_folder, PLUGIN_FILE_NAME)}{Fore.RESET}') |
| 140 | + print(f'{Fore.CYAN} Implement the {Fore.YELLOW}`{PLUGIN_FUNCTION_NAME}`{Fore.CYAN} function{Fore.RESET}') |
131 | 141 | print(f'{Fore.CYAN} This is a typical implementation:{Fore.RESET}') |
132 | 142 | print(f'{Fore.CYAN} - Call an llm using the input from payload{Fore.RESET}') |
133 | | - print(f'{Fore.CYAN} - Set `prompt` with the prompt you used to call the llm{Fore.RESET}') |
134 | | - print(f'{Fore.CYAN} - Set `prediction` with the result from your llm call (the llm model prediction){Fore.RESET}') |
135 | | - print(f'{Fore.CYAN} - Set name and payload from your input args as is - this is used by the framework as metadata{Fore.RESET}') |
136 | | - print(f'{Fore.YELLOW} NOTE: If it recommended to implement retries on rate limit errors{Fore.RESET}') |
137 | | - print(f'{Fore.YELLOW} With bedrock, add @bedrock_limits_retry decorator to your `{PLUGIN_FUNCTION_NAME}`{Fore.RESET}') |
138 | | - print(f'{Fore.CYAN} See `configuration` in the docs to set different concurrency per testcase{Fore.RESET}') |
| 143 | + print(f'{Fore.CYAN} - When returning LlmTaskResult: ') |
| 144 | + print(f'{Fore.CYAN} - Set {Fore.YELLOW}`prompt`{Fore.CYAN} with the prompt you called the llm with{Fore.RESET}') |
| 145 | + print( |
| 146 | + f'{Fore.CYAN} - Set {Fore.YELLOW}`prediction`{Fore.CYAN} with the result from your llm call (the llm model prediction){Fore.RESET}' |
| 147 | + ) |
| 148 | + print( |
| 149 | + f'{Fore.CYAN} - Set {Fore.YELLOW}`name`{Fore.CYAN} and {Fore.YELLOW}`payload`{Fore.CYAN} from your input args as is - this is used by the framework as metadata{Fore.RESET}' |
| 150 | + ) |
| 151 | + print() |
| 152 | + print( |
| 153 | + f'{Fore.YELLOW} NOTE: If it recommended to implement retries on rate limit errors on the call to {Fore.YELLOW}`{PLUGIN_FUNCTION_NAME}`{Fore.RESET}' |
| 154 | + ) |
| 155 | + print( |
| 156 | + f'{Fore.YELLOW} Check out the built-in retry decorators in {Fore.YELLOW}`simpleval/utilities/retryables.py`{Fore.RESET}' |
| 157 | + ) |
| 158 | + print() |
| 159 | + print( |
| 160 | + f'{Fore.CYAN}See https://cyberark.github.io/simple-llm-eval/latest/users/configuration/ on how to set different concurrency per testcase{Fore.RESET}' |
| 161 | + ) |
139 | 162 |
|
140 | 163 | print() |
141 | 164 | print(f'{Fore.CYAN}{idx}. You are ready to run the evaluation with:{Fore.RESET}') |
142 | | - print(f'{Fore.CYAN} `simpleval run -e {new_eval_set_folder} -t {new_testcases_folder}`{Fore.RESET}') |
| 165 | + print(f'{Fore.YELLOW} `simpleval run -e {eval_set_folder_to_show} -t {testcase}`{Fore.RESET}') |
| 166 | + print() |
143 | 167 | print(f'{Fore.YELLOW} NOTE: {LLM_TASKS_RESULT_FILE} is created in the testcase folder on first run{Fore.RESET}') |
144 | 168 | print(f'{Fore.YELLOW} if results exist from previous run you will be prompted on how to proceed{Fore.RESET}') |
145 | 169 | print(f'{Fore.YELLOW} you can also pass -o/--overwrite to overwrite all existing results{Fore.RESET}') |
|
0 commit comments