Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ share/python-wheels/
*.egg
MANIFEST
requirements.txt

temp/
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
Expand Down
94 changes: 77 additions & 17 deletions evaluation/benchmarks/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,27 +120,37 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageActio
mode = metadata.details['mode']
llm_model = metadata.llm_config.model

# Determine the template file based on mode and LLM
if metadata.instruction_template_name:
template_name = metadata.instruction_template_name
elif mode.startswith('swt'):
template_name = 'swt.j2'
elif mode == 'swe':
if 'gpt-4.1' in llm_model:
template_name = 'swe_gpt4.j2'
else:
template_name = (
'swe_default.j2' # Default for 'swe' mode (regular swe-bench)
)
# Check for custom instruction template path (absolute path takes precedence)
custom_instruction_template_path = metadata.details.get('instruction_template_path')

if custom_instruction_template_path and os.path.isfile(custom_instruction_template_path):
# Use custom instruction template from provided path
prompts_dir = os.path.dirname(custom_instruction_template_path)
template_name = os.path.basename(custom_instruction_template_path)
logger.info(f'Using custom instruction template: {custom_instruction_template_path}')
else:
# Fallback or error handling if mode is unexpected
logger.error(f'Unexpected evaluation mode: {mode}. Falling back to default.')
template_name = 'swe_default.j2'
# Determine the template file based on mode and LLM
if metadata.instruction_template_name:
template_name = metadata.instruction_template_name
elif mode.startswith('swt'):
template_name = 'swt.j2'
elif mode == 'swe':
if 'gpt-4.1' in llm_model:
template_name = 'swe_gpt4.j2'
else:
template_name = (
'swe_default.j2' # Default for 'swe' mode (regular swe-bench)
)
else:
# Fallback or error handling if mode is unexpected
logger.error(f'Unexpected evaluation mode: {mode}. Falling back to default.')
template_name = 'swe_default.j2'

# Default prompts directory
prompts_dir = os.path.join(os.path.dirname(__file__), 'prompts')

logger.debug(f'Using instruction template file: {template_name}')
# Set up Jinja2 environment
# Assuming templates are in 'evaluation/benchmarks/swe_bench/prompts' relative to this script
prompts_dir = os.path.join(os.path.dirname(__file__), 'prompts')
env = Environment(loader=FileSystemLoader(prompts_dir))
template = env.get_template(template_name)

Expand Down Expand Up @@ -214,6 +224,9 @@ def get_instance_docker_image(
return (docker_image_prefix.rstrip('/') + '/' + image_name).lower()





def get_config(
instance: pd.Series,
metadata: EvalMetadata,
Expand Down Expand Up @@ -273,6 +286,8 @@ def get_config(
system_prompt_filename=metadata.agent_config.system_prompt_filename
if metadata.agent_config
else 'system_prompt.j2',
system_prompt_path=SYSTEM_PROMPT_PATH,
system_prompt_long_horizon_path=SYSTEM_PROMPT_LONG_HORIZON_PATH,
)
config.set_agent_config(agent_config)

Expand Down Expand Up @@ -867,6 +882,13 @@ def filter_dataset(


if __name__ == '__main__':
# Declare globals at the start of the block
global SYSTEM_PROMPT_PATH, SYSTEM_PROMPT_LONG_HORIZON_PATH

# Module-level variables to store custom prompt paths
SYSTEM_PROMPT_PATH = None
SYSTEM_PROMPT_LONG_HORIZON_PATH = None

parser = get_evaluation_parser()
parser.add_argument(
'--dataset',
Expand Down Expand Up @@ -899,6 +921,24 @@ def filter_dataset(
default=None,
help='Path to a JSON file containing instance data to use instead of loading from HuggingFace (e.g., \'{"instance_id": "...", "repo": "...", ...}\')',
)
parser.add_argument(
'--instruction-template-path',
type=str,
default=None,
help='Path to a custom instruction template file (overrides swe_default.j2)',
)
parser.add_argument(
'--system-prompt-path',
type=str,
default=None,
help='Path to a custom system_prompt.j2 file',
)
parser.add_argument(
'--system-prompt-long-horizon-path',
type=str,
default=None,
help='Path to a custom system_prompt_long_horizon.j2 file',
)

args, _ = parser.parse_known_args()

Expand Down Expand Up @@ -996,7 +1036,27 @@ def filter_dataset(
if args.agent_config:
agent_config = get_agent_config_arg(args.agent_config, args.config_file)

# Set up custom system prompt paths if provided
if args.system_prompt_path:
if os.path.isfile(args.system_prompt_path):
SYSTEM_PROMPT_PATH = args.system_prompt_path
logger.info(f'Using custom system_prompt.j2: {SYSTEM_PROMPT_PATH}')
else:
raise ValueError(f'System prompt file does not exist: {args.system_prompt_path}')

if args.system_prompt_long_horizon_path:
if os.path.isfile(args.system_prompt_long_horizon_path):
SYSTEM_PROMPT_LONG_HORIZON_PATH = args.system_prompt_long_horizon_path
logger.info(f'Using custom system_prompt_long_horizon.j2: {SYSTEM_PROMPT_LONG_HORIZON_PATH}')
else:
raise ValueError(f'System prompt long horizon file does not exist: {args.system_prompt_long_horizon_path}')

# Build details dict with custom prompt paths
details = {'mode': args.mode}
if args.instruction_template_path:
details['instruction_template_path'] = args.instruction_template_path
logger.info(f'Custom instruction template path: {args.instruction_template_path}')

_agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)

dataset_description = (
Expand Down
26 changes: 24 additions & 2 deletions evaluation/benchmarks/swe_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,12 @@ EVAL_OUTPUT_DIR=${9}
SELECTED_ID=${10}
INSTANCE_DICT_PATH=${11}
CONFIG_FILE=${12}
N_RUNS=${13}
MODE=${14}
INSTRUCTION_TEMPLATE_PATH=${13}
SYSTEM_PROMPT_PATH=${14}
SYSTEM_PROMPT_LONG_HORIZON_PATH=${15}
N_RUNS=${16}
MODE=${17}


if [ -z "$NUM_WORKERS" ]; then
NUM_WORKERS=1
Expand Down Expand Up @@ -133,6 +137,9 @@ echo "EVAL_CONDENSER: $EVAL_CONDENSER"
echo "EVAL_OUTPUT_DIR: $EVAL_OUTPUT_DIR"
echo "SELECTED_ID: $SELECTED_ID"
echo "INSTANCE_DICT_PATH: $INSTANCE_DICT_PATH"
echo "INSTRUCTION_TEMPLATE_PATH: $INSTRUCTION_TEMPLATE_PATH"
echo "SYSTEM_PROMPT_PATH: $SYSTEM_PROMPT_PATH"
echo "SYSTEM_PROMPT_LONG_HORIZON_PATH: $SYSTEM_PROMPT_LONG_HORIZON_PATH"
echo "TMUX_MEMORY_LIMIT: $TMUX_MEMORY_LIMIT"
echo "COMMAND_EXEC_TIMEOUT: $COMMAND_EXEC_TIMEOUT"

Expand Down Expand Up @@ -199,6 +206,21 @@ function run_eval() {
COMMAND="$COMMAND --config-file $CONFIG_FILE"
fi

if [ -n "$INSTRUCTION_TEMPLATE_PATH" ]; then
echo "INSTRUCTION_TEMPLATE_PATH: $INSTRUCTION_TEMPLATE_PATH"
COMMAND="$COMMAND --instruction-template-path $INSTRUCTION_TEMPLATE_PATH"
fi

if [ -n "$SYSTEM_PROMPT_PATH" ]; then
echo "SYSTEM_PROMPT_PATH: $SYSTEM_PROMPT_PATH"
COMMAND="$COMMAND --system-prompt-path $SYSTEM_PROMPT_PATH"
fi

if [ -n "$SYSTEM_PROMPT_LONG_HORIZON_PATH" ]; then
echo "SYSTEM_PROMPT_LONG_HORIZON_PATH: $SYSTEM_PROMPT_LONG_HORIZON_PATH"
COMMAND="$COMMAND --system-prompt-long-horizon-path $SYSTEM_PROMPT_LONG_HORIZON_PATH"
fi

# Run the command
eval $COMMAND
}
Expand Down
Loading
Loading