Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 88 additions & 1 deletion evaluation/benchmarks/swe_bench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,12 @@
from openhands.core.logger import openhands_logger as logger
from openhands.core.main import create_runtime, run_controller
from openhands.critic import AgentFinishedCritic
from openhands.events.action import CmdRunAction, FileReadAction, MessageAction
from openhands.events.action import CmdRunAction, FileReadAction, MessageAction
from openhands.events.observation import (
CmdOutputObservation,
ErrorObservation,
FileReadObservation,
Observation,
)
from openhands.events.serialization.event import event_from_dict, event_to_dict
from openhands.runtime.base import Runtime
Expand Down Expand Up @@ -635,12 +636,76 @@ def complete_runtime(
return {'git_patch': git_patch}


def _has_existing_result(eval_output_dir: str, instance_id: str) -> tuple[bool, dict | None]:
completions_dir = os.path.join(eval_output_dir, 'llm_completions', instance_id)
has_completions = False
if os.path.exists(completions_dir):
json_files = [f for f in os.listdir(completions_dir) if f.endswith('.json')]
has_completions = len(json_files) > 0

if not has_completions:
return False, None

output_file = os.path.join(eval_output_dir, 'output.jsonl')
existing_result = None
if os.path.exists(output_file):
try:
with open(output_file, 'r') as f:
for line in f:
try:
result = json.loads(line.strip())
if result.get('instance_id') == instance_id:
git_patch = result.get('test_result', {}).get('git_patch', '')
if git_patch and git_patch.strip():
existing_result = result
break
except json.JSONDecodeError:
continue
except Exception as e:
logger.warning(f'Error reading output file for existing result: {e}')

if has_completions:
return True, existing_result

return False, None


def process_instance(
instance: pd.Series,
metadata: EvalMetadata,
reset_logger: bool = True,
runtime_failure_count: int = 0,
) -> EvalOutput:

should_skip, existing_result = _has_existing_result(metadata.eval_output_dir, instance.instance_id)
if should_skip:
if existing_result:
return EvalOutput(
instance_id=existing_result.get('instance_id', instance.instance_id),
instruction=existing_result.get('instruction', ''),
instance=existing_result.get('instance', instance.to_dict()),
test_result=existing_result.get('test_result', {}),
metadata=metadata,
history=existing_result.get('history', []),
metrics=existing_result.get('metrics', {}),
error=existing_result.get('error'),
)
else:
return EvalOutput(
instance_id=instance.instance_id,
instruction='',
instance=instance.to_dict(),
test_result={
'git_patch': '',
'skipped': True,
'skip_reason': 'completions_exist_no_result',
},
metadata=metadata,
history=[],
metrics={},
error=None,
)

config = get_config(instance, metadata)

# Setup the logger properly, so you can run multi-processing to parallelize the evaluation
Expand Down Expand Up @@ -733,6 +798,9 @@ def process_instance(
histories = [event_to_dict(event) for event in state.history]
metrics = get_metrics(state)

# Calculate action execution times from history
metrics['action_execution_latencies'] = get_action_execution_latencies(state.history)

# Save the output
instruction = message_action.content
if message_action.image_urls:
Expand All @@ -752,6 +820,25 @@ def process_instance(
return output


def get_action_execution_latencies(history: list) -> list[dict]:
"""Extract execution latencies from observations in the history."""
latencies = []
for event in history:
if isinstance(event, Observation):
execution_latency = getattr(event, '_execution_latency', None)
if execution_latency is None:
execution_latency = getattr(event, 'execution_latency', None)
if execution_latency is not None:
latencies.append({
'observation_type': type(event).__name__,
'observation_id': str(event.id),
'latency': float(execution_latency),
'message': event.message,
'timestamp': event.timestamp,
})
return latencies


def filter_dataset(
dataset: pd.DataFrame,
filter_column: str,
Expand Down
2 changes: 2 additions & 0 deletions evaluation/benchmarks/swe_bench/scripts/run_infer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ echo "EVAL_CONDENSER: $EVAL_CONDENSER"
echo "EVAL_OUTPUT_DIR: $EVAL_OUTPUT_DIR"
echo "SELECTED_ID: $SELECTED_ID"
echo "INSTANCE_DICT_PATH: $INSTANCE_DICT_PATH"
echo "TMUX_MEMORY_LIMIT: $TMUX_MEMORY_LIMIT"
echo "COMMAND_EXEC_TIMEOUT: $COMMAND_EXEC_TIMEOUT"

# Default to NOT use Hint
if [ -z "$USE_HINT_TEXT" ]; then
Expand Down
Loading
Loading