Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 40 additions & 31 deletions resources_servers/math_with_code/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,39 +206,48 @@ async def execute_python(self, request: Request, body: ExecutePythonRequest) ->
)

async def end_session(self, request: Request) -> ExecutePythonResponse:
sid = request.session[SESSION_ID_KEY]
if sid in self._sessions:
self._sessions[sid].close()
del self._sessions[sid]
session_id = request.session[SESSION_ID_KEY]
self._cleanup_session(session_id)
return ExecutePythonResponse(success=True, stdout="", stderr="")

async def verify(self, body: PythonMathVerifyRequest) -> PythonMathVerifyResponse:
expected = body.expected_result

# Extract actual answer from final assistant message
actual = None
for output in reversed(body.response.output):
if output.type == "message" and output.role == "assistant":
text_content = ""
for content in output.content:
if content.type == "output_text":
text_content += content.text

# Extract boxed answer
match = re.search(r"\\boxed\{([^}]+)\}", text_content)
if match:
actual = match.group(1).strip()
break

accuracy = str(actual) == str(expected)
reward = 1.0 if accuracy else 0.0

return PythonMathVerifyResponse(
**body.model_dump(),
reward=reward,
extracted_answer=actual,
accuracy=accuracy,
)
async def verify(self, request: Request, body: PythonMathVerifyRequest) -> PythonMathVerifyResponse:
session_id = request.session[SESSION_ID_KEY]

try:
expected = body.expected_result

# Extract actual answer from final assistant message
actual = None
for output in reversed(body.response.output):
if output.type == "message" and output.role == "assistant":
text_content = ""
for content in output.content:
if content.type == "output_text":
text_content += content.text

# Extract boxed answer
match = re.search(r"\\boxed\{([^}]+)\}", text_content)
if match:
actual = match.group(1).strip()
break

accuracy = str(actual) == str(expected)
reward = 1.0 if accuracy else 0.0

return PythonMathVerifyResponse(
**body.model_dump(),
reward=reward,
extracted_answer=actual,
accuracy=accuracy,
)
finally:
self._cleanup_session(session_id)

def _cleanup_session(self, session_id: str) -> None:
"""Clean up subprocess for the given session."""
if session_id in self._sessions:
self._sessions[session_id].close()
del self._sessions[session_id]


def _get_last_expr_value(code: str, globals_dict: dict, locals_dict: dict):
Expand Down