diff --git a/resources_servers/math_with_code/app.py b/resources_servers/math_with_code/app.py index 02e9b7dc4..c0f8b2245 100644 --- a/resources_servers/math_with_code/app.py +++ b/resources_servers/math_with_code/app.py @@ -206,39 +206,48 @@ async def execute_python(self, request: Request, body: ExecutePythonRequest) -> ) async def end_session(self, request: Request) -> ExecutePythonResponse: - sid = request.session[SESSION_ID_KEY] - if sid in self._sessions: - self._sessions[sid].close() - del self._sessions[sid] + session_id = request.session[SESSION_ID_KEY] + self._cleanup_session(session_id) return ExecutePythonResponse(success=True, stdout="", stderr="") - async def verify(self, body: PythonMathVerifyRequest) -> PythonMathVerifyResponse: - expected = body.expected_result - - # Extract actual answer from final assistant message - actual = None - for output in reversed(body.response.output): - if output.type == "message" and output.role == "assistant": - text_content = "" - for content in output.content: - if content.type == "output_text": - text_content += content.text - - # Extract boxed answer - match = re.search(r"\\boxed\{([^}]+)\}", text_content) - if match: - actual = match.group(1).strip() - break - - accuracy = str(actual) == str(expected) - reward = 1.0 if accuracy else 0.0 - - return PythonMathVerifyResponse( - **body.model_dump(), - reward=reward, - extracted_answer=actual, - accuracy=accuracy, - ) + async def verify(self, request: Request, body: PythonMathVerifyRequest) -> PythonMathVerifyResponse: + session_id = request.session[SESSION_ID_KEY] + + try: + expected = body.expected_result + + # Extract actual answer from final assistant message + actual = None + for output in reversed(body.response.output): + if output.type == "message" and output.role == "assistant": + text_content = "" + for content in output.content: + if content.type == "output_text": + text_content += content.text + + # Extract boxed answer + match = re.search(r"\\boxed\{([^}]+)\}", text_content) + if match: + actual = match.group(1).strip() + break + + accuracy = str(actual) == str(expected) + reward = 1.0 if accuracy else 0.0 + + return PythonMathVerifyResponse( + **body.model_dump(), + reward=reward, + extracted_answer=actual, + accuracy=accuracy, + ) + finally: + self._cleanup_session(session_id) + + def _cleanup_session(self, session_id: str) -> None: + """Clean up subprocess for the given session.""" + if session_id in self._sessions: + self._sessions[session_id].close() + del self._sessions[session_id] def _get_last_expr_value(code: str, globals_dict: dict, locals_dict: dict):