diff --git a/.gitignore b/.gitignore index 5d1e9ac..dcd89ff 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ .DS_Store Thumbs.db -# 环境变量(保护敏感信息) +# Environment variables (protect sensitive info) .env .env.local .env.*.local @@ -36,7 +36,7 @@ yarn-error.log* *.swp *.swo -# 测试 +# Testing .pytest_cache/ .coverage htmlcov/ @@ -45,17 +45,17 @@ htmlcov/ .cursor/ .claude/ -# 文档与测试程序 +# Docs and test programs mydoc/ mytest/ -# 日志文件 +# Log files backend/logs/ *.log -# 上传文件 +# Uploaded files backend/uploads/ -# Docker 数据 +# Docker data data/backend/venv311/ backend/venv311/ diff --git a/Dockerfile b/Dockerfile index e656468..4b46c15 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,29 +1,29 @@ FROM python:3.11 -# 安装 Node.js (满足 >=18)及必要工具 +# Install Node.js (>=18) and necessary tools RUN apt-get update \ && apt-get install -y --no-install-recommends nodejs npm \ && rm -rf /var/lib/apt/lists/* -# 从 uv 官方镜像复制 uv +# Copy uv from official image COPY --from=ghcr.io/astral-sh/uv:0.9.26 /uv /uvx /bin/ WORKDIR /app -# 先复制依赖描述文件以利用缓存 +# Copy dependency files first for caching COPY package.json package-lock.json ./ COPY frontend/package.json frontend/package-lock.json ./frontend/ COPY backend/pyproject.toml backend/uv.lock ./backend/ -# 安装依赖(Node + Python) +# Install dependencies (Node + Python) RUN npm ci \ && npm ci --prefix frontend \ - && cd backend && uv sync --frozen + && cd backend && uv lock && uv sync -# 复制项目源码 +# Copy project source code COPY . . EXPOSE 3000 5001 -# 同时启动前后端(开发模式) +# Start frontend and backend (dev mode) CMD ["npm", "run", "dev"] \ No newline at end of file diff --git a/backend/app/__init__.py b/backend/app/__init__.py index e874cea..f0915f0 100644 --- a/backend/app/__init__.py +++ b/backend/app/__init__.py @@ -1,12 +1,12 @@ """ -MiroFish Backend - Flask应用工厂 +MiroFish Backend - Flask application factory """ import os import warnings -# 抑制 multiprocessing resource_tracker 的警告(来自第三方库如 transformers) -# 需要在所有其他导入之前设置 +# Suppress multiprocessing resource_tracker warnings (from third-party libraries like transformers) +# Must be set before all other imports warnings.filterwarnings("ignore", message=".*resource_tracker.*") from flask import Flask, request @@ -17,76 +17,75 @@ def create_app(config_class=Config): - """Flask应用工厂函数""" + """Flask application factory function""" app = Flask(__name__) app.config.from_object(config_class) - - # 设置JSON编码:确保中文直接显示(而不是 \uXXXX 格式) - # Flask >= 2.3 使用 app.json.ensure_ascii,旧版本使用 JSON_AS_ASCII 配置 + + # Set JSON encoding: ensure non-ASCII characters display directly (instead of \uXXXX format) + # Flask >= 2.3 uses app.json.ensure_ascii, older versions use JSON_AS_ASCII config if hasattr(app, 'json') and hasattr(app.json, 'ensure_ascii'): app.json.ensure_ascii = False - - # 设置日志 + + # Set up logging logger = setup_logger('mirofish') - - # 只在 reloader 子进程中打印启动信息(避免 debug 模式下打印两次) + + # Only print startup info in the reloader subprocess (avoid printing twice in debug mode) is_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true' debug_mode = app.config.get('DEBUG', False) should_log_startup = not debug_mode or is_reloader_process - + if should_log_startup: logger.info("=" * 50) - logger.info("MiroFish-Offline Backend 启动中...") + logger.info("MiroFish-Offline Backend starting...") logger.info("=" * 50) - - # 启用CORS + + # Enable CORS CORS(app, resources={r"/api/*": {"origins": "*"}}) - # --- 初始化 Neo4jStorage 单例(DI via app.extensions) --- + # --- Initialize Neo4jStorage singleton (DI via app.extensions) --- from .storage import Neo4jStorage try: neo4j_storage = Neo4jStorage() app.extensions['neo4j_storage'] = neo4j_storage if should_log_startup: - logger.info("Neo4jStorage 已初始化(连接 %s)", Config.NEO4J_URI) + logger.info("Neo4jStorage initialized (connected to %s)", Config.NEO4J_URI) except Exception as e: - logger.error("Neo4jStorage 初始化失败: %s", e) + logger.error("Neo4jStorage initialization failed: %s", e) # Store None so endpoints can return 503 gracefully app.extensions['neo4j_storage'] = None - - # 注册模拟进程清理函数(确保服务器关闭时终止所有模拟进程) + + # Register simulation process cleanup function (ensure all simulation processes are terminated when the server shuts down) from .services.simulation_runner import SimulationRunner SimulationRunner.register_cleanup() if should_log_startup: - logger.info("已注册模拟进程清理函数") - - # 请求日志中间件 + logger.info("Simulation process cleanup function registered") + + # Request logging middleware @app.before_request def log_request(): logger = get_logger('mirofish.request') - logger.debug(f"请求: {request.method} {request.path}") + logger.debug(f"Request: {request.method} {request.path}") if request.content_type and 'json' in request.content_type: - logger.debug(f"请求体: {request.get_json(silent=True)}") - + logger.debug(f"Request body: {request.get_json(silent=True)}") + @app.after_request def log_response(response): logger = get_logger('mirofish.request') - logger.debug(f"响应: {response.status_code}") + logger.debug(f"Response: {response.status_code}") return response - - # 注册蓝图 + + # Register blueprints from .api import graph_bp, simulation_bp, report_bp app.register_blueprint(graph_bp, url_prefix='/api/graph') app.register_blueprint(simulation_bp, url_prefix='/api/simulation') app.register_blueprint(report_bp, url_prefix='/api/report') - - # 健康检查 + + # Health check @app.route('/health') def health(): return {'status': 'ok', 'service': 'MiroFish-Offline Backend'} - + if should_log_startup: - logger.info("MiroFish-Offline Backend 启动完成") - - return app + logger.info("MiroFish-Offline Backend startup complete") + return app diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py index ffda743..79fd83e 100644 --- a/backend/app/api/__init__.py +++ b/backend/app/api/__init__.py @@ -1,5 +1,5 @@ """ -API路由模块 +API routing module """ from flask import Blueprint diff --git a/backend/app/api/graph.py b/backend/app/api/graph.py index e2e95d3..fb4b9fd 100644 --- a/backend/app/api/graph.py +++ b/backend/app/api/graph.py @@ -1,6 +1,6 @@ """ -图谱相关API路由 -采用项目上下文机制,服务端持久化状态 +Graph-related API routes +Uses project context mechanism with server-side persistent state """ import os @@ -18,7 +18,7 @@ from ..models.task import TaskManager, TaskStatus from ..models.project import ProjectManager, ProjectStatus -# 获取日志器 +# Get logger logger = get_logger('mirofish.api') @@ -31,26 +31,26 @@ def _get_storage(): def allowed_file(filename: str) -> bool: - """检查文件扩展名是否允许""" + """Check if the file extension is allowed""" if not filename or '.' not in filename: return False ext = os.path.splitext(filename)[1].lower().lstrip('.') return ext in Config.ALLOWED_EXTENSIONS -# ============== 项目管理接口 ============== +# ============== Project management endpoints ============== @graph_bp.route('/project/', methods=['GET']) def get_project(project_id: str): """ - 获取项目详情 + Get project details """ project = ProjectManager.get_project(project_id) if not project: return jsonify({ "success": False, - "error": f"项目不存在: {project_id}" + "error": f"Project not found: {project_id}" }), 404 return jsonify({ @@ -62,7 +62,7 @@ def get_project(project_id: str): @graph_bp.route('/project/list', methods=['GET']) def list_projects(): """ - 列出所有项目 + List all projects """ limit = request.args.get('limit', 50, type=int) projects = ProjectManager.list_projects(limit=limit) @@ -77,36 +77,36 @@ def list_projects(): @graph_bp.route('/project/', methods=['DELETE']) def delete_project(project_id: str): """ - 删除项目 + Delete project """ success = ProjectManager.delete_project(project_id) if not success: return jsonify({ "success": False, - "error": f"项目不存在或删除失败: {project_id}" + "error": f"Project not found or deletion failed: {project_id}" }), 404 return jsonify({ "success": True, - "message": f"项目已删除: {project_id}" + "message": f"Project deleted: {project_id}" }) @graph_bp.route('/project//reset', methods=['POST']) def reset_project(project_id: str): """ - 重置项目状态(用于重新构建图谱) + Reset project status (for rebuilding graph) """ project = ProjectManager.get_project(project_id) if not project: return jsonify({ "success": False, - "error": f"项目不存在: {project_id}" + "error": f"Project not found: {project_id}" }), 404 - # 重置到本体已生成状态 + # Reset to ontology generated state if project.ontology: project.status = ProjectStatus.ONTOLOGY_GENERATED else: @@ -119,27 +119,27 @@ def reset_project(project_id: str): return jsonify({ "success": True, - "message": f"项目已重置: {project_id}", + "message": f"Project reset: {project_id}", "data": project.to_dict() }) -# ============== 接口1:上传文件并生成本体 ============== +# ============== Endpoint 1: Upload files and generate ontology ============== @graph_bp.route('/ontology/generate', methods=['POST']) def generate_ontology(): """ - 接口1:上传文件,分析生成本体定义 + Endpoint 1: Upload files, analyze and generate ontology definition - 请求方式:multipart/form-data + Request method: multipart/form-data - 参数: - files: 上传的文件(PDF/MD/TXT),可多个 - simulation_requirement: 模拟需求描述(必填) - project_name: 项目名称(可选) - additional_context: 额外说明(可选) + Parameters: + files: Uploaded files (PDF/MD/TXT), multiple allowed + simulation_requirement: Simulation requirement description (required) + project_name: Project name (optional) + additional_context: Additional context (optional) - 返回: + Response: { "success": true, "data": { @@ -155,42 +155,42 @@ def generate_ontology(): } """ try: - logger.info("=== 开始生成本体定义 ===") + logger.info("=== Starting ontology definition generation ===") - # 获取参数 + # Get parameters simulation_requirement = request.form.get('simulation_requirement', '') project_name = request.form.get('project_name', 'Unnamed Project') additional_context = request.form.get('additional_context', '') - logger.debug(f"项目名称: {project_name}") - logger.debug(f"模拟需求: {simulation_requirement[:100]}...") + logger.debug(f"Project name: {project_name}") + logger.debug(f"Simulation requirement: {simulation_requirement[:100]}...") if not simulation_requirement: return jsonify({ "success": False, - "error": "请提供模拟需求描述 (simulation_requirement)" + "error": "Please provide simulation requirement description (simulation_requirement)" }), 400 - # 获取上传的文件 + # Get uploaded files uploaded_files = request.files.getlist('files') if not uploaded_files or all(not f.filename for f in uploaded_files): return jsonify({ "success": False, - "error": "请至少上传一个文档文件" + "error": "Please upload at least one document file" }), 400 - # 创建项目 + # Create project project = ProjectManager.create_project(name=project_name) project.simulation_requirement = simulation_requirement - logger.info(f"创建项目: {project.project_id}") + logger.info(f"Created project: {project.project_id}") - # 保存文件并提取文本 + # Save files and extract text document_texts = [] all_text = "" for file in uploaded_files: if file and file.filename and allowed_file(file.filename): - # 保存文件到项目目录 + # Save file to project directory file_info = ProjectManager.save_file_to_project( project.project_id, file, @@ -201,7 +201,7 @@ def generate_ontology(): "size": file_info["size"] }) - # 提取文本 + # Extract text text = FileParser.extract_text(file_info["path"]) text = TextProcessor.preprocess_text(text) document_texts.append(text) @@ -211,16 +211,16 @@ def generate_ontology(): ProjectManager.delete_project(project.project_id) return jsonify({ "success": False, - "error": "没有成功处理任何文档,请检查文件格式" + "error": "No documents were successfully processed. Please check the file formats" }), 400 - # 保存提取的文本 + # Save extracted text project.total_text_length = len(all_text) ProjectManager.save_extracted_text(project.project_id, all_text) - logger.info(f"文本提取完成,共 {len(all_text)} 字符") + logger.info(f"Text extraction complete, {len(all_text)} characters total") - # 生成本体 - logger.info("调用 LLM 生成本体定义...") + # Generate ontology + logger.info("Calling LLM to generate ontology definition...") generator = OntologyGenerator() ontology = generator.generate( document_texts=document_texts, @@ -228,10 +228,10 @@ def generate_ontology(): additional_context=additional_context if additional_context else None ) - # 保存本体到项目 + # Save ontology to project entity_count = len(ontology.get("entity_types", [])) edge_count = len(ontology.get("edge_types", [])) - logger.info(f"本体生成完成: {entity_count} 个实体类型, {edge_count} 个关系类型") + logger.info(f"Ontology generation complete: {entity_count} entity types, {edge_count} relationship types") project.ontology = { "entity_types": ontology.get("entity_types", []), @@ -240,7 +240,7 @@ def generate_ontology(): project.analysis_summary = ontology.get("analysis_summary", "") project.status = ProjectStatus.ONTOLOGY_GENERATED ProjectManager.save_project(project) - logger.info(f"=== 本体生成完成 === 项目ID: {project.project_id}") + logger.info(f"=== Ontology generation complete === Project ID: {project.project_id}") return jsonify({ "success": True, @@ -262,132 +262,132 @@ def generate_ontology(): }), 500 -# ============== 接口2:构建图谱 ============== +# ============== Endpoint 2: Build graph ============== @graph_bp.route('/build', methods=['POST']) def build_graph(): """ - 接口2:根据project_id构建图谱 + Endpoint 2: Build graph based on project_id - 请求(JSON): + Request (JSON): { - "project_id": "proj_xxxx", // 必填,来自接口1 - "graph_name": "图谱名称", // 可选 - "chunk_size": 500, // 可选,默认500 - "chunk_overlap": 50 // 可选,默认50 + "project_id": "proj_xxxx", // Required, from Endpoint 1 + "graph_name": "Graph Name", // Optional + "chunk_size": 500, // Optional, default 500 + "chunk_overlap": 50 // Optional, default 50 } - 返回: + Response: { "success": true, "data": { "project_id": "proj_xxxx", "task_id": "task_xxxx", - "message": "图谱构建任务已启动" + "message": "Graph build task started" } } """ try: - logger.info("=== 开始构建图谱 ===") + logger.info("=== Starting graph build ===") - # 解析请求 + # Parse request data = request.get_json() or {} project_id = data.get('project_id') - logger.debug(f"请求参数: project_id={project_id}") + logger.debug(f"Request parameters: project_id={project_id}") if not project_id: return jsonify({ "success": False, - "error": "请提供 project_id" + "error": "Please provide project_id" }), 400 - # 获取项目 + # Get project project = ProjectManager.get_project(project_id) if not project: return jsonify({ "success": False, - "error": f"项目不存在: {project_id}" + "error": f"Project not found: {project_id}" }), 404 - # 检查项目状态 - force = data.get('force', False) # 强制重新构建 + # Check project status + force = data.get('force', False) # Force rebuild if project.status == ProjectStatus.CREATED: return jsonify({ "success": False, - "error": "项目尚未生成本体,请先调用 /ontology/generate" + "error": "Project has not generated ontology yet. Please call /ontology/generate first" }), 400 if project.status == ProjectStatus.GRAPH_BUILDING and not force: return jsonify({ "success": False, - "error": "图谱正在构建中,请勿重复提交。如需强制重建,请添加 force: true", + "error": "Graph is currently being built. Please do not submit again. To force rebuild, add force: true", "task_id": project.graph_build_task_id }), 400 - # 如果强制重建,重置状态 + # If force rebuild, reset status if force and project.status in [ProjectStatus.GRAPH_BUILDING, ProjectStatus.FAILED, ProjectStatus.GRAPH_COMPLETED]: project.status = ProjectStatus.ONTOLOGY_GENERATED project.graph_id = None project.graph_build_task_id = None project.error = None - # 获取配置 + # Get configuration graph_name = data.get('graph_name', project.name or 'MiroFish Graph') chunk_size = data.get('chunk_size', project.chunk_size or Config.DEFAULT_CHUNK_SIZE) chunk_overlap = data.get('chunk_overlap', project.chunk_overlap or Config.DEFAULT_CHUNK_OVERLAP) - # 更新项目配置 + # Update project configuration project.chunk_size = chunk_size project.chunk_overlap = chunk_overlap - # 获取提取的文本 + # Get extracted text text = ProjectManager.get_extracted_text(project_id) if not text: return jsonify({ "success": False, - "error": "未找到提取的文本内容" + "error": "Extracted text content not found" }), 400 - # 获取本体 + # Get ontology ontology = project.ontology if not ontology: return jsonify({ "success": False, - "error": "未找到本体定义" + "error": "Ontology definition not found" }), 400 - # 获取 storage 在请求上下文中(后台线程无法访问 current_app) + # Get storage in request context (background thread cannot access current_app) storage = _get_storage() - # 创建异步任务 + # Create async task task_manager = TaskManager() - task_id = task_manager.create_task(f"构建图谱: {graph_name}") - logger.info(f"创建图谱构建任务: task_id={task_id}, project_id={project_id}") + task_id = task_manager.create_task(f"Build graph: {graph_name}") + logger.info(f"Created graph build task: task_id={task_id}, project_id={project_id}") - # 更新项目状态 + # Update project status project.status = ProjectStatus.GRAPH_BUILDING project.graph_build_task_id = task_id ProjectManager.save_project(project) - # 启动后台任务 + # Start background task def build_task(): build_logger = get_logger('mirofish.build') try: - build_logger.info(f"[{task_id}] 开始构建图谱...") + build_logger.info(f"[{task_id}] Starting graph build...") task_manager.update_task( task_id, status=TaskStatus.PROCESSING, - message="初始化图谱构建服务..." + message="Initializing graph build service..." ) - # 创建图谱构建服务(storage 从外部闭包传入) + # Create graph build service (storage passed in from outer closure) builder = GraphBuilderService(storage=storage) - # 分块 + # Chunking task_manager.update_task( task_id, - message="文本分块中...", + message="Splitting text into chunks...", progress=5 ) chunks = TextProcessor.split_text( @@ -397,27 +397,27 @@ def build_task(): ) total_chunks = len(chunks) - # 创建图谱 + # Create graph task_manager.update_task( task_id, - message="创建Zep图谱...", + message="Creating Zep graph...", progress=10 ) graph_id = builder.create_graph(name=graph_name) - # 更新项目的graph_id + # Update project graph_id project.graph_id = graph_id ProjectManager.save_project(project) - # 设置本体 + # Set ontology task_manager.update_task( task_id, - message="设置本体定义...", + message="Setting ontology definition...", progress=15 ) builder.set_ontology(graph_id, ontology) - # 添加文本(progress_callback 签名是 (msg, progress_ratio)) + # Add text (progress_callback signature is (msg, progress_ratio)) def add_progress_callback(msg, progress_ratio): progress = 15 + int(progress_ratio * 40) # 15% - 55% task_manager.update_task( @@ -428,7 +428,7 @@ def add_progress_callback(msg, progress_ratio): task_manager.update_task( task_id, - message=f"开始添加 {total_chunks} 个文本块...", + message=f"Starting to add {total_chunks} text chunks...", progress=15 ) @@ -439,34 +439,34 @@ def add_progress_callback(msg, progress_ratio): progress_callback=add_progress_callback ) - # Neo4j处理是同步的,无需等待 + # Neo4j processing is synchronous, no need to wait task_manager.update_task( task_id, - message="文本处理完成,生成图谱数据...", + message="Text processing complete, generating graph data...", progress=90 ) - # 获取图谱数据 + # Get graph data task_manager.update_task( task_id, - message="获取图谱数据...", + message="Fetching graph data...", progress=95 ) graph_data = builder.get_graph_data(graph_id) - # 更新项目状态 + # Update project status project.status = ProjectStatus.GRAPH_COMPLETED ProjectManager.save_project(project) node_count = graph_data.get("node_count", 0) edge_count = graph_data.get("edge_count", 0) - build_logger.info(f"[{task_id}] 图谱构建完成: graph_id={graph_id}, 节点={node_count}, 边={edge_count}") + build_logger.info(f"[{task_id}] Graph build complete: graph_id={graph_id}, nodes={node_count}, edges={edge_count}") - # 完成 + # Complete task_manager.update_task( task_id, status=TaskStatus.COMPLETED, - message="图谱构建完成", + message="Graph build complete", progress=100, result={ "project_id": project_id, @@ -478,8 +478,8 @@ def add_progress_callback(msg, progress_ratio): ) except Exception as e: - # 更新项目状态为失败 - build_logger.error(f"[{task_id}] 图谱构建失败: {str(e)}") + # Update project status to failed + build_logger.error(f"[{task_id}] Graph build failed: {str(e)}") build_logger.debug(traceback.format_exc()) project.status = ProjectStatus.FAILED @@ -489,11 +489,11 @@ def add_progress_callback(msg, progress_ratio): task_manager.update_task( task_id, status=TaskStatus.FAILED, - message=f"构建失败: {str(e)}", + message=f"Build failed: {str(e)}", error=traceback.format_exc() ) - # 启动后台线程 + # Start background thread thread = threading.Thread(target=build_task, daemon=True) thread.start() @@ -502,7 +502,7 @@ def add_progress_callback(msg, progress_ratio): "data": { "project_id": project_id, "task_id": task_id, - "message": "图谱构建任务已启动,请通过 /task/{task_id} 查询进度" + "message": "Graph build task started. Query progress via /task/{task_id}" } }) @@ -514,19 +514,19 @@ def add_progress_callback(msg, progress_ratio): }), 500 -# ============== 任务查询接口 ============== +# ============== Task query endpoints ============== @graph_bp.route('/task/', methods=['GET']) def get_task(task_id: str): """ - 查询任务状态 + Query task status """ task = TaskManager().get_task(task_id) if not task: return jsonify({ "success": False, - "error": f"任务不存在: {task_id}" + "error": f"Task not found: {task_id}" }), 404 return jsonify({ @@ -538,7 +538,7 @@ def get_task(task_id: str): @graph_bp.route('/tasks', methods=['GET']) def list_tasks(): """ - 列出所有任务 + List all tasks """ tasks = TaskManager().list_tasks() @@ -549,12 +549,12 @@ def list_tasks(): }) -# ============== 图谱数据接口 ============== +# ============== Graph data endpoints ============== @graph_bp.route('/data/', methods=['GET']) def get_graph_data(graph_id: str): """ - 获取图谱数据(节点和边) + Get graph data (nodes and edges) """ try: storage = _get_storage() @@ -577,7 +577,7 @@ def get_graph_data(graph_id: str): @graph_bp.route('/delete/', methods=['DELETE']) def delete_graph(graph_id: str): """ - 删除图谱 + Delete graph """ try: storage = _get_storage() @@ -586,7 +586,7 @@ def delete_graph(graph_id: str): return jsonify({ "success": True, - "message": f"图谱已删除: {graph_id}" + "message": f"Graph deleted: {graph_id}" }) except Exception as e: diff --git a/backend/app/api/report.py b/backend/app/api/report.py index ab7f4a5..67902a1 100644 --- a/backend/app/api/report.py +++ b/backend/app/api/report.py @@ -1,6 +1,6 @@ """ -Report API路由 -提供模拟报告生成、获取、对话等接口 +Report API routes +Provides interfaces for simulation report generation, retrieval, conversation, etc. """ import os @@ -19,30 +19,30 @@ logger = get_logger('mirofish.api.report') -# ============== 报告生成接口 ============== +# ============== Report generation endpoints ============== @report_bp.route('/generate', methods=['POST']) def generate_report(): """ - 生成模拟分析报告(异步任务) - - 这是一个耗时操作,接口会立即返回task_id, - 使用 GET /api/report/generate/status 查询进度 - - 请求(JSON): + Generate simulation analysis report (async task) + + This is a time-consuming operation. The endpoint returns task_id immediately. + Use GET /api/report/generate/status to query progress. + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "force_regenerate": false // 可选,强制重新生成 + "simulation_id": "sim_xxxx", // Required, simulation ID + "force_regenerate": false // Optional, force regeneration } - - 返回: + + Response: { "success": true, "data": { "simulation_id": "sim_xxxx", "task_id": "task_xxxx", "status": "generating", - "message": "报告生成任务已启动" + "message": "Report generation task started" } } """ @@ -53,22 +53,22 @@ def generate_report(): if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 - + force_regenerate = data.get('force_regenerate', False) - - # 获取模拟信息 + + # Get simulation info manager = SimulationManager() state = manager.get_simulation(simulation_id) if not state: return jsonify({ "success": False, - "error": f"模拟不存在: {simulation_id}" + "error": f"Simulation not found: {simulation_id}" }), 404 - - # 检查是否已有报告 + + # Check if a report already exists if not force_regenerate: existing_report = ReportManager.get_report_by_simulation(simulation_id) if existing_report and existing_report.status == ReportStatus.COMPLETED: @@ -78,38 +78,38 @@ def generate_report(): "simulation_id": simulation_id, "report_id": existing_report.report_id, "status": "completed", - "message": "报告已存在", + "message": "Report already exists", "already_generated": True } }) - # 获取项目信息 + # Get project info project = ProjectManager.get_project(state.project_id) if not project: return jsonify({ "success": False, - "error": f"项目不存在: {state.project_id}" + "error": f"Project not found: {state.project_id}" }), 404 graph_id = state.graph_id or project.graph_id if not graph_id: return jsonify({ "success": False, - "error": "缺少图谱ID,请确保已构建图谱" + "error": "Missing graph ID, please ensure the graph has been built" }), 400 simulation_requirement = project.simulation_requirement if not simulation_requirement: return jsonify({ "success": False, - "error": "缺少模拟需求描述" + "error": "Missing simulation requirement description" }), 400 - # 提前生成 report_id,以便立即返回给前端 + # Pre-generate report_id so it can be returned to the frontend immediately import uuid report_id = f"report_{uuid.uuid4().hex[:12]}" - # 创建异步任务 + # Create async task task_manager = TaskManager() task_id = task_manager.create_task( task_type="report_generate", @@ -120,24 +120,32 @@ def generate_report(): } ) - # 定义后台任务 + # Grab storage from app context before spawning thread + from flask import current_app + from ..services.graph_tools import GraphToolsService + storage = current_app.extensions['neo4j_storage'] + + # Define background task def run_generate(): try: task_manager.update_task( task_id, status=TaskStatus.PROCESSING, progress=0, - message="初始化Report Agent..." + message="Initializing Report Agent..." ) - - # 创建Report Agent + + # Create Report Agent with GraphToolsService + graph_tools = GraphToolsService(storage=storage) + agent = ReportAgent( graph_id=graph_id, simulation_id=simulation_id, - simulation_requirement=simulation_requirement + simulation_requirement=simulation_requirement, + graph_tools=graph_tools ) - # 进度回调 + # Progress callback def progress_callback(stage, progress, message): task_manager.update_task( task_id, @@ -145,13 +153,13 @@ def progress_callback(stage, progress, message): message=f"[{stage}] {message}" ) - # 生成报告(传入预先生成的 report_id) + # Generate report (pass in pre-generated report_id) report = agent.generate_report( progress_callback=progress_callback, report_id=report_id ) - # 保存报告 + # Save report ReportManager.save_report(report) if report.status == ReportStatus.COMPLETED: @@ -164,13 +172,13 @@ def progress_callback(stage, progress, message): } ) else: - task_manager.fail_task(task_id, report.error or "报告生成失败") + task_manager.fail_task(task_id, report.error or "Report generation failed") except Exception as e: - logger.error(f"报告生成失败: {str(e)}") + logger.error(f"Report generation failed: {str(e)}") task_manager.fail_task(task_id, str(e)) - # 启动后台线程 + # Start background thread thread = threading.Thread(target=run_generate, daemon=True) thread.start() @@ -181,13 +189,13 @@ def progress_callback(stage, progress, message): "report_id": report_id, "task_id": task_id, "status": "generating", - "message": "报告生成任务已启动,请通过 /api/report/generate/status 查询进度", + "message": "Report generation task started. Query progress via /api/report/generate/status", "already_generated": False } }) except Exception as e: - logger.error(f"启动报告生成任务失败: {str(e)}") + logger.error(f"Failed to start report generation task: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -198,15 +206,15 @@ def progress_callback(stage, progress, message): @report_bp.route('/generate/status', methods=['POST']) def get_generate_status(): """ - 查询报告生成任务进度 - - 请求(JSON): + Query report generation task progress + + Request (JSON): { - "task_id": "task_xxxx", // 可选,generate返回的task_id - "simulation_id": "sim_xxxx" // 可选,模拟ID + "task_id": "task_xxxx", // Optional, task_id returned by generate + "simulation_id": "sim_xxxx" // Optional, simulation ID } - - 返回: + + Response: { "success": true, "data": { @@ -223,7 +231,7 @@ def get_generate_status(): task_id = data.get('task_id') simulation_id = data.get('simulation_id') - # 如果提供了simulation_id,先检查是否已有完成的报告 + # If simulation_id is provided, first check if a completed report already exists if simulation_id: existing_report = ReportManager.get_report_by_simulation(simulation_id) if existing_report and existing_report.status == ReportStatus.COMPLETED: @@ -234,7 +242,7 @@ def get_generate_status(): "report_id": existing_report.report_id, "status": "completed", "progress": 100, - "message": "报告已生成", + "message": "Report already generated", "already_completed": True } }) @@ -242,7 +250,7 @@ def get_generate_status(): if not task_id: return jsonify({ "success": False, - "error": "请提供 task_id 或 simulation_id" + "error": "Please provide task_id or simulation_id" }), 400 task_manager = TaskManager() @@ -251,7 +259,7 @@ def get_generate_status(): if not task: return jsonify({ "success": False, - "error": f"任务不存在: {task_id}" + "error": f"Task not found: {task_id}" }), 404 return jsonify({ @@ -260,21 +268,21 @@ def get_generate_status(): }) except Exception as e: - logger.error(f"查询任务状态失败: {str(e)}") + logger.error(f"Failed to query task status: {str(e)}") return jsonify({ "success": False, "error": str(e) }), 500 -# ============== 报告获取接口 ============== +# ============== Report retrieval endpoints ============== @report_bp.route('/', methods=['GET']) def get_report(report_id: str): """ - 获取报告详情 - - 返回: + Get report details + + Response: { "success": true, "data": { @@ -294,7 +302,7 @@ def get_report(report_id: str): if not report: return jsonify({ "success": False, - "error": f"报告不存在: {report_id}" + "error": f"Report not found: {report_id}" }), 404 return jsonify({ @@ -303,7 +311,7 @@ def get_report(report_id: str): }) except Exception as e: - logger.error(f"获取报告失败: {str(e)}") + logger.error(f"Failed to get report: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -314,9 +322,9 @@ def get_report(report_id: str): @report_bp.route('/by-simulation/', methods=['GET']) def get_report_by_simulation(simulation_id: str): """ - 根据模拟ID获取报告 - - 返回: + Get report by simulation ID + + Response: { "success": true, "data": { @@ -331,7 +339,7 @@ def get_report_by_simulation(simulation_id: str): if not report: return jsonify({ "success": False, - "error": f"该模拟暂无报告: {simulation_id}", + "error": f"No report found for this simulation: {simulation_id}", "has_report": False }), 404 @@ -342,7 +350,7 @@ def get_report_by_simulation(simulation_id: str): }) except Exception as e: - logger.error(f"获取报告失败: {str(e)}") + logger.error(f"Failed to get report: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -353,13 +361,13 @@ def get_report_by_simulation(simulation_id: str): @report_bp.route('/list', methods=['GET']) def list_reports(): """ - 列出所有报告 - - Query参数: - simulation_id: 按模拟ID过滤(可选) - limit: 返回数量限制(默认50) - - 返回: + List all reports + + Query parameters: + simulation_id: Filter by simulation ID (optional) + limit: Return count limit (default 50) + + Response: { "success": true, "data": [...], @@ -382,7 +390,7 @@ def list_reports(): }) except Exception as e: - logger.error(f"列出报告失败: {str(e)}") + logger.error(f"Failed to list reports: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -393,9 +401,9 @@ def list_reports(): @report_bp.route('//download', methods=['GET']) def download_report(report_id: str): """ - 下载报告(Markdown格式) - - 返回Markdown文件 + Download report (Markdown format) + + Returns a Markdown file """ try: report = ReportManager.get_report(report_id) @@ -403,13 +411,13 @@ def download_report(report_id: str): if not report: return jsonify({ "success": False, - "error": f"报告不存在: {report_id}" + "error": f"Report not found: {report_id}" }), 404 md_path = ReportManager._get_report_markdown_path(report_id) if not os.path.exists(md_path): - # 如果MD文件不存在,生成一个临时文件 + # If the MD file doesn't exist, generate a temporary file import tempfile with tempfile.NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f: f.write(report.markdown_content) @@ -428,7 +436,7 @@ def download_report(report_id: str): ) except Exception as e: - logger.error(f"下载报告失败: {str(e)}") + logger.error(f"Failed to download report: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -438,23 +446,23 @@ def download_report(report_id: str): @report_bp.route('/', methods=['DELETE']) def delete_report(report_id: str): - """删除报告""" + """Delete report""" try: success = ReportManager.delete_report(report_id) if not success: return jsonify({ "success": False, - "error": f"报告不存在: {report_id}" + "error": f"Report not found: {report_id}" }), 404 return jsonify({ "success": True, - "message": f"报告已删除: {report_id}" + "message": f"Report deleted: {report_id}" }) except Exception as e: - logger.error(f"删除报告失败: {str(e)}") + logger.error(f"Failed to delete report: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -462,32 +470,32 @@ def delete_report(report_id: str): }), 500 -# ============== Report Agent对话接口 ============== +# ============== Report Agent conversation endpoints ============== @report_bp.route('/chat', methods=['POST']) def chat_with_report_agent(): """ - 与Report Agent对话 - - Report Agent可以在对话中自主调用检索工具来回答问题 - - 请求(JSON): + Chat with Report Agent + + Report Agent can autonomously invoke retrieval tools during conversation to answer questions + + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "message": "请解释一下舆情走向", // 必填,用户消息 - "chat_history": [ // 可选,对话历史 + "simulation_id": "sim_xxxx", // Required, simulation ID + "message": "Please explain the opinion trend", // Required, user message + "chat_history": [ // Optional, chat history {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."} ] } - - 返回: + + Response: { "success": true, "data": { - "response": "Agent回复...", - "tool_calls": [调用的工具列表], - "sources": [信息来源] + "response": "Agent reply...", + "tool_calls": [list of tools called], + "sources": [information sources] } } """ @@ -501,42 +509,42 @@ def chat_with_report_agent(): if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 if not message: return jsonify({ "success": False, - "error": "请提供 message" + "error": "Please provide message" }), 400 - # 获取模拟和项目信息 + # Get simulation and project info manager = SimulationManager() state = manager.get_simulation(simulation_id) if not state: return jsonify({ "success": False, - "error": f"模拟不存在: {simulation_id}" + "error": f"Simulation not found: {simulation_id}" }), 404 project = ProjectManager.get_project(state.project_id) if not project: return jsonify({ "success": False, - "error": f"项目不存在: {state.project_id}" + "error": f"Project not found: {state.project_id}" }), 404 graph_id = state.graph_id or project.graph_id if not graph_id: return jsonify({ "success": False, - "error": "缺少图谱ID" + "error": "Missing graph ID" }), 400 simulation_requirement = project.simulation_requirement or "" - # 创建Agent并进行对话 + # Create Agent and conduct conversation agent = ReportAgent( graph_id=graph_id, simulation_id=simulation_id, @@ -551,7 +559,7 @@ def chat_with_report_agent(): }) except Exception as e: - logger.error(f"对话失败: {str(e)}") + logger.error(f"Chat failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -559,22 +567,22 @@ def chat_with_report_agent(): }), 500 -# ============== 报告进度与分章节接口 ============== +# ============== Report progress and section endpoints ============== @report_bp.route('//progress', methods=['GET']) def get_report_progress(report_id: str): """ - 获取报告生成进度(实时) - - 返回: + Get report generation progress (real-time) + + Response: { "success": true, "data": { "status": "generating", "progress": 45, - "message": "正在生成章节: 关键发现", - "current_section": "关键发现", - "completed_sections": ["执行摘要", "模拟背景"], + "message": "Generating section: Key Findings", + "current_section": "Key Findings", + "completed_sections": ["Executive Summary", "Simulation Background"], "updated_at": "2025-12-09T..." } } @@ -585,7 +593,7 @@ def get_report_progress(report_id: str): if not progress: return jsonify({ "success": False, - "error": f"报告不存在或进度信息不可用: {report_id}" + "error": f"Report not found or progress info unavailable: {report_id}" }), 404 return jsonify({ @@ -594,7 +602,7 @@ def get_report_progress(report_id: str): }) except Exception as e: - logger.error(f"获取报告进度失败: {str(e)}") + logger.error(f"Failed to get report progress: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -605,11 +613,11 @@ def get_report_progress(report_id: str): @report_bp.route('//sections', methods=['GET']) def get_report_sections(report_id: str): """ - 获取已生成的章节列表(分章节输出) - - 前端可以轮询此接口获取已生成的章节内容,无需等待整个报告完成 - - 返回: + Get the list of generated sections (section-by-section output) + + The frontend can poll this endpoint to get generated section content without waiting for the entire report + + Response: { "success": true, "data": { @@ -618,7 +626,7 @@ def get_report_sections(report_id: str): { "filename": "section_01.md", "section_index": 1, - "content": "## 执行摘要\\n\\n..." + "content": "## Executive Summary\\n\\n..." }, ... ], @@ -630,7 +638,7 @@ def get_report_sections(report_id: str): try: sections = ReportManager.get_generated_sections(report_id) - # 获取报告状态 + # Get report status report = ReportManager.get_report(report_id) is_complete = report is not None and report.status == ReportStatus.COMPLETED @@ -645,7 +653,7 @@ def get_report_sections(report_id: str): }) except Exception as e: - logger.error(f"获取章节列表失败: {str(e)}") + logger.error(f"Failed to get section list: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -656,14 +664,14 @@ def get_report_sections(report_id: str): @report_bp.route('//section/', methods=['GET']) def get_single_section(report_id: str, section_index: int): """ - 获取单个章节内容 - - 返回: + Get a single section's content + + Response: { "success": true, "data": { "filename": "section_01.md", - "content": "## 执行摘要\\n\\n..." + "content": "## Executive Summary\\n\\n..." } } """ @@ -673,7 +681,7 @@ def get_single_section(report_id: str, section_index: int): if not os.path.exists(section_path): return jsonify({ "success": False, - "error": f"章节不存在: section_{section_index:02d}.md" + "error": f"Section not found: section_{section_index:02d}.md" }), 404 with open(section_path, 'r', encoding='utf-8') as f: @@ -689,7 +697,7 @@ def get_single_section(report_id: str, section_index: int): }) except Exception as e: - logger.error(f"获取章节内容失败: {str(e)}") + logger.error(f"Failed to get section content: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -697,16 +705,16 @@ def get_single_section(report_id: str, section_index: int): }), 500 -# ============== 报告状态检查接口 ============== +# ============== Report status check endpoints ============== @report_bp.route('/check/', methods=['GET']) def check_report_status(simulation_id: str): """ - 检查模拟是否有报告,以及报告状态 - - 用于前端判断是否解锁Interview功能 - - 返回: + Check whether a simulation has a report and the report status + + Used by the frontend to determine whether to unlock Interview functionality + + Response: { "success": true, "data": { @@ -725,7 +733,7 @@ def check_report_status(simulation_id: str): report_status = report.status.value if report else None report_id = report.report_id if report else None - # 只有报告完成后才解锁interview + # Only unlock interview after the report is completed interview_unlocked = has_report and report.status == ReportStatus.COMPLETED return jsonify({ @@ -740,7 +748,7 @@ def check_report_status(simulation_id: str): }) except Exception as e: - logger.error(f"检查报告状态失败: {str(e)}") + logger.error(f"Failed to check report status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -748,22 +756,22 @@ def check_report_status(simulation_id: str): }), 500 -# ============== Agent 日志接口 ============== +# ============== Agent log endpoints ============== @report_bp.route('//agent-log', methods=['GET']) def get_agent_log(report_id: str): """ - 获取 Report Agent 的详细执行日志 - - 实时获取报告生成过程中的每一步动作,包括: - - 报告开始、规划开始/完成 - - 每个章节的开始、工具调用、LLM响应、完成 - - 报告完成或失败 - - Query参数: - from_line: 从第几行开始读取(可选,默认0,用于增量获取) - - 返回: + Get detailed execution log of the Report Agent + + Retrieve each step action during report generation in real-time, including: + - Report start, planning start/complete + - Each section's start, tool calls, LLM responses, completion + - Report completion or failure + + Query parameters: + from_line: Start reading from which line (optional, default 0, for incremental retrieval) + + Response: { "success": true, "data": { @@ -774,7 +782,7 @@ def get_agent_log(report_id: str): "report_id": "report_xxxx", "action": "tool_call", "stage": "generating", - "section_title": "执行摘要", + "section_title": "Executive Summary", "section_index": 1, "details": { "tool_name": "insight_forge", @@ -801,7 +809,7 @@ def get_agent_log(report_id: str): }) except Exception as e: - logger.error(f"获取Agent日志失败: {str(e)}") + logger.error(f"Failed to get Agent log: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -812,9 +820,9 @@ def get_agent_log(report_id: str): @report_bp.route('//agent-log/stream', methods=['GET']) def stream_agent_log(report_id: str): """ - 获取完整的 Agent 日志(一次性获取全部) - - 返回: + Get the complete Agent log (fetch all at once) + + Response: { "success": true, "data": { @@ -835,7 +843,7 @@ def stream_agent_log(report_id: str): }) except Exception as e: - logger.error(f"获取Agent日志失败: {str(e)}") + logger.error(f"Failed to get Agent log: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -843,27 +851,27 @@ def stream_agent_log(report_id: str): }), 500 -# ============== 控制台日志接口 ============== +# ============== Console log endpoints ============== @report_bp.route('//console-log', methods=['GET']) def get_console_log(report_id: str): """ - 获取 Report Agent 的控制台输出日志 - - 实时获取报告生成过程中的控制台输出(INFO、WARNING等), - 这与 agent-log 接口返回的结构化 JSON 日志不同, - 是纯文本格式的控制台风格日志。 - - Query参数: - from_line: 从第几行开始读取(可选,默认0,用于增量获取) - - 返回: + Get Report Agent console output log + + Retrieve console output (INFO, WARNING, etc.) during report generation in real-time. + This is different from the structured JSON log returned by the agent-log endpoint; + it is a plain text format console-style log. + + Query parameters: + from_line: Start reading from which line (optional, default 0, for incremental retrieval) + + Response: { "success": true, "data": { "logs": [ - "[19:46:14] INFO: 搜索完成: 找到 15 条相关事实", - "[19:46:14] INFO: 图谱搜索: graph_id=xxx, query=...", + "[19:46:14] INFO: Search complete: found 15 relevant facts", + "[19:46:14] INFO: Graph search: graph_id=xxx, query=...", ... ], "total_lines": 100, @@ -883,7 +891,7 @@ def get_console_log(report_id: str): }) except Exception as e: - logger.error(f"获取控制台日志失败: {str(e)}") + logger.error(f"Failed to get console log: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -894,9 +902,9 @@ def get_console_log(report_id: str): @report_bp.route('//console-log/stream', methods=['GET']) def stream_console_log(report_id: str): """ - 获取完整的控制台日志(一次性获取全部) - - 返回: + Get the complete console log (fetch all at once) + + Response: { "success": true, "data": { @@ -917,7 +925,7 @@ def stream_console_log(report_id: str): }) except Exception as e: - logger.error(f"获取控制台日志失败: {str(e)}") + logger.error(f"Failed to get console log: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -925,17 +933,17 @@ def stream_console_log(report_id: str): }), 500 -# ============== 工具调用接口(供调试使用)============== +# ============== Tool call endpoints (for debugging) ============== @report_bp.route('/tools/search', methods=['POST']) def search_graph_tool(): """ - 图谱搜索工具接口(供调试使用) - - 请求(JSON): + Graph search tool endpoint (for debugging) + + Request (JSON): { "graph_id": "mirofish_xxxx", - "query": "搜索查询", + "query": "search query", "limit": 10 } """ @@ -949,7 +957,7 @@ def search_graph_tool(): if not graph_id or not query: return jsonify({ "success": False, - "error": "请提供 graph_id 和 query" + "error": "Please provide graph_id and query" }), 400 from ..services.graph_tools import GraphToolsService @@ -970,7 +978,7 @@ def search_graph_tool(): }) except Exception as e: - logger.error(f"图谱搜索失败: {str(e)}") + logger.error(f"Graph search failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -981,9 +989,9 @@ def search_graph_tool(): @report_bp.route('/tools/statistics', methods=['POST']) def get_graph_statistics_tool(): """ - 图谱统计工具接口(供调试使用) - - 请求(JSON): + Graph statistics tool endpoint (for debugging) + + Request (JSON): { "graph_id": "mirofish_xxxx" } @@ -996,7 +1004,7 @@ def get_graph_statistics_tool(): if not graph_id: return jsonify({ "success": False, - "error": "请提供 graph_id" + "error": "Please provide graph_id" }), 400 from ..services.graph_tools import GraphToolsService @@ -1013,7 +1021,7 @@ def get_graph_statistics_tool(): }) except Exception as e: - logger.error(f"获取图谱统计失败: {str(e)}") + logger.error(f"Failed to get graph statistics: {str(e)}") return jsonify({ "success": False, "error": str(e), diff --git a/backend/app/api/simulation.py b/backend/app/api/simulation.py index ff798df..2f4e3df 100644 --- a/backend/app/api/simulation.py +++ b/backend/app/api/simulation.py @@ -1,6 +1,6 @@ """ -模拟相关API路由 -Step2: 实体读取与过滤、OASIS模拟准备与运行(全程自动化) +Simulation-related API routes +Step2: Entity reading and filtering, OASIS simulation preparation and execution (fully automated) """ import os @@ -19,48 +19,48 @@ logger = get_logger('mirofish.api.simulation') -# Interview prompt 优化前缀 -# 添加此前缀可以避免Agent调用工具,直接用文本回复 -INTERVIEW_PROMPT_PREFIX = "结合你的人设、所有的过往记忆与行动,不调用任何工具直接用文本回复我:" +# Interview prompt optimization prefix +# Adding this prefix prevents the Agent from calling tools, and forces a direct text reply +INTERVIEW_PROMPT_PREFIX = "Based on your persona, all past memories and actions, reply directly in text without calling any tools: " def optimize_interview_prompt(prompt: str) -> str: """ - 优化Interview提问,添加前缀避免Agent调用工具 + Optimize Interview prompt by adding prefix to prevent Agent from calling tools Args: - prompt: 原始提问 + prompt: Original prompt Returns: - 优化后的提问 + Optimized prompt """ if not prompt: return prompt - # 避免重复添加前缀 + # Avoid adding prefix repeatedly if prompt.startswith(INTERVIEW_PROMPT_PREFIX): return prompt return f"{INTERVIEW_PROMPT_PREFIX}{prompt}" -# ============== 实体读取接口 ============== +# ============== Entity reading endpoints ============== @simulation_bp.route('/entities/', methods=['GET']) def get_graph_entities(graph_id: str): """ - 获取图谱中的所有实体(已过滤) + Get all entities in the graph (filtered) - 只返回符合预定义实体类型的节点(Labels不只是Entity的节点) + Only returns nodes matching predefined entity types (nodes with labels other than just Entity) - Query参数: - entity_types: 逗号分隔的实体类型列表(可选,用于进一步过滤) - enrich: 是否获取相关边信息(默认true) + Query parameters: + entity_types: Comma-separated list of entity types (optional, for further filtering) + enrich: Whether to get related edge info (default true) """ try: entity_types_str = request.args.get('entity_types', '') entity_types = [t.strip() for t in entity_types_str.split(',') if t.strip()] if entity_types_str else None enrich = request.args.get('enrich', 'true').lower() == 'true' - logger.info(f"获取图谱实体: graph_id={graph_id}, entity_types={entity_types}, enrich={enrich}") + logger.info(f"Get graph entities: graph_id={graph_id}, entity_types={entity_types}, enrich={enrich}") storage = current_app.extensions.get('neo4j_storage') if not storage: @@ -78,7 +78,7 @@ def get_graph_entities(graph_id: str): }) except Exception as e: - logger.error(f"获取图谱实体失败: {str(e)}") + logger.error(f"Failed to get graph entities: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -88,7 +88,7 @@ def get_graph_entities(graph_id: str): @simulation_bp.route('/entities//', methods=['GET']) def get_entity_detail(graph_id: str, entity_uuid: str): - """获取单个实体的详细信息""" + """Get detailed information of a single entity""" try: storage = current_app.extensions.get('neo4j_storage') if not storage: @@ -99,7 +99,7 @@ def get_entity_detail(graph_id: str, entity_uuid: str): if not entity: return jsonify({ "success": False, - "error": f"实体不存在: {entity_uuid}" + "error": f"Entity not found: {entity_uuid}" }), 404 return jsonify({ @@ -108,7 +108,7 @@ def get_entity_detail(graph_id: str, entity_uuid: str): }) except Exception as e: - logger.error(f"获取实体详情失败: {str(e)}") + logger.error(f"Failed to get entity details: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -118,7 +118,7 @@ def get_entity_detail(graph_id: str, entity_uuid: str): @simulation_bp.route('/entities//by-type/', methods=['GET']) def get_entities_by_type(graph_id: str, entity_type: str): - """获取指定类型的所有实体""" + """Get all entities of a specified type""" try: enrich = request.args.get('enrich', 'true').lower() == 'true' @@ -142,7 +142,7 @@ def get_entities_by_type(graph_id: str, entity_type: str): }) except Exception as e: - logger.error(f"获取实体失败: {str(e)}") + logger.error(f"Failed to get entities: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -150,24 +150,24 @@ def get_entities_by_type(graph_id: str, entity_type: str): }), 500 -# ============== 模拟管理接口 ============== +# ============== Simulation management endpoints ============== @simulation_bp.route('/create', methods=['POST']) def create_simulation(): """ - 创建新的模拟 + Create a new simulation - 注意:max_rounds等参数由LLM智能生成,无需手动设置 + Note: Parameters like max_rounds are intelligently generated by LLM, no manual setup needed - 请求(JSON): + Request (JSON): { - "project_id": "proj_xxxx", // 必填 - "graph_id": "mirofish_xxxx", // 可选,如不提供则从project获取 - "enable_twitter": true, // 可选,默认true - "enable_reddit": true // 可选,默认true + "project_id": "proj_xxxx", // Required + "graph_id": "mirofish_xxxx", // Optional, fetched from project if not provided + "enable_twitter": true, // Optional, default true + "enable_reddit": true // Optional, default true } - 返回: + Response: { "success": true, "data": { @@ -188,21 +188,21 @@ def create_simulation(): if not project_id: return jsonify({ "success": False, - "error": "请提供 project_id" + "error": "Please provide project_id" }), 400 project = ProjectManager.get_project(project_id) if not project: return jsonify({ "success": False, - "error": f"项目不存在: {project_id}" + "error": f"Project not found: {project_id}" }), 404 graph_id = data.get('graph_id') or project.graph_id if not graph_id: return jsonify({ "success": False, - "error": "项目尚未构建图谱,请先调用 /api/graph/build" + "error": "Project has not built a graph yet. Please call /api/graph/build first" }), 400 manager = SimulationManager() @@ -219,7 +219,7 @@ def create_simulation(): }) except Exception as e: - logger.error(f"创建模拟失败: {str(e)}") + logger.error(f"Failed to create simulation: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -229,16 +229,16 @@ def create_simulation(): def _check_simulation_prepared(simulation_id: str) -> tuple: """ - 检查模拟是否已经准备完成 + Check if the simulation has been prepared - 检查条件: - 1. state.json 存在且 status 为 "ready" - 2. 必要文件存在:reddit_profiles.json, twitter_profiles.csv, simulation_config.json + Check conditions: + 1. state.json exists and status is "ready" + 2. Required files exist: reddit_profiles.json, twitter_profiles.csv, simulation_config.json - 注意:运行脚本(run_*.py)保留在 backend/scripts/ 目录,不再复制到模拟目录 + Note: Run scripts (run_*.py) remain in the backend/scripts/ directory and are no longer copied to the simulation directory Args: - simulation_id: 模拟ID + simulation_id: Simulation ID Returns: (is_prepared: bool, info: dict) @@ -248,11 +248,11 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: simulation_dir = os.path.join(Config.OASIS_SIMULATION_DATA_DIR, simulation_id) - # 检查目录是否存在 + # Check if directory exists if not os.path.exists(simulation_dir): - return False, {"reason": "模拟目录不存在"} + return False, {"reason": "Simulation directory does not exist"} - # 必要文件列表(不包括脚本,脚本位于 backend/scripts/) + # Required files list (excluding scripts, scripts are in backend/scripts/) required_files = [ "state.json", "simulation_config.json", @@ -260,7 +260,7 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: "twitter_profiles.csv" ] - # 检查文件是否存在 + # Check if files exist existing_files = [] missing_files = [] for f in required_files: @@ -272,12 +272,12 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: if missing_files: return False, { - "reason": "缺少必要文件", + "reason": "Missing required files", "missing_files": missing_files, "existing_files": existing_files } - # 检查state.json中的状态 + # Check state in state.json state_file = os.path.join(simulation_dir, "state.json") try: import json @@ -287,20 +287,20 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: status = state_data.get("status", "") config_generated = state_data.get("config_generated", False) - # 详细日志 - logger.debug(f"检测模拟准备状态: {simulation_id}, status={status}, config_generated={config_generated}") - - # 如果 config_generated=True 且文件存在,认为准备完成 - # 以下状态都说明准备工作已完成: - # - ready: 准备完成,可以运行 - # - preparing: 如果 config_generated=True 说明已完成 - # - running: 正在运行,说明准备早就完成了 - # - completed: 运行完成,说明准备早就完成了 - # - stopped: 已停止,说明准备早就完成了 - # - failed: 运行失败(但准备是完成的) + # Detailed log + logger.debug(f"Check simulation preparation status: {simulation_id}, status={status}, config_generated={config_generated}") + + # If config_generated=True and files exist, consider preparation complete + # The following statuses indicate preparation is complete: + # - ready: Preparation complete, ready to run + # - preparing: If config_generated=True it means already completed + # - running: Currently running, which means preparation was completed long ago + # - completed: Run completed, which means preparation was completed long ago + # - stopped: Stopped, which means preparation was completed long ago + # - failed: Run failed (but preparation was completed) prepared_statuses = ["ready", "preparing", "running", "completed", "stopped", "failed"] if status in prepared_statuses and config_generated: - # 获取文件统计信息 + # Get file statistics profiles_file = os.path.join(simulation_dir, "reddit_profiles.json") config_file = os.path.join(simulation_dir, "simulation_config.json") @@ -310,7 +310,7 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: profiles_data = json.load(f) profiles_count = len(profiles_data) if isinstance(profiles_data, list) else 0 - # 如果状态是preparing但文件已完成,自动更新状态为ready + # If status is preparing but files are complete, automatically update status to ready if status == "preparing": try: state_data["status"] = "ready" @@ -318,12 +318,12 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: state_data["updated_at"] = datetime.now().isoformat() with open(state_file, 'w', encoding='utf-8') as f: json.dump(state_data, f, ensure_ascii=False, indent=2) - logger.info(f"自动更新模拟状态: {simulation_id} preparing -> ready") + logger.info(f"Auto-updating simulation status: {simulation_id} preparing -> ready") status = "ready" except Exception as e: - logger.warning(f"自动更新状态失败: {e}") + logger.warning(f"Failed to auto-update status: {e}") - logger.info(f"模拟 {simulation_id} 检测结果: 已准备完成 (status={status}, config_generated={config_generated})") + logger.info(f"Simulation {simulation_id} check result: preparation complete (status={status}, config_generated={config_generated})") return True, { "status": status, "entities_count": state_data.get("entities_count", 0), @@ -335,55 +335,55 @@ def _check_simulation_prepared(simulation_id: str) -> tuple: "existing_files": existing_files } else: - logger.warning(f"模拟 {simulation_id} 检测结果: 未准备完成 (status={status}, config_generated={config_generated})") + logger.warning(f"Simulation {simulation_id} check result: preparation not complete (status={status}, config_generated={config_generated})") return False, { - "reason": f"状态不在已准备列表中或config_generated为false: status={status}, config_generated={config_generated}", + "reason": f"Status not in prepared list or config_generated is false: status={status}, config_generated={config_generated}", "status": status, "config_generated": config_generated } except Exception as e: - return False, {"reason": f"读取状态文件失败: {str(e)}"} + return False, {"reason": f"Failed to read state file: {str(e)}"} @simulation_bp.route('/prepare', methods=['POST']) def prepare_simulation(): """ - 准备模拟环境(异步任务,LLM智能生成所有参数) + Prepare simulation environment (async task, LLM intelligently generates all parameters) - 这是一个耗时操作,接口会立即返回task_id, - 使用 GET /api/simulation/prepare/status 查询进度 + This is a time-consuming operation. The endpoint returns task_id immediately. + Use GET /api/simulation/prepare/status to query progress - 特性: - - 自动检测已完成的准备工作,避免重复生成 - - 如果已准备完成,直接返回已有结果 - - 支持强制重新生成(force_regenerate=true) + Features: + - Automatically detects completed preparation to avoid duplicate generation + - If already prepared, directly returns existing results + - Supports forced regeneration (force_regenerate=true) - 步骤: - 1. 检查是否已有完成的准备工作 - 2. 从Zep图谱读取并过滤实体 - 3. 为每个实体生成OASIS Agent Profile(带重试机制) - 4. LLM智能生成模拟配置(带重试机制) - 5. 保存配置文件和预设脚本 + Steps: + 1. Check if completed preparation work already exists + 2. Read and filter entities from Zep graph + 3. Generate OASIS Agent Profile for each entity (with retry mechanism) + 4. LLM intelligently generates simulation configuration (with retry mechanism) + 5. Save configuration files and preset scripts - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "entity_types": ["Student", "PublicFigure"], // 可选,指定实体类型 - "use_llm_for_profiles": true, // 可选,是否用LLM生成人设 - "parallel_profile_count": 5, // 可选,并行生成人设数量,默认5 - "force_regenerate": false // 可选,强制重新生成,默认false + "simulation_id": "sim_xxxx", // Required, simulation ID + "entity_types": ["Student", "PublicFigure"], // Optional, specify entity types + "use_llm_for_profiles": true, // Optional, whether to use LLM to generate personas + "parallel_profile_count": 5, // Optional, number of parallel persona generations, default 5 + "force_regenerate": false // Optional, force regeneration, default false } - 返回: + Response: { "success": true, "data": { "simulation_id": "sim_xxxx", - "task_id": "task_xxxx", // 新任务时返回 + "task_id": "task_xxxx", // Returned for new tasks "status": "preparing|ready", - "message": "准备任务已启动|已有完成的准备工作", - "already_prepared": true|false // 是否已准备完成 + "message": "Preparation task started|Completed preparation work exists", + "already_prepared": true|false // Whether preparation is complete } } """ @@ -399,7 +399,7 @@ def prepare_simulation(): if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 manager = SimulationManager() @@ -408,81 +408,81 @@ def prepare_simulation(): if not state: return jsonify({ "success": False, - "error": f"模拟不存在: {simulation_id}" + "error": f"Simulation not found: {simulation_id}" }), 404 - # 检查是否强制重新生成 + # Check whether to force regeneration force_regenerate = data.get('force_regenerate', False) - logger.info(f"开始处理 /prepare 请求: simulation_id={simulation_id}, force_regenerate={force_regenerate}") + logger.info(f"Processing /prepare request: simulation_id={simulation_id}, force_regenerate={force_regenerate}") - # 检查是否已经准备完成(避免重复生成) + # Check if already prepared (avoid duplicate generation) if not force_regenerate: - logger.debug(f"检查模拟 {simulation_id} 是否已准备完成...") + logger.debug(f"Check simulation {simulation_id} Whether preparation is complete...") is_prepared, prepare_info = _check_simulation_prepared(simulation_id) - logger.debug(f"检查结果: is_prepared={is_prepared}, prepare_info={prepare_info}") + logger.debug(f"Check result: is_prepared={is_prepared}, prepare_info={prepare_info}") if is_prepared: - logger.info(f"模拟 {simulation_id} 已准备完成,跳过重复生成") + logger.info(f"Simulation {simulation_id} already prepared, skipping duplicate generation") return jsonify({ "success": True, "data": { "simulation_id": simulation_id, "status": "ready", - "message": "已有完成的准备工作,无需重复生成", + "message": "Completed preparation work exists, no need for duplicate generation", "already_prepared": True, "prepare_info": prepare_info } }) else: - logger.info(f"模拟 {simulation_id} 未准备完成,将启动准备任务") + logger.info(f"Simulation {simulation_id} not prepared yet, will start preparation task") - # 从项目获取必要信息 + # Get required info from project project = ProjectManager.get_project(state.project_id) if not project: return jsonify({ "success": False, - "error": f"项目不存在: {state.project_id}" + "error": f"Project not found: {state.project_id}" }), 404 - # 获取模拟需求 + # Get simulation requirement simulation_requirement = project.simulation_requirement or "" if not simulation_requirement: return jsonify({ "success": False, - "error": "项目缺少模拟需求描述 (simulation_requirement)" + "error": "Project missing simulation requirement description (simulation_requirement)" }), 400 - # 获取文档文本 + # Get document text document_text = ProjectManager.get_extracted_text(state.project_id) or "" entity_types_list = data.get('entity_types') use_llm_for_profiles = data.get('use_llm_for_profiles', True) parallel_profile_count = data.get('parallel_profile_count', 5) - # ========== 获取 GraphStorage(在后台任务启动前捕获引用) ========== + # ========== Get GraphStorage (capture reference before background task starts) ========== storage = current_app.extensions.get('neo4j_storage') if not storage: raise ValueError("GraphStorage not initialized — check Neo4j connection") - # ========== 同步获取实体数量(在后台任务启动前) ========== - # 这样前端在调用prepare后立即就能获取到预期Agent总数 + # ========== Synchronously get entity count (before background task starts) ========== + # This allows the frontend to get the expected Agent count immediately after calling prepare try: - logger.info(f"同步获取实体数量: graph_id={state.graph_id}") + logger.info(f"Synchronously getting entity count: graph_id={state.graph_id}") reader = EntityReader(storage) - # 快速读取实体(不需要边信息,只统计数量) + # Quick entity read (no edge info needed, just count) filtered_preview = reader.filter_defined_entities( graph_id=state.graph_id, defined_entity_types=entity_types_list, - enrich_with_edges=False # 不获取边信息,加快速度 + enrich_with_edges=False # Skip edge info to speed up ) - # 保存实体数量到状态(供前端立即获取) + # Save entity count to state (for frontend immediate access) state.entities_count = filtered_preview.filtered_count state.entity_types = list(filtered_preview.entity_types) - logger.info(f"预期实体数量: {filtered_preview.filtered_count}, 类型: {filtered_preview.entity_types}") + logger.info(f"Expected entity count: {filtered_preview.filtered_count}, Types: {filtered_preview.entity_types}") except Exception as e: - logger.warning(f"同步获取实体数量失败(将在后台任务中重试): {e}") - # 失败不影响后续流程,后台任务会重新获取 + logger.warning(f"Failed to synchronously get entity count (will retry in background task): {e}") + # Failure does not affect subsequent flow, background task will retry - # 创建异步任务 + # Create async task task_manager = TaskManager() task_id = task_manager.create_task( task_type="simulation_prepare", @@ -492,26 +492,26 @@ def prepare_simulation(): } ) - # 更新模拟状态(包含预先获取的实体数量) + # Update simulation status (including pre-fetched entity count) state.status = SimulationStatus.PREPARING manager._save_simulation_state(state) - # 定义后台任务 + # Define background task def run_prepare(): try: task_manager.update_task( task_id, status=TaskStatus.PROCESSING, progress=0, - message="开始准备模拟环境..." + message="Starting simulation environment preparation..." ) - # 准备模拟(带进度回调) - # 存储阶段进度详情 + # Prepare simulation (with progress callback) + # Store stage progress details stage_details = {} def progress_callback(stage, progress, message, **kwargs): - # 计算总进度 + # Calculate total progress stage_weights = { "reading": (0, 20), # 0-20% "generating_profiles": (20, 70), # 20-70% @@ -522,18 +522,18 @@ def progress_callback(stage, progress, message, **kwargs): start, end = stage_weights.get(stage, (0, 100)) current_progress = int(start + (end - start) * progress / 100) - # 构建详细进度信息 + # Build detailed progress info stage_names = { - "reading": "读取图谱实体", - "generating_profiles": "生成Agent人设", - "generating_config": "生成模拟配置", - "copying_scripts": "准备模拟脚本" + "reading": "Reading graph entities", + "generating_profiles": "Generating Agent personas", + "generating_config": "Generating simulation config", + "copying_scripts": "Preparing simulation scripts" } stage_index = list(stage_weights.keys()).index(stage) + 1 if stage in stage_weights else 1 total_stages = len(stage_weights) - # 更新阶段详情 + # Update stage details stage_details[stage] = { "stage_name": stage_names.get(stage, stage), "stage_progress": progress, @@ -542,7 +542,7 @@ def progress_callback(stage, progress, message, **kwargs): "item_name": kwargs.get("item_name", "") } - # 构建详细进度信息 + # Build detailed progress info detail = stage_details[stage] progress_detail_data = { "current_stage": stage, @@ -555,7 +555,7 @@ def progress_callback(stage, progress, message, **kwargs): "item_description": message } - # 构建简洁消息 + # Build concise message if detail["total"] > 0: detailed_message = ( f"[{stage_index}/{total_stages}] {stage_names.get(stage, stage)}: " @@ -582,24 +582,24 @@ def progress_callback(stage, progress, message, **kwargs): storage=storage, ) - # 任务完成 + # Task complete task_manager.complete_task( task_id, result=result_state.to_simple_dict() ) except Exception as e: - logger.error(f"准备模拟失败: {str(e)}") + logger.error(f"Failed to prepare simulation: {str(e)}") task_manager.fail_task(task_id, str(e)) - # 更新模拟状态为失败 + # Update simulation status to failed state = manager.get_simulation(simulation_id) if state: state.status = SimulationStatus.FAILED state.error = str(e) manager._save_simulation_state(state) - # 启动后台线程 + # Start background thread thread = threading.Thread(target=run_prepare, daemon=True) thread.start() @@ -609,10 +609,10 @@ def progress_callback(stage, progress, message, **kwargs): "simulation_id": simulation_id, "task_id": task_id, "status": "preparing", - "message": "准备任务已启动,请通过 /api/simulation/prepare/status 查询进度", + "message": "Preparation task started. Query progress via /api/simulation/prepare/status", "already_prepared": False, - "expected_entities_count": state.entities_count, # 预期的Agent总数 - "entity_types": state.entity_types # 实体类型列表 + "expected_entities_count": state.entities_count, # Expected total Agent count + "entity_types": state.entity_types # Entity type list } }) @@ -623,7 +623,7 @@ def progress_callback(stage, progress, message, **kwargs): }), 404 except Exception as e: - logger.error(f"启动准备任务失败: {str(e)}") + logger.error(f"Failed to start preparation task: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -634,19 +634,19 @@ def progress_callback(stage, progress, message, **kwargs): @simulation_bp.route('/prepare/status', methods=['POST']) def get_prepare_status(): """ - 查询准备任务进度 + Query preparation task progress - 支持两种查询方式: - 1. 通过task_id查询正在进行的任务进度 - 2. 通过simulation_id检查是否已有完成的准备工作 + Supports two query methods: + 1. Query ongoing task progress via task_id + 2. Check via simulation_id if completed preparation work already exists - 请求(JSON): + Request (JSON): { - "task_id": "task_xxxx", // 可选,prepare返回的task_id - "simulation_id": "sim_xxxx" // 可选,模拟ID(用于检查已完成的准备) + "task_id": "task_xxxx", // Optional, task_id returned by prepare + "simulation_id": "sim_xxxx" // Optional, simulation ID (for checking completed preparation) } - 返回: + Response: { "success": true, "data": { @@ -654,8 +654,8 @@ def get_prepare_status(): "status": "processing|completed|ready", "progress": 45, "message": "...", - "already_prepared": true|false, // 是否已有完成的准备 - "prepare_info": {...} // 已准备完成时的详细信息 + "already_prepared": true|false, // Whether already has completed preparation + "prepare_info": {...} // Detailed info when preparation is complete } } """ @@ -667,7 +667,7 @@ def get_prepare_status(): task_id = data.get('task_id') simulation_id = data.get('simulation_id') - # 如果提供了simulation_id,先检查是否已准备完成 + # If simulation_id is provided, first check whether preparation is complete if simulation_id: is_prepared, prepare_info = _check_simulation_prepared(simulation_id) if is_prepared: @@ -677,36 +677,36 @@ def get_prepare_status(): "simulation_id": simulation_id, "status": "ready", "progress": 100, - "message": "已有完成的准备工作", + "message": "Completed preparation work already exists", "already_prepared": True, "prepare_info": prepare_info } }) - # 如果没有task_id,返回错误 + # If no task_id, return error if not task_id: if simulation_id: - # 有simulation_id但未准备完成 + # Has simulation_id but preparation not complete return jsonify({ "success": True, "data": { "simulation_id": simulation_id, "status": "not_started", "progress": 0, - "message": "尚未开始准备,请调用 /api/simulation/prepare 开始", + "message": "Preparation not started yet. Please call /api/simulation/prepare to begin", "already_prepared": False } }) return jsonify({ "success": False, - "error": "请提供 task_id 或 simulation_id" + "error": "Please provide task_id or simulation_id" }), 400 task_manager = TaskManager() task = task_manager.get_task(task_id) if not task: - # 任务不存在,但如果有simulation_id,检查是否已准备完成 + # Task not found, but if simulation_id exists, check whether preparation is complete if simulation_id: is_prepared, prepare_info = _check_simulation_prepared(simulation_id) if is_prepared: @@ -717,7 +717,7 @@ def get_prepare_status(): "task_id": task_id, "status": "ready", "progress": 100, - "message": "任务已完成(准备工作已存在)", + "message": "Task completed (preparation work already exists)", "already_prepared": True, "prepare_info": prepare_info } @@ -725,7 +725,7 @@ def get_prepare_status(): return jsonify({ "success": False, - "error": f"任务不存在: {task_id}" + "error": f"Task not found: {task_id}" }), 404 task_dict = task.to_dict() @@ -737,7 +737,7 @@ def get_prepare_status(): }) except Exception as e: - logger.error(f"查询任务状态失败: {str(e)}") + logger.error(f"Failed to query task status: {str(e)}") return jsonify({ "success": False, "error": str(e) @@ -746,7 +746,7 @@ def get_prepare_status(): @simulation_bp.route('/', methods=['GET']) def get_simulation(simulation_id: str): - """获取模拟状态""" + """Get simulation status""" try: manager = SimulationManager() state = manager.get_simulation(simulation_id) @@ -754,12 +754,12 @@ def get_simulation(simulation_id: str): if not state: return jsonify({ "success": False, - "error": f"模拟不存在: {simulation_id}" + "error": f"Simulation not found: {simulation_id}" }), 404 result = state.to_dict() - # 如果模拟已准备好,附加运行说明 + # If simulation is ready, attach run instructions if state.status == SimulationStatus.READY: result["run_instructions"] = manager.get_run_instructions(simulation_id) @@ -769,7 +769,7 @@ def get_simulation(simulation_id: str): }) except Exception as e: - logger.error(f"获取模拟状态失败: {str(e)}") + logger.error(f"Failed to get simulation status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -780,10 +780,10 @@ def get_simulation(simulation_id: str): @simulation_bp.route('/list', methods=['GET']) def list_simulations(): """ - 列出所有模拟 + List all simulations - Query参数: - project_id: 按项目ID过滤(可选) + Query parameters: + project_id: Filter by project ID (optional) """ try: project_id = request.args.get('project_id') @@ -798,7 +798,7 @@ def list_simulations(): }) except Exception as e: - logger.error(f"列出模拟失败: {str(e)}") + logger.error(f"Failed to list simulations: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -808,22 +808,22 @@ def list_simulations(): def _get_report_id_for_simulation(simulation_id: str) -> str: """ - 获取 simulation 对应的最新 report_id + Get the latest report_id for the simulation - 遍历 reports 目录,找出 simulation_id 匹配的 report, - 如果有多个则返回最新的(按 created_at 排序) + Traverse reports directory, find reports matching simulation_id, + if multiple exist, return the newest one (sorted by created_at) Args: - simulation_id: 模拟ID + simulation_id: Simulation ID Returns: - report_id 或 None + report_id or None """ import json from datetime import datetime - # reports 目录路径:backend/uploads/reports - # __file__ 是 app/api/simulation.py,需要向上两级到 backend/ + # Reports directory path: backend/uploads/reports + # __file__ is app/api/simulation.py, need to go up two levels to backend/ reports_dir = os.path.join(os.path.dirname(__file__), '../../uploads/reports') if not os.path.exists(reports_dir): return None @@ -856,34 +856,34 @@ def _get_report_id_for_simulation(simulation_id: str) -> str: if not matching_reports: return None - # 按创建时间倒序排序,返回最新的 + # Sort by creation time in descending order, return the newest matching_reports.sort(key=lambda x: x.get("created_at", ""), reverse=True) return matching_reports[0].get("report_id") except Exception as e: - logger.warning(f"查找 simulation {simulation_id} 的 report 失败: {e}") + logger.warning(f"Failed to find report for simulation {simulation_id}: {e}") return None @simulation_bp.route('/history', methods=['GET']) def get_simulation_history(): """ - 获取历史模拟列表(带项目详情) + Get historical simulation list (with project details) - 用于首页历史项目展示,返回包含项目名称、描述等丰富信息的模拟列表 + For homepage historical project display, returns simulation list with rich info like project name, description, etc. - Query参数: - limit: 返回数量限制(默认20) + Query parameters: + limit: Return count limit (default 20) - 返回: + Response: { "success": true, "data": [ { "simulation_id": "sim_xxxx", "project_id": "proj_xxxx", - "project_name": "武大舆情分析", - "simulation_requirement": "如果武汉大学发布...", + "project_name": "WHU Public Opinion Analysis", + "simulation_requirement": "If Wuhan University releases...", "status": "completed", "entities_count": 68, "profiles_count": 68, @@ -906,18 +906,18 @@ def get_simulation_history(): manager = SimulationManager() simulations = manager.list_simulations()[:limit] - # 增强模拟数据,只从 Simulation 文件读取 + # Enrich simulation data, only read from Simulation files enriched_simulations = [] for sim in simulations: sim_dict = sim.to_dict() - # 获取模拟配置信息(从 simulation_config.json 读取 simulation_requirement) + # Get simulation config info (read simulation_requirement from simulation_config.json) config = manager.get_simulation_config(sim.simulation_id) if config: sim_dict["simulation_requirement"] = config.get("simulation_requirement", "") time_config = config.get("time_config", {}) sim_dict["total_simulation_hours"] = time_config.get("total_simulation_hours", 0) - # 推荐轮数(后备值) + # Recommended rounds (fallback value) recommended_rounds = int( time_config.get("total_simulation_hours", 0) * 60 / max(time_config.get("minutes_per_round", 60), 1) @@ -927,35 +927,35 @@ def get_simulation_history(): sim_dict["total_simulation_hours"] = 0 recommended_rounds = 0 - # 获取运行状态(从 run_state.json 读取用户设置的实际轮数) + # Get run state (read actual rounds set by user from run_state.json) run_state = SimulationRunner.get_run_state(sim.simulation_id) if run_state: sim_dict["current_round"] = run_state.current_round sim_dict["runner_status"] = run_state.runner_status.value - # 使用用户设置的 total_rounds,若无则使用推荐轮数 + # Use user-set total_rounds, otherwise use recommended rounds sim_dict["total_rounds"] = run_state.total_rounds if run_state.total_rounds > 0 else recommended_rounds else: sim_dict["current_round"] = 0 sim_dict["runner_status"] = "idle" sim_dict["total_rounds"] = recommended_rounds - # 获取关联项目的文件列表(最多3个) + # Get associated project file list (max 3) project = ProjectManager.get_project(sim.project_id) if project and hasattr(project, 'files') and project.files: sim_dict["files"] = [ - {"filename": f.get("filename", "未知文件")} + {"filename": f.get("filename", "Unknown file")} for f in project.files[:3] ] else: sim_dict["files"] = [] - # 获取关联的 report_id(查找该 simulation 最新的 report) + # Get associated report_id (find the latest report for this simulation) sim_dict["report_id"] = _get_report_id_for_simulation(sim.simulation_id) - # 添加版本号 + # Add version number sim_dict["version"] = "v1.0.2" - # 格式化日期 + # Format date try: created_date = sim_dict.get("created_at", "")[:10] sim_dict["created_date"] = created_date @@ -971,7 +971,7 @@ def get_simulation_history(): }) except Exception as e: - logger.error(f"获取历史模拟失败: {str(e)}") + logger.error(f"Failed to get simulation history: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -982,10 +982,10 @@ def get_simulation_history(): @simulation_bp.route('//profiles', methods=['GET']) def get_simulation_profiles(simulation_id: str): """ - 获取模拟的Agent Profile + Get simulation Agent Profiles - Query参数: - platform: 平台类型(reddit/twitter,默认reddit) + Query parameters: + platform: Platform type (reddit/twitter, default reddit) """ try: platform = request.args.get('platform', 'reddit') @@ -1009,7 +1009,7 @@ def get_simulation_profiles(simulation_id: str): }), 404 except Exception as e: - logger.error(f"获取Profile失败: {str(e)}") + logger.error(f"Failed to get profiles: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1020,25 +1020,25 @@ def get_simulation_profiles(simulation_id: str): @simulation_bp.route('//profiles/realtime', methods=['GET']) def get_simulation_profiles_realtime(simulation_id: str): """ - 实时获取模拟的Agent Profile(用于在生成过程中实时查看进度) + Get simulation Agent Profiles in real-time (for viewing progress during generation) - 与 /profiles 接口的区别: - - 直接读取文件,不经过 SimulationManager - - 适用于生成过程中的实时查看 - - 返回额外的元数据(如文件修改时间、是否正在生成等) + Differences from the /profiles endpoint: + - Reads files directly, without going through SimulationManager + - Suitable for real-time viewing during generation + - Returns additional metadata (such as file modification time, whether generating, etc.) - Query参数: - platform: 平台类型(reddit/twitter,默认reddit) + Query parameters: + platform: Platform type (reddit/twitter, default reddit) - 返回: + Response: { "success": true, "data": { "simulation_id": "sim_xxxx", "platform": "reddit", "count": 15, - "total_expected": 93, // 预期总数(如果有) - "is_generating": true, // 是否正在生成 + "total_expected": 93, // Expected total (if available) + "is_generating": true, // Whether currently generating "file_exists": true, "file_modified_at": "2025-12-04T18:20:00", "profiles": [...] @@ -1052,28 +1052,28 @@ def get_simulation_profiles_realtime(simulation_id: str): try: platform = request.args.get('platform', 'reddit') - # 获取模拟目录 + # Get simulation directory sim_dir = os.path.join(Config.OASIS_SIMULATION_DATA_DIR, simulation_id) if not os.path.exists(sim_dir): return jsonify({ "success": False, - "error": f"模拟不存在: {simulation_id}" + "error": f"Simulation not found: {simulation_id}" }), 404 - # 确定文件路径 + # Determine file path if platform == "reddit": profiles_file = os.path.join(sim_dir, "reddit_profiles.json") else: profiles_file = os.path.join(sim_dir, "twitter_profiles.csv") - # 检查文件是否存在 + # Check if files exist file_exists = os.path.exists(profiles_file) profiles = [] file_modified_at = None if file_exists: - # 获取文件修改时间 + # Get file modification time file_stat = os.stat(profiles_file) file_modified_at = datetime.fromtimestamp(file_stat.st_mtime).isoformat() @@ -1086,10 +1086,10 @@ def get_simulation_profiles_realtime(simulation_id: str): reader = csv.DictReader(f) profiles = list(reader) except (json.JSONDecodeError, Exception) as e: - logger.warning(f"读取 profiles 文件失败(可能正在写入中): {e}") + logger.warning(f"Failed to read profiles file (may be in progress): {e}") profiles = [] - # 检查是否正在生成(通过 state.json 判断) + # Check if generating (by checking state.json) is_generating = False total_expected = None @@ -1119,7 +1119,7 @@ def get_simulation_profiles_realtime(simulation_id: str): }) except Exception as e: - logger.error(f"实时获取Profile失败: {str(e)}") + logger.error(f"Failed to get profiles in real-time: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1130,24 +1130,24 @@ def get_simulation_profiles_realtime(simulation_id: str): @simulation_bp.route('//config/realtime', methods=['GET']) def get_simulation_config_realtime(simulation_id: str): """ - 实时获取模拟配置(用于在生成过程中实时查看进度) + Get simulation config in real-time (for viewing progress during generation) - 与 /config 接口的区别: - - 直接读取文件,不经过 SimulationManager - - 适用于生成过程中的实时查看 - - 返回额外的元数据(如文件修改时间、是否正在生成等) - - 即使配置还没生成完也能返回部分信息 + Differences from the /config endpoint: + - Reads files directly, without going through SimulationManager + - Suitable for real-time viewing during generation + - Returns additional metadata (such as file modification time, whether generating, etc.) + - Can return partial information even if config generation is not complete - 返回: + Response: { "success": true, "data": { "simulation_id": "sim_xxxx", "file_exists": true, "file_modified_at": "2025-12-04T18:20:00", - "is_generating": true, // 是否正在生成 - "generation_stage": "generating_config", // 当前生成阶段 - "config": {...} // 配置内容(如果存在) + "is_generating": true, // Whether currently generating + "generation_stage": "generating_config", // Current generation stage + "config": {...} // Config content (if exists) } } """ @@ -1155,25 +1155,25 @@ def get_simulation_config_realtime(simulation_id: str): from datetime import datetime try: - # 获取模拟目录 + # Get simulation directory sim_dir = os.path.join(Config.OASIS_SIMULATION_DATA_DIR, simulation_id) if not os.path.exists(sim_dir): return jsonify({ "success": False, - "error": f"模拟不存在: {simulation_id}" + "error": f"Simulation not found: {simulation_id}" }), 404 - # 配置文件路径 + # Config file path config_file = os.path.join(sim_dir, "simulation_config.json") - # 检查文件是否存在 + # Check if files exist file_exists = os.path.exists(config_file) config = None file_modified_at = None if file_exists: - # 获取文件修改时间 + # Get file modification time file_stat = os.stat(config_file) file_modified_at = datetime.fromtimestamp(file_stat.st_mtime).isoformat() @@ -1181,10 +1181,10 @@ def get_simulation_config_realtime(simulation_id: str): with open(config_file, 'r', encoding='utf-8') as f: config = json.load(f) except (json.JSONDecodeError, Exception) as e: - logger.warning(f"读取 config 文件失败(可能正在写入中): {e}") + logger.warning(f"Failed to read config file (may be in progress): {e}") config = None - # 检查是否正在生成(通过 state.json 判断) + # Check if generating (by checking state.json) is_generating = False generation_stage = None config_generated = False @@ -1198,7 +1198,7 @@ def get_simulation_config_realtime(simulation_id: str): is_generating = status == "preparing" config_generated = state_data.get("config_generated", False) - # 判断当前阶段 + # Determine current stage if is_generating: if state_data.get("profiles_generated", False): generation_stage = "generating_config" @@ -1209,7 +1209,7 @@ def get_simulation_config_realtime(simulation_id: str): except Exception: pass - # 构建返回数据 + # Build response data response_data = { "simulation_id": simulation_id, "file_exists": file_exists, @@ -1220,7 +1220,7 @@ def get_simulation_config_realtime(simulation_id: str): "config": config } - # 如果配置存在,提取一些关键统计信息 + # If config exists, extract some key statistics if config: response_data["summary"] = { "total_agents": len(config.get("agent_configs", [])), @@ -1239,7 +1239,7 @@ def get_simulation_config_realtime(simulation_id: str): }) except Exception as e: - logger.error(f"实时获取Config失败: {str(e)}") + logger.error(f"Failed to get config in real-time: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1250,14 +1250,14 @@ def get_simulation_config_realtime(simulation_id: str): @simulation_bp.route('//config', methods=['GET']) def get_simulation_config(simulation_id: str): """ - 获取模拟配置(LLM智能生成的完整配置) - - 返回包含: - - time_config: 时间配置(模拟时长、轮次、高峰/低谷时段) - - agent_configs: 每个Agent的活动配置(活跃度、发言频率、立场等) - - event_config: 事件配置(初始帖子、热点话题) - - platform_configs: 平台配置 - - generation_reasoning: LLM的配置推理说明 + Get simulation configuration (complete configuration intelligently generated by LLM) + + Response includes: + - time_config: Time configuration (simulation duration, rounds, peak/off-peak periods) + - agent_configs: Activity configuration for each Agent (activity level, posting frequency, stance, etc.) + - event_config: Event configuration (initial posts, hot topics) + - platform_configs: Platform configuration + - generation_reasoning: LLM's configuration reasoning explanation """ try: manager = SimulationManager() @@ -1266,7 +1266,7 @@ def get_simulation_config(simulation_id: str): if not config: return jsonify({ "success": False, - "error": f"模拟配置不存在,请先调用 /prepare 接口" + "error": f"Simulation config does not exist. Please call /prepare endpoint first" }), 404 return jsonify({ @@ -1275,7 +1275,7 @@ def get_simulation_config(simulation_id: str): }) except Exception as e: - logger.error(f"获取配置失败: {str(e)}") + logger.error(f"Failed to get config: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1285,7 +1285,7 @@ def get_simulation_config(simulation_id: str): @simulation_bp.route('//config/download', methods=['GET']) def download_simulation_config(simulation_id: str): - """下载模拟配置文件""" + """Download simulation config file""" try: manager = SimulationManager() sim_dir = manager._get_simulation_dir(simulation_id) @@ -1294,7 +1294,7 @@ def download_simulation_config(simulation_id: str): if not os.path.exists(config_path): return jsonify({ "success": False, - "error": "配置文件不存在,请先调用 /prepare 接口" + "error": "Config file does not exist. Please call /prepare endpoint first" }), 404 return send_file( @@ -1304,7 +1304,7 @@ def download_simulation_config(simulation_id: str): ) except Exception as e: - logger.error(f"下载配置失败: {str(e)}") + logger.error(f"Failed to download config: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1315,19 +1315,19 @@ def download_simulation_config(simulation_id: str): @simulation_bp.route('/script//download', methods=['GET']) def download_simulation_script(script_name: str): """ - 下载模拟运行脚本文件(通用脚本,位于 backend/scripts/) + Download simulation run script files (generic scripts located in backend/scripts/) - script_name可选值: + script_name possible values: - run_twitter_simulation.py - run_reddit_simulation.py - run_parallel_simulation.py - action_logger.py """ try: - # 脚本位于 backend/scripts/ 目录 + # Scripts are located in the backend/scripts/ directory scripts_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../scripts')) - # 验证脚本名称 + # Validate script name allowed_scripts = [ "run_twitter_simulation.py", "run_reddit_simulation.py", @@ -1338,7 +1338,7 @@ def download_simulation_script(script_name: str): if script_name not in allowed_scripts: return jsonify({ "success": False, - "error": f"未知脚本: {script_name},可选: {allowed_scripts}" + "error": f"Unknown script: {script_name},Available: {allowed_scripts}" }), 400 script_path = os.path.join(scripts_dir, script_name) @@ -1346,7 +1346,7 @@ def download_simulation_script(script_name: str): if not os.path.exists(script_path): return jsonify({ "success": False, - "error": f"脚本文件不存在: {script_name}" + "error": f"Script file does not exist: {script_name}" }), 404 return send_file( @@ -1356,7 +1356,7 @@ def download_simulation_script(script_name: str): ) except Exception as e: - logger.error(f"下载脚本失败: {str(e)}") + logger.error(f"Failed to download script: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1364,19 +1364,19 @@ def download_simulation_script(script_name: str): }), 500 -# ============== Profile生成接口(独立使用) ============== +# ============== Profile generation endpoints (standalone use) ============== @simulation_bp.route('/generate-profiles', methods=['POST']) def generate_profiles(): """ - 直接从图谱生成OASIS Agent Profile(不创建模拟) + Generate OASIS Agent Profile directly from graph (without creating a simulation) - 请求(JSON): + Request (JSON): { - "graph_id": "mirofish_xxxx", // 必填 - "entity_types": ["Student"], // 可选 - "use_llm": true, // 可选 - "platform": "reddit" // 可选 + "graph_id": "mirofish_xxxx", // Required + "entity_types": ["Student"], // Optional + "use_llm": true, // Optional + "platform": "reddit" // Optional } """ try: @@ -1386,7 +1386,7 @@ def generate_profiles(): if not graph_id: return jsonify({ "success": False, - "error": "请提供 graph_id" + "error": "Please provide graph_id" }), 400 entity_types = data.get('entity_types') @@ -1406,7 +1406,7 @@ def generate_profiles(): if filtered.filtered_count == 0: return jsonify({ "success": False, - "error": "没有找到符合条件的实体" + "error": "No matching entities found" }), 400 generator = OasisProfileGenerator() @@ -1433,7 +1433,7 @@ def generate_profiles(): }) except Exception as e: - logger.error(f"生成Profile失败: {str(e)}") + logger.error(f"Failed to generate profiles: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1441,35 +1441,35 @@ def generate_profiles(): }), 500 -# ============== 模拟运行控制接口 ============== +# ============== Simulation run control endpoints ============== @simulation_bp.route('/start', methods=['POST']) def start_simulation(): """ - 开始运行模拟 + Start running simulation - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "platform": "parallel", // 可选: twitter / reddit / parallel (默认) - "max_rounds": 100, // 可选: 最大模拟轮数,用于截断过长的模拟 - "enable_graph_memory_update": false, // 可选: 是否将Agent活动动态更新到Zep图谱记忆 - "force": false // 可选: 强制重新开始(会停止运行中的模拟并清理日志) + "simulation_id": "sim_xxxx", // Required, simulation ID + "platform": "parallel", // Optional: twitter / reddit / parallel (default) + "max_rounds": 100, // Optional: Maximum simulation rounds, for truncating overly long simulations + "enable_graph_memory_update": false, // Optional: Whether to dynamically update Agent activities to Zep graph memory + "force": false // Optional: Force restart (stops running simulation and cleans up logs) } - 关于 force 参数: - - 启用后,如果模拟正在运行或已完成,会先停止并清理运行日志 - - 清理的内容包括:run_state.json, actions.jsonl, simulation.log 等 - - 不会清理配置文件(simulation_config.json)和 profile 文件 - - 适用于需要重新运行模拟的场景 + About the force parameter: + - When enabled, if simulation is running or completed, it will first stop and clean up run logs + - Cleaned content includes: run_state.json, actions.jsonl, simulation.log, etc. + - Will not clean up config files (simulation_config.json) and profile files + - Suitable for scenarios requiring simulation re-run - 关于 enable_graph_memory_update: - - 启用后,模拟中所有Agent的活动(发帖、评论、点赞等)都会实时更新到Zep图谱 - - 这可以让图谱"记住"模拟过程,用于后续分析或AI对话 - - 需要模拟关联的项目有有效的 graph_id - - 采用批量更新机制,减少API调用次数 + About enable_graph_memory_update: + - When enabled, all Agent activities (posting, commenting, liking, etc.) during simulation will be updated to Zep graph in real-time + - This allows the graph to "remember" the simulation process for subsequent analysis or AI conversation + - Requires the associated project to have a valid graph_id + - Uses batch update mechanism to reduce API call count - 返回: + Response: { "success": true, "data": { @@ -1479,8 +1479,8 @@ def start_simulation(): "twitter_running": true, "reddit_running": true, "started_at": "2025-12-01T10:00:00", - "graph_memory_update_enabled": true, // 是否启用了图谱记忆更新 - "force_restarted": true // 是否是强制重新开始 + "graph_memory_update_enabled": true, // Whether graph memory update is enabled + "force_restarted": true // Whether it is a forced restart } } """ @@ -1491,98 +1491,98 @@ def start_simulation(): if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 platform = data.get('platform', 'parallel') - max_rounds = data.get('max_rounds') # 可选:最大模拟轮数 - enable_graph_memory_update = data.get('enable_graph_memory_update', False) # 可选:是否启用图谱记忆更新 - force = data.get('force', False) # 可选:强制重新开始 + max_rounds = data.get('max_rounds') # Optional: maximum simulation rounds + enable_graph_memory_update = data.get('enable_graph_memory_update', False) # Optional: whether to enable graph memory update + force = data.get('force', False) # Optional: force restart - # 验证 max_rounds 参数 + # Validate max_rounds parameter if max_rounds is not None: try: max_rounds = int(max_rounds) if max_rounds <= 0: return jsonify({ "success": False, - "error": "max_rounds 必须是正整数" + "error": "max_rounds must be a positive integer" }), 400 except (ValueError, TypeError): return jsonify({ "success": False, - "error": "max_rounds 必须是有效的整数" + "error": "max_rounds must be a valid integer" }), 400 if platform not in ['twitter', 'reddit', 'parallel']: return jsonify({ "success": False, - "error": f"无效的平台类型: {platform},可选: twitter/reddit/parallel" + "error": f"Invalid platform type: {platform}. Available: twitter/reddit/parallel" }), 400 - # 检查模拟是否已准备好 + # Check if simulation is ready manager = SimulationManager() state = manager.get_simulation(simulation_id) if not state: return jsonify({ "success": False, - "error": f"模拟不存在: {simulation_id}" + "error": f"Simulation not found: {simulation_id}" }), 404 force_restarted = False - # 智能处理状态:如果准备工作已完成,允许重新启动 + # Smart status handling: if preparation is complete, allow restart if state.status != SimulationStatus.READY: - # 检查准备工作是否已完成 + # Check if preparation is complete is_prepared, prepare_info = _check_simulation_prepared(simulation_id) if is_prepared: - # 准备工作已完成,检查是否有正在运行的进程 + # Preparation complete, check if there is a running process if state.status == SimulationStatus.RUNNING: - # 检查模拟进程是否真的在运行 + # Check if simulation process is actually running run_state = SimulationRunner.get_run_state(simulation_id) if run_state and run_state.runner_status.value == "running": - # 进程确实在运行 + # Process is actually running if force: - # 强制模式:停止运行中的模拟 - logger.info(f"强制模式:停止运行中的模拟 {simulation_id}") + # Force mode: stopping running simulation + logger.info(f"Force mode: stopping running simulation {simulation_id}") try: SimulationRunner.stop_simulation(simulation_id) except Exception as e: - logger.warning(f"停止模拟时出现警告: {str(e)}") + logger.warning(f"Warning while stopping simulation: {str(e)}") else: return jsonify({ "success": False, - "error": f"模拟正在运行中,请先调用 /stop 接口停止,或使用 force=true 强制重新开始" + "error": f"Simulation is running. Please call /stop endpoint to stop first, or use force=true to force restart" }), 400 - # 如果是强制模式,清理运行日志 + # If force mode, clean up run logs if force: - logger.info(f"强制模式:清理模拟日志 {simulation_id}") + logger.info(f"Force mode: cleaning up simulation logs {simulation_id}") cleanup_result = SimulationRunner.cleanup_simulation_logs(simulation_id) if not cleanup_result.get("success"): - logger.warning(f"清理日志时出现警告: {cleanup_result.get('errors')}") + logger.warning(f"Warning while cleaning logs: {cleanup_result.get('errors')}") force_restarted = True - # 进程不存在或已结束,重置状态为 ready - logger.info(f"模拟 {simulation_id} 准备工作已完成,重置状态为 ready(原状态: {state.status.value})") + # Process does not exist or has ended, resetting status to ready + logger.info(f"Simulation {simulation_id} preparation complete, resetting status to ready (original status: {state.status.value})") state.status = SimulationStatus.READY manager._save_simulation_state(state) else: - # 准备工作未完成 + # Preparation not complete return jsonify({ "success": False, - "error": f"模拟未准备好,当前状态: {state.status.value},请先调用 /prepare 接口" + "error": f"Simulation not ready, current status: {state.status.value},, please call /prepare endpoint first" }), 400 - # 获取图谱ID(用于图谱记忆更新) + # Get graph ID (for graph memory update) graph_id = None if enable_graph_memory_update: - # 从模拟状态或项目中获取 graph_id + # Get graph_id from simulation state or project graph_id = state.graph_id if not graph_id: - # 尝试从项目中获取 + # Try to get from project project = ProjectManager.get_project(state.project_id) if project: graph_id = project.graph_id @@ -1590,12 +1590,12 @@ def start_simulation(): if not graph_id: return jsonify({ "success": False, - "error": "启用图谱记忆更新需要有效的 graph_id,请确保项目已构建图谱" + "error": "Enabling graph memory update requires a valid graph_id. Please ensure the project has built a graph" }), 400 - logger.info(f"启用图谱记忆更新: simulation_id={simulation_id}, graph_id={graph_id}") + logger.info(f"Enabling graph memory update: simulation_id={simulation_id}, graph_id={graph_id}") - # 启动模拟 + # Start simulation run_state = SimulationRunner.start_simulation( simulation_id=simulation_id, platform=platform, @@ -1604,7 +1604,7 @@ def start_simulation(): graph_id=graph_id ) - # 更新模拟状态 + # Update simulation status state.status = SimulationStatus.RUNNING manager._save_simulation_state(state) @@ -1628,7 +1628,7 @@ def start_simulation(): }), 400 except Exception as e: - logger.error(f"启动模拟失败: {str(e)}") + logger.error(f"Failed to start simulation: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1639,14 +1639,14 @@ def start_simulation(): @simulation_bp.route('/stop', methods=['POST']) def stop_simulation(): """ - 停止模拟 + Stop simulation - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx" // 必填,模拟ID + "simulation_id": "sim_xxxx" // Required, simulation ID } - 返回: + Response: { "success": true, "data": { @@ -1663,12 +1663,12 @@ def stop_simulation(): if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 run_state = SimulationRunner.stop_simulation(simulation_id) - # 更新模拟状态 + # Update simulation status manager = SimulationManager() state = manager.get_simulation(simulation_id) if state: @@ -1687,7 +1687,7 @@ def stop_simulation(): }), 400 except Exception as e: - logger.error(f"停止模拟失败: {str(e)}") + logger.error(f"Failed to stop simulation: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1695,14 +1695,14 @@ def stop_simulation(): }), 500 -# ============== 实时状态监控接口 ============== +# ============== Real-time status monitoring endpoints ============== @simulation_bp.route('//run-status', methods=['GET']) def get_run_status(simulation_id: str): """ - 获取模拟运行实时状态(用于前端轮询) + Get simulation real-time run status (for frontend polling) - 返回: + Response: { "success": true, "data": { @@ -1747,7 +1747,7 @@ def get_run_status(simulation_id: str): }) except Exception as e: - logger.error(f"获取运行状态失败: {str(e)}") + logger.error(f"Failed to get run status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1758,14 +1758,14 @@ def get_run_status(simulation_id: str): @simulation_bp.route('//run-status/detail', methods=['GET']) def get_run_status_detail(simulation_id: str): """ - 获取模拟运行详细状态(包含所有动作) + Get detailed simulation run status (including all actions) - 用于前端展示实时动态 + For frontend real-time display - Query参数: - platform: 过滤平台(twitter/reddit,可选) + Query parameters: + platform: Filter platform (twitter/reddit, optional) - 返回: + Response: { "success": true, "data": { @@ -1787,8 +1787,8 @@ def get_run_status_detail(simulation_id: str): }, ... ], - "twitter_actions": [...], # Twitter 平台的所有动作 - "reddit_actions": [...] # Reddit 平台的所有动作 + "twitter_actions": [...], # All actions from the Twitter platform + "reddit_actions": [...] # All actions from the Reddit platform } } """ @@ -1808,13 +1808,13 @@ def get_run_status_detail(simulation_id: str): } }) - # 获取完整的动作列表 + # Get the complete action list all_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, platform=platform_filter ) - # 分平台获取动作 + # Get actions by platform twitter_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, platform="twitter" @@ -1825,7 +1825,7 @@ def get_run_status_detail(simulation_id: str): platform="reddit" ) if not platform_filter or platform_filter == "reddit" else [] - # 获取当前轮次的动作(recent_actions 只展示最新一轮) + # Get current round actions (recent_actions only shows the latest round) current_round = run_state.current_round recent_actions = SimulationRunner.get_all_actions( simulation_id=simulation_id, @@ -1833,13 +1833,13 @@ def get_run_status_detail(simulation_id: str): round_num=current_round ) if current_round > 0 else [] - # 获取基础状态信息 + # Get basic status info result = run_state.to_dict() result["all_actions"] = [a.to_dict() for a in all_actions] result["twitter_actions"] = [a.to_dict() for a in twitter_actions] result["reddit_actions"] = [a.to_dict() for a in reddit_actions] result["rounds_count"] = len(run_state.rounds) - # recent_actions 只展示当前最新一轮两个平台的内容 + # recent_actions only shows the latest round from both platforms result["recent_actions"] = [a.to_dict() for a in recent_actions] return jsonify({ @@ -1848,7 +1848,7 @@ def get_run_status_detail(simulation_id: str): }) except Exception as e: - logger.error(f"获取详细状态失败: {str(e)}") + logger.error(f"Failed to get detailed status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1859,16 +1859,16 @@ def get_run_status_detail(simulation_id: str): @simulation_bp.route('//actions', methods=['GET']) def get_simulation_actions(simulation_id: str): """ - 获取模拟中的Agent动作历史 + Get Agent action history in the simulation - Query参数: - limit: 返回数量(默认100) - offset: 偏移量(默认0) - platform: 过滤平台(twitter/reddit) - agent_id: 过滤Agent ID - round_num: 过滤轮次 + Query parameters: + limit: Return count (default 100) + offset: Offset (default 0) + platform: Filter platform (twitter/reddit) + agent_id: Filter Agent ID + round_num: Filter round - 返回: + Response: { "success": true, "data": { @@ -1902,7 +1902,7 @@ def get_simulation_actions(simulation_id: str): }) except Exception as e: - logger.error(f"获取动作历史失败: {str(e)}") + logger.error(f"Failed to get action history: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1913,15 +1913,15 @@ def get_simulation_actions(simulation_id: str): @simulation_bp.route('//timeline', methods=['GET']) def get_simulation_timeline(simulation_id: str): """ - 获取模拟时间线(按轮次汇总) + Get simulation timeline (summarized by round) - 用于前端展示进度条和时间线视图 + For frontend progress bar and timeline view display - Query参数: - start_round: 起始轮次(默认0) - end_round: 结束轮次(默认全部) + Query parameters: + start_round: Start round (default 0) + end_round: End round (default all) - 返回每轮的汇总信息 + Returns summary info for each round """ try: start_round = request.args.get('start_round', 0, type=int) @@ -1942,7 +1942,7 @@ def get_simulation_timeline(simulation_id: str): }) except Exception as e: - logger.error(f"获取时间线失败: {str(e)}") + logger.error(f"Failed to get timeline: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1953,9 +1953,9 @@ def get_simulation_timeline(simulation_id: str): @simulation_bp.route('//agent-stats', methods=['GET']) def get_agent_stats(simulation_id: str): """ - 获取每个Agent的统计信息 + Get statistics for each Agent - 用于前端展示Agent活跃度排行、动作分布等 + For frontend Agent activity ranking, action distribution display, etc. """ try: stats = SimulationRunner.get_agent_stats(simulation_id) @@ -1969,7 +1969,7 @@ def get_agent_stats(simulation_id: str): }) except Exception as e: - logger.error(f"获取Agent统计失败: {str(e)}") + logger.error(f"Failed to get Agent statistics: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -1977,19 +1977,19 @@ def get_agent_stats(simulation_id: str): }), 500 -# ============== 数据库查询接口 ============== +# ============== Database query endpoints ============== @simulation_bp.route('//posts', methods=['GET']) def get_simulation_posts(simulation_id: str): """ - 获取模拟中的帖子 + Get posts from the simulation - Query参数: - platform: 平台类型(twitter/reddit) - limit: 返回数量(默认50) - offset: 偏移量 + Query parameters: + platform: Platform type (twitter/reddit) + limit: Return count (default 50) + offset: Offset - 返回帖子列表(从SQLite数据库读取) + Returns post list (read from SQLite database) """ try: platform = request.args.get('platform', 'reddit') @@ -2011,7 +2011,7 @@ def get_simulation_posts(simulation_id: str): "platform": platform, "count": 0, "posts": [], - "message": "数据库不存在,模拟可能尚未运行" + "message": "Database does not exist, simulation may not have been run yet" } }) @@ -2049,7 +2049,7 @@ def get_simulation_posts(simulation_id: str): }) except Exception as e: - logger.error(f"获取帖子失败: {str(e)}") + logger.error(f"Failed to get posts: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2060,12 +2060,12 @@ def get_simulation_posts(simulation_id: str): @simulation_bp.route('//comments', methods=['GET']) def get_simulation_comments(simulation_id: str): """ - 获取模拟中的评论(仅Reddit) + Get comments from the simulation (Reddit only) - Query参数: - post_id: 过滤帖子ID(可选) - limit: 返回数量 - offset: 偏移量 + Query parameters: + post_id: Filter by post ID (optional) + limit: Return count + offset: Offset """ try: post_id = request.args.get('post_id') @@ -2124,7 +2124,7 @@ def get_simulation_comments(simulation_id: str): }) except Exception as e: - logger.error(f"获取评论失败: {str(e)}") + logger.error(f"Failed to get comments: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2132,31 +2132,31 @@ def get_simulation_comments(simulation_id: str): }), 500 -# ============== Interview 采访接口 ============== +# ============== Interview endpoints ============== @simulation_bp.route('/interview', methods=['POST']) def interview_agent(): """ - 采访单个Agent + Interview a single Agent - 注意:此功能需要模拟环境处于运行状态(完成模拟循环后进入等待命令模式) + Note: This feature requires the simulation environment to be running (in command waiting mode after completing the simulation loop) - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "agent_id": 0, // 必填,Agent ID - "prompt": "你对这件事有什么看法?", // 必填,采访问题 - "platform": "twitter", // 可选,指定平台(twitter/reddit) - // 不指定时:双平台模拟同时采访两个平台 - "timeout": 60 // 可选,超时时间(秒),默认60 + "simulation_id": "sim_xxxx", // Required, simulation ID + "agent_id": 0, // Required,Agent ID + "prompt": "What do you think about this?", // Required, interview question + "platform": "twitter", // Optional, specify platform (twitter/reddit) + // When not specified: dual-platform simulation interviews both platforms simultaneously + "timeout": 60 // Optional, timeout in seconds, default 60 } - 返回(不指定platform,双平台模式): + Response (without specifying platform, dual-platform mode): { "success": true, "data": { "agent_id": 0, - "prompt": "你对这件事有什么看法?", + "prompt": "What do you think about this?", "result": { "agent_id": 0, "prompt": "...", @@ -2169,15 +2169,15 @@ def interview_agent(): } } - 返回(指定platform): + Response (with specified platform): { "success": true, "data": { "agent_id": 0, - "prompt": "你对这件事有什么看法?", + "prompt": "What do you think about this?", "result": { "agent_id": 0, - "response": "我认为...", + "response": "I think...", "platform": "twitter", "timestamp": "2025-12-08T10:00:00" }, @@ -2191,42 +2191,42 @@ def interview_agent(): simulation_id = data.get('simulation_id') agent_id = data.get('agent_id') prompt = data.get('prompt') - platform = data.get('platform') # 可选:twitter/reddit/None + platform = data.get('platform') # Optional: twitter/reddit/None timeout = data.get('timeout', 60) if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 if agent_id is None: return jsonify({ "success": False, - "error": "请提供 agent_id" + "error": "Please provide agent_id" }), 400 if not prompt: return jsonify({ "success": False, - "error": "请提供 prompt(采访问题)" + "error": "Please provide prompt (interview question)" }), 400 - # 验证platform参数 + # Validate platform parameter if platform and platform not in ("twitter", "reddit"): return jsonify({ "success": False, - "error": "platform 参数只能是 'twitter' 或 'reddit'" + "error": "platform parameter can only be 'twitter' or 'reddit'" }), 400 - # 检查环境状态 + # Check environment status if not SimulationRunner.check_env_alive(simulation_id): return jsonify({ "success": False, - "error": "模拟环境未运行或已关闭。请确保模拟已完成并进入等待命令模式。" + "error": "Simulation environment is not running or has been closed. Please ensure the simulation has completed and entered command waiting mode." }), 400 - # 优化prompt,添加前缀避免Agent调用工具 + # Optimize prompt by adding prefix to prevent Agent from calling tools optimized_prompt = optimize_interview_prompt(prompt) result = SimulationRunner.interview_agent( @@ -2251,11 +2251,11 @@ def interview_agent(): except TimeoutError as e: return jsonify({ "success": False, - "error": f"等待Interview响应超时: {str(e)}" + "error": f"Timed out waiting for Interview response: {str(e)}" }), 504 except Exception as e: - logger.error(f"Interview失败: {str(e)}") + logger.error(f"Interview failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2266,30 +2266,30 @@ def interview_agent(): @simulation_bp.route('/interview/batch', methods=['POST']) def interview_agents_batch(): """ - 批量采访多个Agent + Batch interview multiple Agents - 注意:此功能需要模拟环境处于运行状态 + Note: This feature requires the simulation environment to be running - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "interviews": [ // 必填,采访列表 + "simulation_id": "sim_xxxx", // Required, simulation ID + "interviews": [ // Required, interview list { "agent_id": 0, - "prompt": "你对A有什么看法?", - "platform": "twitter" // 可选,指定该Agent的采访平台 + "prompt": "What do you think about A?", + "platform": "twitter" // Optional, specify this Agent's interview platform }, { "agent_id": 1, - "prompt": "你对B有什么看法?" // 不指定platform则使用默认值 + "prompt": "What do you think about B?" // Uses default if platform not specified } ], - "platform": "reddit", // 可选,默认平台(被每项的platform覆盖) - // 不指定时:双平台模拟每个Agent同时采访两个平台 - "timeout": 120 // 可选,超时时间(秒),默认120 + "platform": "reddit", // Optional, default platform (overridden by each item platform) + // When not specified: dual-platform simulation interviews each Agent on both platforms simultaneously + "timeout": 120 // Optional, timeout in seconds, default 120 } - 返回: + Response: { "success": true, "data": { @@ -2312,56 +2312,56 @@ def interview_agents_batch(): simulation_id = data.get('simulation_id') interviews = data.get('interviews') - platform = data.get('platform') # 可选:twitter/reddit/None + platform = data.get('platform') # Optional: twitter/reddit/None timeout = data.get('timeout', 120) if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 if not interviews or not isinstance(interviews, list): return jsonify({ "success": False, - "error": "请提供 interviews(采访列表)" + "error": "Please provide interviews (interview list)" }), 400 - # 验证platform参数 + # Validate platform parameter if platform and platform not in ("twitter", "reddit"): return jsonify({ "success": False, - "error": "platform 参数只能是 'twitter' 或 'reddit'" + "error": "platform parameter can only be 'twitter' or 'reddit'" }), 400 - # 验证每个采访项 + # Validate each interview item for i, interview in enumerate(interviews): if 'agent_id' not in interview: return jsonify({ "success": False, - "error": f"采访列表第{i+1}项缺少 agent_id" + "error": f"Interview list item #{i+1} missing agent_id" }), 400 if 'prompt' not in interview: return jsonify({ "success": False, - "error": f"采访列表第{i+1}项缺少 prompt" + "error": f"Interview list item #{i+1} missing prompt" }), 400 - # 验证每项的platform(如果有) + # Validate each item's platform (if present) item_platform = interview.get('platform') if item_platform and item_platform not in ("twitter", "reddit"): return jsonify({ "success": False, - "error": f"采访列表第{i+1}项的platform只能是 'twitter' 或 'reddit'" + "error": f"Interview list item #{i+1}'s platform can only be 'twitter' or 'reddit'" }), 400 - # 检查环境状态 + # Check environment status if not SimulationRunner.check_env_alive(simulation_id): return jsonify({ "success": False, - "error": "模拟环境未运行或已关闭。请确保模拟已完成并进入等待命令模式。" + "error": "Simulation environment is not running or has been closed. Please ensure the simulation has completed and entered command waiting mode." }), 400 - # 优化每个采访项的prompt,添加前缀避免Agent调用工具 + # Optimize each interview item's prompt by adding prefix to prevent Agent from calling tools optimized_interviews = [] for interview in interviews: optimized_interview = interview.copy() @@ -2389,11 +2389,11 @@ def interview_agents_batch(): except TimeoutError as e: return jsonify({ "success": False, - "error": f"等待批量Interview响应超时: {str(e)}" + "error": f"Timed out waiting for batch Interview response: {str(e)}" }), 504 except Exception as e: - logger.error(f"批量Interview失败: {str(e)}") + logger.error(f"Batch Interview failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2404,20 +2404,20 @@ def interview_agents_batch(): @simulation_bp.route('/interview/all', methods=['POST']) def interview_all_agents(): """ - 全局采访 - 使用相同问题采访所有Agent + Global interview - interview all Agents with the same question - 注意:此功能需要模拟环境处于运行状态 + Note: This feature requires the simulation environment to be running - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "prompt": "你对这件事整体有什么看法?", // 必填,采访问题(所有Agent使用相同问题) - "platform": "reddit", // 可选,指定平台(twitter/reddit) - // 不指定时:双平台模拟每个Agent同时采访两个平台 - "timeout": 180 // 可选,超时时间(秒),默认180 + "simulation_id": "sim_xxxx", // Required, simulation ID + "prompt": "What is your overall view on this?", // Required, interview question (same for all Agents) + "platform": "reddit", // Optional, specify platform (twitter/reddit) + // When not specified: dual-platform simulation interviews each Agent on both platforms simultaneously + "timeout": 180 // Optional, timeout in seconds, default 180 } - 返回: + Response: { "success": true, "data": { @@ -2439,36 +2439,36 @@ def interview_all_agents(): simulation_id = data.get('simulation_id') prompt = data.get('prompt') - platform = data.get('platform') # 可选:twitter/reddit/None + platform = data.get('platform') # Optional: twitter/reddit/None timeout = data.get('timeout', 180) if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 if not prompt: return jsonify({ "success": False, - "error": "请提供 prompt(采访问题)" + "error": "Please provide prompt (interview question)" }), 400 - # 验证platform参数 + # Validate platform parameter if platform and platform not in ("twitter", "reddit"): return jsonify({ "success": False, - "error": "platform 参数只能是 'twitter' 或 'reddit'" + "error": "platform parameter can only be 'twitter' or 'reddit'" }), 400 - # 检查环境状态 + # Check environment status if not SimulationRunner.check_env_alive(simulation_id): return jsonify({ "success": False, - "error": "模拟环境未运行或已关闭。请确保模拟已完成并进入等待命令模式。" + "error": "Simulation environment is not running or has been closed. Please ensure the simulation has completed and entered command waiting mode." }), 400 - # 优化prompt,添加前缀避免Agent调用工具 + # Optimize prompt by adding prefix to prevent Agent from calling tools optimized_prompt = optimize_interview_prompt(prompt) result = SimulationRunner.interview_all_agents( @@ -2492,11 +2492,11 @@ def interview_all_agents(): except TimeoutError as e: return jsonify({ "success": False, - "error": f"等待全局Interview响应超时: {str(e)}" + "error": f"Timed out waiting for global Interview response: {str(e)}" }), 504 except Exception as e: - logger.error(f"全局Interview失败: {str(e)}") + logger.error(f"Global Interview failed: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2507,20 +2507,20 @@ def interview_all_agents(): @simulation_bp.route('/interview/history', methods=['POST']) def get_interview_history(): """ - 获取Interview历史记录 + Get Interview history - 从模拟数据库中读取所有Interview记录 + Read all Interview records from simulation database - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "platform": "reddit", // 可选,平台类型(reddit/twitter) - // 不指定则返回两个平台的所有历史 - "agent_id": 0, // 可选,只获取该Agent的采访历史 - "limit": 100 // 可选,返回数量,默认100 + "simulation_id": "sim_xxxx", // Required, simulation ID + "platform": "reddit", // Optional, platform type (reddit/twitter) + // Returns history from both platforms if not specified + "agent_id": 0, // Optional, get only this Agent's interview history + "limit": 100 // Optional, return count, default 100 } - 返回: + Response: { "success": true, "data": { @@ -2528,8 +2528,8 @@ def get_interview_history(): "history": [ { "agent_id": 0, - "response": "我认为...", - "prompt": "你对这件事有什么看法?", + "response": "I think...", + "prompt": "What do you think about this?", "timestamp": "2025-12-08T10:00:00", "platform": "reddit" }, @@ -2542,14 +2542,14 @@ def get_interview_history(): data = request.get_json() or {} simulation_id = data.get('simulation_id') - platform = data.get('platform') # 不指定则返回两个平台的历史 + platform = data.get('platform') # Returns history from both platforms if not specified agent_id = data.get('agent_id') limit = data.get('limit', 100) if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 history = SimulationRunner.get_interview_history( @@ -2568,7 +2568,7 @@ def get_interview_history(): }) except Exception as e: - logger.error(f"获取Interview历史失败: {str(e)}") + logger.error(f"Failed to get Interview history: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2579,16 +2579,16 @@ def get_interview_history(): @simulation_bp.route('/env-status', methods=['POST']) def get_env_status(): """ - 获取模拟环境状态 + Get simulation environment status - 检查模拟环境是否存活(可以接收Interview命令) + Check if the simulation environment is alive (can receive Interview commands) - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx" // 必填,模拟ID + "simulation_id": "sim_xxxx" // Required, simulation ID } - 返回: + Response: { "success": true, "data": { @@ -2596,7 +2596,7 @@ def get_env_status(): "env_alive": true, "twitter_available": true, "reddit_available": true, - "message": "环境正在运行,可以接收Interview命令" + "message": "Environment is running and can receive Interview commands" } } """ @@ -2608,18 +2608,18 @@ def get_env_status(): if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 env_alive = SimulationRunner.check_env_alive(simulation_id) - # 获取更详细的状态信息 + # Get more detailed status info env_status = SimulationRunner.get_env_status_detail(simulation_id) if env_alive: - message = "环境正在运行,可以接收Interview命令" + message = "Environment is running and can receive Interview commands" else: - message = "环境未运行或已关闭" + message = "Environment is not running or has been closed" return jsonify({ "success": True, @@ -2633,7 +2633,7 @@ def get_env_status(): }) except Exception as e: - logger.error(f"获取环境状态失败: {str(e)}") + logger.error(f"Failed to get environment status: {str(e)}") return jsonify({ "success": False, "error": str(e), @@ -2644,24 +2644,24 @@ def get_env_status(): @simulation_bp.route('/close-env', methods=['POST']) def close_simulation_env(): """ - 关闭模拟环境 + Close simulation environment - 向模拟发送关闭环境命令,使其优雅退出等待命令模式。 + Send close environment command to the simulation for graceful exit from command waiting mode. - 注意:这不同于 /stop 接口,/stop 会强制终止进程, - 而此接口会让模拟优雅地关闭环境并退出。 + Note: This differs from the /stop endpoint, which forcefully terminates the process, + while this endpoint lets the simulation gracefully close the environment and exit. - 请求(JSON): + Request (JSON): { - "simulation_id": "sim_xxxx", // 必填,模拟ID - "timeout": 30 // 可选,超时时间(秒),默认30 + "simulation_id": "sim_xxxx", // Required, simulation ID + "timeout": 30 // Optional, timeout in seconds, default 30 } - 返回: + Response: { "success": true, "data": { - "message": "环境关闭命令已发送", + "message": "Close environment command sent", "result": {...}, "timestamp": "2025-12-08T10:00:01" } @@ -2676,7 +2676,7 @@ def close_simulation_env(): if not simulation_id: return jsonify({ "success": False, - "error": "请提供 simulation_id" + "error": "Please provide simulation_id" }), 400 result = SimulationRunner.close_simulation_env( @@ -2684,7 +2684,7 @@ def close_simulation_env(): timeout=timeout ) - # 更新模拟状态 + # Update simulation status manager = SimulationManager() state = manager.get_simulation(simulation_id) if state: @@ -2703,7 +2703,7 @@ def close_simulation_env(): }), 400 except Exception as e: - logger.error(f"关闭环境失败: {str(e)}") + logger.error(f"Failed to close environment: {str(e)}") return jsonify({ "success": False, "error": str(e), diff --git a/backend/app/config.py b/backend/app/config.py index 6b8eb75..1cae133 100644 --- a/backend/app/config.py +++ b/backend/app/config.py @@ -1,60 +1,60 @@ """ -配置管理 -统一从项目根目录的 .env 文件加载配置 +Configuration management +Loads configuration from the .env file in the project root directory """ import os from dotenv import load_dotenv -# 加载项目根目录的 .env 文件 -# 路径: MiroFish/.env (相对于 backend/app/config.py) +# Load .env file from project root directory +# Path: MiroFish/.env (relative to backend/app/config.py) project_root_env = os.path.join(os.path.dirname(__file__), '../../.env') if os.path.exists(project_root_env): load_dotenv(project_root_env, override=True) else: - # 如果根目录没有 .env,尝试加载环境变量(用于生产环境) + # If no .env in root directory, try loading environment variables (for production) load_dotenv(override=True) class Config: - """Flask配置类""" + """Flask configuration class""" - # Flask配置 + # Flask configuration SECRET_KEY = os.environ.get('SECRET_KEY', 'mirofish-secret-key') DEBUG = os.environ.get('FLASK_DEBUG', 'True').lower() == 'true' - # JSON配置 - 禁用ASCII转义,让中文直接显示(而不是 \uXXXX 格式) + # JSON configuration - disable ASCII escaping so non-ASCII characters display directly (instead of \uXXXX format) JSON_AS_ASCII = False - # LLM配置(统一使用OpenAI格式) + # LLM configuration (unified OpenAI format) LLM_API_KEY = os.environ.get('LLM_API_KEY') LLM_BASE_URL = os.environ.get('LLM_BASE_URL', 'http://localhost:11434/v1') LLM_MODEL_NAME = os.environ.get('LLM_MODEL_NAME', 'qwen2.5:32b') - # Neo4j配置 + # Neo4j configuration NEO4J_URI = os.environ.get('NEO4J_URI', 'bolt://localhost:7687') NEO4J_USER = os.environ.get('NEO4J_USER', 'neo4j') NEO4J_PASSWORD = os.environ.get('NEO4J_PASSWORD', 'mirofish') - # Embedding配置 + # Embedding configuration EMBEDDING_MODEL = os.environ.get('EMBEDDING_MODEL', 'nomic-embed-text') EMBEDDING_BASE_URL = os.environ.get('EMBEDDING_BASE_URL', 'http://localhost:11434') - # 文件上传配置 + # File upload configuration MAX_CONTENT_LENGTH = 50 * 1024 * 1024 # 50MB UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), '../uploads') ALLOWED_EXTENSIONS = {'pdf', 'md', 'txt', 'markdown'} - # 文本处理配置 - DEFAULT_CHUNK_SIZE = 500 # 默认切块大小 - DEFAULT_CHUNK_OVERLAP = 50 # 默认重叠大小 + # Text processing configuration + DEFAULT_CHUNK_SIZE = 300 # Default chunk size + DEFAULT_CHUNK_OVERLAP = 50 # Default overlap size - # OASIS模拟配置 + # OASIS simulation configuration OASIS_DEFAULT_MAX_ROUNDS = int(os.environ.get('OASIS_DEFAULT_MAX_ROUNDS', '10')) OASIS_SIMULATION_DATA_DIR = os.path.join(os.path.dirname(__file__), '../uploads/simulations') - # OASIS平台可用动作配置 + # OASIS platform available actions configuration OASIS_TWITTER_ACTIONS = [ 'CREATE_POST', 'LIKE_POST', 'REPOST', 'FOLLOW', 'DO_NOTHING', 'QUOTE_POST' ] @@ -64,19 +64,19 @@ class Config: 'TREND', 'REFRESH', 'DO_NOTHING', 'FOLLOW', 'MUTE' ] - # Report Agent配置 + # Report Agent configuration REPORT_AGENT_MAX_TOOL_CALLS = int(os.environ.get('REPORT_AGENT_MAX_TOOL_CALLS', '5')) REPORT_AGENT_MAX_REFLECTION_ROUNDS = int(os.environ.get('REPORT_AGENT_MAX_REFLECTION_ROUNDS', '2')) REPORT_AGENT_TEMPERATURE = float(os.environ.get('REPORT_AGENT_TEMPERATURE', '0.5')) @classmethod def validate(cls): - """验证必要配置""" + """Validate required configuration""" errors = [] if not cls.LLM_API_KEY: - errors.append("LLM_API_KEY 未配置 (设置为任意非空值, 例如 'ollama')") + errors.append("LLM_API_KEY is not configured (set to any non-empty value, e.g. 'ollama')") if not cls.NEO4J_URI: - errors.append("NEO4J_URI 未配置") + errors.append("NEO4J_URI is not configured") if not cls.NEO4J_PASSWORD: - errors.append("NEO4J_PASSWORD 未配置") + errors.append("NEO4J_PASSWORD is not configured") return errors diff --git a/backend/app/models/__init__.py b/backend/app/models/__init__.py index 55bec61..b348366 100644 --- a/backend/app/models/__init__.py +++ b/backend/app/models/__init__.py @@ -1,5 +1,5 @@ """ -数据模型模块 +Data models module """ from .task import TaskManager, TaskStatus diff --git a/backend/app/models/project.py b/backend/app/models/project.py index 0897893..b975465 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -1,6 +1,6 @@ """ -项目上下文管理 -用于在服务端持久化项目状态,避免前端在接口间传递大量数据 +Project context management +Used to persist project state on the server side, avoiding the need to pass large data between API calls from the frontend """ import os @@ -15,45 +15,45 @@ class ProjectStatus(str, Enum): - """项目状态""" - CREATED = "created" # 刚创建,文件已上传 - ONTOLOGY_GENERATED = "ontology_generated" # 本体已生成 - GRAPH_BUILDING = "graph_building" # 图谱构建中 - GRAPH_COMPLETED = "graph_completed" # 图谱构建完成 - FAILED = "failed" # 失败 + """Project status""" + CREATED = "created" # Just created, files uploaded + ONTOLOGY_GENERATED = "ontology_generated" # Ontology generated + GRAPH_BUILDING = "graph_building" # Graph under construction + GRAPH_COMPLETED = "graph_completed" # Graph construction completed + FAILED = "failed" # Failed @dataclass class Project: - """项目数据模型""" + """Project data model""" project_id: str name: str status: ProjectStatus created_at: str updated_at: str - - # 文件信息 + + # File information files: List[Dict[str, str]] = field(default_factory=list) # [{filename, path, size}] total_text_length: int = 0 - - # 本体信息(接口1生成后填充) + + # Ontology information (populated after API step 1) ontology: Optional[Dict[str, Any]] = None analysis_summary: Optional[str] = None - - # 图谱信息(接口2完成后填充) + + # Graph information (populated after API step 2) graph_id: Optional[str] = None graph_build_task_id: Optional[str] = None - - # 配置 + + # Configuration simulation_requirement: Optional[str] = None chunk_size: int = 500 chunk_overlap: int = 50 - - # 错误信息 + + # Error information error: Optional[str] = None - + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Convert to dictionary""" return { "project_id": self.project_id, "name": self.name, @@ -71,14 +71,14 @@ def to_dict(self) -> Dict[str, Any]: "chunk_overlap": self.chunk_overlap, "error": self.error } - + @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'Project': - """从字典创建""" + """Create from dictionary""" status = data.get('status', 'created') if isinstance(status, str): status = ProjectStatus(status) - + return cls( project_id=data['project_id'], name=data.get('name', 'Unnamed Project'), @@ -99,52 +99,52 @@ def from_dict(cls, data: Dict[str, Any]) -> 'Project': class ProjectManager: - """项目管理器 - 负责项目的持久化存储和检索""" - - # 项目存储根目录 + """Project manager - handles persistent storage and retrieval of projects""" + + # Project storage root directory PROJECTS_DIR = os.path.join(Config.UPLOAD_FOLDER, 'projects') - + @classmethod def _ensure_projects_dir(cls): - """确保项目目录存在""" + """Ensure the projects directory exists""" os.makedirs(cls.PROJECTS_DIR, exist_ok=True) - + @classmethod def _get_project_dir(cls, project_id: str) -> str: - """获取项目目录路径""" + """Get the project directory path""" return os.path.join(cls.PROJECTS_DIR, project_id) - + @classmethod def _get_project_meta_path(cls, project_id: str) -> str: - """获取项目元数据文件路径""" + """Get the project metadata file path""" return os.path.join(cls._get_project_dir(project_id), 'project.json') - + @classmethod def _get_project_files_dir(cls, project_id: str) -> str: - """获取项目文件存储目录""" + """Get the project file storage directory""" return os.path.join(cls._get_project_dir(project_id), 'files') - + @classmethod def _get_project_text_path(cls, project_id: str) -> str: - """获取项目提取文本存储路径""" + """Get the project extracted text storage path""" return os.path.join(cls._get_project_dir(project_id), 'extracted_text.txt') - + @classmethod def create_project(cls, name: str = "Unnamed Project") -> Project: """ - 创建新项目 - + Create a new project + Args: - name: 项目名称 - + name: Project name + Returns: - 新创建的Project对象 + Newly created Project object """ cls._ensure_projects_dir() - + project_id = f"proj_{uuid.uuid4().hex[:12]}" now = datetime.now().isoformat() - + project = Project( project_id=project_id, name=name, @@ -152,154 +152,153 @@ def create_project(cls, name: str = "Unnamed Project") -> Project: created_at=now, updated_at=now ) - - # 创建项目目录结构 + + # Create project directory structure project_dir = cls._get_project_dir(project_id) files_dir = cls._get_project_files_dir(project_id) os.makedirs(project_dir, exist_ok=True) os.makedirs(files_dir, exist_ok=True) - - # 保存项目元数据 + + # Save project metadata cls.save_project(project) - + return project - + @classmethod def save_project(cls, project: Project) -> None: - """保存项目元数据""" + """Save project metadata""" project.updated_at = datetime.now().isoformat() meta_path = cls._get_project_meta_path(project.project_id) - + with open(meta_path, 'w', encoding='utf-8') as f: json.dump(project.to_dict(), f, ensure_ascii=False, indent=2) - + @classmethod def get_project(cls, project_id: str) -> Optional[Project]: """ - 获取项目 - + Get a project + Args: - project_id: 项目ID - + project_id: Project ID + Returns: - Project对象,如果不存在返回None + Project object, or None if not found """ meta_path = cls._get_project_meta_path(project_id) - + if not os.path.exists(meta_path): return None - + with open(meta_path, 'r', encoding='utf-8') as f: data = json.load(f) - + return Project.from_dict(data) - + @classmethod def list_projects(cls, limit: int = 50) -> List[Project]: """ - 列出所有项目 - + List all projects + Args: - limit: 返回数量限制 - + limit: Maximum number of results + Returns: - 项目列表,按创建时间倒序 + List of projects, sorted by creation time descending """ cls._ensure_projects_dir() - + projects = [] for project_id in os.listdir(cls.PROJECTS_DIR): project = cls.get_project(project_id) if project: projects.append(project) - - # 按创建时间倒序排序 + + # Sort by creation time descending projects.sort(key=lambda p: p.created_at, reverse=True) - + return projects[:limit] - + @classmethod def delete_project(cls, project_id: str) -> bool: """ - 删除项目及其所有文件 - + Delete a project and all its files + Args: - project_id: 项目ID - + project_id: Project ID + Returns: - 是否删除成功 + Whether the deletion was successful """ project_dir = cls._get_project_dir(project_id) - + if not os.path.exists(project_dir): return False - + shutil.rmtree(project_dir) return True - + @classmethod def save_file_to_project(cls, project_id: str, file_storage, original_filename: str) -> Dict[str, str]: """ - 保存上传的文件到项目目录 - + Save an uploaded file to the project directory + Args: - project_id: 项目ID - file_storage: Flask的FileStorage对象 - original_filename: 原始文件名 - + project_id: Project ID + file_storage: Flask FileStorage object + original_filename: Original filename + Returns: - 文件信息字典 {filename, path, size} + File info dictionary {filename, path, size} """ files_dir = cls._get_project_files_dir(project_id) os.makedirs(files_dir, exist_ok=True) - - # 生成安全的文件名 + + # Generate a safe filename ext = os.path.splitext(original_filename)[1].lower() safe_filename = f"{uuid.uuid4().hex[:8]}{ext}" file_path = os.path.join(files_dir, safe_filename) - - # 保存文件 + + # Save file file_storage.save(file_path) - - # 获取文件大小 + + # Get file size file_size = os.path.getsize(file_path) - + return { "original_filename": original_filename, "saved_filename": safe_filename, "path": file_path, "size": file_size } - + @classmethod def save_extracted_text(cls, project_id: str, text: str) -> None: - """保存提取的文本""" + """Save extracted text""" text_path = cls._get_project_text_path(project_id) with open(text_path, 'w', encoding='utf-8') as f: f.write(text) - + @classmethod def get_extracted_text(cls, project_id: str) -> Optional[str]: - """获取提取的文本""" + """Get extracted text""" text_path = cls._get_project_text_path(project_id) - + if not os.path.exists(text_path): return None - + with open(text_path, 'r', encoding='utf-8') as f: return f.read() - + @classmethod def get_project_files(cls, project_id: str) -> List[str]: - """获取项目的所有文件路径""" + """Get all file paths for a project""" files_dir = cls._get_project_files_dir(project_id) - + if not os.path.exists(files_dir): return [] - + return [ - os.path.join(files_dir, f) - for f in os.listdir(files_dir) + os.path.join(files_dir, f) + for f in os.listdir(files_dir) if os.path.isfile(os.path.join(files_dir, f)) ] - diff --git a/backend/app/models/task.py b/backend/app/models/task.py index e15f35f..25dd480 100644 --- a/backend/app/models/task.py +++ b/backend/app/models/task.py @@ -1,6 +1,6 @@ """ -任务状态管理 -用于跟踪长时间运行的任务(如图谱构建) +Task status management +Used to track long-running tasks (such as graph construction) """ import uuid @@ -12,30 +12,30 @@ class TaskStatus(str, Enum): - """任务状态枚举""" - PENDING = "pending" # 等待中 - PROCESSING = "processing" # 处理中 - COMPLETED = "completed" # 已完成 - FAILED = "failed" # 失败 + """Task status enum""" + PENDING = "pending" # Waiting + PROCESSING = "processing" # In progress + COMPLETED = "completed" # Completed + FAILED = "failed" # Failed @dataclass class Task: - """任务数据类""" + """Task data class""" task_id: str task_type: str status: TaskStatus created_at: datetime updated_at: datetime - progress: int = 0 # 总进度百分比 0-100 - message: str = "" # 状态消息 - result: Optional[Dict] = None # 任务结果 - error: Optional[str] = None # 错误信息 - metadata: Dict = field(default_factory=dict) # 额外元数据 - progress_detail: Dict = field(default_factory=dict) # 详细进度信息 - + progress: int = 0 # Overall progress percentage 0-100 + message: str = "" # Status message + result: Optional[Dict] = None # Task result + error: Optional[str] = None # Error message + metadata: Dict = field(default_factory=dict) # Additional metadata + progress_detail: Dict = field(default_factory=dict) # Detailed progress info + def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Convert to dictionary""" return { "task_id": self.task_id, "task_type": self.task_type, @@ -53,15 +53,15 @@ def to_dict(self) -> Dict[str, Any]: class TaskManager: """ - 任务管理器 - 线程安全的任务状态管理 + Task manager + Thread-safe task status management """ - + _instance = None _lock = threading.Lock() - + def __new__(cls): - """单例模式""" + """Singleton pattern""" if cls._instance is None: with cls._lock: if cls._instance is None: @@ -69,21 +69,21 @@ def __new__(cls): cls._instance._tasks: Dict[str, Task] = {} cls._instance._task_lock = threading.Lock() return cls._instance - + def create_task(self, task_type: str, metadata: Optional[Dict] = None) -> str: """ - 创建新任务 - + Create a new task + Args: - task_type: 任务类型 - metadata: 额外元数据 - + task_type: Task type + metadata: Additional metadata + Returns: - 任务ID + Task ID """ task_id = str(uuid.uuid4()) now = datetime.now() - + task = Task( task_id=task_id, task_type=task_type, @@ -92,17 +92,17 @@ def create_task(self, task_type: str, metadata: Optional[Dict] = None) -> str: updated_at=now, metadata=metadata or {} ) - + with self._task_lock: self._tasks[task_id] = task - + return task_id - + def get_task(self, task_id: str) -> Optional[Task]: - """获取任务""" + """Get a task""" with self._task_lock: return self._tasks.get(task_id) - + def update_task( self, task_id: str, @@ -114,16 +114,16 @@ def update_task( progress_detail: Optional[Dict] = None ): """ - 更新任务状态 - + Update task status + Args: - task_id: 任务ID - status: 新状态 - progress: 进度 - message: 消息 - result: 结果 - error: 错误信息 - progress_detail: 详细进度信息 + task_id: Task ID + status: New status + progress: Progress + message: Message + result: Result + error: Error message + progress_detail: Detailed progress info """ with self._task_lock: task = self._tasks.get(task_id) @@ -141,39 +141,39 @@ def update_task( task.error = error if progress_detail is not None: task.progress_detail = progress_detail - + def complete_task(self, task_id: str, result: Dict): - """标记任务完成""" + """Mark task as completed""" self.update_task( task_id, status=TaskStatus.COMPLETED, progress=100, - message="任务完成", + message="Task completed", result=result ) - + def fail_task(self, task_id: str, error: str): - """标记任务失败""" + """Mark task as failed""" self.update_task( task_id, status=TaskStatus.FAILED, - message="任务失败", + message="Task failed", error=error ) - + def list_tasks(self, task_type: Optional[str] = None) -> list: - """列出任务""" + """List tasks""" with self._task_lock: tasks = list(self._tasks.values()) if task_type: tasks = [t for t in tasks if t.task_type == task_type] return [t.to_dict() for t in sorted(tasks, key=lambda x: x.created_at, reverse=True)] - + def cleanup_old_tasks(self, max_age_hours: int = 24): - """清理旧任务""" + """Clean up old tasks""" from datetime import timedelta cutoff = datetime.now() - timedelta(hours=max_age_hours) - + with self._task_lock: old_ids = [ tid for tid, task in self._tasks.items() @@ -181,4 +181,3 @@ def cleanup_old_tasks(self, max_age_hours: int = 24): ] for tid in old_ids: del self._tasks[tid] - diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py index 0328bc6..006f1d7 100644 --- a/backend/app/services/__init__.py +++ b/backend/app/services/__init__.py @@ -1,5 +1,5 @@ """ -业务服务模块 +Business service modules """ from .ontology_generator import OntologyGenerator diff --git a/backend/app/services/entity_reader.py b/backend/app/services/entity_reader.py index e5aac1b..ca1fff1 100644 --- a/backend/app/services/entity_reader.py +++ b/backend/app/services/entity_reader.py @@ -1,6 +1,6 @@ """ -实体读取与过滤服务 -从 Neo4j 图谱中读取节点,筛选出符合预定义实体类型的节点 +Entity Reading and Filtering Service +Reads nodes from Neo4j graph and filters those matching predefined entity types Replaces zep_entity_reader.py — all Zep Cloud calls replaced by GraphStorage. """ @@ -16,15 +16,15 @@ @dataclass class EntityNode: - """实体节点数据结构""" + """Entity node data structure""" uuid: str name: str labels: List[str] summary: str attributes: Dict[str, Any] - # 相关的边信息 + # Related edge info related_edges: List[Dict[str, Any]] = field(default_factory=list) - # 相关的其他节点信息 + # Related node info related_nodes: List[Dict[str, Any]] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: @@ -39,7 +39,7 @@ def to_dict(self) -> Dict[str, Any]: } def get_entity_type(self) -> Optional[str]: - """获取实体类型(排除默认的Entity标签)""" + """Get entity type (excluding default Entity label)""" for label in self.labels: if label not in ["Entity", "Node"]: return label @@ -48,7 +48,7 @@ def get_entity_type(self) -> Optional[str]: @dataclass class FilteredEntities: - """过滤后的实体集合""" + """Filtered entity set""" entities: List[EntityNode] entity_types: Set[str] total_count: int @@ -65,12 +65,12 @@ def to_dict(self) -> Dict[str, Any]: class EntityReader: """ - 实体读取与过滤服务 (via GraphStorage / Neo4j) + Entity Reading and Filtering Service (via GraphStorage / Neo4j) - 主要功能: - 1. 从图谱读取所有节点 - 2. 筛选出符合预定义实体类型的节点(Labels不只是Entity的节点) - 3. 获取每个实体的相关边和关联节点信息 + Main features: + 1. Read all nodes from graph + 2. Filter nodes matching predefined entity types (nodes with labels beyond just Entity) + 3. Get related edges and associated node info for each entity """ def __init__(self, storage: GraphStorage): @@ -78,48 +78,48 @@ def __init__(self, storage: GraphStorage): def get_all_nodes(self, graph_id: str) -> List[Dict[str, Any]]: """ - 获取图谱的所有节点 + Get all nodes in the graph Args: - graph_id: 图谱ID + graph_id: Graph ID Returns: - 节点列表 + Node list """ - logger.info(f"获取图谱 {graph_id} 的所有节点...") + logger.info(f"Getting all nodes in graph {graph_id}...") nodes = self.storage.get_all_nodes(graph_id) - logger.info(f"共获取 {len(nodes)} 个节点") + logger.info(f"Retrieved {len(nodes)} nodes") return nodes def get_all_edges(self, graph_id: str) -> List[Dict[str, Any]]: """ - 获取图谱的所有边 + Get all edges in the graph Args: - graph_id: 图谱ID + graph_id: Graph ID Returns: - 边列表 + Edge list """ - logger.info(f"获取图谱 {graph_id} 的所有边...") + logger.info(f"Getting all edges in graph {graph_id}...") edges = self.storage.get_all_edges(graph_id) - logger.info(f"共获取 {len(edges)} 条边") + logger.info(f"Retrieved {len(edges)} edges") return edges def get_node_edges(self, node_uuid: str) -> List[Dict[str, Any]]: """ - 获取指定节点的所有相关边 + Get all related edges for a specified node Args: - node_uuid: 节点UUID + node_uuid: Node UUID Returns: - 边列表 + Edge list """ try: return self.storage.get_node_edges(node_uuid) except Exception as e: - logger.warning(f"获取节点 {node_uuid} 的边失败: {str(e)}") + logger.warning(f"Failed to get edges for node {node_uuid}: {str(e)}") return [] def filter_defined_entities( @@ -129,47 +129,47 @@ def filter_defined_entities( enrich_with_edges: bool = True ) -> FilteredEntities: """ - 筛选出符合预定义实体类型的节点 + Filter nodes matching predefined entity types - 筛选逻辑: - - 如果节点的Labels只有一个"Entity",说明这个实体不符合我们预定义的类型,跳过 - - 如果节点的Labels包含除"Entity"和"Node"之外的标签,说明符合预定义类型,保留 + Filtering logic: + - If a node only has the "Entity" label, it does not match our predefined types — skip + - If a node has labels other than "Entity" and "Node", it matches predefined types — keep Args: - graph_id: 图谱ID - defined_entity_types: 预定义的实体类型列表(可选,如果提供则只保留这些类型) - enrich_with_edges: 是否获取每个实体的相关边信息 + graph_id: Graph ID + defined_entity_types: Predefined entity type list (optional, if provided only these types are kept) + enrich_with_edges: Whether to get related edge info for each entity Returns: - FilteredEntities: 过滤后的实体集合 + FilteredEntities: Filtered entity set """ - logger.info(f"开始筛选图谱 {graph_id} 的实体...") + logger.info(f"Starting entity filtering for graph {graph_id}...") - # 获取所有节点 + # Get all nodes all_nodes = self.get_all_nodes(graph_id) total_count = len(all_nodes) - # 获取所有边(用于后续关联查找) + # Get all edges (for subsequent relation lookup) all_edges = self.get_all_edges(graph_id) if enrich_with_edges else [] - # 构建节点UUID到节点数据的映射 + # Build node UUID to node data mapping node_map = {n["uuid"]: n for n in all_nodes} - # 筛选符合条件的实体 + # Filter matching entities filtered_entities = [] entity_types_found: Set[str] = set() for node in all_nodes: labels = node.get("labels", []) - # 筛选逻辑:Labels必须包含除"Entity"和"Node"之外的标签 + # Filtering logic:Labels must contain labels other than "Entity" and "Node" custom_labels = [la for la in labels if la not in ["Entity", "Node"]] if not custom_labels: - # 只有默认标签,跳过 + # Only default labels, skip continue - # 如果指定了预定义类型,检查是否匹配 + # If predefined types specified, check for match if defined_entity_types: matching_labels = [la for la in custom_labels if la in defined_entity_types] if not matching_labels: @@ -180,7 +180,7 @@ def filter_defined_entities( entity_types_found.add(entity_type) - # 创建实体节点对象 + # Create entity node object entity = EntityNode( uuid=node["uuid"], name=node["name"], @@ -189,7 +189,7 @@ def filter_defined_entities( attributes=node.get("attributes", {}), ) - # 获取相关边和节点 + # Get related edges and nodes if enrich_with_edges: related_edges = [] related_node_uuids: Set[str] = set() @@ -214,7 +214,7 @@ def filter_defined_entities( entity.related_edges = related_edges - # 获取关联节点的基本信息 + # Get basic info of associated nodes related_nodes = [] for related_uuid in related_node_uuids: if related_uuid in node_map: @@ -230,8 +230,8 @@ def filter_defined_entities( filtered_entities.append(entity) - logger.info(f"筛选完成: 总节点 {total_count}, 符合条件 {len(filtered_entities)}, " - f"实体类型: {entity_types_found}") + logger.info(f"Filtering complete: total nodes {total_count}, matching {len(filtered_entities)}, " + f"entity types: {entity_types_found}") return FilteredEntities( entities=filtered_entities, @@ -246,17 +246,17 @@ def get_entity_with_context( entity_uuid: str ) -> Optional[EntityNode]: """ - 获取单个实体及其完整上下文(边和关联节点) + Get a single entity with its full context (edges and associated nodes) Optimized: uses get_node() + get_node_edges() instead of loading ALL nodes. Only fetches related nodes individually as needed. Args: - graph_id: 图谱ID - entity_uuid: 实体UUID + graph_id: Graph ID + entity_uuid: Entity UUID Returns: - EntityNode或None + EntityNode or None """ try: # Get the node directly by UUID (O(1) lookup) @@ -312,7 +312,7 @@ def get_entity_with_context( ) except Exception as e: - logger.error(f"获取实体 {entity_uuid} 失败: {str(e)}") + logger.error(f"Failed to get entity {entity_uuid}: {str(e)}") return None def get_entities_by_type( @@ -322,15 +322,15 @@ def get_entities_by_type( enrich_with_edges: bool = True ) -> List[EntityNode]: """ - 获取指定类型的所有实体 + Get all entities of a specified type Args: - graph_id: 图谱ID - entity_type: 实体类型(如 "Student", "PublicFigure" 等) - enrich_with_edges: 是否获取相关边信息 + graph_id: Graph ID + entity_type: Entity type (e.g. "Student", "PublicFigure", etc.) + enrich_with_edges: Whether to get related edge info Returns: - 实体列表 + Entity list """ result = self.filter_defined_entities( graph_id=graph_id, diff --git a/backend/app/services/graph_builder.py b/backend/app/services/graph_builder.py index cec8fdf..045fcee 100644 --- a/backend/app/services/graph_builder.py +++ b/backend/app/services/graph_builder.py @@ -1,6 +1,6 @@ """ -图谱构建服务 -使用 GraphStorage (Neo4j) 替代 Zep Cloud API +Graph Building Service +Uses GraphStorage (Neo4j) to replace Zep Cloud API """ import time @@ -19,7 +19,7 @@ @dataclass class GraphInfo: - """图谱信息""" + """Graph info""" graph_id: str node_count: int edge_count: int @@ -36,8 +36,8 @@ def to_dict(self) -> Dict[str, Any]: class GraphBuilderService: """ - 图谱构建服务 - 通过 GraphStorage 接口构建知识图谱 + Graph Building Service + Builds knowledge graph via GraphStorage interface """ def __init__(self, storage: GraphStorage): @@ -54,20 +54,20 @@ def build_graph_async( batch_size: int = 3 ) -> str: """ - 异步构建图谱 + Build graph asynchronously Args: - text: 输入文本 - ontology: 本体定义(来自接口1的输出) - graph_name: 图谱名称 - chunk_size: 文本块大小 - chunk_overlap: 块重叠大小 - batch_size: 每批发送的块数量 + text: Input text + ontology: Ontology definition (from interface 1 output) + graph_name: Graph name + chunk_size: Text chunk size + chunk_overlap: Chunk overlap size + batch_size: Number of chunks per batch Returns: - 任务ID + Task ID """ - # 创建任务 + # Create task task_id = self.task_manager.create_task( task_type="graph_build", metadata={ @@ -77,7 +77,7 @@ def build_graph_async( } ) - # 在后台线程中执行构建 + # Execute build in background thread thread = threading.Thread( target=self._build_graph_worker, args=(task_id, text, ontology, graph_name, chunk_size, chunk_overlap, batch_size) @@ -97,41 +97,41 @@ def _build_graph_worker( chunk_overlap: int, batch_size: int ): - """图谱构建工作线程""" + """Graph build worker thread""" try: self.task_manager.update_task( task_id, status=TaskStatus.PROCESSING, progress=5, - message="开始构建图谱..." + message="Starting graph build..." ) - # 1. 创建图谱 + # 1. Create graph graph_id = self.create_graph(graph_name) self.task_manager.update_task( task_id, progress=10, - message=f"图谱已创建: {graph_id}" + message=f"Graph created: {graph_id}" ) - # 2. 设置本体 + # 2. Set ontology self.set_ontology(graph_id, ontology) self.task_manager.update_task( task_id, progress=15, - message="本体已设置" + message="Ontology set" ) - # 3. 文本分块 + # 3. Text chunking chunks = TextProcessor.split_text(text, chunk_size, chunk_overlap) total_chunks = len(chunks) self.task_manager.update_task( task_id, progress=20, - message=f"文本已分割为 {total_chunks} 个块" + message=f"Text split into {total_chunks} chunks" ) - # 4. 分批发送数据 (NER + embedding + Neo4j insert — synchronous) + # 4. Batch send data (NER + embedding + Neo4j insert — synchronous) episode_uuids = self.add_text_batches( graph_id, chunks, batch_size, lambda msg, prog: self.task_manager.update_task( @@ -141,19 +141,19 @@ def _build_graph_worker( ) ) - # 5. 等待处理 (no-op for Neo4j — already synchronous) + # 5. Wait for processing (no-op for Neo4j — already synchronous) self.storage.wait_for_processing(episode_uuids) self.task_manager.update_task( task_id, progress=85, - message="数据处理完成,获取图谱信息..." + message="Data processing complete, getting graph info..." ) - # 6. 获取图谱信息 + # 6. Get graph info graph_info = self._get_graph_info(graph_id) - # 完成 + # Complete self.task_manager.complete_task(task_id, { "graph_id": graph_id, "graph_info": graph_info.to_dict(), @@ -166,7 +166,7 @@ def _build_graph_worker( self.task_manager.fail_task(task_id, error_msg) def create_graph(self, name: str) -> str: - """创建图谱""" + """Create graph""" return self.storage.create_graph( name=name, description="MiroFish Social Simulation Graph" @@ -174,7 +174,7 @@ def create_graph(self, name: str) -> str: def set_ontology(self, graph_id: str, ontology: Dict[str, Any]): """ - 设置图谱本体 + Set graph ontology Simply stores ontology as JSON in the Graph node. No more dynamic Pydantic class creation (was Zep-specific). @@ -189,7 +189,7 @@ def add_text_batches( batch_size: int = 3, progress_callback: Optional[Callable] = None ) -> List[str]: - """分批添加文本到图谱,返回所有 episode 的 uuid 列表""" + """Add text to graph in batches, return all episode uuid list""" episode_uuids = [] total_chunks = len(chunks) total_batches = (total_chunks + batch_size - 1) // batch_size @@ -203,7 +203,7 @@ def add_text_batches( if progress_callback: progress = (i + len(batch_chunks)) / total_chunks progress_callback( - f"处理第 {batch_num}/{total_batches} 批数据 ({len(batch_chunks)} 块)...", + f"Processing batch {batch_num}/{total_batches} ({len(batch_chunks)} chunks)...", progress ) @@ -229,14 +229,14 @@ def add_text_batches( f"after {elapsed:.1f}s: {e}" ) if progress_callback: - progress_callback(f"批次 {batch_num} 处理失败: {str(e)}", 0) + progress_callback(f"Batch {batch_num} processing failed: {str(e)}", 0) raise logger.info(f"[graph_build] All {total_chunks} chunks processed successfully") return episode_uuids def _get_graph_info(self, graph_id: str) -> GraphInfo: - """获取图谱信息""" + """Get graph info""" info = self.storage.get_graph_info(graph_id) return GraphInfo( graph_id=info["graph_id"], @@ -246,9 +246,9 @@ def _get_graph_info(self, graph_id: str) -> GraphInfo: ) def get_graph_data(self, graph_id: str) -> Dict[str, Any]: - """获取完整图谱数据(包含详细信息)""" + """Get complete graph data (with detailed info)""" return self.storage.get_graph_data(graph_id) def delete_graph(self, graph_id: str): - """删除图谱""" + """Delete graph""" self.storage.delete_graph(graph_id) diff --git a/backend/app/services/graph_memory_updater.py b/backend/app/services/graph_memory_updater.py index b8af069..7a5979f 100644 --- a/backend/app/services/graph_memory_updater.py +++ b/backend/app/services/graph_memory_updater.py @@ -1,6 +1,6 @@ """ -图谱记忆更新服务 -将模拟中的Agent活动动态更新到 Neo4j 图谱中 +Graph Memory Update Service +Dynamically updates Agent activities from simulation into Neo4j graph Replaces zep_graph_memory_updater.py — Zep client replaced by GraphStorage. """ @@ -23,7 +23,7 @@ @dataclass class AgentActivity: - """Agent活动记录""" + """Agent activity record""" platform: str # twitter / reddit agent_id: int agent_name: str @@ -34,9 +34,9 @@ class AgentActivity: def to_episode_text(self) -> str: """ - 将活动转换为自然语言文本描述 + Convert activity to natural language text description - 采用自然语言描述格式,让 NER 提取器能够从中提取实体和关系 + Uses natural language description format so NER extractor can extract entities and relationships """ action_descriptions = { "CREATE_POST": self._describe_create_post, @@ -61,41 +61,41 @@ def to_episode_text(self) -> str: def _describe_create_post(self) -> str: content = self.action_args.get("content", "") if content: - return f"发布了一条帖子:「{content}」" - return "发布了一条帖子" + return f'published a post: "{content}"' + return "published a post" def _describe_like_post(self) -> str: post_content = self.action_args.get("post_content", "") post_author = self.action_args.get("post_author_name", "") if post_content and post_author: - return f"点赞了{post_author}的帖子:「{post_content}」" + return f'liked {post_author}\'s post: "{post_content}"' elif post_content: - return f"点赞了一条帖子:「{post_content}」" + return f'liked a post: "{post_content}"' elif post_author: - return f"点赞了{post_author}的一条帖子" - return "点赞了一条帖子" + return f"liked a post by {post_author}" + return "liked a post" def _describe_dislike_post(self) -> str: post_content = self.action_args.get("post_content", "") post_author = self.action_args.get("post_author_name", "") if post_content and post_author: - return f"踩了{post_author}的帖子:「{post_content}」" + return f'disliked {post_author}\'s post: "{post_content}"' elif post_content: - return f"踩了一条帖子:「{post_content}」" + return f'disliked a post: "{post_content}"' elif post_author: - return f"踩了{post_author}的一条帖子" - return "踩了一条帖子" + return f"disliked a post by {post_author}" + return "disliked a post" def _describe_repost(self) -> str: original_content = self.action_args.get("original_content", "") original_author = self.action_args.get("original_author_name", "") if original_content and original_author: - return f"转发了{original_author}的帖子:「{original_content}」" + return f'reposted {original_author}\'s post: "{original_content}"' elif original_content: - return f"转发了一条帖子:「{original_content}」" + return f'reposted a post: "{original_content}"' elif original_author: - return f"转发了{original_author}的一条帖子" - return "转发了一条帖子" + return f"reposted a post by {original_author}" + return "reposted a post" def _describe_quote_post(self) -> str: original_content = self.action_args.get("original_content", "") @@ -103,22 +103,22 @@ def _describe_quote_post(self) -> str: quote_content = self.action_args.get("quote_content", "") or self.action_args.get("content", "") base = "" if original_content and original_author: - base = f"引用了{original_author}的帖子「{original_content}」" + base = f'quoted {original_author}\'s post "{original_content}"' elif original_content: - base = f"引用了一条帖子「{original_content}」" + base = f'quoted a post "{original_content}"' elif original_author: - base = f"引用了{original_author}的一条帖子" + base = f"quoted a post by {original_author}" else: - base = "引用了一条帖子" + base = "quoted a post" if quote_content: - base += f",并评论道:「{quote_content}」" + base += f', and commented: "{quote_content}"' return base def _describe_follow(self) -> str: target_user_name = self.action_args.get("target_user_name", "") if target_user_name: - return f"关注了用户「{target_user_name}」" - return "关注了一个用户" + return f'followed user "{target_user_name}"' + return "followed a user" def _describe_create_comment(self) -> str: content = self.action_args.get("content", "") @@ -126,67 +126,67 @@ def _describe_create_comment(self) -> str: post_author = self.action_args.get("post_author_name", "") if content: if post_content and post_author: - return f"在{post_author}的帖子「{post_content}」下评论道:「{content}」" + return f'commented on {post_author}\'s post "{post_content}": "{content}"' elif post_content: - return f"在帖子「{post_content}」下评论道:「{content}」" + return f'commented on post "{post_content}": "{content}"' elif post_author: - return f"在{post_author}的帖子下评论道:「{content}」" - return f"评论道:「{content}」" - return "发表了评论" + return f'commented on {post_author}\'s post: "{content}"' + return f'commented: "{content}"' + return "posted a comment" def _describe_like_comment(self) -> str: comment_content = self.action_args.get("comment_content", "") comment_author = self.action_args.get("comment_author_name", "") if comment_content and comment_author: - return f"点赞了{comment_author}的评论:「{comment_content}」" + return f'liked {comment_author}\'s comment: "{comment_content}"' elif comment_content: - return f"点赞了一条评论:「{comment_content}」" + return f'liked a comment: "{comment_content}"' elif comment_author: - return f"点赞了{comment_author}的一条评论" - return "点赞了一条评论" + return f"liked a comment by {comment_author}" + return "liked a comment" def _describe_dislike_comment(self) -> str: comment_content = self.action_args.get("comment_content", "") comment_author = self.action_args.get("comment_author_name", "") if comment_content and comment_author: - return f"踩了{comment_author}的评论:「{comment_content}」" + return f'disliked {comment_author}\'s comment: "{comment_content}"' elif comment_content: - return f"踩了一条评论:「{comment_content}」" + return f'disliked a comment: "{comment_content}"' elif comment_author: - return f"踩了{comment_author}的一条评论" - return "踩了一条评论" + return f"disliked a comment by {comment_author}" + return "disliked a comment" def _describe_search(self) -> str: query = self.action_args.get("query", "") or self.action_args.get("keyword", "") - return f"搜索了「{query}」" if query else "进行了搜索" + return f'searched for "{query}"' if query else "performed a search" def _describe_search_user(self) -> str: query = self.action_args.get("query", "") or self.action_args.get("username", "") - return f"搜索了用户「{query}」" if query else "搜索了用户" + return f'searched for user "{query}"' if query else "searched for a user" def _describe_mute(self) -> str: target_user_name = self.action_args.get("target_user_name", "") if target_user_name: - return f"屏蔽了用户「{target_user_name}」" - return "屏蔽了一个用户" + return f'muted user "{target_user_name}"' + return "muted a user" def _describe_generic(self) -> str: - return f"执行了{self.action_type}操作" + return f"performed {self.action_type} action" class GraphMemoryUpdater: """ - 图谱记忆更新器 (via GraphStorage / Neo4j) + Graph Memory Updater (via GraphStorage / Neo4j) - 监控模拟的actions日志文件,将新的agent活动实时更新到图谱中。 - 按平台分组,每累积BATCH_SIZE条活动后批量发送到图谱。 + Monitors simulation action log files and updates new agent activities to the graph in real-time. + Groups by platform and batch-sends to graph after accumulating BATCH_SIZE activities. """ BATCH_SIZE = 5 PLATFORM_DISPLAY_NAMES = { - 'twitter': '世界1', - 'reddit': '世界2', + 'twitter': 'World 1', + 'reddit': 'World 2', } SEND_INTERVAL = 0.5 @@ -195,10 +195,10 @@ class GraphMemoryUpdater: def __init__(self, graph_id: str, storage: GraphStorage): """ - 初始化更新器 + Initialize updater Args: - graph_id: 图谱ID + graph_id: Graph ID storage: GraphStorage instance (injected) """ self.graph_id = graph_id @@ -221,13 +221,13 @@ def __init__(self, graph_id: str, storage: GraphStorage): self._failed_count = 0 self._skipped_count = 0 - logger.info(f"GraphMemoryUpdater 初始化完成: graph_id={graph_id}, batch_size={self.BATCH_SIZE}") + logger.info(f"GraphMemoryUpdater initialized: graph_id={graph_id}, batch_size={self.BATCH_SIZE}") def _get_platform_display_name(self, platform: str) -> str: return self.PLATFORM_DISPLAY_NAMES.get(platform.lower(), platform) def start(self): - """启动后台工作线程""" + """Start background worker thread""" if self._running: return @@ -238,10 +238,10 @@ def start(self): name=f"GraphMemoryUpdater-{self.graph_id[:8]}" ) self._worker_thread.start() - logger.info(f"GraphMemoryUpdater 已启动: graph_id={self.graph_id}") + logger.info(f"GraphMemoryUpdater started: graph_id={self.graph_id}") def stop(self): - """停止后台工作线程""" + """Stop background worker thread""" self._running = False self._flush_remaining() @@ -249,7 +249,7 @@ def stop(self): if self._worker_thread and self._worker_thread.is_alive(): self._worker_thread.join(timeout=10) - logger.info(f"GraphMemoryUpdater 已停止: graph_id={self.graph_id}, " + logger.info(f"GraphMemoryUpdater stopped: graph_id={self.graph_id}, " f"total_activities={self._total_activities}, " f"batches_sent={self._total_sent}, " f"items_sent={self._total_items_sent}, " @@ -257,17 +257,17 @@ def stop(self): f"skipped={self._skipped_count}") def add_activity(self, activity: AgentActivity): - """添加一个agent活动到队列""" + """Add an agent activity to the queue""" if activity.action_type == "DO_NOTHING": self._skipped_count += 1 return self._activity_queue.put(activity) self._total_activities += 1 - logger.debug(f"添加活动到队列: {activity.agent_name} - {activity.action_type}") + logger.debug(f"Added activity to queue: {activity.agent_name} - {activity.action_type}") def add_activity_from_dict(self, data: Dict[str, Any], platform: str): - """从字典数据添加活动""" + """Add activity from dict data""" if "event_type" in data: return @@ -284,7 +284,7 @@ def add_activity_from_dict(self, data: Dict[str, Any], platform: str): self.add_activity(activity) def _worker_loop(self): - """后台工作循环 - 按平台批量发送活动到图谱""" + """Background worker loop - batch send activities to graph by platform""" while self._running or not self._activity_queue.empty(): try: try: @@ -306,12 +306,12 @@ def _worker_loop(self): pass except Exception as e: - logger.error(f"工作循环异常: {e}") + logger.error(f"Worker loop exception: {e}") time.sleep(1) def _send_batch_activities(self, activities: List[AgentActivity], platform: str): """ - 批量发送活动到图谱(合并为一条文本,通过 add_text 触发 NER) + Batch send activities to graph (merged into single text, triggers NER via add_text) """ if not activities: return @@ -326,20 +326,20 @@ def _send_batch_activities(self, activities: List[AgentActivity], platform: str) self._total_sent += 1 self._total_items_sent += len(activities) display_name = self._get_platform_display_name(platform) - logger.info(f"成功批量发送 {len(activities)} 条{display_name}活动到图谱 {self.graph_id}") - logger.debug(f"批量内容预览: {combined_text[:200]}...") + logger.info(f"Successfully batch sent {len(activities)} {display_name} activities to graph {self.graph_id}") + logger.debug(f"Batch content preview: {combined_text[:200]}...") return except Exception as e: if attempt < self.MAX_RETRIES - 1: - logger.warning(f"批量发送到图谱失败 (尝试 {attempt + 1}/{self.MAX_RETRIES}): {e}") + logger.warning(f"Batch send to graph failed (attempt {attempt + 1}/{self.MAX_RETRIES}): {e}") time.sleep(self.RETRY_DELAY * (attempt + 1)) else: - logger.error(f"批量发送到图谱失败,已重试{self.MAX_RETRIES}次: {e}") + logger.error(f"Batch send to graph failed after {self.MAX_RETRIES} retries: {e}") self._failed_count += 1 def _flush_remaining(self): - """发送队列和缓冲区中剩余的活动""" + """Send remaining activities in queue and buffer""" while not self._activity_queue.empty(): try: activity = self._activity_queue.get_nowait() @@ -355,13 +355,13 @@ def _flush_remaining(self): for platform, buffer in self._platform_buffers.items(): if buffer: display_name = self._get_platform_display_name(platform) - logger.info(f"发送{display_name}平台剩余的 {len(buffer)} 条活动") + logger.info(f"Sending remaining {len(buffer)} {display_name} platform activities") self._send_batch_activities(buffer, platform) for platform in self._platform_buffers: self._platform_buffers[platform] = [] def get_stats(self) -> Dict[str, Any]: - """获取统计信息""" + """Get statistics""" with self._buffer_lock: buffer_sizes = {p: len(b) for p, b in self._platform_buffers.items()} @@ -381,9 +381,9 @@ def get_stats(self) -> Dict[str, Any]: class GraphMemoryManager: """ - 管理多个模拟的图谱记忆更新器 + Manages graph memory updaters for multiple simulations - 每个模拟可以有自己的更新器实例。 + Each simulation can have its own updater instance. NOTE: create_updater() requires a GraphStorage instance — must be passed in. """ @@ -395,11 +395,11 @@ def create_updater( cls, simulation_id: str, graph_id: str, storage: GraphStorage ) -> GraphMemoryUpdater: """ - 为模拟创建图谱记忆更新器 + Create graph memory updater for simulation Args: - simulation_id: 模拟ID - graph_id: 图谱ID + simulation_id: Simulation ID + graph_id: Graph ID storage: GraphStorage instance """ with cls._lock: @@ -410,28 +410,28 @@ def create_updater( updater.start() cls._updaters[simulation_id] = updater - logger.info(f"创建图谱记忆更新器: simulation_id={simulation_id}, graph_id={graph_id}") + logger.info(f"Created graph memory updater: simulation_id={simulation_id}, graph_id={graph_id}") return updater @classmethod def get_updater(cls, simulation_id: str) -> Optional[GraphMemoryUpdater]: - """获取模拟的更新器""" + """Get updater for simulation""" return cls._updaters.get(simulation_id) @classmethod def stop_updater(cls, simulation_id: str): - """停止并移除模拟的更新器""" + """Stop and remove updater for simulation""" with cls._lock: if simulation_id in cls._updaters: cls._updaters[simulation_id].stop() del cls._updaters[simulation_id] - logger.info(f"已停止图谱记忆更新器: simulation_id={simulation_id}") + logger.info(f"Stopped graph memory updater: simulation_id={simulation_id}") _stop_all_done = False @classmethod def stop_all(cls): - """停止所有更新器""" + """Stop all updaters""" if cls._stop_all_done: return cls._stop_all_done = True @@ -442,13 +442,13 @@ def stop_all(cls): try: updater.stop() except Exception as e: - logger.error(f"停止更新器失败: simulation_id={simulation_id}, error={e}") + logger.error(f"Failed to stop updater: simulation_id={simulation_id}, error={e}") cls._updaters.clear() - logger.info("已停止所有图谱记忆更新器") + logger.info("All graph memory updaters stopped") @classmethod def get_all_stats(cls) -> Dict[str, Dict[str, Any]]: - """获取所有更新器的统计信息""" + """Get statistics for all updaters""" return { sim_id: updater.get_stats() for sim_id, updater in cls._updaters.items() diff --git a/backend/app/services/graph_tools.py b/backend/app/services/graph_tools.py index b3d1bb6..39911ad 100644 --- a/backend/app/services/graph_tools.py +++ b/backend/app/services/graph_tools.py @@ -1,13 +1,13 @@ """ -图谱检索工具服务 -封装图谱搜索、节点读取、边查询等工具,供Report Agent使用 +Graph Retrieval Tools Service +Wraps graph search, node reading, edge query tools for Report Agent use Replaces zep_tools.py — all Zep Cloud calls replaced by GraphStorage. -核心检索工具(优化后): -1. InsightForge(深度洞察检索)- 最强大的混合检索,自动生成子问题并多维度检索 -2. PanoramaSearch(广度搜索)- 获取全貌,包括过期内容 -3. QuickSearch(简单搜索)- 快速检索 +Core retrieval tools (optimized): +1. InsightForge (deep insight retrieval) - most powerful hybrid retrieval, auto-generates sub-questions and multi-dimensional search +2. PanoramaSearch (breadth search) - get full picture, including expired content +3. QuickSearch (simple search) - quick retrieval """ import json @@ -23,7 +23,7 @@ @dataclass class SearchResult: - """搜索结果""" + """Search result""" facts: List[str] edges: List[Dict[str, Any]] nodes: List[Dict[str, Any]] @@ -40,11 +40,11 @@ def to_dict(self) -> Dict[str, Any]: } def to_text(self) -> str: - """转换为文本格式,供LLM理解""" - text_parts = [f"搜索查询: {self.query}", f"找到 {self.total_count} 条相关信息"] + """Convert to text format for LLM understanding""" + text_parts = [f"Search query: {self.query}", f"Found {self.total_count} related items"] if self.facts: - text_parts.append("\n### 相关事实:") + text_parts.append("\n### Related facts:") for i, fact in enumerate(self.facts, 1): text_parts.append(f"{i}. {fact}") @@ -53,7 +53,7 @@ def to_text(self) -> str: @dataclass class NodeInfo: - """节点信息""" + """Node info""" uuid: str name: str labels: List[str] @@ -70,14 +70,14 @@ def to_dict(self) -> Dict[str, Any]: } def to_text(self) -> str: - """转换为文本格式""" - entity_type = next((la for la in self.labels if la not in ["Entity", "Node"]), "未知类型") - return f"实体: {self.name} (类型: {entity_type})\n摘要: {self.summary}" + """Convert to text format""" + entity_type = next((la for la in self.labels if la not in ["Entity", "Node"]), "Unknown type") + return f"Entity: {self.name} (type: {entity_type})\nSummary: {self.summary}" @dataclass class EdgeInfo: - """边信息""" + """Edge info""" uuid: str name: str fact: str @@ -85,7 +85,7 @@ class EdgeInfo: target_node_uuid: str source_node_name: Optional[str] = None target_node_name: Optional[str] = None - # 时间信息 (may be absent in Neo4j — kept for interface compat) + # Temporal info (may be absent in Neo4j — kept for interface compat) created_at: Optional[str] = None valid_at: Optional[str] = None invalid_at: Optional[str] = None @@ -107,47 +107,47 @@ def to_dict(self) -> Dict[str, Any]: } def to_text(self, include_temporal: bool = False) -> str: - """转换为文本格式""" + """Convert to text format""" source = self.source_node_name or self.source_node_uuid[:8] target = self.target_node_name or self.target_node_uuid[:8] - base_text = f"关系: {source} --[{self.name}]--> {target}\n事实: {self.fact}" + base_text = f"Relationship: {source} --[{self.name}]--> {target}\nFact: {self.fact}" if include_temporal: - valid_at = self.valid_at or "未知" - invalid_at = self.invalid_at or "至今" - base_text += f"\n时效: {valid_at} - {invalid_at}" + valid_at = self.valid_at or "Unknown" + invalid_at = self.invalid_at or "Present" + base_text += f"\nValidity: {valid_at} - {invalid_at}" if self.expired_at: - base_text += f" (已过期: {self.expired_at})" + base_text += f" (expired: {self.expired_at})" return base_text @property def is_expired(self) -> bool: - """是否已过期""" + """Whether expired""" return self.expired_at is not None @property def is_invalid(self) -> bool: - """是否已失效""" + """Whether invalidated""" return self.invalid_at is not None @dataclass class InsightForgeResult: """ - 深度洞察检索结果 (InsightForge) - 包含多个子问题的检索结果,以及综合分析 + Deep insight retrieval result (InsightForge) + Contains retrieval results from multiple sub-queries, plus comprehensive analysis """ query: str simulation_requirement: str sub_queries: List[str] - # 各维度检索结果 + # Multi-dimensional retrieval results semantic_facts: List[str] = field(default_factory=list) entity_insights: List[Dict[str, Any]] = field(default_factory=list) relationship_chains: List[str] = field(default_factory=list) - # 统计信息 + # Statistics total_facts: int = 0 total_entities: int = 0 total_relationships: int = 0 @@ -166,38 +166,38 @@ def to_dict(self) -> Dict[str, Any]: } def to_text(self) -> str: - """转换为详细的文本格式,供LLM理解""" + """Convert to detailed text format for LLM understanding""" text_parts = [ - f"## 未来预测深度分析", - f"分析问题: {self.query}", - f"预测场景: {self.simulation_requirement}", - f"\n### 预测数据统计", - f"- 相关预测事实: {self.total_facts}条", - f"- 涉及实体: {self.total_entities}个", - f"- 关系链: {self.total_relationships}条" + f"## Deep Future Prediction Analysis", + f"Analysis question: {self.query}", + f"Prediction scenario: {self.simulation_requirement}", + f"\n### Prediction Data Statistics", + f"- Related prediction facts: {self.total_facts}", + f"- Entities involved: {self.total_entities}", + f"- Relationship chains: {self.total_relationships}" ] if self.sub_queries: - text_parts.append(f"\n### 分析的子问题") + text_parts.append(f"\n### Analyzed Sub-questions") for i, sq in enumerate(self.sub_queries, 1): text_parts.append(f"{i}. {sq}") if self.semantic_facts: - text_parts.append(f"\n### 【关键事实】(请在报告中引用这些原文)") + text_parts.append(f"\n### [Key Facts] (cite these in the report)") for i, fact in enumerate(self.semantic_facts, 1): text_parts.append(f'{i}. "{fact}"') if self.entity_insights: - text_parts.append(f"\n### 【核心实体】") + text_parts.append(f"\n### [Core Entities]") for entity in self.entity_insights: - text_parts.append(f"- **{entity.get('name', '未知')}** ({entity.get('type', '实体')})") + text_parts.append(f"- **{entity.get('name', 'Unknown')}** ({entity.get('type', 'Entity')})") if entity.get('summary'): - text_parts.append(f" 摘要: \"{entity.get('summary')}\"") + text_parts.append(f" Summary: \"{entity.get('summary')}\"") if entity.get('related_facts'): - text_parts.append(f" 相关事实: {len(entity.get('related_facts', []))}条") + text_parts.append(f" Related facts: {len(entity.get('related_facts', []))}") if self.relationship_chains: - text_parts.append(f"\n### 【关系链】") + text_parts.append(f"\n### [Relationship Chains]") for chain in self.relationship_chains: text_parts.append(f"- {chain}") @@ -207,8 +207,8 @@ def to_text(self) -> str: @dataclass class PanoramaResult: """ - 广度搜索结果 (Panorama) - 包含所有相关信息,包括过期内容 + Breadth search result (Panorama) + Contains all related info, including expired content """ query: str @@ -236,31 +236,31 @@ def to_dict(self) -> Dict[str, Any]: } def to_text(self) -> str: - """转换为文本格式(完整版本,不截断)""" + """Convert to text format (full version, no truncation)""" text_parts = [ - f"## 广度搜索结果(未来全景视图)", - f"查询: {self.query}", - f"\n### 统计信息", - f"- 总节点数: {self.total_nodes}", - f"- 总边数: {self.total_edges}", - f"- 当前有效事实: {self.active_count}条", - f"- 历史/过期事实: {self.historical_count}条" + f"## Breadth Search Results (Future Panorama View)", + f"Query: {self.query}", + f"\n### Statistics", + f"- Total nodes: {self.total_nodes}", + f"- Total edges: {self.total_edges}", + f"- Current active facts: {self.active_count}", + f"- Historical/expired facts: {self.historical_count}" ] if self.active_facts: - text_parts.append(f"\n### 【当前有效事实】(模拟结果原文)") + text_parts.append(f"\n### [Current Active Facts] (simulation result text)") for i, fact in enumerate(self.active_facts, 1): text_parts.append(f'{i}. "{fact}"') if self.historical_facts: - text_parts.append(f"\n### 【历史/过期事实】(演变过程记录)") + text_parts.append(f"\n### [Historical/Expired Facts] (evolution records)") for i, fact in enumerate(self.historical_facts, 1): text_parts.append(f'{i}. "{fact}"') if self.all_nodes: - text_parts.append(f"\n### 【涉及实体】") + text_parts.append(f"\n### [Involved Entities]") for node in self.all_nodes: - entity_type = next((la for la in node.labels if la not in ["Entity", "Node"]), "实体") + entity_type = next((la for la in node.labels if la not in ["Entity", "Node"]), "Entity") text_parts.append(f"- **{node.name}** ({entity_type})") return "\n".join(text_parts) @@ -268,7 +268,7 @@ def to_text(self) -> str: @dataclass class AgentInterview: - """单个Agent的采访结果""" + """Interview result for a single Agent""" agent_name: str agent_role: str agent_bio: str @@ -288,11 +288,11 @@ def to_dict(self) -> Dict[str, Any]: def to_text(self) -> str: text = f"**{self.agent_name}** ({self.agent_role})\n" - text += f"_简介: {self.agent_bio}_\n\n" + text += f"_Bio: {self.agent_bio}_\n\n" text += f"**Q:** {self.question}\n\n" text += f"**A:** {self.response}\n" if self.key_quotes: - text += "\n**关键引言:**\n" + text += "\n**Key Quotes:**\n" for quote in self.key_quotes: clean_quote = quote.replace('\u201c', '').replace('\u201d', '').replace('"', '') clean_quote = clean_quote.replace('\u300c', '').replace('\u300d', '') @@ -320,8 +320,8 @@ def to_text(self) -> str: @dataclass class InterviewResult: """ - 采访结果 (Interview) - 包含多个模拟Agent的采访回答 + Interview result (Interview) + Contains interview responses from multiple simulated Agents """ interview_topic: str interview_questions: List[str] @@ -348,64 +348,64 @@ def to_dict(self) -> Dict[str, Any]: } def to_text(self) -> str: - """转换为详细的文本格式,供LLM理解和报告引用""" + """Convert to detailed text format for LLM understanding and report citation""" text_parts = [ - "## 深度采访报告", - f"**采访主题:** {self.interview_topic}", - f"**采访人数:** {self.interviewed_count} / {self.total_agents} 位模拟Agent", - "\n### 采访对象选择理由", - self.selection_reasoning or "(自动选择)", + "## In-depth Interview Report", + f"**Interview Topic:** {self.interview_topic}", + f"**Interviewees:** {self.interviewed_count} / {self.total_agents} simulated Agents", + "\n### Interview Subject Selection Reasoning", + self.selection_reasoning or "(Auto-selected)", "\n---", - "\n### 采访实录", + "\n### Interview Transcripts", ] if self.interviews: for i, interview in enumerate(self.interviews, 1): - text_parts.append(f"\n#### 采访 #{i}: {interview.agent_name}") + text_parts.append(f"\n#### Interview #{i}: {interview.agent_name}") text_parts.append(interview.to_text()) text_parts.append("\n---") else: - text_parts.append("(无采访记录)\n\n---") + text_parts.append("(No interview records)\n\n---") - text_parts.append("\n### 采访摘要与核心观点") - text_parts.append(self.summary or "(无摘要)") + text_parts.append("\n### Interview Summary and Core Views") + text_parts.append(self.summary or "(No summary)") return "\n".join(text_parts) class GraphToolsService: """ - 图谱检索工具服务 (via GraphStorage / Neo4j) - - 【核心检索工具 - 优化后】 - 1. insight_forge - 深度洞察检索(最强大,自动生成子问题,多维度检索) - 2. panorama_search - 广度搜索(获取全貌,包括过期内容) - 3. quick_search - 简单搜索(快速检索) - 4. interview_agents - 深度采访(采访模拟Agent,获取多视角观点) - - 【基础工具】 - - search_graph - 图谱语义搜索 - - get_all_nodes - 获取图谱所有节点 - - get_all_edges - 获取图谱所有边(含时间信息) - - get_node_detail - 获取节点详细信息 - - get_node_edges - 获取节点相关的边 - - get_entities_by_type - 按类型获取实体 - - get_entity_summary - 获取实体的关系摘要 + Graph Retrieval Tools Service (via GraphStorage / Neo4j) + + [Core Retrieval Tools - Optimized] + 1. insight_forge - deep insight retrieval (most powerful, auto sub-questions, multi-dimensional) + 2. panorama_search - breadth search (full picture, including expired content) + 3. quick_search - simple search (quick retrieval) + 4. interview_agents - in-depth interview (interview simulation Agents, get multi-perspective views) + + [Basic Tools] + - search_graph - Graph semantic search + - get_all_nodes - get all graph nodes + - get_all_edges - get all graph edges (with temporal info) + - get_node_detail - get node details + - get_node_edges - get edges related to node + - get_entities_by_type - get entities by type + - get_entity_summary - get entity relationship summary """ def __init__(self, storage: GraphStorage, llm_client: Optional[LLMClient] = None): self.storage = storage self._llm_client = llm_client - logger.info("GraphToolsService 初始化完成") + logger.info("GraphToolsService initialized") @property def llm(self) -> LLMClient: - """延迟初始化LLM客户端""" + """Lazy-initialize LLM client""" if self._llm_client is None: self._llm_client = LLMClient() return self._llm_client - # ========== 基础工具 ========== + # ========== Basic Tools ========== def search_graph( self, @@ -415,18 +415,18 @@ def search_graph( scope: str = "edges" ) -> SearchResult: """ - 图谱语义搜索 (hybrid: vector + BM25 via Neo4j) + Graph semantic search (hybrid: vector + BM25 via Neo4j) Args: - graph_id: 图谱ID - query: 搜索查询 - limit: 返回结果数量 - scope: 搜索范围,"edges" 或 "nodes" 或 "both" + graph_id: Graph ID + query: Search query + limit: Number of results to return + scope: Search scope, "edges" or "nodes" or "both" Returns: SearchResult """ - logger.info(f"图谱搜索: graph_id={graph_id}, query={query[:50]}...") + logger.info(f"Graph search: graph_id={graph_id}, query={query[:50]}...") try: search_results = self.storage.search( @@ -481,7 +481,7 @@ def search_graph( if summary: facts.append(f"[{node.get('name', '')}]: {summary}") - logger.info(f"搜索完成: 找到 {len(facts)} 条相关事实") + logger.info(f"Search complete: found {len(facts)} related facts") return SearchResult( facts=facts, @@ -492,7 +492,7 @@ def search_graph( ) except Exception as e: - logger.warning(f"图谱搜索失败,降级为本地搜索: {str(e)}") + logger.warning(f"Graph search failed, falling back to local search: {str(e)}") return self._local_search(graph_id, query, limit, scope) def _local_search( @@ -503,9 +503,9 @@ def _local_search( scope: str = "edges" ) -> SearchResult: """ - 本地关键词匹配搜索(降级方案) + Local keyword matching search (fallback) """ - logger.info(f"使用本地搜索: query={query[:30]}...") + logger.info(f"Using local search: query={query[:30]}...") facts = [] edges_result = [] @@ -570,10 +570,10 @@ def match_score(text: str) -> int: if summary: facts.append(f"[{node.get('name', '')}]: {summary}") - logger.info(f"本地搜索完成: 找到 {len(facts)} 条相关事实") + logger.info(f"Local search complete: found {len(facts)} related facts") except Exception as e: - logger.error(f"本地搜索失败: {str(e)}") + logger.error(f"Local search failed: {str(e)}") return SearchResult( facts=facts, @@ -584,8 +584,8 @@ def match_score(text: str) -> int: ) def get_all_nodes(self, graph_id: str) -> List[NodeInfo]: - """获取图谱的所有节点""" - logger.info(f"获取图谱 {graph_id} 的所有节点...") + """Get all nodes in the graph""" + logger.info(f"Getting all nodes in graph {graph_id}...") raw_nodes = self.storage.get_all_nodes(graph_id) @@ -599,12 +599,12 @@ def get_all_nodes(self, graph_id: str) -> List[NodeInfo]: attributes=node.get("attributes", {}) )) - logger.info(f"获取到 {len(result)} 个节点") + logger.info(f"Retrieved {len(result)} nodes") return result def get_all_edges(self, graph_id: str, include_temporal: bool = True) -> List[EdgeInfo]: - """获取图谱的所有边(含时间信息)""" - logger.info(f"获取图谱 {graph_id} 的所有边...") + """Get all edges in the graph (with temporal info)""" + logger.info(f"Getting all edges in graph {graph_id}...") raw_edges = self.storage.get_all_edges(graph_id) @@ -626,12 +626,12 @@ def get_all_edges(self, graph_id: str, include_temporal: bool = True) -> List[Ed result.append(edge_info) - logger.info(f"获取到 {len(result)} 条边") + logger.info(f"Retrieved {len(result)} edges") return result def get_node_detail(self, node_uuid: str) -> Optional[NodeInfo]: - """获取单个节点的详细信息""" - logger.info(f"获取节点详情: {node_uuid[:8]}...") + """Get detailed info for a single node""" + logger.info(f"Getting node details: {node_uuid[:8]}...") try: node = self.storage.get_node(node_uuid) @@ -646,17 +646,17 @@ def get_node_detail(self, node_uuid: str) -> Optional[NodeInfo]: attributes=node.get("attributes", {}) ) except Exception as e: - logger.error(f"获取节点详情失败: {str(e)}") + logger.error(f"Failed to get node details: {str(e)}") return None def get_node_edges(self, graph_id: str, node_uuid: str) -> List[EdgeInfo]: """ - 获取节点相关的所有边 + Get all edges related to a node Optimized: uses storage.get_node_edges() (O(degree) Cypher) instead of loading ALL edges and filtering. """ - logger.info(f"获取节点 {node_uuid[:8]}... 的相关边") + logger.info(f"Getting related edges for node {node_uuid[:8]}...") try: raw_edges = self.storage.get_node_edges(node_uuid) @@ -675,11 +675,11 @@ def get_node_edges(self, graph_id: str, node_uuid: str) -> List[EdgeInfo]: expired_at=edge.get("expired_at"), )) - logger.info(f"找到 {len(result)} 条与节点相关的边") + logger.info(f"Found {len(result)} edges related to node") return result except Exception as e: - logger.warning(f"获取节点边失败: {str(e)}") + logger.warning(f"Failed to get node edges: {str(e)}") return [] def get_entities_by_type( @@ -687,8 +687,8 @@ def get_entities_by_type( graph_id: str, entity_type: str ) -> List[NodeInfo]: - """按类型获取实体""" - logger.info(f"获取类型为 {entity_type} 的实体...") + """Get entities by type""" + logger.info(f"Getting entities of type {entity_type}...") # Use optimized label-based query from storage raw_nodes = self.storage.get_nodes_by_label(graph_id, entity_type) @@ -703,7 +703,7 @@ def get_entities_by_type( attributes=node.get("attributes", {}) )) - logger.info(f"找到 {len(result)} 个 {entity_type} 类型的实体") + logger.info(f"Found {len(result)} entities of type {entity_type}") return result def get_entity_summary( @@ -711,8 +711,8 @@ def get_entity_summary( graph_id: str, entity_name: str ) -> Dict[str, Any]: - """获取指定实体的关系摘要""" - logger.info(f"获取实体 {entity_name} 的关系摘要...") + """Get relationship summary for specified entity""" + logger.info(f"Getting relationship summary for entity {entity_name}...") search_result = self.search_graph( graph_id=graph_id, @@ -740,8 +740,8 @@ def get_entity_summary( } def get_graph_statistics(self, graph_id: str) -> Dict[str, Any]: - """获取图谱的统计信息""" - logger.info(f"获取图谱 {graph_id} 的统计信息...") + """Get graph statistics""" + logger.info(f"Getting statistics for graph {graph_id}...") nodes = self.get_all_nodes(graph_id) edges = self.get_all_edges(graph_id) @@ -770,8 +770,8 @@ def get_simulation_context( simulation_requirement: str, limit: int = 30 ) -> Dict[str, Any]: - """获取模拟相关的上下文信息""" - logger.info(f"获取模拟上下文: {simulation_requirement[:50]}...") + """Get simulation-related context info""" + logger.info(f"Getting simulation context: {simulation_requirement[:50]}...") search_result = self.search_graph( graph_id=graph_id, @@ -801,7 +801,7 @@ def get_simulation_context( "total_entities": len(entities) } - # ========== 核心检索工具(优化后) ========== + # ========== Core Retrieval Tools (Optimized) ========== def insight_forge( self, @@ -812,16 +812,16 @@ def insight_forge( max_sub_queries: int = 5 ) -> InsightForgeResult: """ - 【InsightForge - 深度洞察检索】 - - 最强大的混合检索函数,自动分解问题并多维度检索: - 1. 使用LLM将问题分解为多个子问题 - 2. 对每个子问题进行语义搜索 - 3. 提取相关实体并获取其详细信息 - 4. 追踪关系链 - 5. 整合所有结果,生成深度洞察 + [InsightForge - Deep Insight Retrieval] + + Most powerful hybrid retrieval function, auto-decomposes questions and searches across dimensions: + 1. Use LLM to decompose question into multiple sub-queries + 2. Perform semantic search on each sub-query + 3. Extract related entities and get their details + 4. Trace relationship chains + 5. Integrate all results and generate deep insights """ - logger.info(f"InsightForge 深度洞察检索: {query[:50]}...") + logger.info(f"InsightForge deep insight retrieval: {query[:50]}...") result = InsightForgeResult( query=query, @@ -829,7 +829,7 @@ def insight_forge( sub_queries=[] ) - # Step 1: 使用LLM生成子问题 + # Step 1: Generate sub-queries using LLM sub_queries = self._generate_sub_queries( query=query, simulation_requirement=simulation_requirement, @@ -837,9 +837,9 @@ def insight_forge( max_queries=max_sub_queries ) result.sub_queries = sub_queries - logger.info(f"生成 {len(sub_queries)} 个子问题") + logger.info(f"Generated {len(sub_queries)} sub-queries") - # Step 2: 对每个子问题进行语义搜索 + # Step 2: Semantic search on each sub-query all_facts = [] all_edges = [] seen_facts = set() @@ -859,7 +859,7 @@ def insight_forge( all_edges.extend(search_result.edges) - # 对原始问题也进行搜索 + # Also search the original question main_search = self.search_graph( graph_id=graph_id, query=query, @@ -874,7 +874,7 @@ def insight_forge( result.semantic_facts = all_facts result.total_facts = len(all_facts) - # Step 3: 从边中提取相关实体UUID + # Step 3: Extract related entity UUIDs from edges entity_uuids = set() for edge_data in all_edges: if isinstance(edge_data, dict): @@ -885,7 +885,7 @@ def insight_forge( if target_uuid: entity_uuids.add(target_uuid) - # 获取相关实体详情 + # Get related entity details entity_insights = [] node_map = {} @@ -896,7 +896,7 @@ def insight_forge( node = self.get_node_detail(uuid) if node: node_map[uuid] = node - entity_type = next((la for la in node.labels if la not in ["Entity", "Node"]), "实体") + entity_type = next((la for la in node.labels if la not in ["Entity", "Node"]), "Entity") related_facts = [ f for f in all_facts @@ -911,13 +911,13 @@ def insight_forge( "related_facts": related_facts }) except Exception as e: - logger.debug(f"获取节点 {uuid} 失败: {e}") + logger.debug(f"Failed to get node {uuid}: {e}") continue result.entity_insights = entity_insights result.total_entities = len(entity_insights) - # Step 4: 构建关系链 + # Step 4: Build relationship chains relationship_chains = [] for edge_data in all_edges: if isinstance(edge_data, dict): @@ -935,7 +935,7 @@ def insight_forge( result.relationship_chains = relationship_chains result.total_relationships = len(relationship_chains) - logger.info(f"InsightForge完成: {result.total_facts}条事实, {result.total_entities}个实体, {result.total_relationships}条关系") + logger.info(f"InsightForge complete: {result.total_facts} facts, {result.total_entities} entities, {result.total_relationships} relationships") return result def _generate_sub_queries( @@ -945,24 +945,24 @@ def _generate_sub_queries( report_context: str = "", max_queries: int = 5 ) -> List[str]: - """使用LLM生成子问题""" - system_prompt = """你是一个专业的问题分析专家。你的任务是将一个复杂问题分解为多个可以在模拟世界中独立观察的子问题。 + """Generate sub-questions using LLM""" + system_prompt = """You are a professional question analysis expert. Your task is to decompose a complex question into multiple sub-questions that can be independently observed in the simulation world. -要求: -1. 每个子问题应该足够具体,可以在模拟世界中找到相关的Agent行为或事件 -2. 子问题应该覆盖原问题的不同维度(如:谁、什么、为什么、怎么样、何时、何地) -3. 子问题应该与模拟场景相关 -4. 返回JSON格式:{"sub_queries": ["子问题1", "子问题2", ...]}""" +Requirements: +1. Each sub-question should be specific enough to find related Agent behaviors or events in the simulation world +2. Sub-questions should cover different dimensions of the original question (e.g.: who, what, why, how, when, where) +3. Sub-questions should be relevant to the simulation scenario +4. Return JSON format: {"sub_queries": ["sub-question 1", "sub-question 2", ...]}""" - user_prompt = f"""模拟需求背景: + user_prompt = f"""Simulation requirement background: {simulation_requirement} -{f"报告上下文:{report_context[:500]}" if report_context else ""} +{f"Report context: {report_context[:500]}" if report_context else ""} -请将以下问题分解为{max_queries}个子问题: +Please decompose the following question into {max_queries} sub-questions: {query} -返回JSON格式的子问题列表。""" +Return a JSON-format list of sub-questions.""" try: response = self.llm.chat_json( @@ -977,12 +977,12 @@ def _generate_sub_queries( return [str(sq) for sq in sub_queries[:max_queries]] except Exception as e: - logger.warning(f"生成子问题失败: {str(e)},使用默认子问题") + logger.warning(f"Failed to generate sub-questions: {str(e)}, using defaults") return [ query, - f"{query} 的主要参与者", - f"{query} 的原因和影响", - f"{query} 的发展过程" + f"{query} main participants", + f"{query} causes and impacts", + f"{query} development process" ][:max_queries] def panorama_search( @@ -993,26 +993,26 @@ def panorama_search( limit: int = 50 ) -> PanoramaResult: """ - 【PanoramaSearch - 广度搜索】 + [PanoramaSearch - Breadth Search] - 获取全貌视图,包括所有相关内容和历史/过期信息。 + Get full panorama view, including all related content and historical/expired info. """ - logger.info(f"PanoramaSearch 广度搜索: {query[:50]}...") + logger.info(f"PanoramaSearch breadth search: {query[:50]}...") result = PanoramaResult(query=query) - # 获取所有节点 + # Get all nodes all_nodes = self.get_all_nodes(graph_id) node_map = {n.uuid: n for n in all_nodes} result.all_nodes = all_nodes result.total_nodes = len(all_nodes) - # 获取所有边(包含时间信息) + # Get all edges (with temporal info) all_edges = self.get_all_edges(graph_id, include_temporal=True) result.all_edges = all_edges result.total_edges = len(all_edges) - # 分类事实 + # Categorize facts active_facts = [] historical_facts = [] @@ -1026,14 +1026,14 @@ def panorama_search( is_historical = edge.is_expired or edge.is_invalid if is_historical: - valid_at = edge.valid_at or "未知" - invalid_at = edge.invalid_at or edge.expired_at or "未知" + valid_at = edge.valid_at or "Unknown" + invalid_at = edge.invalid_at or edge.expired_at or "Unknown" fact_with_time = f"[{valid_at} - {invalid_at}] {edge.fact}" historical_facts.append(fact_with_time) else: active_facts.append(edge.fact) - # 基于查询进行相关性排序 + # Sort by relevance based on query query_lower = query.lower() keywords = [w.strip() for w in query_lower.replace(',', ' ').replace(',', ' ').split() if len(w.strip()) > 1] @@ -1055,7 +1055,7 @@ def relevance_score(fact: str) -> int: result.active_count = len(active_facts) result.historical_count = len(historical_facts) - logger.info(f"PanoramaSearch完成: {result.active_count}条有效, {result.historical_count}条历史") + logger.info(f"PanoramaSearch complete: {result.active_count} active, {result.historical_count} historical") return result def quick_search( @@ -1065,10 +1065,10 @@ def quick_search( limit: int = 10 ) -> SearchResult: """ - 【QuickSearch - 简单搜索】 - 快速、轻量级的检索工具。 + [QuickSearch - Simple Search] + Fast, lightweight retrieval tool. """ - logger.info(f"QuickSearch 简单搜索: {query[:50]}...") + logger.info(f"QuickSearch simple search: {query[:50]}...") result = self.search_graph( graph_id=graph_id, @@ -1077,7 +1077,7 @@ def quick_search( scope="edges" ) - logger.info(f"QuickSearch完成: {result.total_count}条结果") + logger.info(f"QuickSearch complete: {result.total_count} results") return result def interview_agents( @@ -1089,33 +1089,33 @@ def interview_agents( custom_questions: List[str] = None ) -> InterviewResult: """ - 【InterviewAgents - 深度采访】 + [InterviewAgents - In-depth Interview] - 调用真实的OASIS采访API,采访模拟中正在运行的Agent。 + Calls real OASIS interview API to interview running simulation Agents. This method does NOT use GraphStorage — it calls SimulationRunner and reads agent profiles from disk. """ from .simulation_runner import SimulationRunner - logger.info(f"InterviewAgents 深度采访(真实API): {interview_requirement[:50]}...") + logger.info(f"InterviewAgents in-depth interview (real API): {interview_requirement[:50]}...") result = InterviewResult( interview_topic=interview_requirement, interview_questions=custom_questions or [] ) - # Step 1: 读取人设文件 + # Step 1: Load persona files profiles = self._load_agent_profiles(simulation_id) if not profiles: - logger.warning(f"未找到模拟 {simulation_id} 的人设文件") - result.summary = "未找到可采访的Agent人设文件" + logger.warning(f"No persona files found for simulation {simulation_id}") + result.summary = "No Agent persona files found for interview" return result result.total_agents = len(profiles) - logger.info(f"加载到 {len(profiles)} 个Agent人设") + logger.info(f"Loaded {len(profiles)} Agent personas") - # Step 2: 使用LLM选择要采访的Agent + # Step 2: Use LLM to select Agents for interview selected_agents, selected_indices, selection_reasoning = self._select_agents_for_interview( profiles=profiles, interview_requirement=interview_requirement, @@ -1125,33 +1125,33 @@ def interview_agents( result.selected_agents = selected_agents result.selection_reasoning = selection_reasoning - logger.info(f"选择了 {len(selected_agents)} 个Agent进行采访: {selected_indices}") + logger.info(f"Selected {len(selected_agents)} Agents for interview: {selected_indices}") - # Step 3: 生成采访问题 + # Step 3: Generate interview questions if not result.interview_questions: result.interview_questions = self._generate_interview_questions( interview_requirement=interview_requirement, simulation_requirement=simulation_requirement, selected_agents=selected_agents ) - logger.info(f"生成了 {len(result.interview_questions)} 个采访问题") + logger.info(f"Generated {len(result.interview_questions)} interview questions") combined_prompt = "\n".join([f"{i+1}. {q}" for i, q in enumerate(result.interview_questions)]) INTERVIEW_PROMPT_PREFIX = ( - "你正在接受一次采访。请结合你的人设、所有的过往记忆与行动," - "以纯文本方式直接回答以下问题。\n" - "回复要求:\n" - "1. 直接用自然语言回答,不要调用任何工具\n" - "2. 不要返回JSON格式或工具调用格式\n" - "3. 不要使用Markdown标题(如#、##、###)\n" - "4. 按问题编号逐一回答,每个回答以「问题X:」开头(X为问题编号)\n" - "5. 每个问题的回答之间用空行分隔\n" - "6. 回答要有实质内容,每个问题至少回答2-3句话\n\n" + "You are being interviewed. Based on your persona, all past memories and actions, " + "please directly answer the following questions in plain text.\n" + "Response requirements:\n" + "1. Answer directly in natural language, do not call any tools\n" + "2. Do not return JSON format or tool call format\n" + "3. Do not use Markdown headings (e.g. #, ##, ###)\n" + "4. Answer each question by number, starting each answer with \"Question X:\" (X is the question number)\n" + "5. Separate answers with blank lines\n" + "6. Answers should be substantive, at least 2-3 sentences per question\n\n" ) optimized_prompt = f"{INTERVIEW_PROMPT_PREFIX}{combined_prompt}" - # Step 4: 调用真实的采访API + # Step 4: Call real interview API try: interviews_request = [] for agent_idx in selected_indices: @@ -1160,7 +1160,7 @@ def interview_agents( "prompt": optimized_prompt }) - logger.info(f"调用批量采访API(双平台): {len(interviews_request)} 个Agent") + logger.info(f"Calling batch interview API (dual platform): {len(interviews_request)} Agents") api_result = SimulationRunner.interview_agents_batch( simulation_id=simulation_id, @@ -1169,22 +1169,22 @@ def interview_agents( timeout=180.0 ) - logger.info(f"采访API返回: {api_result.get('interviews_count', 0)} 个结果, success={api_result.get('success')}") + logger.info(f"Interview API returned: {api_result.get('interviews_count', 0)} results, success={api_result.get('success')}") if not api_result.get("success", False): - error_msg = api_result.get("error", "未知错误") - logger.warning(f"采访API返回失败: {error_msg}") - result.summary = f"采访API调用失败:{error_msg}。请检查OASIS模拟环境状态。" + error_msg = api_result.get("error", "Unknown error") + logger.warning(f"Interview API returned failure: {error_msg}") + result.summary = f"Interview API call failed: {error_msg}. Please check OASIS simulation environment status." return result - # Step 5: 解析API返回结果 + # Step 5: Parse API return results api_data = api_result.get("result", {}) results_dict = api_data.get("results", {}) if isinstance(api_data, dict) else {} for i, agent_idx in enumerate(selected_indices): agent = selected_agents[i] agent_name = agent.get("realname", agent.get("username", f"Agent_{agent_idx}")) - agent_role = agent.get("profession", "未知") + agent_role = agent.get("profession", "Unknown") agent_bio = agent.get("bio", "") twitter_result = results_dict.get(f"twitter_{agent_idx}", {}) @@ -1196,9 +1196,9 @@ def interview_agents( twitter_response = self._clean_tool_call_response(twitter_response) reddit_response = self._clean_tool_call_response(reddit_response) - twitter_text = twitter_response if twitter_response else "(该平台未获得回复)" - reddit_text = reddit_response if reddit_response else "(该平台未获得回复)" - response_text = f"【Twitter平台回答】\n{twitter_text}\n\n【Reddit平台回答】\n{reddit_text}" + twitter_text = twitter_response if twitter_response else "(No response from this platform)" + reddit_text = reddit_response if reddit_response else "(No response from this platform)" + response_text = f"[Twitter Platform Response]\n{twitter_text}\n\n[Reddit Platform Response]\n{reddit_text}" import re combined_responses = f"{twitter_response} {reddit_response}" @@ -1206,7 +1206,7 @@ def interview_agents( clean_text = re.sub(r'#{1,6}\s+', '', combined_responses) clean_text = re.sub(r'\{[^}]*tool_name[^}]*\}', '', clean_text) clean_text = re.sub(r'[*_`|>~\-]{2,}', '', clean_text) - clean_text = re.sub(r'问题\d+[::]\s*', '', clean_text) + clean_text = re.sub(r'Question\s*\d+[::]\s*', '', clean_text) clean_text = re.sub(r'【[^】]+】', '', clean_text) sentences = re.split(r'[。!?]', clean_text) @@ -1214,7 +1214,7 @@ def interview_agents( s.strip() for s in sentences if 20 <= len(s.strip()) <= 150 and not re.match(r'^[\s\W,,;;::、]+', s.strip()) - and not s.strip().startswith(('{', '问题')) + and not s.strip().startswith(('{', 'Question')) ] meaningful.sort(key=len, reverse=True) key_quotes = [s + "。" for s in meaningful[:3]] @@ -1237,29 +1237,29 @@ def interview_agents( result.interviewed_count = len(result.interviews) except ValueError as e: - logger.warning(f"采访API调用失败(环境未运行?): {e}") - result.summary = f"采访失败:{str(e)}。模拟环境可能已关闭,请确保OASIS环境正在运行。" + logger.warning(f"Interview API call failed (environment not running?): {e}") + result.summary = f"Interview failed: {str(e)}. Simulation environment may be closed. Please ensure OASIS environment is running." return result except Exception as e: - logger.error(f"采访API调用异常: {e}") + logger.error(f"Interview API call exception: {e}") import traceback logger.error(traceback.format_exc()) - result.summary = f"采访过程发生错误:{str(e)}" + result.summary = f"Error during interview: {str(e)}" return result - # Step 6: 生成采访摘要 + # Step 6: Generate interview summary if result.interviews: result.summary = self._generate_interview_summary( interviews=result.interviews, interview_requirement=interview_requirement ) - logger.info(f"InterviewAgents完成: 采访了 {result.interviewed_count} 个Agent(双平台)") + logger.info(f"InterviewAgents complete: interviewed {result.interviewed_count} Agents (dual platform)") return result @staticmethod def _clean_tool_call_response(response: str) -> str: - """清理 Agent 回复中的 JSON 工具调用包裹,提取实际内容""" + """Clean JSON tool call wrappers from Agent responses, extract actual content""" if not response or not response.strip().startswith('{'): return response text = response.strip() @@ -1279,7 +1279,7 @@ def _clean_tool_call_response(response: str) -> str: return response def _load_agent_profiles(self, simulation_id: str) -> List[Dict[str, Any]]: - """加载模拟的Agent人设文件""" + """Load Agent persona files for simulation""" import os import csv @@ -1290,18 +1290,18 @@ def _load_agent_profiles(self, simulation_id: str) -> List[Dict[str, Any]]: profiles = [] - # 优先尝试读取Reddit JSON格式 + # Prefer reading Reddit JSON format reddit_profile_path = os.path.join(sim_dir, "reddit_profiles.json") if os.path.exists(reddit_profile_path): try: with open(reddit_profile_path, 'r', encoding='utf-8') as f: profiles = json.load(f) - logger.info(f"从 reddit_profiles.json 加载了 {len(profiles)} 个人设") + logger.info(f"Loaded {len(profiles)} personas from reddit_profiles.json") return profiles except Exception as e: - logger.warning(f"读取 reddit_profiles.json 失败: {e}") + logger.warning(f"Failed to read reddit_profiles.json: {e}") - # 尝试读取Twitter CSV格式 + # Try reading Twitter CSV format twitter_profile_path = os.path.join(sim_dir, "twitter_profiles.csv") if os.path.exists(twitter_profile_path): try: @@ -1313,12 +1313,12 @@ def _load_agent_profiles(self, simulation_id: str) -> List[Dict[str, Any]]: "username": row.get("username", ""), "bio": row.get("description", ""), "persona": row.get("user_char", ""), - "profession": "未知" + "profession": "Unknown" }) - logger.info(f"从 twitter_profiles.csv 加载了 {len(profiles)} 个人设") + logger.info(f"Loaded {len(profiles)} personas from twitter_profiles.csv") return profiles except Exception as e: - logger.warning(f"读取 twitter_profiles.csv 失败: {e}") + logger.warning(f"Failed to read twitter_profiles.csv: {e}") return profiles @@ -1329,43 +1329,43 @@ def _select_agents_for_interview( simulation_requirement: str, max_agents: int ) -> tuple: - """使用LLM选择要采访的Agent""" + """Use LLM to select Agents for interview""" agent_summaries = [] for i, profile in enumerate(profiles): summary = { "index": i, "name": profile.get("realname", profile.get("username", f"Agent_{i}")), - "profession": profile.get("profession", "未知"), + "profession": profile.get("profession", "Unknown"), "bio": profile.get("bio", "")[:200], "interested_topics": profile.get("interested_topics", []) } agent_summaries.append(summary) - system_prompt = """你是一个专业的采访策划专家。你的任务是根据采访需求,从模拟Agent列表中选择最适合采访的对象。 + system_prompt = """You are a professional interview planning expert. Your task is to select the most suitable interview subjects from a simulation Agent list based on interview requirements. -选择标准: -1. Agent的身份/职业与采访主题相关 -2. Agent可能持有独特或有价值的观点 -3. 选择多样化的视角(如:支持方、反对方、中立方、专业人士等) -4. 优先选择与事件直接相关的角色 +Selection criteria: +1. Agent identity/profession is related to interview topic +2. Agent may hold unique or valuable perspectives +3. Select diverse viewpoints (e.g.: supporters, opponents, neutrals, professionals) +4. Prioritize roles directly related to the event -返回JSON格式: +Return JSON format: { - "selected_indices": [选中Agent的索引列表], - "reasoning": "选择理由说明" + "selected_indices": [list of selected Agent indices], + "reasoning": "explanation of selection reasoning" }""" - user_prompt = f"""采访需求: + user_prompt = f"""Interview requirements: {interview_requirement} -模拟背景: -{simulation_requirement if simulation_requirement else "未提供"} +Simulation background: +{simulation_requirement if simulation_requirement else "Not provided"} -可选择的Agent列表(共{len(agent_summaries)}个): +Available Agent list ({len(agent_summaries)} total): {json.dumps(agent_summaries, ensure_ascii=False, indent=2)} -请选择最多{max_agents}个最适合采访的Agent,并说明选择理由。""" +Please select up to {max_agents} most suitable Agents for interview and explain your selection reasoning.""" try: response = self.llm.chat_json( @@ -1377,7 +1377,7 @@ def _select_agents_for_interview( ) selected_indices = response.get("selected_indices", [])[:max_agents] - reasoning = response.get("reasoning", "基于相关性自动选择") + reasoning = response.get("reasoning", "Auto-selected based on relevance") selected_agents = [] valid_indices = [] @@ -1389,10 +1389,10 @@ def _select_agents_for_interview( return selected_agents, valid_indices, reasoning except Exception as e: - logger.warning(f"LLM选择Agent失败,使用默认选择: {e}") + logger.warning(f"LLM Agent selection failed, using default selection: {e}") selected = profiles[:max_agents] indices = list(range(min(max_agents, len(profiles)))) - return selected, indices, "使用默认选择策略" + return selected, indices, "Using default selection strategy" def _generate_interview_questions( self, @@ -1400,29 +1400,29 @@ def _generate_interview_questions( simulation_requirement: str, selected_agents: List[Dict[str, Any]] ) -> List[str]: - """使用LLM生成采访问题""" + """Generate interview questions using LLM""" - agent_roles = [a.get("profession", "未知") for a in selected_agents] + agent_roles = [a.get("profession", "Unknown") for a in selected_agents] - system_prompt = """你是一个专业的记者/采访者。根据采访需求,生成3-5个深度采访问题。 + system_prompt = """You are a professional journalist/interviewer. Generate 3-5 in-depth interview questions based on interview requirements. -问题要求: -1. 开放性问题,鼓励详细回答 -2. 针对不同角色可能有不同答案 -3. 涵盖事实、观点、感受等多个维度 -4. 语言自然,像真实采访一样 -5. 每个问题控制在50字以内,简洁明了 -6. 直接提问,不要包含背景说明或前缀 +Question requirements: +1. Open-ended questions that encourage detailed answers +2. Different roles may have different answers +3. Cover multiple dimensions: facts, opinions, feelings +4. Natural language, like a real interview +5. Keep each question under 50 words, concise and clear +6. Ask directly, do not include background descriptions or prefixes -返回JSON格式:{"questions": ["问题1", "问题2", ...]}""" +Return JSON format: {"questions": ["question 1", "question 2", ...]}""" - user_prompt = f"""采访需求:{interview_requirement} + user_prompt = f"""Interview requirements: {interview_requirement} -模拟背景:{simulation_requirement if simulation_requirement else "未提供"} +Simulation background: {simulation_requirement if simulation_requirement else "Not provided"} -采访对象角色:{', '.join(agent_roles)} +Interviewee roles: {', '.join(agent_roles)} -请生成3-5个采访问题。""" +Please generate 3-5 interview questions.""" try: response = self.llm.chat_json( @@ -1433,14 +1433,14 @@ def _generate_interview_questions( temperature=0.5 ) - return response.get("questions", [f"关于{interview_requirement},您有什么看法?"]) + return response.get("questions", [f"What are your views on {interview_requirement}?"]) except Exception as e: - logger.warning(f"生成采访问题失败: {e}") + logger.warning(f"Failed to generate interview questions: {e}") return [ - f"关于{interview_requirement},您的观点是什么?", - "这件事对您或您所代表的群体有什么影响?", - "您认为应该如何解决或改进这个问题?" + f"What is your perspective on {interview_requirement}?", + "What impact does this have on you or the group you represent?", + "How do you think this issue should be resolved or improved?" ] def _generate_interview_summary( @@ -1448,37 +1448,37 @@ def _generate_interview_summary( interviews: List[AgentInterview], interview_requirement: str ) -> str: - """生成采访摘要""" + """Generate interview summary""" if not interviews: - return "未完成任何采访" + return "No interviews completed" interview_texts = [] for interview in interviews: interview_texts.append(f"【{interview.agent_name}({interview.agent_role})】\n{interview.response[:500]}") - system_prompt = """你是一个专业的新闻编辑。请根据多位受访者的回答,生成一份采访摘要。 + system_prompt = """You are a professional news editor. Generate an interview summary based on multiple respondents' answers. -摘要要求: -1. 提炼各方主要观点 -2. 指出观点的共识和分歧 -3. 突出有价值的引言 -4. 客观中立,不偏袒任何一方 -5. 控制在1000字内 +Summary requirements: +1. Extract main viewpoints from all parties +2. Identify consensus and disagreements +3. Highlight valuable quotes +4. Be objective and neutral, do not favor any party +5. Keep within 1000 words -格式约束(必须遵守): -- 使用纯文本段落,用空行分隔不同部分 -- 不要使用Markdown标题(如#、##、###) -- 不要使用分割线(如---、***) -- 引用受访者原话时使用中文引号「」 -- 可以使用**加粗**标记关键词,但不要使用其他Markdown语法""" +Format constraints (must follow): +- Use plain text paragraphs, separate sections with blank lines +- Do not use Markdown headings (e.g. #, ##, ###) +- Do not use divider lines (e.g. ---, ***) +- Use quotation marks when citing respondent quotes +- You may use **bold** for keywords, but do not use other Markdown syntax""" - user_prompt = f"""采访主题:{interview_requirement} + user_prompt = f"""Interview topic: {interview_requirement} -采访内容: +Interview content: {"".join(interview_texts)} -请生成采访摘要。""" +Please generate an interview summary.""" try: summary = self.llm.chat( @@ -1492,5 +1492,5 @@ def _generate_interview_summary( return summary except Exception as e: - logger.warning(f"生成采访摘要失败: {e}") - return f"共采访了{len(interviews)}位受访者,包括:" + "、".join([i.agent_name for i in interviews]) + logger.warning(f"Failed to generate interview summary: {e}") + return f"Interviewed {len(interviews)} respondents, including: " + "、".join([i.agent_name for i in interviews]) diff --git a/backend/app/services/oasis_profile_generator.py b/backend/app/services/oasis_profile_generator.py index 10dcc52..d6c5229 100644 --- a/backend/app/services/oasis_profile_generator.py +++ b/backend/app/services/oasis_profile_generator.py @@ -1,11 +1,11 @@ """ -OASIS Agent Profile生成器 -将图谱中的实体转换为OASIS模拟平台所需的Agent Profile格式 +OASIS Agent Profile Generator +Converts entities from the graph into Agent Profile format required by OASIS simulation platform -优化改进: -1. 调用图谱检索功能二次丰富节点信息 -2. 优化提示词生成非常详细的人设 -3. 区分个人实体和抽象群体实体 +Optimizations: +1. Calls graph retrieval to further enrich node info +2. Optimized prompts to generate very detailed personas +3. Distinguishes individual entities from abstract group entities """ import json @@ -27,23 +27,23 @@ @dataclass class OasisAgentProfile: - """OASIS Agent Profile数据结构""" - # 通用字段 + """OASIS Agent Profile data structure""" + # Common fields user_id: int user_name: str name: str bio: str persona: str - # 可选字段 - Reddit风格 + # Optional fields - Reddit style karma: int = 1000 - # 可选字段 - Twitter风格 + # Optional fields - Twitter style friend_count: int = 100 follower_count: int = 150 statuses_count: int = 500 - # 额外人设信息 + # Additional persona info age: Optional[int] = None gender: Optional[str] = None mbti: Optional[str] = None @@ -51,17 +51,17 @@ class OasisAgentProfile: profession: Optional[str] = None interested_topics: List[str] = field(default_factory=list) - # 来源实体信息 + # Source entity info source_entity_uuid: Optional[str] = None source_entity_type: Optional[str] = None created_at: str = field(default_factory=lambda: datetime.now().strftime("%Y-%m-%d")) def to_reddit_format(self) -> Dict[str, Any]: - """转换为Reddit平台格式""" + """Convert to Reddit platform format""" profile = { "user_id": self.user_id, - "username": self.user_name, # OASIS 库要求字段名为 username(无下划线) + "username": self.user_name, # OASIS library requires field name username (no underscore) "name": self.name, "bio": self.bio, "persona": self.persona, @@ -69,7 +69,7 @@ def to_reddit_format(self) -> Dict[str, Any]: "created_at": self.created_at, } - # 添加额外人设信息(如果有) + # Add additional persona info (if available) if self.age: profile["age"] = self.age if self.gender: @@ -86,10 +86,10 @@ def to_reddit_format(self) -> Dict[str, Any]: return profile def to_twitter_format(self) -> Dict[str, Any]: - """转换为Twitter平台格式""" + """Convert to Twitter platform format""" profile = { "user_id": self.user_id, - "username": self.user_name, # OASIS 库要求字段名为 username(无下划线) + "username": self.user_name, # OASIS library requires field name username (no underscore) "name": self.name, "bio": self.bio, "persona": self.persona, @@ -99,7 +99,7 @@ def to_twitter_format(self) -> Dict[str, Any]: "created_at": self.created_at, } - # 添加额外人设信息 + # Add additional persona info if self.age: profile["age"] = self.age if self.gender: @@ -116,7 +116,7 @@ def to_twitter_format(self) -> Dict[str, Any]: return profile def to_dict(self) -> Dict[str, Any]: - """转换为完整字典格式""" + """Convert to complete dictionary format""" return { "user_id": self.user_id, "user_name": self.user_name, @@ -141,17 +141,17 @@ def to_dict(self) -> Dict[str, Any]: class OasisProfileGenerator: """ - OASIS Profile生成器 + OASIS Profile Generator - 将图谱中的实体转换为OASIS模拟所需的Agent Profile + Converts graph entities into Agent Profiles required for OASIS simulation - 优化特性: - 1. 调用图谱检索功能获取更丰富的上下文 - 2. 生成非常详细的人设(包括基本信息、职业经历、性格特征、社交媒体行为等) - 3. 区分个人实体和抽象群体实体 + Optimized features: + 1. Calls graph retrieval for richer context + 2. Generates very detailed personas (including basic info, career history, personality traits, social media behavior, etc.) + 3. Distinguishes individual entities from abstract group entities """ - # MBTI类型列表 + # MBTI type list MBTI_TYPES = [ "INTJ", "INTP", "ENTJ", "ENTP", "INFJ", "INFP", "ENFJ", "ENFP", @@ -159,19 +159,19 @@ class OasisProfileGenerator: "ISTP", "ISFP", "ESTP", "ESFP" ] - # 常见国家列表 + # Common countries list COUNTRIES = [ "China", "US", "UK", "Japan", "Germany", "France", "Canada", "Australia", "Brazil", "India", "South Korea" ] - # 个人类型实体(需要生成具体人设) + # Individual entity types (need specific persona generation) INDIVIDUAL_ENTITY_TYPES = [ "student", "alumni", "professor", "person", "publicfigure", "expert", "faculty", "official", "journalist", "activist" ] - # 群体/机构类型实体(需要生成群体代表人设) + # Group/institution entity types (need representative account persona) GROUP_ENTITY_TYPES = [ "university", "governmentagency", "organization", "ngo", "mediaoutlet", "company", "institution", "group", "community" @@ -190,7 +190,7 @@ def __init__( self.model_name = model_name or Config.LLM_MODEL_NAME if not self.api_key: - raise ValueError("LLM_API_KEY 未配置") + raise ValueError("LLM_API_KEY not configured") self.client = OpenAI( api_key=self.api_key, @@ -208,27 +208,27 @@ def generate_profile_from_entity( use_llm: bool = True ) -> OasisAgentProfile: """ - 从图谱实体生成OASIS Agent Profile + Generate OASIS Agent Profile from graph entity Args: - entity: 图谱实体节点 - user_id: 用户ID(用于OASIS) - use_llm: 是否使用LLM生成详细人设 + entity: Graph entity node + user_id: User ID (for OASIS) + use_llm: Whether to use LLM to generate detailed persona Returns: OasisAgentProfile """ entity_type = entity.get_entity_type() or "Entity" - # 基础信息 + # Basic info name = entity.name user_name = self._generate_username(name) - # 构建上下文信息 + # Build context info context = self._build_entity_context(entity) if use_llm: - # 使用LLM生成详细人设 + # Use LLM to generate detailed persona profile_data = self._generate_profile_with_llm( entity_name=name, entity_type=entity_type, @@ -237,7 +237,7 @@ def generate_profile_from_entity( context=context ) else: - # 使用规则生成基础人设 + # Use rules to generate basic persona profile_data = self._generate_profile_rule_based( entity_name=name, entity_type=entity_type, @@ -266,26 +266,26 @@ def generate_profile_from_entity( ) def _generate_username(self, name: str) -> str: - """生成用户名""" - # 移除特殊字符,转换为小写 + """Generate username""" + # Remove special chars, convert to lowercase username = name.lower().replace(" ", "_") username = ''.join(c for c in username if c.isalnum() or c == '_') - # 添加随机后缀避免重复 + # Add random suffix to avoid duplicates suffix = random.randint(100, 999) return f"{username}_{suffix}" def _search_graph_for_entity(self, entity: EntityNode) -> Dict[str, Any]: """ - 使用 GraphStorage 混合搜索获取实体相关的丰富信息 + Use GraphStorage hybrid search to get rich entity-related info Uses storage.search() (hybrid vector + BM25) for both edges and nodes. Args: - entity: 实体节点对象 + entity: Entity node object Returns: - 包含facts, node_summaries, context的字典 + Dictionary containing facts, node_summaries, context """ if not self.storage: return {"facts": [], "node_summaries": [], "context": ""} @@ -299,10 +299,10 @@ def _search_graph_for_entity(self, entity: EntityNode) -> Dict[str, Any]: } if not self.graph_id: - logger.debug(f"跳过图谱检索:未设置graph_id") + logger.debug(f"Skipping graph retrieval: graph_id not set") return results - comprehensive_query = f"关于{entity_name}的所有信息、活动、事件、关系和背景" + comprehensive_query = f"All information, activities, events, relationships and background about {entity_name}" try: # Search edges (facts) @@ -337,49 +337,49 @@ def _search_graph_for_entity(self, entity: EntityNode) -> Dict[str, Any]: all_summaries.add(summary) name = node.get('name', '') if name and name != entity_name: - all_summaries.add(f"相关实体: {name}") + all_summaries.add(f"Related entity: {name}") results["node_summaries"] = list(all_summaries) # Build combined context context_parts = [] if results["facts"]: - context_parts.append("事实信息:\n" + "\n".join(f"- {f}" for f in results["facts"][:20])) + context_parts.append("Facts:\n" + "\n".join(f"- {f}" for f in results["facts"][:20])) if results["node_summaries"]: - context_parts.append("相关实体:\n" + "\n".join(f"- {s}" for s in results["node_summaries"][:10])) + context_parts.append("Related entities:\n" + "\n".join(f"- {s}" for s in results["node_summaries"][:10])) results["context"] = "\n\n".join(context_parts) - logger.info(f"图谱混合检索完成: {entity_name}, 获取 {len(results['facts'])} 条事实, {len(results['node_summaries'])} 个相关节点") + logger.info(f"Graph hybrid search complete: {entity_name}, retrieved {len(results['facts'])} facts, {len(results['node_summaries'])} related nodes") except Exception as e: - logger.warning(f"图谱检索失败 ({entity_name}): {e}") + logger.warning(f"Graph retrieval failed ({entity_name}): {e}") return results def _build_entity_context(self, entity: EntityNode) -> str: """ - 构建实体的完整上下文信息 + Build complete context info for entity - 包括: - 1. 实体本身的边信息(事实) - 2. 关联节点的详细信息 - 3. 图谱混合检索到的丰富信息 + Includes: + 1. Edge info from the entity itself (facts) + 2. Detailed info of associated nodes + 3. Rich info from graph hybrid retrieval """ context_parts = [] - # 1. 添加实体属性信息 + # 1. Add entity attribute info if entity.attributes: attrs = [] for key, value in entity.attributes.items(): if value and str(value).strip(): attrs.append(f"- {key}: {value}") if attrs: - context_parts.append("### 实体属性\n" + "\n".join(attrs)) + context_parts.append("### Entity Attributes\n" + "\n".join(attrs)) - # 2. 添加相关边信息(事实/关系) + # 2. Add related edge info (facts/relationships) existing_facts = set() if entity.related_edges: relationships = [] - for edge in entity.related_edges: # 不限制数量 + for edge in entity.related_edges: # No limit on count fact = edge.get("fact", "") edge_name = edge.get("edge_name", "") direction = edge.get("direction", "") @@ -389,22 +389,22 @@ def _build_entity_context(self, entity: EntityNode) -> str: existing_facts.add(fact) elif edge_name: if direction == "outgoing": - relationships.append(f"- {entity.name} --[{edge_name}]--> (相关实体)") + relationships.append(f"- {entity.name} --[{edge_name}]--> (related entity)") else: - relationships.append(f"- (相关实体) --[{edge_name}]--> {entity.name}") + relationships.append(f"- (related entity) --[{edge_name}]--> {entity.name}") if relationships: - context_parts.append("### 相关事实和关系\n" + "\n".join(relationships)) + context_parts.append("### Related Facts and Relationships\n" + "\n".join(relationships)) - # 3. 添加关联节点的详细信息 + # 3. Add detailed info of associated nodes if entity.related_nodes: related_info = [] - for node in entity.related_nodes: # 不限制数量 + for node in entity.related_nodes: # No limit on count node_name = node.get("name", "") node_labels = node.get("labels", []) node_summary = node.get("summary", "") - # 过滤掉默认标签 + # Filter out default labels custom_labels = [l for l in node_labels if l not in ["Entity", "Node"]] label_str = f" ({', '.join(custom_labels)})" if custom_labels else "" @@ -414,28 +414,28 @@ def _build_entity_context(self, entity: EntityNode) -> str: related_info.append(f"- **{node_name}**{label_str}") if related_info: - context_parts.append("### 关联实体信息\n" + "\n".join(related_info)) + context_parts.append("### Associated Entity Info\n" + "\n".join(related_info)) - # 4. 使用图谱混合检索获取更丰富的信息 + # 4. Use graph hybrid retrieval for richer info graph_results = self._search_graph_for_entity(entity) if graph_results.get("facts"): - # 去重:排除已存在的事实 + # Deduplicate: exclude existing facts new_facts = [f for f in graph_results["facts"] if f not in existing_facts] if new_facts: - context_parts.append("### 图谱检索到的事实信息\n" + "\n".join(f"- {f}" for f in new_facts[:15])) + context_parts.append("### Facts Retrieved from Graph\n" + "\n".join(f"- {f}" for f in new_facts[:15])) if graph_results.get("node_summaries"): - context_parts.append("### 图谱检索到的相关节点\n" + "\n".join(f"- {s}" for s in graph_results["node_summaries"][:10])) + context_parts.append("### Related Nodes Retrieved from Graph\n" + "\n".join(f"- {s}" for s in graph_results["node_summaries"][:10])) return "\n\n".join(context_parts) def _is_individual_entity(self, entity_type: str) -> bool: - """判断是否是个人类型实体""" + """Determine if entity is individual type""" return entity_type.lower() in self.INDIVIDUAL_ENTITY_TYPES def _is_group_entity(self, entity_type: str) -> bool: - """判断是否是群体/机构类型实体""" + """Determine if entity is group/institution type""" return entity_type.lower() in self.GROUP_ENTITY_TYPES def _generate_profile_with_llm( @@ -447,11 +447,11 @@ def _generate_profile_with_llm( context: str ) -> Dict[str, Any]: """ - 使用LLM生成非常详细的人设 + Use LLM to generate very detailed persona - 根据实体类型区分: - - 个人实体:生成具体的人物设定 - - 群体/机构实体:生成代表性账号设定 + Differentiated by entity type: + - Individual entities: generate specific character settings + - Group/institution entities: generate representative account settings """ is_individual = self._is_individual_entity(entity_type) @@ -465,7 +465,7 @@ def _generate_profile_with_llm( entity_name, entity_type, entity_summary, entity_attributes, context ) - # 尝试多次生成,直到成功或达到最大重试次数 + # Try multiple times until success or max retries reached max_attempts = 3 last_error = None @@ -478,34 +478,34 @@ def _generate_profile_with_llm( {"role": "user", "content": prompt} ], response_format={"type": "json_object"}, - temperature=0.7 - (attempt * 0.1) # 每次重试降低温度 - # 不设置max_tokens,让LLM自由发挥 + temperature=0.7 - (attempt * 0.1) # Lower temperature with each retry + # Do not set max_tokens, let LLM generate freely ) content = response.choices[0].message.content - # 检查是否被截断(finish_reason不是'stop') + # Check if truncated (finish_reason is not 'stop') finish_reason = response.choices[0].finish_reason if finish_reason == 'length': - logger.warning(f"LLM输出被截断 (attempt {attempt+1}), 尝试修复...") + logger.warning(f"LLM output truncated (attempt {attempt+1}), attempting repair...") content = self._fix_truncated_json(content) - # 尝试解析JSON + # Try to parse JSON try: result = json.loads(content) - # 验证必需字段 + # Validate required fields if "bio" not in result or not result["bio"]: result["bio"] = entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}" if "persona" not in result or not result["persona"]: - result["persona"] = entity_summary or f"{entity_name}是一个{entity_type}。" + result["persona"] = entity_summary or f"{entity_name} is a {entity_type}." return result except json.JSONDecodeError as je: - logger.warning(f"JSON解析失败 (attempt {attempt+1}): {str(je)[:80]}") + logger.warning(f"JSON parse failed (attempt {attempt+1}): {str(je)[:80]}") - # 尝试修复JSON + # Try to fix JSON result = self._try_fix_json(content, entity_name, entity_type, entity_summary) if result.get("_fixed"): del result["_fixed"] @@ -514,75 +514,75 @@ def _generate_profile_with_llm( last_error = je except Exception as e: - logger.warning(f"LLM调用失败 (attempt {attempt+1}): {str(e)[:80]}") + logger.warning(f"LLM call failed (attempt {attempt+1}): {str(e)[:80]}") last_error = e import time - time.sleep(1 * (attempt + 1)) # 指数退避 + time.sleep(1 * (attempt + 1)) # Exponential backoff - logger.warning(f"LLM生成人设失败({max_attempts}次尝试): {last_error}, 使用规则生成") + logger.warning(f"LLM persona generation failed ({max_attempts} attempts): {last_error}, using rule-based generation") return self._generate_profile_rule_based( entity_name, entity_type, entity_summary, entity_attributes ) def _fix_truncated_json(self, content: str) -> str: - """修复被截断的JSON(输出被max_tokens限制截断)""" + """Fix truncated JSON (output truncated by max_tokens limit)""" import re - # 如果JSON被截断,尝试闭合它 + # If JSON is truncated, try to close it content = content.strip() - # 计算未闭合的括号 + # Count unclosed brackets open_braces = content.count('{') - content.count('}') open_brackets = content.count('[') - content.count(']') - # 检查是否有未闭合的字符串 - # 简单检查:如果最后一个引号后没有逗号或闭合括号,可能是字符串被截断 + # Check for unclosed strings + # Simple check: if no comma or closing bracket after last quote, string may be truncated if content and content[-1] not in '",}]': - # 尝试闭合字符串 + # Try to close string content += '"' - # 闭合括号 + # Close brackets content += ']' * open_brackets content += '}' * open_braces return content def _try_fix_json(self, content: str, entity_name: str, entity_type: str, entity_summary: str = "") -> Dict[str, Any]: - """尝试修复损坏的JSON""" + """Try to fix broken JSON""" import re - # 1. 首先尝试修复被截断的情况 + # 1. First try to fix truncated case content = self._fix_truncated_json(content) - # 2. 尝试提取JSON部分 + # 2. Try to extract JSON part json_match = re.search(r'\{[\s\S]*\}', content) if json_match: json_str = json_match.group() - # 3. 处理字符串中的换行符问题 - # 找到所有字符串值并替换其中的换行符 + # 3. Handle newline issues in strings + # Find all string values and replace newlines in them def fix_string_newlines(match): s = match.group(0) - # 替换字符串内的实际换行符为空格 + # Replace actual newlines in string with spaces s = s.replace('\n', ' ').replace('\r', ' ') - # 替换多余空格 + # Replace excess spaces s = re.sub(r'\s+', ' ', s) return s - # 匹配JSON字符串值 + # Match JSON string values json_str = re.sub(r'"[^"\\]*(?:\\.[^"\\]*)*"', fix_string_newlines, json_str) - # 4. 尝试解析 + # 4. Try to parse try: result = json.loads(json_str) result["_fixed"] = True return result except json.JSONDecodeError as e: - # 5. 如果还是失败,尝试更激进的修复 + # 5. If still failing, try more aggressive repair try: - # 移除所有控制字符 + # Remove all control characters json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', json_str) - # 替换所有连续空白 + # Replace all consecutive whitespace json_str = re.sub(r'\s+', ' ', json_str) result = json.loads(json_str) result["_fixed"] = True @@ -590,32 +590,32 @@ def fix_string_newlines(match): except: pass - # 6. 尝试从内容中提取部分信息 + # 6. Try to extract partial info from content bio_match = re.search(r'"bio"\s*:\s*"([^"]*)"', content) - persona_match = re.search(r'"persona"\s*:\s*"([^"]*)', content) # 可能被截断 + persona_match = re.search(r'"persona"\s*:\s*"([^"]*)', content) # May be truncated bio = bio_match.group(1) if bio_match else (entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}") - persona = persona_match.group(1) if persona_match else (entity_summary or f"{entity_name}是一个{entity_type}。") + persona = persona_match.group(1) if persona_match else (entity_summary or f"{entity_name} is a {entity_type}.") - # 如果提取到了有意义的内容,标记为已修复 + # If meaningful content extracted, mark as fixed if bio_match or persona_match: - logger.info(f"从损坏的JSON中提取了部分信息") + logger.info(f"Extracted partial info from broken JSON") return { "bio": bio, "persona": persona, "_fixed": True } - # 7. 完全失败,返回基础结构 - logger.warning(f"JSON修复失败,返回基础结构") + # 7. Complete failure, return basic structure + logger.warning(f"JSON repair failed, returning basic structure") return { "bio": entity_summary[:200] if entity_summary else f"{entity_type}: {entity_name}", - "persona": entity_summary or f"{entity_name}是一个{entity_type}。" + "persona": entity_summary or f"{entity_name} is a {entity_type}." } def _get_system_prompt(self, is_individual: bool) -> str: - """获取系统提示词""" - base_prompt = "你是社交媒体用户画像生成专家。生成详细、真实的人设用于舆论模拟,最大程度还原已有现实情况。必须返回有效的JSON格式,所有字符串值不能包含未转义的换行符。使用中文。" + """Get system prompt""" + base_prompt = "You are a social media user persona generation expert. Generate detailed, realistic personas for opinion simulation, maximally restoring existing real-world conditions. Must return valid JSON format. All string values must not contain unescaped newlines. Always respond in English." return base_prompt def _build_individual_persona_prompt( @@ -626,45 +626,45 @@ def _build_individual_persona_prompt( entity_attributes: Dict[str, Any], context: str ) -> str: - """构建个人实体的详细人设提示词""" + """Build detailed persona prompt for individual entities""" - attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "无" - context_str = context[:3000] if context else "无额外上下文" + attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "None" + context_str = context[:3000] if context else "No additional context" - return f"""为实体生成详细的社交媒体用户人设,最大程度还原已有现实情况。 + return f"""Generate a detailed social media user persona for this entity, maximally restoring existing real-world conditions. -实体名称: {entity_name} -实体类型: {entity_type} -实体摘要: {entity_summary} -实体属性: {attrs_str} +Entity name: {entity_name} +Entity type: {entity_type} +Entity summary: {entity_summary} +Entity attributes: {attrs_str} -上下文信息: +Context info: {context_str} -请生成JSON,包含以下字段: +Please generate JSON with the following fields: -1. bio: 社交媒体简介,200字 -2. persona: 详细人设描述(2000字的纯文本),需包含: - - 基本信息(年龄、职业、教育背景、所在地) - - 人物背景(重要经历、与事件的关联、社会关系) - - 性格特征(MBTI类型、核心性格、情绪表达方式) - - 社交媒体行为(发帖频率、内容偏好、互动风格、语言特点) - - 立场观点(对话题的态度、可能被激怒/感动的内容) - - 独特特征(口头禅、特殊经历、个人爱好) - - 个人记忆(人设的重要部分,要介绍这个个体与事件的关联,以及这个个体在事件中的已有动作与反应) -3. age: 年龄数字(必须是整数) -4. gender: 性别,必须是英文: "male" 或 "female" -5. mbti: MBTI类型(如INTJ、ENFP等) -6. country: 国家(使用中文,如"中国") -7. profession: 职业 -8. interested_topics: 感兴趣话题数组 +1. bio: Social media bio, 200 characters +2. persona: Detailed persona description (2000 characters of plain text), should include: + - Basic info (age, profession, educational background, location) + - Background (important experiences, connection to events, social relationships) + - Personality traits (MBTI type, core personality, emotional expression style) + - Social media behavior (posting frequency, content preferences, interaction style, language characteristics) + - Stance and views (attitude toward topics, content that may anger/move them) + - Unique features (catchphrases, special experiences, personal hobbies) + - Personal memory (important part of persona: introduce this individual's connection to the event, and their existing actions and reactions in the event) +3. age: Age number (must be integer) +4. gender: Gender, must be English: "male" or "female" +5. mbti: MBTI type (e.g. INTJ, ENFP) +6. country: Country (use Chinese, e.g. "China") +7. profession: Profession +8. interested_topics: Array of interest topics -重要: -- 所有字段值必须是字符串或数字,不要使用换行符 -- persona必须是一段连贯的文字描述 -- 使用中文(除了gender字段必须用英文male/female) -- 内容要与实体信息保持一致 -- age必须是有效的整数,gender必须是"male"或"female" +Important: +- All field values must be strings or numbers, do not use newlines +- persona must be a coherent text description +- Always respond in English +- Content must be consistent with entity info +- age must be a valid integer, gender must be "male" or "female" """ def _build_group_persona_prompt( @@ -675,45 +675,45 @@ def _build_group_persona_prompt( entity_attributes: Dict[str, Any], context: str ) -> str: - """构建群体/机构实体的详细人设提示词""" + """Build group/institution entity detailed persona prompt""" - attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "无" - context_str = context[:3000] if context else "无额外上下文" + attrs_str = json.dumps(entity_attributes, ensure_ascii=False) if entity_attributes else "None" + context_str = context[:3000] if context else "No additional context" - return f"""为机构/群体实体生成详细的社交媒体账号设定,最大程度还原已有现实情况。 + return f"""Generate a detailed social media account profile for this institution/group entity, maximally restoring existing real-world conditions. -实体名称: {entity_name} -实体类型: {entity_type} -实体摘要: {entity_summary} -实体属性: {attrs_str} +Entity name: {entity_name} +Entity type: {entity_type} +Entity summary: {entity_summary} +Entity attributes: {attrs_str} -上下文信息: +Context info: {context_str} -请生成JSON,包含以下字段: +Please generate JSON with the following fields: -1. bio: 官方账号简介,200字,专业得体 -2. persona: 详细账号设定描述(2000字的纯文本),需包含: - - 机构基本信息(正式名称、机构性质、成立背景、主要职能) - - 账号定位(账号类型、目标受众、核心功能) - - 发言风格(语言特点、常用表达、禁忌话题) - - 发布内容特点(内容类型、发布频率、活跃时间段) - - 立场态度(对核心话题的官方立场、面对争议的处理方式) - - 特殊说明(代表的群体画像、运营习惯) - - 机构记忆(机构人设的重要部分,要介绍这个机构与事件的关联,以及这个机构在事件中的已有动作与反应) -3. age: 固定填30(机构账号的虚拟年龄) -4. gender: 固定填"other"(机构账号使用other表示非个人) -5. mbti: MBTI类型,用于描述账号风格,如ISTJ代表严谨保守 -6. country: 国家(使用中文,如"中国") -7. profession: 机构职能描述 -8. interested_topics: 关注领域数组 +1. bio: Official account bio, 200 characters, professional and appropriate +2. persona: Detailed account profile description (2000 characters of plain text), should include: + - Institution basic info (official name, nature, founding background, main functions) + - Account positioning (account type, target audience, core functions) + - Speaking style (language characteristics, common expressions, taboo topics) + - Content characteristics (content types, posting frequency, active hours) + - Stance and attitude (official stance on core topics, how controversies are handled) + - Special notes (represented group profile, operational habits) + - Institutional memory (important part of persona: introduce this institution's connection to the event, and its existing actions and reactions in the event) +3. age: Fixed at 30 (virtual age for institutional accounts) +4. gender: Fixed "other" (institutional accounts use other to indicate non-individual) +5. mbti: MBTI type, used to describe account style, e.g. ISTJ represents rigorous and conservative +6. country: Country (use Chinese, e.g. "China") +7. profession: Institutional function description +8. interested_topics: Array of focus areas -重要: -- 所有字段值必须是字符串或数字,不允许null值 -- persona必须是一段连贯的文字描述,不要使用换行符 -- 使用中文(除了gender字段必须用英文"other") -- age必须是整数30,gender必须是字符串"other" -- 机构账号发言要符合其身份定位""" +Important: +- All field values must be strings or numbers, null values not allowed +- persona must be a coherent text description, do not use newlines +- Always respond in English +- age must be integer 30, gender must be string "other" +- Institutional account speech must match its identity positioning""" def _generate_profile_rule_based( self, @@ -722,9 +722,9 @@ def _generate_profile_rule_based( entity_summary: str, entity_attributes: Dict[str, Any] ) -> Dict[str, Any]: - """使用规则生成基础人设""" + """Generate basic persona using rules""" - # 根据实体类型生成不同的人设 + # Generate different personas based on entity type entity_type_lower = entity_type.lower() if entity_type_lower in ["student", "alumni"]: @@ -755,10 +755,10 @@ def _generate_profile_rule_based( return { "bio": f"Official account for {entity_name}. News and updates.", "persona": f"{entity_name} is a media entity that reports news and facilitates public discourse. The account shares timely updates and engages with the audience on current events.", - "age": 30, # 机构虚拟年龄 - "gender": "other", # 机构使用other - "mbti": "ISTJ", # 机构风格:严谨保守 - "country": "中国", + "age": 30, # Institution virtual age + "gender": "other", # Institutions use other + "mbti": "ISTJ", # Institution style: rigorous and conservative + "country": "China", "profession": "Media", "interested_topics": ["General News", "Current Events", "Public Affairs"], } @@ -767,16 +767,16 @@ def _generate_profile_rule_based( return { "bio": f"Official account of {entity_name}.", "persona": f"{entity_name} is an institutional entity that communicates official positions, announcements, and engages with stakeholders on relevant matters.", - "age": 30, # 机构虚拟年龄 - "gender": "other", # 机构使用other - "mbti": "ISTJ", # 机构风格:严谨保守 - "country": "中国", + "age": 30, # Institution virtual age + "gender": "other", # Institutions use other + "mbti": "ISTJ", # Institution style: rigorous and conservative + "country": "China", "profession": entity_type, "interested_topics": ["Public Policy", "Community", "Official Announcements"], } else: - # 默认人设 + # Default persona return { "bio": entity_summary[:150] if entity_summary else f"{entity_type}: {entity_name}", "persona": entity_summary or f"{entity_name} is a {entity_type.lower()} participating in social discussions.", @@ -789,7 +789,7 @@ def _generate_profile_rule_based( } def set_graph_id(self, graph_id: str): - """设置图谱ID用于图谱检索""" + """Set graph ID for graph retrieval""" self.graph_id = graph_id def generate_profiles_from_entities( @@ -803,52 +803,52 @@ def generate_profiles_from_entities( output_platform: str = "reddit" ) -> List[OasisAgentProfile]: """ - 批量从实体生成Agent Profile(支持并行生成) + Batch generate Agent Profiles from entities (supports parallel generation) Args: - entities: 实体列表 - use_llm: 是否使用LLM生成详细人设 - progress_callback: 进度回调函数 (current, total, message) - graph_id: 图谱ID,用于图谱检索获取更丰富上下文 - parallel_count: 并行生成数量,默认5 - realtime_output_path: 实时写入的文件路径(如果提供,每生成一个就写入一次) - output_platform: 输出平台格式 ("reddit" 或 "twitter") + entities: Entity list + use_llm: Whether to use LLM to generate detailed persona + progress_callback: Progress callback function (current, total, message) + graph_id: Graph ID for graph retrieval to get richer context + parallel_count: Parallel generation count, default 5 + realtime_output_path: Real-time write file path (if provided, writes after each generation) + output_platform: Output platform format ("reddit" or "twitter") Returns: - Agent Profile列表 + Agent Profile list """ import concurrent.futures from threading import Lock - # 设置graph_id用于图谱检索 + # Set graph_id for graph retrieval if graph_id: self.graph_id = graph_id total = len(entities) - profiles = [None] * total # 预分配列表保持顺序 - completed_count = [0] # 使用列表以便在闭包中修改 + profiles = [None] * total # Pre-allocate list to maintain order + completed_count = [0] # Use list to allow modification in closure lock = Lock() - # 实时写入文件的辅助函数 + # Helper function for real-time file writing def save_profiles_realtime(): - """实时保存已生成的 profiles 到文件""" + """Save generated profiles to file in real-time""" if not realtime_output_path: return with lock: - # 过滤出已生成的 profiles + # Filter out already generated profiles existing_profiles = [p for p in profiles if p is not None] if not existing_profiles: return try: if output_platform == "reddit": - # Reddit JSON 格式 + # Reddit JSON format profiles_data = [p.to_reddit_format() for p in existing_profiles] with open(realtime_output_path, 'w', encoding='utf-8') as f: json.dump(profiles_data, f, ensure_ascii=False, indent=2) else: - # Twitter CSV 格式 + # Twitter CSV format import csv profiles_data = [p.to_twitter_format() for p in existing_profiles] if profiles_data: @@ -858,10 +858,10 @@ def save_profiles_realtime(): writer.writeheader() writer.writerows(profiles_data) except Exception as e: - logger.warning(f"实时保存 profiles 失败: {e}") + logger.warning(f"Real-time profile save failed: {e}") def generate_single_profile(idx: int, entity: EntityNode) -> tuple: - """生成单个profile的工作函数""" + """Worker function for generating a single profile""" entity_type = entity.get_entity_type() or "Entity" try: @@ -871,14 +871,14 @@ def generate_single_profile(idx: int, entity: EntityNode) -> tuple: use_llm=use_llm ) - # 实时输出生成的人设到控制台和日志 + # Output generated persona to console and log in real-time self._print_generated_profile(entity.name, entity_type, profile) return idx, profile, None except Exception as e: - logger.error(f"生成实体 {entity.name} 的人设失败: {str(e)}") - # 创建一个基础profile + logger.error(f"Persona generation failed for entity {entity.name}: {str(e)}") + # Create a basic profile fallback_profile = OasisAgentProfile( user_id=idx, user_name=self._generate_username(entity.name), @@ -890,20 +890,20 @@ def generate_single_profile(idx: int, entity: EntityNode) -> tuple: ) return idx, fallback_profile, str(e) - logger.info(f"开始并行生成 {total} 个Agent人设(并行数: {parallel_count})...") + logger.info(f"Starting parallel generation of {total} Agent personas (parallel count: {parallel_count})...") print(f"\n{'='*60}") - print(f"开始生成Agent人设 - 共 {total} 个实体,并行数: {parallel_count}") + print(f"Starting Agent persona generation - {total} entities, parallel count: {parallel_count}") print(f"{'='*60}\n") - # 使用线程池并行执行 + # Execute in parallel using thread pool with concurrent.futures.ThreadPoolExecutor(max_workers=parallel_count) as executor: - # 提交所有任务 + # Submit all tasks future_to_entity = { executor.submit(generate_single_profile, idx, entity): (idx, entity) for idx, entity in enumerate(entities) } - # 收集结果 + # Collect results for future in concurrent.futures.as_completed(future_to_entity): idx, entity = future_to_entity[future] entity_type = entity.get_entity_type() or "Entity" @@ -916,23 +916,23 @@ def generate_single_profile(idx: int, entity: EntityNode) -> tuple: completed_count[0] += 1 current = completed_count[0] - # 实时写入文件 + # Write to file in real-time save_profiles_realtime() if progress_callback: progress_callback( current, total, - f"已完成 {current}/{total}: {entity.name}({entity_type})" + f"Completed {current}/{total}: {entity.name} ({entity_type})" ) if error: - logger.warning(f"[{current}/{total}] {entity.name} 使用备用人设: {error}") + logger.warning(f"[{current}/{total}] {entity.name} using fallback persona: {error}") else: - logger.info(f"[{current}/{total}] 成功生成人设: {entity.name} ({entity_type})") + logger.info(f"[{current}/{total}] Successfully generated persona: {entity.name} ({entity_type})") except Exception as e: - logger.error(f"处理实体 {entity.name} 时发生异常: {str(e)}") + logger.error(f"Exception processing entity {entity.name}: {str(e)}") with lock: completed_count[0] += 1 profiles[idx] = OasisAgentProfile( @@ -944,44 +944,44 @@ def generate_single_profile(idx: int, entity: EntityNode) -> tuple: source_entity_uuid=entity.uuid, source_entity_type=entity_type, ) - # 实时写入文件(即使是备用人设) + # Write to file in real-time (even for fallback persona) save_profiles_realtime() print(f"\n{'='*60}") - print(f"人设生成完成!共生成 {len([p for p in profiles if p])} 个Agent") + print(f"Persona generation complete! Generated {len([p for p in profiles if p])} Agents") print(f"{'='*60}\n") return profiles def _print_generated_profile(self, entity_name: str, entity_type: str, profile: OasisAgentProfile): - """实时输出生成的人设到控制台(完整内容,不截断)""" + """Output generated persona to console in real-time (full content, no truncation)""" separator = "-" * 70 - # 构建完整输出内容(不截断) - topics_str = ', '.join(profile.interested_topics) if profile.interested_topics else '无' + # Build full output content (no truncation) + topics_str = ', '.join(profile.interested_topics) if profile.interested_topics else 'None' output_lines = [ f"\n{separator}", - f"[已生成] {entity_name} ({entity_type})", + f"[Generated] {entity_name} ({entity_type})", f"{separator}", - f"用户名: {profile.user_name}", + f"Username: {profile.user_name}", f"", - f"【简介】", + f"[Bio]", f"{profile.bio}", f"", - f"【详细人设】", + f"[Detailed Persona]", f"{profile.persona}", f"", - f"【基本属性】", - f"年龄: {profile.age} | 性别: {profile.gender} | MBTI: {profile.mbti}", - f"职业: {profile.profession} | 国家: {profile.country}", - f"兴趣话题: {topics_str}", + f"[Basic Attributes]", + f"Age: {profile.age} | Gender: {profile.gender} | MBTI: {profile.mbti}", + f"Profession: {profile.profession} | Country: {profile.country}", + f"Interest topics: {topics_str}", separator ] output = "\n".join(output_lines) - # 只输出到控制台(避免重复,logger不再输出完整内容) + # Only output to console (avoid duplication, logger no longer outputs full content) print(output) def save_profiles( @@ -991,16 +991,16 @@ def save_profiles( platform: str = "reddit" ): """ - 保存Profile到文件(根据平台选择正确格式) + Save profiles to file (choose correct format based on platform) - OASIS平台格式要求: - - Twitter: CSV格式 - - Reddit: JSON格式 + OASIS platform format requirements: + - Twitter: CSV format + - Reddit: JSON format Args: - profiles: Profile列表 - file_path: 文件路径 - platform: 平台类型 ("reddit" 或 "twitter") + profiles: Profile list + file_path: File path + platform: Platform type ("reddit" or "twitter") """ if platform == "twitter": self._save_twitter_csv(profiles, file_path) @@ -1009,117 +1009,112 @@ def save_profiles( def _save_twitter_csv(self, profiles: List[OasisAgentProfile], file_path: str): """ - 保存Twitter Profile为CSV格式(符合OASIS官方要求) - - OASIS Twitter要求的CSV字段: - - user_id: 用户ID(根据CSV顺序从0开始) - - name: 用户真实姓名 - - username: 系统中的用户名 - - user_char: 详细人设描述(注入到LLM系统提示中,指导Agent行为) - - description: 简短的公开简介(显示在用户资料页面) - - user_char vs description 区别: - - user_char: 内部使用,LLM系统提示,决定Agent如何思考和行动 - - description: 外部显示,其他用户可见的简介 + Save Twitter Profiles as CSV format (per OASIS official requirements) + + OASIS Twitter required CSV fields: + - user_id: User ID (starts from 0 based on CSV order) + - name: User real name + - username: System username + - user_char: Detailed persona description (injected into LLM system prompt, guides Agent behavior) + - description: Short public bio (shown on user profile page) + + user_char vs description difference: + - user_char: Internal use, LLM system prompt, determines how Agent thinks and acts + - description: External display, bio visible to other users """ import csv - # 确保文件扩展名是.csv + # Ensure file extension is .csv if not file_path.endswith('.csv'): file_path = file_path.replace('.json', '.csv') with open(file_path, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) - # 写入OASIS要求的表头 + # Write OASIS-required headers headers = ['user_id', 'name', 'username', 'user_char', 'description'] writer.writerow(headers) - # 写入数据行 + # Write data rows for idx, profile in enumerate(profiles): - # user_char: 完整人设(bio + persona),用于LLM系统提示 + # user_char: Full persona (bio + persona), for LLM system prompt user_char = profile.bio if profile.persona and profile.persona != profile.bio: user_char = f"{profile.bio} {profile.persona}" - # 处理换行符(CSV中用空格替代) + # Handle newlines (replace with spaces in CSV) user_char = user_char.replace('\n', ' ').replace('\r', ' ') - # description: 简短简介,用于外部显示 + # description: Short bio, for external display description = profile.bio.replace('\n', ' ').replace('\r', ' ') row = [ - idx, # user_id: 从0开始的顺序ID - profile.name, # name: 真实姓名 - profile.user_name, # username: 用户名 - user_char, # user_char: 完整人设(内部LLM使用) - description # description: 简短简介(外部显示) + idx, # user_id: Sequential ID starting from 0 + profile.name, # name: Real name + profile.user_name, # username: Username + user_char, # user_char: Full persona (internal LLM use) + description # description: Short bio (external display) ] writer.writerow(row) - logger.info(f"已保存 {len(profiles)} 个Twitter Profile到 {file_path} (OASIS CSV格式)") + logger.info(f"Saved {len(profiles)} Twitter Profiles to {file_path} (OASIS CSV format)") def _normalize_gender(self, gender: Optional[str]) -> str: """ - 标准化gender字段为OASIS要求的英文格式 + Normalize gender field to English format required by OASIS - OASIS要求: male, female, other + OASIS requires: male, female, other """ if not gender: return "other" gender_lower = gender.lower().strip() - # 中文映射 gender_map = { - "男": "male", - "女": "female", - "机构": "other", - "其他": "other", - # 英文已有 "male": "male", "female": "female", "other": "other", + "institution": "other", } return gender_map.get(gender_lower, "other") def _save_reddit_json(self, profiles: List[OasisAgentProfile], file_path: str): """ - 保存Reddit Profile为JSON格式 - - 使用与 to_reddit_format() 一致的格式,确保 OASIS 能正确读取。 - 必须包含 user_id 字段,这是 OASIS agent_graph.get_agent() 匹配的关键! - - 必需字段: - - user_id: 用户ID(整数,用于匹配 initial_posts 中的 poster_agent_id) - - username: 用户名 - - name: 显示名称 - - bio: 简介 - - persona: 详细人设 - - age: 年龄(整数) - - gender: "male", "female", 或 "other" - - mbti: MBTI类型 - - country: 国家 + Save Reddit Profiles as JSON format + + Uses format consistent with to_reddit_format() to ensure OASIS can read correctly. + Must include user_id field — this is key for OASIS agent_graph.get_agent() matching! + + Required fields: + - user_id: User ID (integer, used for matching poster_agent_id in initial_posts) + - username: Username + - name: Display name + - bio: Bio + - persona: Detailed persona + - age: Age (integer) + - gender: "male", "female", or "other" + - mbti: MBTI type + - country: Country """ data = [] for idx, profile in enumerate(profiles): - # 使用与 to_reddit_format() 一致的格式 + # Use format consistent with to_reddit_format() item = { - "user_id": profile.user_id if profile.user_id is not None else idx, # 关键:必须包含 user_id + "user_id": profile.user_id if profile.user_id is not None else idx, # Key: must include user_id "username": profile.user_name, "name": profile.name, "bio": profile.bio[:150] if profile.bio else f"{profile.name}", "persona": profile.persona or f"{profile.name} is a participant in social discussions.", "karma": profile.karma if profile.karma else 1000, "created_at": profile.created_at, - # OASIS必需字段 - 确保都有默认值 + # OASIS required fields - ensure all have default values "age": profile.age if profile.age else 30, "gender": self._normalize_gender(profile.gender), "mbti": profile.mbti if profile.mbti else "ISTJ", - "country": profile.country if profile.country else "中国", + "country": profile.country if profile.country else "China", } - # 可选字段 + # Optional fields if profile.profession: item["profession"] = profile.profession if profile.interested_topics: @@ -1130,16 +1125,16 @@ def _save_reddit_json(self, profiles: List[OasisAgentProfile], file_path: str): with open(file_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) - logger.info(f"已保存 {len(profiles)} 个Reddit Profile到 {file_path} (JSON格式,包含user_id字段)") + logger.info(f"Saved {len(profiles)} Reddit Profiles to {file_path} (JSON format, includes user_id field)") - # 保留旧方法名作为别名,保持向后兼容 + # Keep old method name as alias for backward compatibility def save_profiles_to_json( self, profiles: List[OasisAgentProfile], file_path: str, platform: str = "reddit" ): - """[已废弃] 请使用 save_profiles() 方法""" - logger.warning("save_profiles_to_json已废弃,请使用save_profiles方法") + """[Deprecated] Please use save_profiles() method""" + logger.warning("save_profiles_to_json is deprecated, please use save_profiles method") self.save_profiles(profiles, file_path, platform) diff --git a/backend/app/services/ontology_generator.py b/backend/app/services/ontology_generator.py index cc44f2a..467301d 100644 --- a/backend/app/services/ontology_generator.py +++ b/backend/app/services/ontology_generator.py @@ -1,6 +1,6 @@ """ -本体生成服务 -接口1:分析文本内容,生成适合社会模拟的实体和关系类型定义 +Ontology Generation Service +Interface 1: Analyze text content and generate entity and relationship type definitions suitable for social simulation """ import json @@ -8,157 +8,158 @@ from ..utils.llm_client import LLMClient -# 本体生成的系统提示词 -ONTOLOGY_SYSTEM_PROMPT = """你是一个专业的知识图谱本体设计专家。你的任务是分析给定的文本内容和模拟需求,设计适合**社交媒体舆论模拟**的实体类型和关系类型。 +# Ontology generation system prompt +ONTOLOGY_SYSTEM_PROMPT = """You are a professional knowledge graph ontology design expert. Your task is to analyze given text content and simulation requirements, and design entity types and relationship types suitable for **social media opinion simulation**. -**重要:你必须输出有效的JSON格式数据,不要输出任何其他内容。** +**Important: You must output valid JSON format data. Do not output anything else.** -## 核心任务背景 +## Core Task Background -我们正在构建一个**社交媒体舆论模拟系统**。在这个系统中: -- 每个实体都是一个可以在社交媒体上发声、互动、传播信息的"账号"或"主体" -- 实体之间会相互影响、转发、评论、回应 -- 我们需要模拟舆论事件中各方的反应和信息传播路径 +We are building a **social media opinion simulation system**. In this system: +- Each entity is an "account" or "actor" that can speak, interact, and spread information on social media +- Entities influence each other through reposts, comments, and responses +- We need to simulate the reactions of various parties in opinion events and information propagation paths -因此,**实体必须是现实中真实存在的、可以在社媒上发声和互动的主体**: +Therefore, **entities must be real-world actors that can speak and interact on social media**: -**可以是**: -- 具体的个人(公众人物、当事人、意见领袖、专家学者、普通人) -- 公司、企业(包括其官方账号) -- 组织机构(大学、协会、NGO、工会等) -- 政府部门、监管机构 -- 媒体机构(报纸、电视台、自媒体、网站) -- 社交媒体平台本身 -- 特定群体代表(如校友会、粉丝团、维权群体等) +**Can be**: +- Specific individuals (public figures, parties involved, opinion leaders, experts, ordinary people) +- Companies and enterprises (including their official accounts) +- Organizations (universities, associations, NGOs, unions, etc.) +- Government departments, regulatory agencies +- Media organizations (newspapers, TV stations, self-media, websites) +- Social media platforms themselves +- Representatives of specific groups (alumni associations, fan groups, advocacy groups, etc.) -**不可以是**: -- 抽象概念(如"舆论"、"情绪"、"趋势") -- 主题/话题(如"学术诚信"、"教育改革") -- 观点/态度(如"支持方"、"反对方") +**Cannot be**: +- Abstract concepts (e.g. "public opinion", "emotion", "trend") +- Topics/themes (e.g. "academic integrity", "education reform") +- Views/attitudes (e.g. "supporters", "opponents") -## 输出格式 +## Output Format -请输出JSON格式,包含以下结构: +Please output JSON format with the following structure: ```json { "entity_types": [ { - "name": "实体类型名称(英文,PascalCase)", - "description": "简短描述(英文,不超过100字符)", + "name": "Entity type name (English, PascalCase)", + "description": "Brief description (English, max 100 characters)", "attributes": [ { - "name": "属性名(英文,snake_case)", + "name": "Attribute name (English, snake_case)", "type": "text", - "description": "属性描述" + "description": "Attribute description" } ], - "examples": ["示例实体1", "示例实体2"] + "examples": ["Example entity 1", "Example entity 2"] } ], "edge_types": [ { - "name": "关系类型名称(英文,UPPER_SNAKE_CASE)", - "description": "简短描述(英文,不超过100字符)", + "name": "Relationship type name (English, UPPER_SNAKE_CASE)", + "description": "Brief description (English, max 100 characters)", "source_targets": [ - {"source": "源实体类型", "target": "目标实体类型"} + {"source": "Source entity type", "target": "Target entity type"} ], "attributes": [] } ], - "analysis_summary": "对文本内容的简要分析说明(中文)" + "analysis_summary": "Brief analysis of the text content" } ``` -## 设计指南(极其重要!) - -### 1. 实体类型设计 - 必须严格遵守 - -**数量要求:必须正好10个实体类型** - -**层次结构要求(必须同时包含具体类型和兜底类型)**: - -你的10个实体类型必须包含以下层次: - -A. **兜底类型(必须包含,放在列表最后2个)**: - - `Person`: 任何自然人个体的兜底类型。当一个人不属于其他更具体的人物类型时,归入此类。 - - `Organization`: 任何组织机构的兜底类型。当一个组织不属于其他更具体的组织类型时,归入此类。 - -B. **具体类型(8个,根据文本内容设计)**: - - 针对文本中出现的主要角色,设计更具体的类型 - - 例如:如果文本涉及学术事件,可以有 `Student`, `Professor`, `University` - - 例如:如果文本涉及商业事件,可以有 `Company`, `CEO`, `Employee` - -**为什么需要兜底类型**: -- 文本中会出现各种人物,如"中小学教师"、"路人甲"、"某位网友" -- 如果没有专门的类型匹配,他们应该被归入 `Person` -- 同理,小型组织、临时团体等应该归入 `Organization` - -**具体类型的设计原则**: -- 从文本中识别出高频出现或关键的角色类型 -- 每个具体类型应该有明确的边界,避免重叠 -- description 必须清晰说明这个类型和兜底类型的区别 - -### 2. 关系类型设计 - -- 数量:6-10个 -- 关系应该反映社媒互动中的真实联系 -- 确保关系的 source_targets 涵盖你定义的实体类型 - -### 3. 属性设计 - -- 每个实体类型1-3个关键属性 -- **注意**:属性名不能使用 `name`、`uuid`、`group_id`、`created_at`、`summary`(这些是系统保留字) -- 推荐使用:`full_name`, `title`, `role`, `position`, `location`, `description` 等 - -## 实体类型参考 - -**个人类(具体)**: -- Student: 学生 -- Professor: 教授/学者 -- Journalist: 记者 -- Celebrity: 明星/网红 -- Executive: 高管 -- Official: 政府官员 -- Lawyer: 律师 -- Doctor: 医生 - -**个人类(兜底)**: -- Person: 任何自然人(不属于上述具体类型时使用) - -**组织类(具体)**: -- University: 高校 -- Company: 公司企业 -- GovernmentAgency: 政府机构 -- MediaOutlet: 媒体机构 -- Hospital: 医院 -- School: 中小学 -- NGO: 非政府组织 - -**组织类(兜底)**: -- Organization: 任何组织机构(不属于上述具体类型时使用) - -## 关系类型参考 - -- WORKS_FOR: 工作于 -- STUDIES_AT: 就读于 -- AFFILIATED_WITH: 隶属于 -- REPRESENTS: 代表 -- REGULATES: 监管 -- REPORTS_ON: 报道 -- COMMENTS_ON: 评论 -- RESPONDS_TO: 回应 -- SUPPORTS: 支持 -- OPPOSES: 反对 -- COLLABORATES_WITH: 合作 -- COMPETES_WITH: 竞争 +## Design Guidelines (Extremely Important!) + +### 1. Entity Type Design - Must Strictly Follow + +**Quantity requirement: Generate as many entity types as needed to capture all distinct actors in the text (typically 15-40 types for complex documents). Be thorough — more types means richer simulation.** + +**Hierarchy requirement (must include both specific and fallback types)**: + +Your entity types must include the following levels: + +A. **Fallback types (must include, placed as last 2 in list)**: + - `Person`: Fallback type for any natural person. When a person does not fit other more specific person types, classify here. + - `Organization`: Fallback type for any organization. When an organization does not fit other more specific organization types, classify here. + +B. **Specific types (designed based on text content)**: + - Design specific types for ALL distinct roles and actors appearing in the text + - Be granular — create separate types for meaningfully different actors + - Example: If text involves academic events, can have `Student`, `Professor`, `University`, `ResearchGroup`, `AcademicJournal` + - Example: If text involves geopolitics, can have `HeadOfState`, `Government`, `MilitaryOrganization`, `Diplomat`, `MediaOutlet`, `ThinkTank`, `CorporateEntity` + +**Why fallback types are needed**: +- Various characters appear in text, such as "elementary school teachers", "random bystanders", "anonymous netizens" +- If no specific type matches, they should be classified under `Person` +- Similarly, small organizations, temporary groups, etc. should be classified under `Organization` + +**Design principles for specific types**: +- Identify frequently appearing or key role types from the text +- Each specific type should have clear boundaries, avoiding overlap +- Description must clearly explain the difference between this type and the fallback type + +### 2. Relationship Type Design + +- Quantity: 10-30 (as many as needed to capture all meaningful connections) +- Relationships should reflect real connections in social media interactions +- Ensure source_targets of relationships cover your defined entity types + +### 3. Attribute Design + +- 1-3 key attributes per entity type +- **Note**: Attribute names cannot use `name`, `uuid`, `group_id`, `created_at`, `summary` (these are system reserved words) +- Recommended: `full_name`, `title`, `role`, `position`, `location`, `description`, etc. + +## Entity Type Reference + +**Individual types (specific)**: +- Student: Student +- Professor: Professor/Scholar +- Journalist: Journalist +- Celebrity: Celebrity/Influencer +- Executive: Executive +- Official: Government official +- Lawyer: Lawyer +- Doctor: Doctor + +**Individual types (fallback)**: +- Person: Any natural person (used when not fitting above specific types) + +**Organization types (specific)**: +- University: University/College +- Company: Company/Enterprise +- GovernmentAgency: Government agency +- MediaOutlet: Media organization +- Hospital: Hospital +- School: Primary/Secondary school +- NGO: Non-governmental organization + +**Organization types (fallback)**: +- Organization: Any organization (used when not fitting above specific types) + +## Relationship Type Reference + +- WORKS_FOR: Works for +- STUDIES_AT: Studies at +- AFFILIATED_WITH: Affiliated with +- REPRESENTS: Represents +- REGULATES: Regulates +- REPORTS_ON: Reports on +- COMMENTS_ON: Comments on +- RESPONDS_TO: Responds to +- SUPPORTS: Supports +- OPPOSES: Opposes +- COLLABORATES_WITH: Collaborates with +- COMPETES_WITH: Competes with """ class OntologyGenerator: """ - 本体生成器 - 分析文本内容,生成实体和关系类型定义 + Ontology Generator + Analyzes text content and generates entity and relationship type definitions """ def __init__(self, llm_client: Optional[LLMClient] = None): @@ -171,17 +172,17 @@ def generate( additional_context: Optional[str] = None ) -> Dict[str, Any]: """ - 生成本体定义 + Generate ontology definition Args: - document_texts: 文档文本列表 - simulation_requirement: 模拟需求描述 - additional_context: 额外上下文 + document_texts: Document text list + simulation_requirement: Simulation requirement description + additional_context: Additional context Returns: - 本体定义(entity_types, edge_types等) + Ontology definition (entity_types, edge_types, etc.) """ - # 构建用户消息 + # Build user message user_message = self._build_user_message( document_texts, simulation_requirement, @@ -193,19 +194,19 @@ def generate( {"role": "user", "content": user_message} ] - # 调用LLM + # Call LLM result = self.llm_client.chat_json( messages=messages, temperature=0.3, max_tokens=4096 ) - # 验证和后处理 + # Validate and post-process result = self._validate_and_process(result) return result - # 传给 LLM 的文本最大长度(5万字) + # Max text length sent to LLM (50k chars) MAX_TEXT_LENGTH_FOR_LLM = 50000 def _build_user_message( @@ -214,50 +215,50 @@ def _build_user_message( simulation_requirement: str, additional_context: Optional[str] ) -> str: - """构建用户消息""" + """Build user message""" - # 合并文本 + # Merge texts combined_text = "\n\n---\n\n".join(document_texts) original_length = len(combined_text) - # 如果文本超过5万字,截断(仅影响传给LLM的内容,不影响图谱构建) + # If text exceeds 50k chars, truncate (only affects content sent to LLM, not graph building) if len(combined_text) > self.MAX_TEXT_LENGTH_FOR_LLM: combined_text = combined_text[:self.MAX_TEXT_LENGTH_FOR_LLM] - combined_text += f"\n\n...(原文共{original_length}字,已截取前{self.MAX_TEXT_LENGTH_FOR_LLM}字用于本体分析)..." + combined_text += f"\n\n...(original text has {original_length} chars, first {self.MAX_TEXT_LENGTH_FOR_LLM} chars used for ontology analysis)..." - message = f"""## 模拟需求 + message = f"""## Simulation Requirements {simulation_requirement} -## 文档内容 +## Document Content {combined_text} """ if additional_context: message += f""" -## 额外说明 +## Additional Notes {additional_context} """ message += """ -请根据以上内容,设计适合社会舆论模拟的实体类型和关系类型。 - -**必须遵守的规则**: -1. 必须正好输出10个实体类型 -2. 最后2个必须是兜底类型:Person(个人兜底)和 Organization(组织兜底) -3. 前8个是根据文本内容设计的具体类型 -4. 所有实体类型必须是现实中可以发声的主体,不能是抽象概念 -5. 属性名不能使用 name、uuid、group_id 等保留字,用 full_name、org_name 等替代 +Based on the above content, design entity types and relationship types suitable for social opinion simulation. + +**Rules that must be followed:** +1. Must output exactly 10 entity types +2. Last 2 must be fallback types: Person (individual fallback) and Organization (organization fallback) +3. First 8 are specific types designed based on text content +4. All entity types must be real-world actors that can speak, not abstract concepts +5. Attribute names cannot use reserved words like name, uuid, group_id; use full_name, org_name instead """ return message def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: - """验证和后处理结果""" + """Validate and post-process results""" - # 确保必要字段存在 + # Ensure required fields exist if "entity_types" not in result: result["entity_types"] = [] if "edge_types" not in result: @@ -265,17 +266,17 @@ def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: if "analysis_summary" not in result: result["analysis_summary"] = "" - # 验证实体类型 + # Validate entity types for entity in result["entity_types"]: if "attributes" not in entity: entity["attributes"] = [] if "examples" not in entity: entity["examples"] = [] - # 确保description不超过100字符 + # Ensure description does not exceed 100 chars if len(entity.get("description", "")) > 100: entity["description"] = entity["description"][:97] + "..." - # 验证关系类型 + # Validate relationship types for edge in result["edge_types"]: if "source_targets" not in edge: edge["source_targets"] = [] @@ -284,11 +285,11 @@ def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: if len(edge.get("description", "")) > 100: edge["description"] = edge["description"][:97] + "..." - # Zep API 限制:最多 10 个自定义实体类型,最多 10 个自定义边类型 - MAX_ENTITY_TYPES = 10 - MAX_EDGE_TYPES = 10 + # Neo4j has no hard limit on types — allow generous ontology + MAX_ENTITY_TYPES = 50 + MAX_EDGE_TYPES = 50 - # 兜底类型定义 + # Fallback type definitions person_fallback = { "name": "Person", "description": "Any individual person not fitting other specific person types.", @@ -309,12 +310,12 @@ def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: "examples": ["small business", "community group"] } - # 检查是否已有兜底类型 + # Check if fallback types already exist entity_names = {e["name"] for e in result["entity_types"]} has_person = "Person" in entity_names has_organization = "Organization" in entity_names - # 需要添加的兜底类型 + # Fallback types to add fallbacks_to_add = [] if not has_person: fallbacks_to_add.append(person_fallback) @@ -325,17 +326,17 @@ def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: current_count = len(result["entity_types"]) needed_slots = len(fallbacks_to_add) - # 如果添加后会超过 10 个,需要移除一些现有类型 + # If adding would exceed 10, need to remove some existing types if current_count + needed_slots > MAX_ENTITY_TYPES: - # 计算需要移除多少个 + # Calculate how many to remove to_remove = current_count + needed_slots - MAX_ENTITY_TYPES - # 从末尾移除(保留前面更重要的具体类型) + # Remove from end (preserve more important specific types at front) result["entity_types"] = result["entity_types"][:-to_remove] - # 添加兜底类型 + # Add fallback types result["entity_types"].extend(fallbacks_to_add) - # 最终确保不超过限制(防御性编程) + # Final check to ensure limits are not exceeded (defensive programming) if len(result["entity_types"]) > MAX_ENTITY_TYPES: result["entity_types"] = result["entity_types"][:MAX_ENTITY_TYPES] @@ -346,25 +347,25 @@ def _validate_and_process(self, result: Dict[str, Any]) -> Dict[str, Any]: def generate_python_code(self, ontology: Dict[str, Any]) -> str: """ - [DEPRECATED] 将本体定义转换为Zep-format Pydantic代码。 + [DEPRECATED] Convert ontology definition to Zep-format Pydantic code. Not used in MiroFish-Offline (ontology stored as JSON in Neo4j). Kept for reference only. """ code_lines = [ '"""', - '自定义实体类型定义', - '由MiroFish自动生成,用于社会舆论模拟', + 'Custom entity type definitions', + 'Auto-generated by MiroFish for social opinion simulation', '"""', '', 'from pydantic import Field', 'from zep_cloud.external_clients.ontology import EntityModel, EntityText, EdgeModel', '', '', - '# ============== 实体类型定义 ==============', + '# ============== Entity Type Definitions ==============', '', ] - # 生成实体类型 + # Generate entity types for entity in ontology.get("entity_types", []): name = entity["name"] desc = entity.get("description", f"A {name} entity.") @@ -387,13 +388,13 @@ def generate_python_code(self, ontology: Dict[str, Any]) -> str: code_lines.append('') code_lines.append('') - code_lines.append('# ============== 关系类型定义 ==============') + code_lines.append('# ============== Relationship Type Definitions ==============') code_lines.append('') - # 生成关系类型 + # Generate relationship types for edge in ontology.get("edge_types", []): name = edge["name"] - # 转换为PascalCase类名 + # Convert to PascalCase class name class_name = ''.join(word.capitalize() for word in name.split('_')) desc = edge.get("description", f"A {name} relationship.") @@ -415,8 +416,8 @@ def generate_python_code(self, ontology: Dict[str, Any]) -> str: code_lines.append('') code_lines.append('') - # 生成类型字典 - code_lines.append('# ============== 类型配置 ==============') + # Generate type dictionaries + code_lines.append('# ============== Type Configuration ==============') code_lines.append('') code_lines.append('ENTITY_TYPES = {') for entity in ontology.get("entity_types", []): @@ -432,7 +433,7 @@ def generate_python_code(self, ontology: Dict[str, Any]) -> str: code_lines.append('}') code_lines.append('') - # 生成边的source_targets映射 + # Generate edge source_targets mapping code_lines.append('EDGE_SOURCE_TARGETS = {') for edge in ontology.get("edge_types", []): name = edge["name"] diff --git a/backend/app/services/report_agent.py b/backend/app/services/report_agent.py index e922d02..b0aa367 100644 --- a/backend/app/services/report_agent.py +++ b/backend/app/services/report_agent.py @@ -1,12 +1,12 @@ """ -Report Agent服务 -使用 ReACT 模式生成模拟报告(via GraphStorage / Neo4j) - -功能: -1. 根据模拟需求和图谱信息生成报告 -2. 先规划目录结构,然后分段生成 -3. 每段采用ReACT多轮思考与反思模式 -4. 支持与用户对话,在对话中自主调用检索工具 +Report Agent Service +Generates simulation reports using ReACT pattern (via GraphStorage / Neo4j) + +Features: +1. Generate reports based on simulation requirements and graph info +2. Plan outline structure first, then generate section by section +3. Each section uses ReACT multi-round thinking and reflection pattern +4. Supports conversation with user, autonomously calling retrieval tools during conversation """ import os @@ -34,18 +34,18 @@ class ReportLogger: """ - Report Agent 详细日志记录器 + Report Agent detailed logger - 在报告文件夹中生成 agent_log.jsonl 文件,记录每一步详细动作。 - 每行是一个完整的 JSON 对象,包含时间戳、动作类型、详细内容等。 + Generates agent_log.jsonl file in report folder, recording every detailed action step. + Each line is a complete JSON object with timestamp, action type, detailed content, etc. """ def __init__(self, report_id: str): """ - 初始化日志记录器 + Initialize logger Args: - report_id: 报告ID,用于确定日志文件路径 + report_id: Report ID, used to determine log file path """ self.report_id = report_id self.log_file_path = os.path.join( @@ -55,12 +55,12 @@ def __init__(self, report_id: str): self._ensure_log_file() def _ensure_log_file(self): - """确保日志文件所在目录存在""" + """Ensure log file directory exists""" log_dir = os.path.dirname(self.log_file_path) os.makedirs(log_dir, exist_ok=True) def _get_elapsed_time(self) -> float: - """获取从开始到现在的耗时(秒)""" + """Get elapsed time from start to now (seconds)""" return (datetime.now() - self.start_time).total_seconds() def log( @@ -72,14 +72,14 @@ def log( section_index: int = None ): """ - 记录一条日志 + Record a log entry Args: - action: 动作类型,如 'start', 'tool_call', 'llm_response', 'section_complete' 等 - stage: 当前阶段,如 'planning', 'generating', 'completed' - details: 详细内容字典,不截断 - section_title: 当前章节标题(可选) - section_index: 当前章节索引(可选) + action: Action type, e.g. 'start', 'tool_call', 'llm_response', 'section_complete' + stage: Current stage, e.g. 'planning', 'generating', 'completed' + details: Detail content dictionary, not truncated + section_title: Current section title (optional) + section_index: Current section index (optional) """ log_entry = { "timestamp": datetime.now().isoformat(), @@ -92,12 +92,12 @@ def log( "details": details } - # 追加写入 JSONL 文件 + # Append write to JSONL file with open(self.log_file_path, 'a', encoding='utf-8') as f: f.write(json.dumps(log_entry, ensure_ascii=False) + '\n') def log_start(self, simulation_id: str, graph_id: str, simulation_requirement: str): - """记录报告生成开始""" + """Record report generation start""" self.log( action="report_start", stage="pending", @@ -105,52 +105,52 @@ def log_start(self, simulation_id: str, graph_id: str, simulation_requirement: s "simulation_id": simulation_id, "graph_id": graph_id, "simulation_requirement": simulation_requirement, - "message": "报告生成任务开始" + "message": "Report generation task started" } ) def log_planning_start(self): - """记录大纲规划开始""" + """Record outline planning start""" self.log( action="planning_start", stage="planning", - details={"message": "开始规划报告大纲"} + details={"message": "Starting report outline planning"} ) def log_planning_context(self, context: Dict[str, Any]): - """记录规划时获取的上下文信息""" + """Record context info obtained during planning""" self.log( action="planning_context", stage="planning", details={ - "message": "获取模拟上下文信息", + "message": "Getting simulation context info", "context": context } ) def log_planning_complete(self, outline_dict: Dict[str, Any]): - """记录大纲规划完成""" + """Record outline planning complete""" self.log( action="planning_complete", stage="planning", details={ - "message": "大纲规划完成", + "message": "Outline planning complete", "outline": outline_dict } ) def log_section_start(self, section_title: str, section_index: int): - """记录章节生成开始""" + """Record section generation start""" self.log( action="section_start", stage="generating", section_title=section_title, section_index=section_index, - details={"message": f"开始生成章节: {section_title}"} + details={"message": f"Starting section generation: {section_title}"} ) def log_react_thought(self, section_title: str, section_index: int, iteration: int, thought: str): - """记录 ReACT 思考过程""" + """Record ReACT thinking process""" self.log( action="react_thought", stage="generating", @@ -159,7 +159,7 @@ def log_react_thought(self, section_title: str, section_index: int, iteration: i details={ "iteration": iteration, "thought": thought, - "message": f"ReACT 第{iteration}轮思考" + "message": f"ReACT round {iteration} thinking" } ) @@ -171,7 +171,7 @@ def log_tool_call( parameters: Dict[str, Any], iteration: int ): - """记录工具调用""" + """Record tool call""" self.log( action="tool_call", stage="generating", @@ -181,7 +181,7 @@ def log_tool_call( "iteration": iteration, "tool_name": tool_name, "parameters": parameters, - "message": f"调用工具: {tool_name}" + "message": f"Calling tool: {tool_name}" } ) @@ -193,7 +193,7 @@ def log_tool_result( result: str, iteration: int ): - """记录工具调用结果(完整内容,不截断)""" + """Record tool call result (full content, not truncated)""" self.log( action="tool_result", stage="generating", @@ -202,9 +202,9 @@ def log_tool_result( details={ "iteration": iteration, "tool_name": tool_name, - "result": result, # 完整结果,不截断 + "result": result, # Full result, not truncated "result_length": len(result), - "message": f"工具 {tool_name} 返回结果" + "message": f"Tool {tool_name} returned result" } ) @@ -217,7 +217,7 @@ def log_llm_response( has_tool_calls: bool, has_final_answer: bool ): - """记录 LLM 响应(完整内容,不截断)""" + """Record LLM response (full content, not truncated)""" self.log( action="llm_response", stage="generating", @@ -225,11 +225,11 @@ def log_llm_response( section_index=section_index, details={ "iteration": iteration, - "response": response, # 完整响应,不截断 + "response": response, # Full response, not truncated "response_length": len(response), "has_tool_calls": has_tool_calls, "has_final_answer": has_final_answer, - "message": f"LLM 响应 (工具调用: {has_tool_calls}, 最终答案: {has_final_answer})" + "message": f"LLM response (tool calls: {has_tool_calls}, final answer: {has_final_answer})" } ) @@ -240,17 +240,17 @@ def log_section_content( content: str, tool_calls_count: int ): - """记录章节内容生成完成(仅记录内容,不代表整个章节完成)""" + """Record section content generation complete (records content only, does not mean entire section is complete)""" self.log( action="section_content", stage="generating", section_title=section_title, section_index=section_index, details={ - "content": content, # 完整内容,不截断 + "content": content, # Full content, not truncated "content_length": len(content), "tool_calls_count": tool_calls_count, - "message": f"章节 {section_title} 内容生成完成" + "message": f"Section {section_title} content generation complete" } ) @@ -261,9 +261,9 @@ def log_section_full_complete( full_content: str ): """ - 记录章节生成完成 + Record section generation complete - 前端应监听此日志来判断一个章节是否真正完成,并获取完整内容 + Frontend should monitor this log to determine if a section is truly complete and get full content """ self.log( action="section_complete", @@ -273,24 +273,24 @@ def log_section_full_complete( details={ "content": full_content, "content_length": len(full_content), - "message": f"章节 {section_title} 生成完成" + "message": f"Section {section_title} generation complete" } ) def log_report_complete(self, total_sections: int, total_time_seconds: float): - """记录报告生成完成""" + """Record report generation complete""" self.log( action="report_complete", stage="completed", details={ "total_sections": total_sections, "total_time_seconds": round(total_time_seconds, 2), - "message": "报告生成完成" + "message": "Report generation complete" } ) def log_error(self, error_message: str, stage: str, section_title: str = None): - """记录错误""" + """Record error""" self.log( action="error", stage=stage, @@ -298,25 +298,25 @@ def log_error(self, error_message: str, stage: str, section_title: str = None): section_index=None, details={ "error": error_message, - "message": f"发生错误: {error_message}" + "message": f"Error occurred: {error_message}" } ) class ReportConsoleLogger: """ - Report Agent 控制台日志记录器 + Report Agent console logger - 将控制台风格的日志(INFO、WARNING等)写入报告文件夹中的 console_log.txt 文件。 - 这些日志与 agent_log.jsonl 不同,是纯文本格式的控制台输出。 + Writes console-style logs (INFO, WARNING, etc.) to console_log.txt file in report folder. + These logs differ from agent_log.jsonl — they are plain text console output. """ def __init__(self, report_id: str): """ - 初始化控制台日志记录器 + Initialize console logger Args: - report_id: 报告ID,用于确定日志文件路径 + report_id: Report ID, used to determine log file path """ self.report_id = report_id self.log_file_path = os.path.join( @@ -327,15 +327,15 @@ def __init__(self, report_id: str): self._setup_file_handler() def _ensure_log_file(self): - """确保日志文件所在目录存在""" + """Ensure log file directory exists""" log_dir = os.path.dirname(self.log_file_path) os.makedirs(log_dir, exist_ok=True) def _setup_file_handler(self): - """设置文件处理器,将日志同时写入文件""" + """Set up file handler to write logs to file""" import logging - # 创建文件处理器 + # Create file handler self._file_handler = logging.FileHandler( self.log_file_path, mode='a', @@ -343,14 +343,14 @@ def _setup_file_handler(self): ) self._file_handler.setLevel(logging.INFO) - # 使用与控制台相同的简洁格式 + # Use same concise format as console formatter = logging.Formatter( '[%(asctime)s] %(levelname)s: %(message)s', datefmt='%H:%M:%S' ) self._file_handler.setFormatter(formatter) - # 添加到 report_agent 相关的 logger + # Add to report_agent related loggers loggers_to_attach = [ 'mirofish.report_agent', 'mirofish.graph_tools', @@ -358,12 +358,12 @@ def _setup_file_handler(self): for logger_name in loggers_to_attach: target_logger = logging.getLogger(logger_name) - # 避免重复添加 + # Avoid duplicate addition if self._file_handler not in target_logger.handlers: target_logger.addHandler(self._file_handler) def close(self): - """关闭文件处理器并从 logger 中移除""" + """Close file handler and remove from logger""" import logging if self._file_handler: @@ -381,12 +381,12 @@ def close(self): self._file_handler = None def __del__(self): - """析构时确保关闭文件处理器""" + """Ensure file handler is closed on destruction""" self.close() class ReportStatus(str, Enum): - """报告状态""" + """Report status""" PENDING = "pending" PLANNING = "planning" GENERATING = "generating" @@ -396,7 +396,7 @@ class ReportStatus(str, Enum): @dataclass class ReportSection: - """报告章节""" + """Report section""" title: str content: str = "" @@ -407,7 +407,7 @@ def to_dict(self) -> Dict[str, Any]: } def to_markdown(self, level: int = 2) -> str: - """转换为Markdown格式""" + """Convert to Markdown format""" md = f"{'#' * level} {self.title}\n\n" if self.content: md += f"{self.content}\n\n" @@ -416,7 +416,7 @@ def to_markdown(self, level: int = 2) -> str: @dataclass class ReportOutline: - """报告大纲""" + """Report outline""" title: str summary: str sections: List[ReportSection] @@ -429,7 +429,7 @@ def to_dict(self) -> Dict[str, Any]: } def to_markdown(self) -> str: - """转换为Markdown格式""" + """Convert to Markdown format""" md = f"# {self.title}\n\n" md += f"> {self.summary}\n\n" for section in self.sections: @@ -439,7 +439,7 @@ def to_markdown(self) -> str: @dataclass class Report: - """完整报告""" + """Complete report""" report_id: str simulation_id: str graph_id: str @@ -467,417 +467,417 @@ def to_dict(self) -> Dict[str, Any]: # ═══════════════════════════════════════════════════════════════ -# Prompt 模板常量 +# Prompt template constants # ═══════════════════════════════════════════════════════════════ -# ── 工具描述 ── +# ── Tool descriptions ── TOOL_DESC_INSIGHT_FORGE = """\ -【深度洞察检索 - 强大的检索工具】 -这是我们强大的检索函数,专为深度分析设计。它会: -1. 自动将你的问题分解为多个子问题 -2. 从多个维度检索模拟图谱中的信息 -3. 整合语义搜索、实体分析、关系链追踪的结果 -4. 返回最全面、最深度的检索内容 - -【使用场景】 -- 需要深入分析某个话题 -- 需要了解事件的多个方面 -- 需要获取支撑报告章节的丰富素材 - -【返回内容】 -- 相关事实原文(可直接引用) -- 核心实体洞察 -- 关系链分析""" +[Deep Insight Retrieval - Powerful Retrieval Tool] +This is our powerful retrieval function, designed for deep analysis. It will: +1. Automatically decompose your question into multiple sub-questions +2. Retrieve information from the simulation graph across multiple dimensions +3. Integrate results from semantic search, entity analysis, and relationship chain tracking +4. Return the most comprehensive, in-depth retrieval content + +[Use Cases] +- Need to deeply analyze a topic +- Need to understand multiple aspects of an event +- Need to obtain rich material to support report sections + +[Returns] +- Related facts in original text (can be directly quoted) +- Core entity insights +- Relationship chain analysis""" TOOL_DESC_PANORAMA_SEARCH = """\ -【广度搜索 - 获取全貌视图】 -这个工具用于获取模拟结果的完整全貌,特别适合了解事件演变过程。它会: -1. 获取所有相关节点和关系 -2. 区分当前有效的事实和历史/过期的事实 -3. 帮助你了解舆情是如何演变的 - -【使用场景】 -- 需要了解事件的完整发展脉络 -- 需要对比不同阶段的舆情变化 -- 需要获取全面的实体和关系信息 - -【返回内容】 -- 当前有效事实(模拟最新结果) -- 历史/过期事实(演变记录) -- 所有涉及的实体""" +[Breadth Search - Get Full Picture View] +This tool is for getting the complete picture of simulation results, especially suitable for understanding event evolution. It will: +1. Get all related nodes and relationships +2. Distinguish currently valid facts from historical/expired facts +3. Help you understand how public opinion evolved + +[Use Cases] +- Need to understand the complete development timeline of an event +- Need to compare public opinion changes across different stages +- Need to get comprehensive entity and relationship information + +[Returns] +- Currently valid facts (latest simulation results) +- Historical/expired facts (evolution records) +- All involved entities""" TOOL_DESC_QUICK_SEARCH = """\ -【简单搜索 - 快速检索】 -轻量级的快速检索工具,适合简单、直接的信息查询。 +[Simple Search - Quick Retrieval] +Lightweight quick retrieval tool, suitable for simple, direct information queries. -【使用场景】 -- 需要快速查找某个具体信息 -- 需要验证某个事实 -- 简单的信息检索 +[Use Cases] +- Need to quickly find specific information +- Need to verify a fact +- Simple information retrieval -【返回内容】 -- 与查询最相关的事实列表""" +[Returns] +- List of facts most relevant to the query""" TOOL_DESC_INTERVIEW_AGENTS = """\ -【深度采访 - 真实Agent采访(双平台)】 -调用OASIS模拟环境的采访API,对正在运行的模拟Agent进行真实采访! -这不是LLM模拟,而是调用真实的采访接口获取模拟Agent的原始回答。 -默认在Twitter和Reddit两个平台同时采访,获取更全面的观点。 - -功能流程: -1. 自动读取人设文件,了解所有模拟Agent -2. 智能选择与采访主题最相关的Agent(如学生、媒体、官方等) -3. 自动生成采访问题 -4. 调用 /api/simulation/interview/batch 接口在双平台进行真实采访 -5. 整合所有采访结果,提供多视角分析 - -【使用场景】 -- 需要从不同角色视角了解事件看法(学生怎么看?媒体怎么看?官方怎么说?) -- 需要收集多方意见和立场 -- 需要获取模拟Agent的真实回答(来自OASIS模拟环境) -- 想让报告更生动,包含"采访实录" - -【返回内容】 -- 被采访Agent的身份信息 -- 各Agent在Twitter和Reddit两个平台的采访回答 -- 关键引言(可直接引用) -- 采访摘要和观点对比 - -【重要】需要OASIS模拟环境正在运行才能使用此功能!""" - -# ── 大纲规划 prompt ── +[In-Depth Interview - Real Agent Interview (Dual Platform)] +Calls the OASIS simulation environment's interview API to conduct real interviews with running simulated Agents! +This is not LLM simulation — it calls the real interview interface to get original responses from simulated Agents. +By default, interviews are conducted simultaneously on both Twitter and Reddit platforms for more comprehensive perspectives. + +Workflow: +1. Automatically reads persona files to understand all simulated Agents +2. Intelligently selects Agents most relevant to the interview topic (e.g., students, media, officials, etc.) +3. Automatically generates interview questions +4. Calls the /api/simulation/interview/batch endpoint to conduct real interviews on both platforms +5. Integrates all interview results, providing multi-perspective analysis + +[Use Cases] +- Need to understand event perspectives from different roles (What do students think? What does media think? What do officials say?) +- Need to collect opinions and positions from multiple parties +- Need to obtain real responses from simulated Agents (from the OASIS simulation environment) +- Want to make the report more vivid by including "interview transcripts" + +[Returns] +- Identity information of interviewed Agents +- Each Agent's interview responses on both Twitter and Reddit platforms +- Key quotes (can be directly cited) +- Interview summary and comparison of viewpoints + +[Important] The OASIS simulation environment must be running to use this feature!""" + +# ── Outline planning prompt ── PLAN_SYSTEM_PROMPT = """\ -你是一个「未来预测报告」的撰写专家,拥有对模拟世界的「上帝视角」——你可以洞察模拟中每一位Agent的行为、言论和互动。 - -【核心理念】 -我们构建了一个模拟世界,并向其中注入了特定的「模拟需求」作为变量。模拟世界的演化结果,就是对未来可能发生情况的预测。你正在观察的不是"实验数据",而是"未来的预演"。 - -【你的任务】 -撰写一份「未来预测报告」,回答: -1. 在我们设定的条件下,未来发生了什么? -2. 各类Agent(人群)是如何反应和行动? -3. 这个模拟揭示了哪些值得关注的未来趋势和风险? - -【报告定位】 -- ✅ 这是一份基于模拟的未来预测报告,揭示"如果这样,未来会怎样" -- ✅ 聚焦于预测结果:事件走向、群体反应、涌现现象、潜在风险 -- ✅ 模拟世界中的Agent言行就是对未来人群行为的预测 -- ❌ 不是对现实世界现状的分析 -- ❌ 不是泛泛而谈的舆情综述 - -【章节数量限制】 -- 最少2个章节,最多5个章节 -- 不需要子章节,每个章节直接撰写完整内容 -- 内容要精炼,聚焦于核心预测发现 -- 章节结构由你根据预测结果自主设计 - -请输出JSON格式的报告大纲,格式如下: +You are an expert in writing "Future Prediction Reports," possessing a "God's perspective" over the simulated world — you can observe every Agent's behavior, statements, and interactions within the simulation. + +[Core Concept] +We have built a simulated world and injected specific "simulation requirements" as variables. The evolution results of the simulated world are predictions of what may happen in the future. What you are observing is not "experimental data," but a "rehearsal of the future." + +[Your Task] +Write a "Future Prediction Report" that answers: +1. Under the conditions we set, what happened in the future? +2. How did various Agents (population groups) react and act? +3. What noteworthy future trends and risks does this simulation reveal? + +[Report Positioning] +- This is a simulation-based Future Prediction Report, revealing "if this happens, what will the future look like" +- Focus on prediction results: event trajectories, group reactions, emergent phenomena, potential risks +- The statements and actions of Agents in the simulated world are predictions of future population behavior +- This is NOT an analysis of the current real-world situation +- This is NOT a generic public opinion summary + +[Section Count Limit] +- Minimum 2 sections, maximum 5 sections +- No sub-sections needed; each section should contain complete content directly +- Content should be concise, focusing on core prediction findings +- Section structure should be designed by you based on the prediction results + +Please output the Report outline in JSON format as follows: { - "title": "报告标题", - "summary": "报告摘要(一句话概括核心预测发现)", + "title": "Report title", + "summary": "Report summary (one sentence summarizing the core prediction findings)", "sections": [ { - "title": "章节标题", - "description": "章节内容描述" + "title": "Section title", + "description": "Section content description" } ] } -注意:sections数组最少2个,最多5个元素!""" +Note: the sections array must have a minimum of 2 and a maximum of 5 elements!""" PLAN_USER_PROMPT_TEMPLATE = """\ -【预测场景设定】 -我们向模拟世界注入的变量(模拟需求):{simulation_requirement} +[Prediction Scenario Setting] +The variable we injected into the simulation world (simulation requirement):{simulation_requirement} -【模拟世界规模】 -- 参与模拟的实体数量: {total_nodes} -- 实体间产生的关系数量: {total_edges} -- 实体类型分布: {entity_types} -- 活跃Agent数量: {total_entities} +[Simulation World Scale] +- Number of entities in simulation: {total_nodes} +- Number of relationships between entities: {total_edges} +- Entity type distribution: {entity_types} +- Number of active Agents: {total_entities} -【模拟预测到的部分未来事实样本】 +[Sample Future Facts Predicted by Simulation] {related_facts_json} -请以「上帝视角」审视这个未来预演: -1. 在我们设定的条件下,未来呈现出了什么样的状态? -2. 各类人群(Agent)是如何反应和行动的? -3. 这个模拟揭示了哪些值得关注的未来趋势? +Please examine this future rehearsal from "God's perspective": +1. Under our set conditions, what state did the future present? +2. How did various groups (Agents) react and act? +3. What noteworthy future trends does this simulation reveal? -根据预测结果,设计最合适的报告章节结构。 +Based on prediction results, design the most appropriate report section structure. -【再次提醒】报告章节数量:最少2个,最多5个,内容要精炼聚焦于核心预测发现。""" +[Reminder] Report section count: minimum 2, maximum 5, content should be concise and focused on core prediction findings.""" -# ── 章节生成 prompt ── +# ── Section generation prompt ── SECTION_SYSTEM_PROMPT_TEMPLATE = """\ -你是一个「未来预测报告」的撰写专家,正在撰写报告的一个章节。 +You are an expert in writing "Future Prediction Reports," currently writing one section of the report. -报告标题: {report_title} -报告摘要: {report_summary} -预测场景(模拟需求): {simulation_requirement} +Report title: {report_title} +Report summary: {report_summary} +Prediction scenario (simulation requirement): {simulation_requirement} -当前要撰写的章节: {section_title} +Current section to write: {section_title} ═══════════════════════════════════════════════════════════════ -【核心理念】 +[Core Concept] ═══════════════════════════════════════════════════════════════ -模拟世界是对未来的预演。我们向模拟世界注入了特定条件(模拟需求), -模拟中Agent的行为和互动,就是对未来人群行为的预测。 +The simulated world is a rehearsal of the future. We injected specific conditions (simulation requirements) into the simulated world. +The behavior and interactions of Agents in the simulation are predictions of future population behavior. -你的任务是: -- 揭示在设定条件下,未来发生了什么 -- 预测各类人群(Agent)是如何反应和行动的 -- 发现值得关注的未来趋势、风险和机会 +Your task is to: +- Reveal what happened in the future under the set conditions +- Predict how various population groups (Agents) reacted and acted +- Discover noteworthy future trends, risks, and opportunities -❌ 不要写成对现实世界现状的分析 -✅ 要聚焦于"未来会怎样"——模拟结果就是预测的未来 +Do NOT write this as an analysis of the current real-world situation. +DO focus on "what will the future look like" — the simulation results ARE the predicted future. ═══════════════════════════════════════════════════════════════ -【最重要的规则 - 必须遵守】 +[Most Important Rules - Must Follow] ═══════════════════════════════════════════════════════════════ -1. 【必须调用工具观察模拟世界】 - - 你正在以「上帝视角」观察未来的预演 - - 所有内容必须来自模拟世界中发生的事件和Agent言行 - - 禁止使用你自己的知识来编写报告内容 - - 每个章节至少调用3次工具(最多5次)来观察模拟的世界,它代表了未来 - -2. 【必须引用Agent的原始言行】 - - Agent的发言和行为是对未来人群行为的预测 - - 在报告中使用引用格式展示这些预测,例如: - > "某类人群会表示:原文内容..." - - 这些引用是模拟预测的核心证据 - -3. 【语言一致性 - 引用内容必须翻译为报告语言】 - - 工具返回的内容可能包含英文或中英文混杂的表述 - - 如果模拟需求和材料原文是中文的,报告必须全部使用中文撰写 - - 当你引用工具返回的英文或中英混杂内容时,必须将其翻译为流畅的中文后再写入报告 - - 翻译时保持原意不变,确保表述自然通顺 - - 这一规则同时适用于正文和引用块(> 格式)中的内容 - -4. 【忠实呈现预测结果】 - - 报告内容必须反映模拟世界中的代表未来的模拟结果 - - 不要添加模拟中不存在的信息 - - 如果某方面信息不足,如实说明 +1. [Must Call Tools to Observe the Simulated World] + - You are observing a rehearsal of the future from "God's perspective" + - All content must come from events and Agent statements/actions in the simulated world + - Do NOT use your own knowledge to write report content + - Each section must call tools at least 3 times (maximum 5) to observe the simulated world, which represents the future + +2. [Must Quote Agents' Original Statements and Actions] + - Agent statements and behaviors are predictions of future population behavior + - Use quote format in the report to display these predictions, for example: + > "A certain group would say: original content..." + - These quotes are the core evidence of the simulation predictions + +3. [Language Consistency - Quoted Content Must Be Translated to Report Language] + - Content returned by tools may contain mixed languages + - The report should be written in the same language as the simulation requirements and source materials + - When quoting content returned by tools in a different language, you must translate it into fluent report language before writing it into the report + - Maintain the original meaning during translation, ensuring natural and smooth expression + - This rule applies to both body text and quoted blocks (> format) + +4. [Faithfully Present Prediction Results] + - Report content must reflect the simulation results representing the future from the simulated world + - Do not add information that does not exist in the simulation + - If information is insufficient in certain areas, state this honestly ═══════════════════════════════════════════════════════════════ -【⚠️ 格式规范 - 极其重要!】 +[Format Guidelines - Extremely Important!] ═══════════════════════════════════════════════════════════════ -【一个章节 = 最小内容单位】 -- 每个章节是报告的最小分块单位 -- ❌ 禁止在章节内使用任何 Markdown 标题(#、##、###、#### 等) -- ❌ 禁止在内容开头添加章节主标题 -- ✅ 章节标题由系统自动添加,你只需撰写纯正文内容 -- ✅ 使用**粗体**、段落分隔、引用、列表来组织内容,但不要用标题 +[One Section = Minimum Content Unit] +- Each section is the smallest unit of the report +- Do NOT use any Markdown headings (#, ##, ###, ####, etc.) within the section +- Do NOT add the section main title at the beginning of the content +- The section title is automatically added by the system; you only need to write plain body content +- Use **bold text**, paragraph breaks, quotes, and lists to organize content, but do not use headings -【正确示例】 +[Correct Example] ``` -本章节分析了事件的舆论传播态势。通过对模拟数据的深入分析,我们发现... +This section analyzes the public opinion propagation trends of the event. Through in-depth analysis of simulation data, we found... -**首发引爆阶段** +**Initial Breakout Phase** -微博作为舆情的第一现场,承担了信息首发的核心功能: +Weibo, as the first scene of public opinion, served as the core platform for initial information dissemination: -> "微博贡献了68%的首发声量..." +> "Weibo contributed 68% of the initial voice volume..." -**情绪放大阶段** +**Emotion Amplification Phase** -抖音平台进一步放大了事件影响力: +The TikTok platform further amplified the event's impact: -- 视觉冲击力强 -- 情绪共鸣度高 +- Strong visual impact +- High emotional resonance ``` -【错误示例】 +[Incorrect Example] ``` -## 执行摘要 ← 错误!不要添加任何标题 -### 一、首发阶段 ← 错误!不要用###分小节 -#### 1.1 详细分析 ← 错误!不要用####细分 +## Executive Summary <- Wrong! Do not add any headings +### 1. Initial Phase <- Wrong! Do not use ### for subsections +#### 1.1 Detailed Analysis <- Wrong! Do not use #### for further subdivision -本章节分析了... +This section analyzes... ``` ═══════════════════════════════════════════════════════════════ -【可用检索工具】(每章节调用3-5次) +[Available Retrieval Tools] (call 3-5 times per section) ═══════════════════════════════════════════════════════════════ {tools_description} -【工具使用建议 - 请混合使用不同工具,不要只用一种】 -- insight_forge: 深度洞察分析,自动分解问题并多维度检索事实和关系 -- panorama_search: 广角全景搜索,了解事件全貌、时间线和演变过程 -- quick_search: 快速验证某个具体信息点 -- interview_agents: 采访模拟Agent,获取不同角色的第一人称观点和真实反应 +[Tool Usage Suggestions - Mix different tools, do not use only one type] +- insight_forge: Deep insight analysis, automatically decomposes questions and retrieves facts and relationships across multiple dimensions +- panorama_search: Wide-angle panoramic search, understand the full picture of events, timelines, and evolution process +- quick_search: Quickly verify a specific piece of information +- interview_agents: Interview simulated Agents, obtain first-person perspectives and real reactions from different roles ═══════════════════════════════════════════════════════════════ -【工作流程】 +[Workflow] ═══════════════════════════════════════════════════════════════ -每次回复你只能做以下两件事之一(不可同时做): +Each response can only do one of the following two things (cannot do both simultaneously): -选项A - 调用工具: -输出你的思考,然后用以下格式调用一个工具: +Option A - Call a tool: +Output your thinking, then call a tool using the following format: -{{"name": "工具名称", "parameters": {{"参数名": "参数值"}}}} +{{"name": "Tool name", "parameters": {{"param_name": "param_value"}}}} -系统会执行工具并把结果返回给你。你不需要也不能自己编写工具返回结果。 +The system will execute the tool and return the result to you. You do not need to and cannot write tool return results yourself. -选项B - 输出最终内容: -当你已通过工具获取了足够信息,以 "Final Answer:" 开头输出章节内容。 +Option B - Output final content: +When you have obtained sufficient information through tools, output the section content starting with "Final Answer:". -⚠️ 严格禁止: -- 禁止在一次回复中同时包含工具调用和 Final Answer -- 禁止自己编造工具返回结果(Observation),所有工具结果由系统注入 -- 每次回复最多调用一个工具 +Strict prohibitions: +- Do NOT include both a tool call and Final Answer in the same response +- Do NOT fabricate tool return results (Observation); all tool results are injected by the system +- Call at most one tool per response ═══════════════════════════════════════════════════════════════ -【章节内容要求】 +[Section Content Requirements] ═══════════════════════════════════════════════════════════════ -1. 内容必须基于工具检索到的模拟数据 -2. 大量引用原文来展示模拟效果 -3. 使用Markdown格式(但禁止使用标题): - - 使用 **粗体文字** 标记重点(代替子标题) - - 使用列表(-或1.2.3.)组织要点 - - 使用空行分隔不同段落 - - ❌ 禁止使用 #、##、###、#### 等任何标题语法 -4. 【引用格式规范 - 必须单独成段】 - 引用必须独立成段,前后各有一个空行,不能混在段落中: - - ✅ 正确格式: +1. Content must be based on simulation data retrieved by tools +2. Extensively quote original text to demonstrate simulation effects +3. Use Markdown format (but do NOT use headings): + - Use **bold text** to mark key points (instead of subheadings) + - Use lists (- or 1.2.3.) to organize key points + - Use blank lines to separate different paragraphs + - Do NOT use #, ##, ###, #### or any heading syntax +4. [Quote Format - Must Be Standalone Paragraphs] + Quotes must be standalone paragraphs with a blank line before and after, not mixed into paragraphs: + + Correct format: ``` - 校方的回应被认为缺乏实质内容。 + The school's response was considered lacking in substance. - > "校方的应对模式在瞬息万变的社交媒体环境中显得僵化和迟缓。" + > "The school's response pattern appeared rigid and sluggish in the rapidly changing social media environment." - 这一评价反映了公众的普遍不满。 + This assessment reflects the widespread public dissatisfaction. ``` - ❌ 错误格式: + Incorrect format: ``` - 校方的回应被认为缺乏实质内容。> "校方的应对模式..." 这一评价反映了... + The school's response was considered lacking in substance. > "The school's response pattern..." This assessment reflects... ``` -5. 保持与其他章节的逻辑连贯性 -6. 【避免重复】仔细阅读下方已完成的章节内容,不要重复描述相同的信息 -7. 【再次强调】不要添加任何标题!用**粗体**代替小节标题""" +5. Maintain logical coherence with other sections +6. [Avoid Repetition] Carefully read the completed section content below; do not repeat the same information +7. [Emphasis] Do not add any headings! Use **bold text** instead of subsection headings""" SECTION_USER_PROMPT_TEMPLATE = """\ -已完成的章节内容(请仔细阅读,避免重复): +Completed section content (please read carefully to avoid repetition): {previous_content} ═══════════════════════════════════════════════════════════════ -【当前任务】撰写章节: {section_title} +[Current Task] Write section: {section_title} ═══════════════════════════════════════════════════════════════ -【重要提醒】 -1. 仔细阅读上方已完成的章节,避免重复相同的内容! -2. 开始前必须先调用工具获取模拟数据 -3. 请混合使用不同工具,不要只用一种 -4. 报告内容必须来自检索结果,不要使用自己的知识 +[Important Reminders] +1. Carefully read the completed sections above to avoid repeating the same content! +2. You must call tools to retrieve simulation data before starting +3. Mix different tools; do not use only one type +4. Report content must come from retrieval results; do not use your own knowledge -【⚠️ 格式警告 - 必须遵守】 -- ❌ 不要写任何标题(#、##、###、####都不行) -- ❌ 不要写"{section_title}"作为开头 -- ✅ 章节标题由系统自动添加 -- ✅ 直接写正文,用**粗体**代替小节标题 +[Format Warning - Must Follow] +- Do NOT write any headings (#, ##, ###, #### are all prohibited) +- Do NOT write "{section_title}" as the opening +- The section title is automatically added by the system +- Write body text directly, use **bold text** instead of subsection headings -请开始: -1. 首先思考(Thought)这个章节需要什么信息 -2. 然后调用工具(Action)获取模拟数据 -3. 收集足够信息后输出 Final Answer(纯正文,无任何标题)""" +Please begin: +1. First think (Thought) about what information this section needs +2. Then call tools (Action) to retrieve simulation data +3. After collecting sufficient information, output Final Answer (plain body text, no headings)""" -# ── ReACT 循环内消息模板 ── +# ── ReACT loop message templates ── REACT_OBSERVATION_TEMPLATE = """\ -Observation(检索结果): +Observation (retrieval results): -═══ 工具 {tool_name} 返回 ═══ +=== Tool {tool_name} returned === {result} ═══════════════════════════════════════════════════════════════ -已调用工具 {tool_calls_count}/{max_tool_calls} 次(已用: {used_tools_str}){unused_hint} -- 如果信息充分:以 "Final Answer:" 开头输出章节内容(必须引用上述原文) -- 如果需要更多信息:调用一个工具继续检索 +Tools called {tool_calls_count}/{max_tool_calls} times (used: {used_tools_str}){unused_hint} +- If information is sufficient: output section content starting with "Final Answer:" (must quote the original text above) +- If more information is needed: call a tool to continue retrieving ═══════════════════════════════════════════════════════════════""" REACT_INSUFFICIENT_TOOLS_MSG = ( - "【注意】你只调用了{tool_calls_count}次工具,至少需要{min_tool_calls}次。" - "请再调用工具获取更多模拟数据,然后再输出 Final Answer。{unused_hint}" + "[Notice] You have only called tools {tool_calls_count} times; at least {min_tool_calls} calls are required. " + "Please call more tools to retrieve additional simulation data before outputting Final Answer. {unused_hint}" ) REACT_INSUFFICIENT_TOOLS_MSG_ALT = ( - "当前只调用了 {tool_calls_count} 次工具,至少需要 {min_tool_calls} 次。" - "请调用工具获取模拟数据。{unused_hint}" + "Currently only {tool_calls_count} tool calls have been made; at least {min_tool_calls} are required. " + "Please call tools to retrieve simulation data. {unused_hint}" ) REACT_TOOL_LIMIT_MSG = ( - "工具调用次数已达上限({tool_calls_count}/{max_tool_calls}),不能再调用工具。" - '请立即基于已获取的信息,以 "Final Answer:" 开头输出章节内容。' + "Tool call limit reached ({tool_calls_count}/{max_tool_calls}); no more tool calls allowed. " + 'Please immediately output section content starting with "Final Answer:" based on the information already obtained.' ) -REACT_UNUSED_TOOLS_HINT = "\n💡 你还没有使用过: {unused_list},建议尝试不同工具获取多角度信息" +REACT_UNUSED_TOOLS_HINT = "\nTip: You have not yet used: {unused_list}. Consider trying different tools to get multi-perspective information." -REACT_FORCE_FINAL_MSG = "已达到工具调用限制,请直接输出 Final Answer: 并生成章节内容。" +REACT_FORCE_FINAL_MSG = "Tool call limit reached. Please directly output Final Answer: and generate section content." # ── Chat prompt ── CHAT_SYSTEM_PROMPT_TEMPLATE = """\ -你是一个简洁高效的模拟预测助手。 +You are a concise and efficient simulation prediction assistant. -【背景】 -预测条件: {simulation_requirement} +[Background] +Prediction conditions: {simulation_requirement} -【已生成的分析报告】 +[Generated Analysis Report] {report_content} -【规则】 -1. 优先基于上述报告内容回答问题 -2. 直接回答问题,避免冗长的思考论述 -3. 仅在报告内容不足以回答时,才调用工具检索更多数据 -4. 回答要简洁、清晰、有条理 +[Rules] +1. Prioritize answering questions based on the report content above +2. Answer questions directly; avoid lengthy deliberations +3. Only call tools to retrieve more data when the report content is insufficient to answer +4. Answers should be concise, clear, and well-organized -【可用工具】(仅在需要时使用,最多调用1-2次) +[Available Tools] (use only when needed, call at most 1-2 times) {tools_description} -【工具调用格式】 +[Tool Call Format] -{{"name": "工具名称", "parameters": {{"参数名": "参数值"}}}} +{{"name": "Tool name", "parameters": {{"param_name": "param_value"}}}} -【回答风格】 -- 简洁直接,不要长篇大论 -- 使用 > 格式引用关键内容 -- 优先给出结论,再解释原因""" +[Response Style] +- Concise and direct; avoid lengthy elaboration +- Use > format to quote key content +- Provide conclusions first, then explain reasons""" -CHAT_OBSERVATION_SUFFIX = "\n\n请简洁回答问题。" +CHAT_OBSERVATION_SUFFIX = "\n\nPlease answer the question concisely." # ═══════════════════════════════════════════════════════════════ -# ReportAgent 主类 +# ReportAgent main class # ═══════════════════════════════════════════════════════════════ class ReportAgent: """ - Report Agent - 模拟报告生成Agent + Report Agent - Simulation Report Generation Agent - 采用ReACT(Reasoning + Acting)模式: - 1. 规划阶段:分析模拟需求,规划报告目录结构 - 2. 生成阶段:逐章节生成内容,每章节可多次调用工具获取信息 - 3. 反思阶段:检查内容完整性和准确性 + Uses ReACT (Reasoning + Acting) pattern: + 1. Planning stage: analyze simulation requirements, plan report outline structure + 2. Generation stage: generate content section by section, each section can call tools multiple times + 3. Reflection stage: check content completeness and accuracy """ - # 最大工具调用次数(每个章节) + # Max tool calls per section MAX_TOOL_CALLS_PER_SECTION = 5 - # 最大反思轮数 + # Max reflection rounds MAX_REFLECTION_ROUNDS = 3 - # 对话中的最大工具调用次数 + # Max tool calls per chat MAX_TOOL_CALLS_PER_CHAT = 2 def __init__( @@ -889,14 +889,14 @@ def __init__( graph_tools: Optional[GraphToolsService] = None ): """ - 初始化Report Agent + Initialize Report Agent Args: - graph_id: 图谱ID - simulation_id: 模拟ID - simulation_requirement: 模拟需求描述 - llm_client: LLM客户端(可选) - graph_tools: 图谱工具服务(可选,需外部注入 GraphStorage) + graph_id: Graph ID + simulation_id: Simulation ID + simulation_requirement: Simulation requirement description + llm_client: LLM client (optional) + graph_tools: Graph tools service (optional, requires externally injected GraphStorage) """ self.graph_id = graph_id self.simulation_id = simulation_id @@ -910,66 +910,66 @@ def __init__( ) self.graph_tools = graph_tools - # 工具定义 + # Tool definitions self.tools = self._define_tools() - # 日志记录器(在 generate_report 中初始化) + # Logger (initialized in generate_report) self.report_logger: Optional[ReportLogger] = None - # 控制台日志记录器(在 generate_report 中初始化) + # Console logger (initialized in generate_report) self.console_logger: Optional[ReportConsoleLogger] = None - logger.info(f"ReportAgent 初始化完成: graph_id={graph_id}, simulation_id={simulation_id}") + logger.info(f"ReportAgent initialized: graph_id={graph_id}, simulation_id={simulation_id}") def _define_tools(self) -> Dict[str, Dict[str, Any]]: - """定义可用工具""" + """Define available tools""" return { "insight_forge": { "name": "insight_forge", "description": TOOL_DESC_INSIGHT_FORGE, "parameters": { - "query": "你想深入分析的问题或话题", - "report_context": "当前报告章节的上下文(可选,有助于生成更精准的子问题)" + "query": "The question or topic you want to deeply analyze", + "report_context": "Current report section context (optional, helps generate more precise sub-questions)" } }, "panorama_search": { "name": "panorama_search", "description": TOOL_DESC_PANORAMA_SEARCH, "parameters": { - "query": "搜索查询,用于相关性排序", - "include_expired": "是否包含过期/历史内容(默认True)" + "query": "Search query, used for relevance sorting", + "include_expired": "Whether to include expired/historical content (default True)" } }, "quick_search": { "name": "quick_search", "description": TOOL_DESC_QUICK_SEARCH, "parameters": { - "query": "搜索查询字符串", - "limit": "返回结果数量(可选,默认10)" + "query": "Search query string", + "limit": "Number of results (optional, default 10)" } }, "interview_agents": { "name": "interview_agents", "description": TOOL_DESC_INTERVIEW_AGENTS, "parameters": { - "interview_topic": "采访主题或需求描述(如:'了解学生对宿舍甲醛事件的看法')", - "max_agents": "最多采访的Agent数量(可选,默认5,最大10)" + "interview_topic": "Interview topic or requirement description (e.g., 'understand students' views on the dormitory formaldehyde incident')", + "max_agents": "Max Agents to interview (optional, default 5, max 10)" } } } def _execute_tool(self, tool_name: str, parameters: Dict[str, Any], report_context: str = "") -> str: """ - 执行工具调用 + Execute tool call Args: - tool_name: 工具名称 - parameters: 工具参数 - report_context: 报告上下文(用于InsightForge) + tool_name: Tool name + parameters: Tool parameters + report_context: Report context (for InsightForge) Returns: - 工具执行结果(文本格式) + Tool execution result (text format) """ - logger.info(f"执行工具: {tool_name}, 参数: {parameters}") + logger.info(f"Executing tool: {tool_name}, parameters: {parameters}") try: if tool_name == "insight_forge": @@ -984,7 +984,7 @@ def _execute_tool(self, tool_name: str, parameters: Dict[str, Any], report_conte return result.to_text() elif tool_name == "panorama_search": - # 广度搜索 - 获取全貌 + # Breadth search - get full picture query = parameters.get("query", "") include_expired = parameters.get("include_expired", True) if isinstance(include_expired, str): @@ -997,7 +997,7 @@ def _execute_tool(self, tool_name: str, parameters: Dict[str, Any], report_conte return result.to_text() elif tool_name == "quick_search": - # 简单搜索 - 快速检索 + # Simple search - quick retrieval query = parameters.get("query", "") limit = parameters.get("limit", 10) if isinstance(limit, str): @@ -1010,7 +1010,7 @@ def _execute_tool(self, tool_name: str, parameters: Dict[str, Any], report_conte return result.to_text() elif tool_name == "interview_agents": - # 深度采访 - 调用真实的OASIS采访API获取模拟Agent的回答(双平台) + # In-depth interview - calls real OASIS interview API to get simulated Agent responses (dual platform) interview_topic = parameters.get("interview_topic", parameters.get("query", "")) max_agents = parameters.get("max_agents", 5) if isinstance(max_agents, str): @@ -1024,11 +1024,11 @@ def _execute_tool(self, tool_name: str, parameters: Dict[str, Any], report_conte ) return result.to_text() - # ========== 向后兼容的旧工具(内部重定向到新工具) ========== + # ========== Backward-compatible old tools (internally redirected to new tools) ========== elif tool_name == "search_graph": - # 重定向到 quick_search - logger.info("search_graph 已重定向到 quick_search") + # Redirect to quick_search + logger.info("search_graph redirected to quick_search") return self._execute_tool("quick_search", parameters, report_context) elif tool_name == "get_graph_statistics": @@ -1044,8 +1044,8 @@ def _execute_tool(self, tool_name: str, parameters: Dict[str, Any], report_conte return json.dumps(result, ensure_ascii=False, indent=2) elif tool_name == "get_simulation_context": - # 重定向到 insight_forge,因为它更强大 - logger.info("get_simulation_context 已重定向到 insight_forge") + # Redirect to insight_forge as it is more powerful + logger.info("get_simulation_context redirected to insight_forge") query = parameters.get("query", self.simulation_requirement) return self._execute_tool("insight_forge", {"query": query}, report_context) @@ -1059,26 +1059,26 @@ def _execute_tool(self, tool_name: str, parameters: Dict[str, Any], report_conte return json.dumps(result, ensure_ascii=False, indent=2) else: - return f"未知工具: {tool_name}。请使用以下工具之一: insight_forge, panorama_search, quick_search" + return f"Unknown tool: {tool_name}. Please use one of: insight_forge, panorama_search, quick_search" except Exception as e: - logger.error(f"工具执行失败: {tool_name}, 错误: {str(e)}") - return f"工具执行失败: {str(e)}" + logger.error(f"Tool execution failed: {tool_name}, error: {str(e)}") + return f"Tool execution failed: {str(e)}" - # 合法的工具名称集合,用于裸 JSON 兜底解析时校验 + # Valid tool name set, used for validation when falling back to bare JSON parsing VALID_TOOL_NAMES = {"insight_forge", "panorama_search", "quick_search", "interview_agents"} def _parse_tool_calls(self, response: str) -> List[Dict[str, Any]]: """ - 从LLM响应中解析工具调用 + Parse tool calls from LLM response - 支持的格式(按优先级): + Supported formats (by priority): 1. {"name": "tool_name", "parameters": {...}} - 2. 裸 JSON(响应整体或单行就是一个工具调用 JSON) + 2. Bare JSON (the entire response or a single line is a tool call JSON) """ tool_calls = [] - # 格式1: XML风格(标准格式) + # Format 1: XML style (standard format) xml_pattern = r'\s*(\{.*?\})\s*' for match in re.finditer(xml_pattern, response, re.DOTALL): try: @@ -1090,8 +1090,8 @@ def _parse_tool_calls(self, response: str) -> List[Dict[str, Any]]: if tool_calls: return tool_calls - # 格式2: 兜底 - LLM 直接输出裸 JSON(没包 标签) - # 只在格式1未匹配时尝试,避免误匹配正文中的 JSON + # Format 2: fallback - LLM directly outputs bare JSON (without tags) + # Only try when format 1 did not match, to avoid false matching JSON in body text stripped = response.strip() if stripped.startswith('{') and stripped.endswith('}'): try: @@ -1102,7 +1102,7 @@ def _parse_tool_calls(self, response: str) -> List[Dict[str, Any]]: except json.JSONDecodeError: pass - # 响应可能包含思考文字 + 裸 JSON,尝试提取最后一个 JSON 对象 + # Response may contain thinking text + bare JSON, try to extract last JSON object json_pattern = r'(\{"(?:name|tool)"\s*:.*?\})\s*$' match = re.search(json_pattern, stripped, re.DOTALL) if match: @@ -1116,11 +1116,11 @@ def _parse_tool_calls(self, response: str) -> List[Dict[str, Any]]: return tool_calls def _is_valid_tool_call(self, data: dict) -> bool: - """校验解析出的 JSON 是否是合法的工具调用""" - # 支持 {"name": ..., "parameters": ...} 和 {"tool": ..., "params": ...} 两种键名 + """Validate if parsed JSON is a valid tool call""" + # Support both {"name": ..., "parameters": ...} and {"tool": ..., "params": ...} key names tool_name = data.get("name") or data.get("tool") if tool_name and tool_name in self.VALID_TOOL_NAMES: - # 统一键名为 name / parameters + # Normalize key names to name / parameters if "tool" in data: data["name"] = data.pop("tool") if "params" in data and "parameters" not in data: @@ -1129,13 +1129,13 @@ def _is_valid_tool_call(self, data: dict) -> bool: return False def _get_tools_description(self) -> str: - """生成工具描述文本""" - desc_parts = ["可用工具:"] + """Generate tool description text""" + desc_parts = ["Available tools:"] for name, tool in self.tools.items(): params_desc = ", ".join([f"{k}: {v}" for k, v in tool["parameters"].items()]) desc_parts.append(f"- {name}: {tool['description']}") if params_desc: - desc_parts.append(f" 参数: {params_desc}") + desc_parts.append(f" Parameters: {params_desc}") return "\n".join(desc_parts) def plan_outline( @@ -1143,29 +1143,29 @@ def plan_outline( progress_callback: Optional[Callable] = None ) -> ReportOutline: """ - 规划报告大纲 + Plan report outline - 使用LLM分析模拟需求,规划报告的目录结构 + Use LLM to analyze simulation requirements and plan report outline structure Args: - progress_callback: 进度回调函数 + progress_callback: Progress callback function Returns: - ReportOutline: 报告大纲 + ReportOutline: report outline """ - logger.info("开始规划报告大纲...") + logger.info("Starting to plan report outline...") if progress_callback: - progress_callback("planning", 0, "正在分析模拟需求...") + progress_callback("planning", 0, "Analyzing simulation requirements...") - # 首先获取模拟上下文 + # First get simulation context context = self.graph_tools.get_simulation_context( graph_id=self.graph_id, simulation_requirement=self.simulation_requirement ) if progress_callback: - progress_callback("planning", 30, "正在生成报告大纲...") + progress_callback("planning", 30, "Generating report outline...") system_prompt = PLAN_SYSTEM_PROMPT user_prompt = PLAN_USER_PROMPT_TEMPLATE.format( @@ -1187,9 +1187,9 @@ def plan_outline( ) if progress_callback: - progress_callback("planning", 80, "正在解析大纲结构...") + progress_callback("planning", 80, "Parsing outline structure...") - # 解析大纲 + # Parse outline sections = [] for section_data in response.get("sections", []): sections.append(ReportSection( @@ -1198,27 +1198,27 @@ def plan_outline( )) outline = ReportOutline( - title=response.get("title", "模拟分析报告"), + title=response.get("title", "Simulation Analysis Report"), summary=response.get("summary", ""), sections=sections ) if progress_callback: - progress_callback("planning", 100, "大纲规划完成") + progress_callback("planning", 100, "Outline planning complete") - logger.info(f"大纲规划完成: {len(sections)} 个章节") + logger.info(f"Outline planning complete: {len(sections)} sections") return outline except Exception as e: - logger.error(f"大纲规划失败: {str(e)}") - # 返回默认大纲(3个章节,作为fallback) + logger.error(f"Outline planning failed: {str(e)}") + # Return default outline (3 sections, as fallback) return ReportOutline( - title="未来预测报告", - summary="基于模拟预测的未来趋势与风险分析", + title="Future Prediction Report", + summary="Future trends and risk analysis based on simulation prediction", sections=[ - ReportSection(title="预测场景与核心发现"), - ReportSection(title="人群行为预测分析"), - ReportSection(title="趋势展望与风险提示") + ReportSection(title="Prediction Scenario and Core Findings"), + ReportSection(title="Population Behavior Prediction Analysis"), + ReportSection(title="Trend Outlook and Risk Alerts") ] ) @@ -1231,28 +1231,28 @@ def _generate_section_react( section_index: int = 0 ) -> str: """ - 使用ReACT模式生成单个章节内容 + Generate single section content using ReACT pattern - ReACT循环: - 1. Thought(思考)- 分析需要什么信息 - 2. Action(行动)- 调用工具获取信息 - 3. Observation(观察)- 分析工具返回结果 - 4. 重复直到信息足够或达到最大次数 - 5. Final Answer(最终回答)- 生成章节内容 + ReACT loop: + 1. Thought - analyze what info is needed + 2. Action - call tools to get info + 3. Observation - analyze tool return results + 4. Repeat until info sufficient or max iterations reached + 5. Final Answer - generate section content Args: - section: 要生成的章节 - outline: 完整大纲 - previous_sections: 之前章节的内容(用于保持连贯性) - progress_callback: 进度回调 - section_index: 章节索引(用于日志记录) + section: Section to generate + outline: Complete outline + previous_sections: Previous section content (for maintaining coherence) + progress_callback: Progress callback + section_index: Section index (for logging) Returns: - 章节内容(Markdown格式) + Section content (Markdown format) """ - logger.info(f"ReACT生成章节: {section.title}") + logger.info(f"ReACT generating section: {section.title}") - # 记录章节开始日志 + # Record section start log if self.report_logger: self.report_logger.log_section_start(section.title, section_index) @@ -1264,16 +1264,16 @@ def _generate_section_react( tools_description=self._get_tools_description(), ) - # 构建用户prompt - 每个已完成章节各传入最大4000字 + # Build user prompt - pass max 4000 chars for each completed section if previous_sections: previous_parts = [] for sec in previous_sections: - # 每个章节最多4000字 + # Max 4000 chars per section truncated = sec[:4000] + "..." if len(sec) > 4000 else sec previous_parts.append(truncated) previous_content = "\n\n---\n\n".join(previous_parts) else: - previous_content = "(这是第一个章节)" + previous_content = "(This is the first section)" user_prompt = SECTION_USER_PROMPT_TEMPLATE.format( previous_content=previous_content, @@ -1285,77 +1285,77 @@ def _generate_section_react( {"role": "user", "content": user_prompt} ] - # ReACT循环 + # ReACT loop tool_calls_count = 0 - max_iterations = 5 # 最大迭代轮数 - min_tool_calls = 3 # 最少工具调用次数 - conflict_retries = 0 # 工具调用与Final Answer同时出现的连续冲突次数 - used_tools = set() # 记录已调用过的工具名 + max_iterations = 5 # Max iterations + min_tool_calls = 3 # Min tool calls + conflict_retries = 0 # Consecutive conflicts where tool call and Final Answer appear simultaneously + used_tools = set() # Track called tool names all_tools = {"insight_forge", "panorama_search", "quick_search", "interview_agents"} - # 报告上下文,用于InsightForge的子问题生成 - report_context = f"章节标题: {section.title}\n模拟需求: {self.simulation_requirement}" + # Report context for InsightForge sub-question generation + report_context = f"Section title: {section.title}\nSimulation requirement: {self.simulation_requirement}" for iteration in range(max_iterations): if progress_callback: progress_callback( "generating", int((iteration / max_iterations) * 100), - f"深度检索与撰写中 ({tool_calls_count}/{self.MAX_TOOL_CALLS_PER_SECTION})" + f"Deep retrieval and writing ({tool_calls_count}/{self.MAX_TOOL_CALLS_PER_SECTION})" ) - # 调用LLM + # Call LLM response = self.llm.chat( messages=messages, temperature=0.5, max_tokens=4096 ) - # 检查 LLM 返回是否为 None(API 异常或内容为空) + # Check if LLM return is None (API exception or empty content) if response is None: - logger.warning(f"章节 {section.title} 第 {iteration + 1} 次迭代: LLM 返回 None") - # 如果还有迭代次数,添加消息并重试 + logger.warning(f"Section {section.title} iteration {iteration + 1}: LLM returned None") + # If iterations remain, add message and retry if iteration < max_iterations - 1: - messages.append({"role": "assistant", "content": "(响应为空)"}) - messages.append({"role": "user", "content": "请继续生成内容。"}) + messages.append({"role": "assistant", "content": "(Empty response)"}) + messages.append({"role": "user", "content": "Please continue generating content."}) continue - # 最后一次迭代也返回 None,跳出循环进入强制收尾 + # Last iteration also returned None, break to forced conclusion break - logger.debug(f"LLM响应: {response[:200]}...") + logger.debug(f"LLM response: {response[:200]}...") - # 解析一次,复用结果 + # Parse once, reuse result tool_calls = self._parse_tool_calls(response) has_tool_calls = bool(tool_calls) has_final_answer = "Final Answer:" in response - # ── 冲突处理:LLM 同时输出了工具调用和 Final Answer ── + # ── Conflict handling: LLM output both tool call and Final Answer ── if has_tool_calls and has_final_answer: conflict_retries += 1 logger.warning( - f"章节 {section.title} 第 {iteration+1} 轮: " - f"LLM 同时输出工具调用和 Final Answer(第 {conflict_retries} 次冲突)" + f"Section {section.title} round {iteration+1}: " + f"LLM output both tool call and Final Answer (conflict #{conflict_retries})" ) if conflict_retries <= 2: - # 前两次:丢弃本次响应,要求 LLM 重新回复 + # First two times: discard response, ask LLM to retry messages.append({"role": "assistant", "content": response}) messages.append({ "role": "user", "content": ( - "【格式错误】你在一次回复中同时包含了工具调用和 Final Answer,这是不允许的。\n" - "每次回复只能做以下两件事之一:\n" - "- 调用一个工具(输出一个 块,不要写 Final Answer)\n" - "- 输出最终内容(以 'Final Answer:' 开头,不要包含 )\n" - "请重新回复,只做其中一件事。" + "[Format Error] Your response included both a tool call and Final Answer, which is not allowed.\n" + "Each response can only do one of the following:\n" + "- Call a tool (output a block, do not write Final Answer)\n" + "- Output final content (start with 'Final Answer:', do not include )\n" + "Please respond again, doing only one of these." ), }) continue else: - # 第三次:降级处理,截断到第一个工具调用,强制执行 + # Third time: degrade, truncate to first tool call, force execute logger.warning( - f"章节 {section.title}: 连续 {conflict_retries} 次冲突," - "降级为截断执行第一个工具调用" + f"Section {section.title}: {conflict_retries} consecutive conflicts, " + "degrading to truncated execution of first tool call" ) first_tool_end = response.find('') if first_tool_end != -1: @@ -1365,7 +1365,7 @@ def _generate_section_react( has_final_answer = False conflict_retries = 0 - # 记录 LLM 响应日志 + # Record LLM response log if self.report_logger: self.report_logger.log_llm_response( section_title=section.title, @@ -1376,13 +1376,13 @@ def _generate_section_react( has_final_answer=has_final_answer ) - # ── 情况1:LLM 输出了 Final Answer ── + # ── Case 1: LLM output Final Answer ── if has_final_answer: - # 工具调用次数不足,拒绝并要求继续调工具 + # Insufficient tool calls, reject and require more tool calls if tool_calls_count < min_tool_calls: messages.append({"role": "assistant", "content": response}) unused_tools = all_tools - used_tools - unused_hint = f"(这些工具还未使用,推荐用一下他们: {', '.join(unused_tools)})" if unused_tools else "" + unused_hint = f"(These tools have not been used yet, recommend trying them: {', '.join(unused_tools)})" if unused_tools else "" messages.append({ "role": "user", "content": REACT_INSUFFICIENT_TOOLS_MSG.format( @@ -1393,9 +1393,9 @@ def _generate_section_react( }) continue - # 正常结束 + # Normal end final_answer = response.split("Final Answer:")[-1].strip() - logger.info(f"章节 {section.title} 生成完成(工具调用: {tool_calls_count}次)") + logger.info(f"Section {section.title} generation complete (tool calls: {tool_calls_count})") if self.report_logger: self.report_logger.log_section_content( @@ -1406,9 +1406,9 @@ def _generate_section_react( ) return final_answer - # ── 情况2:LLM 尝试调用工具 ── + # ── Case 2: LLM attempted tool call ── if has_tool_calls: - # 工具额度已耗尽 → 明确告知,要求输出 Final Answer + # Tool quota exhausted -> explicitly inform, require Final Answer output if tool_calls_count >= self.MAX_TOOL_CALLS_PER_SECTION: messages.append({"role": "assistant", "content": response}) messages.append({ @@ -1420,10 +1420,10 @@ def _generate_section_react( }) continue - # 只执行第一个工具调用 + # Only execute first tool call call = tool_calls[0] if len(tool_calls) > 1: - logger.info(f"LLM 尝试调用 {len(tool_calls)} 个工具,只执行第一个: {call['name']}") + logger.info(f"LLM attempted {len(tool_calls)} tool calls, executing only first: {call['name']}") if self.report_logger: self.report_logger.log_tool_call( @@ -1452,7 +1452,7 @@ def _generate_section_react( tool_calls_count += 1 used_tools.add(call['name']) - # 构建未使用工具提示 + # Build unused tools hint unused_tools = all_tools - used_tools unused_hint = "" if unused_tools and tool_calls_count < self.MAX_TOOL_CALLS_PER_SECTION: @@ -1472,13 +1472,13 @@ def _generate_section_react( }) continue - # ── 情况3:既没有工具调用,也没有 Final Answer ── + # ── Case 3: Neither tool call nor Final Answer ── messages.append({"role": "assistant", "content": response}) if tool_calls_count < min_tool_calls: - # 工具调用次数不足,推荐未用过的工具 + # Insufficient tool calls, recommend unused tools unused_tools = all_tools - used_tools - unused_hint = f"(这些工具还未使用,推荐用一下他们: {', '.join(unused_tools)})" if unused_tools else "" + unused_hint = f"(These tools have not been used yet, recommend trying them: {', '.join(unused_tools)})" if unused_tools else "" messages.append({ "role": "user", @@ -1490,9 +1490,9 @@ def _generate_section_react( }) continue - # 工具调用已足够,LLM 输出了内容但没带 "Final Answer:" 前缀 - # 直接将这段内容作为最终答案,不再空转 - logger.info(f"章节 {section.title} 未检测到 'Final Answer:' 前缀,直接采纳LLM输出作为最终内容(工具调用: {tool_calls_count}次)") + # Tool calls sufficient, LLM output content without "Final Answer:" prefix + # Use this content directly as final answer, no more idle loops + logger.info(f"Section {section.title}: no 'Final Answer:' prefix detected, accepting LLM output as final content (tool calls: {tool_calls_count})") final_answer = response.strip() if self.report_logger: @@ -1504,8 +1504,8 @@ def _generate_section_react( ) return final_answer - # 达到最大迭代次数,强制生成内容 - logger.warning(f"章节 {section.title} 达到最大迭代次数,强制生成") + # Max iterations reached, force generate content + logger.warning(f"Section {section.title} reached max iterations, forcing generation") messages.append({"role": "user", "content": REACT_FORCE_FINAL_MSG}) response = self.llm.chat( @@ -1514,16 +1514,16 @@ def _generate_section_react( max_tokens=4096 ) - # 检查强制收尾时 LLM 返回是否为 None + # Check if LLM returns None during forced conclusion if response is None: - logger.error(f"章节 {section.title} 强制收尾时 LLM 返回 None,使用默认错误提示") - final_answer = f"(本章节生成失败:LLM 返回空响应,请稍后重试)" + logger.error(f"Section {section.title}: LLM returned None during forced conclusion, using default error message") + final_answer = f"(This section failed to generate: LLM returned empty response, please retry later)" elif "Final Answer:" in response: final_answer = response.split("Final Answer:")[-1].strip() else: final_answer = response - # 记录章节内容生成完成日志 + # Record section content generation complete log if self.report_logger: self.report_logger.log_section_content( section_title=section.title, @@ -1540,29 +1540,29 @@ def generate_report( report_id: Optional[str] = None ) -> Report: """ - 生成完整报告(分章节实时输出) + Generate complete report (real-time section-by-section output) - 每个章节生成完成后立即保存到文件夹,不需要等待整个报告完成。 - 文件结构: + Each section is saved to folder immediately after generation, no need to wait for entire report. + File structure: reports/{report_id}/ - meta.json - 报告元信息 - outline.json - 报告大纲 - progress.json - 生成进度 - section_01.md - 第1章节 - section_02.md - 第2章节 + meta.json - Report metadata + outline.json - Report outline + progress.json - Generation progress + section_01.md - Section 1 + section_02.md - Section 2 ... - full_report.md - 完整报告 + full_report.md - Complete report Args: - progress_callback: 进度回调函数 (stage, progress, message) - report_id: 报告ID(可选,如果不传则自动生成) + progress_callback: Progress callback function (stage, progress, message) + report_id: Report ID (optional, auto-generated if not provided) Returns: - Report: 完整报告 + Report: Complete report """ import uuid - # 如果没有传入 report_id,则自动生成 + # If no report_id passed, auto-generate if not report_id: report_id = f"report_{uuid.uuid4().hex[:12]}" start_time = datetime.now() @@ -1576,14 +1576,14 @@ def generate_report( created_at=datetime.now().isoformat() ) - # 已完成的章节标题列表(用于进度追踪) + # List of completed section titles (for progress tracking) completed_section_titles = [] try: - # 初始化:创建报告文件夹并保存初始状态 + # Initialize: create report folder and save initial state ReportManager._ensure_report_folder(report_id) - # 初始化日志记录器(结构化日志 agent_log.jsonl) + # Initialize logger (structured log agent_log.jsonl) self.report_logger = ReportLogger(report_id) self.report_logger.log_start( simulation_id=self.simulation_id, @@ -1591,27 +1591,27 @@ def generate_report( simulation_requirement=self.simulation_requirement ) - # 初始化控制台日志记录器(console_log.txt) + # Initialize console logger (console_log.txt) self.console_logger = ReportConsoleLogger(report_id) ReportManager.update_progress( - report_id, "pending", 0, "初始化报告...", + report_id, "pending", 0, "Initializing report...", completed_sections=[] ) ReportManager.save_report(report) - # 阶段1: 规划大纲 + # Stage 1: Plan outline report.status = ReportStatus.PLANNING ReportManager.update_progress( - report_id, "planning", 5, "开始规划报告大纲...", + report_id, "planning", 5, "Starting to plan report outline...", completed_sections=[] ) - # 记录规划开始日志 + # Record planning start log self.report_logger.log_planning_start() if progress_callback: - progress_callback("planning", 0, "开始规划报告大纲...") + progress_callback("planning", 0, "Starting to plan report outline...") outline = self.plan_outline( progress_callback=lambda stage, prog, msg: @@ -1619,33 +1619,33 @@ def generate_report( ) report.outline = outline - # 记录规划完成日志 + # Record planning complete log self.report_logger.log_planning_complete(outline.to_dict()) - # 保存大纲到文件 + # Save outline to file ReportManager.save_outline(report_id, outline) ReportManager.update_progress( - report_id, "planning", 15, f"大纲规划完成,共{len(outline.sections)}个章节", + report_id, "planning", 15, f"Outline planning complete, {len(outline.sections)} sections", completed_sections=[] ) ReportManager.save_report(report) - logger.info(f"大纲已保存到文件: {report_id}/outline.json") + logger.info(f"Outline saved to file: {report_id}/outline.json") - # 阶段2: 逐章节生成(分章节保存) + # Stage 2: Generate section by section (save per section) report.status = ReportStatus.GENERATING total_sections = len(outline.sections) - generated_sections = [] # 保存内容用于上下文 + generated_sections = [] # Save content for context for i, section in enumerate(outline.sections): section_num = i + 1 base_progress = 20 + int((i / total_sections) * 70) - # 更新进度 + # Update progress ReportManager.update_progress( report_id, "generating", base_progress, - f"正在生成章节: {section.title} ({section_num}/{total_sections})", + f"Generating section: {section.title} ({section_num}/{total_sections})", current_section=section.title, completed_sections=completed_section_titles ) @@ -1654,10 +1654,10 @@ def generate_report( progress_callback( "generating", base_progress, - f"正在生成章节: {section.title} ({section_num}/{total_sections})" + f"Generating section: {section.title} ({section_num}/{total_sections})" ) - # 生成主章节内容 + # Generate main section content section_content = self._generate_section_react( section=section, outline=outline, @@ -1674,11 +1674,11 @@ def generate_report( section.content = section_content generated_sections.append(f"## {section.title}\n\n{section_content}") - # 保存章节 + # Save section ReportManager.save_section(report_id, section_num, section) completed_section_titles.append(section.title) - # 记录章节完成日志 + # Record section complete log full_section_content = f"## {section.title}\n\n{section_content}" if self.report_logger: @@ -1688,54 +1688,54 @@ def generate_report( full_content=full_section_content.strip() ) - logger.info(f"章节已保存: {report_id}/section_{section_num:02d}.md") + logger.info(f"Section saved: {report_id}/section_{section_num:02d}.md") - # 更新进度 + # Update progress ReportManager.update_progress( report_id, "generating", base_progress + int(70 / total_sections), - f"章节 {section.title} 已完成", + f"Section {section.title} completed", current_section=None, completed_sections=completed_section_titles ) - # 阶段3: 组装完整报告 + # Stage 3: Assemble complete report if progress_callback: - progress_callback("generating", 95, "正在组装完整报告...") - + progress_callback("generating", 95, "Assembling complete report...") + ReportManager.update_progress( - report_id, "generating", 95, "正在组装完整报告...", + report_id, "generating", 95, "Assembling complete report...", completed_sections=completed_section_titles ) - # 使用ReportManager组装完整报告 + # Use ReportManager to assemble complete report report.markdown_content = ReportManager.assemble_full_report(report_id, outline) report.status = ReportStatus.COMPLETED report.completed_at = datetime.now().isoformat() - # 计算总耗时 + # Calculate total time total_time_seconds = (datetime.now() - start_time).total_seconds() - # 记录报告完成日志 + # Record report complete log if self.report_logger: self.report_logger.log_report_complete( total_sections=total_sections, total_time_seconds=total_time_seconds ) - # 保存最终报告 + # Save final report ReportManager.save_report(report) ReportManager.update_progress( - report_id, "completed", 100, "报告生成完成", + report_id, "completed", 100, "Report generation complete", completed_sections=completed_section_titles ) if progress_callback: - progress_callback("completed", 100, "报告生成完成") + progress_callback("completed", 100, "Report generation complete") - logger.info(f"报告生成完成: {report_id}") + logger.info(f"Report generation complete: {report_id}") - # 关闭控制台日志记录器 + # Close console logger if self.console_logger: self.console_logger.close() self.console_logger = None @@ -1743,25 +1743,25 @@ def generate_report( return report except Exception as e: - logger.error(f"报告生成失败: {str(e)}") + logger.error(f"Report generation failed: {str(e)}") report.status = ReportStatus.FAILED report.error = str(e) - # 记录错误日志 + # Record error log if self.report_logger: self.report_logger.log_error(str(e), "failed") - # 保存失败状态 + # Save failed state try: ReportManager.save_report(report) ReportManager.update_progress( - report_id, "failed", -1, f"报告生成失败: {str(e)}", + report_id, "failed", -1, f"Report generation failed: {str(e)}", completed_sections=completed_section_titles ) except Exception: - pass # 忽略保存失败的错误 + pass # Ignore save failure errors - # 关闭控制台日志记录器 + # Close console logger if self.console_logger: self.console_logger.close() self.console_logger = None @@ -1774,59 +1774,59 @@ def chat( chat_history: List[Dict[str, str]] = None ) -> Dict[str, Any]: """ - 与Report Agent对话 + Chat with Report Agent - 在对话中Agent可以自主调用检索工具来回答问题 + In conversation Agent can autonomously call retrieval tools to answer questions Args: - message: 用户消息 - chat_history: 对话历史 + message: User message + chat_history: Chat history Returns: { - "response": "Agent回复", - "tool_calls": [调用的工具列表], - "sources": [信息来源] + "response": "Agent response", + "tool_calls": [list of tools called], + "sources": [information sources] } """ - logger.info(f"Report Agent对话: {message[:50]}...") + logger.info(f"Report Agent chat: {message[:50]}...") chat_history = chat_history or [] - # 获取已生成的报告内容 + # Get already generated report content report_content = "" try: report = ReportManager.get_report_by_simulation(self.simulation_id) if report and report.markdown_content: - # 限制报告长度,避免上下文过长 + # Limit report length to avoid overly long context report_content = report.markdown_content[:15000] if len(report.markdown_content) > 15000: - report_content += "\n\n... [报告内容已截断] ..." + report_content += "\n\n... [Report content truncated] ..." except Exception as e: - logger.warning(f"获取报告内容失败: {e}") + logger.warning(f"Failed to get report content: {e}") system_prompt = CHAT_SYSTEM_PROMPT_TEMPLATE.format( simulation_requirement=self.simulation_requirement, - report_content=report_content if report_content else "(暂无报告)", + report_content=report_content if report_content else "(No report yet)", tools_description=self._get_tools_description(), ) - # 构建消息 + # Build messages messages = [{"role": "system", "content": system_prompt}] - # 添加历史对话 - for h in chat_history[-10:]: # 限制历史长度 + # Add chat history + for h in chat_history[-10:]: # Limit history length messages.append(h) - # 添加用户消息 + # Add user message messages.append({ "role": "user", "content": message }) - # ReACT循环(简化版) + # ReACT loop (simplified version) tool_calls_made = [] - max_iterations = 2 # 减少迭代轮数 + max_iterations = 2 # Reduce iteration rounds for iteration in range(max_iterations): response = self.llm.chat( @@ -1834,11 +1834,11 @@ def chat( temperature=0.5 ) - # 解析工具调用 + # Parse tool calls tool_calls = self._parse_tool_calls(response) if not tool_calls: - # 没有工具调用,直接返回响应 + # No tool calls, return response directly clean_response = re.sub(r'.*?', '', response, flags=re.DOTALL) clean_response = re.sub(r'\[TOOL_CALL\].*?\)', '', clean_response) @@ -1848,33 +1848,33 @@ def chat( "sources": [tc.get("parameters", {}).get("query", "") for tc in tool_calls_made] } - # 执行工具调用(限制数量) + # Execute tool call (limit quantity) tool_results = [] - for call in tool_calls[:1]: # 每轮最多执行1次工具调用 + for call in tool_calls[:1]: # Max 1 tool call execution per round if len(tool_calls_made) >= self.MAX_TOOL_CALLS_PER_CHAT: break result = self._execute_tool(call["name"], call.get("parameters", {})) tool_results.append({ "tool": call["name"], - "result": result[:1500] # 限制结果长度 + "result": result[:1500] # Limit result length }) tool_calls_made.append(call) - # 将结果添加到消息 + # Add results to messages messages.append({"role": "assistant", "content": response}) - observation = "\n".join([f"[{r['tool']}结果]\n{r['result']}" for r in tool_results]) + observation = "\n".join([f"[{r['tool']} result]\n{r['result']}" for r in tool_results]) messages.append({ "role": "user", "content": observation + CHAT_OBSERVATION_SUFFIX }) - # 达到最大迭代,获取最终响应 + # Max iterations reached, get final response final_response = self.llm.chat( messages=messages, temperature=0.5 ) - # 清理响应 + # Clean response clean_response = re.sub(r'.*?', '', final_response, flags=re.DOTALL) clean_response = re.sub(r'\[TOOL_CALL\].*?\)', '', clean_response) @@ -1887,95 +1887,95 @@ def chat( class ReportManager: """ - 报告管理器 + Report Manager - 负责报告的持久化存储和检索 + Responsible for persistent storage and retrieval of reports - 文件结构(分章节输出): + File structure (section-by-section output): reports/ {report_id}/ - meta.json - 报告元信息和状态 - outline.json - 报告大纲 - progress.json - 生成进度 - section_01.md - 第1章节 - section_02.md - 第2章节 + meta.json - Report metadata and status + outline.json - Report outline + progress.json - Generation progress + section_01.md - Section 1 + section_02.md - Section 2 ... - full_report.md - 完整报告 + full_report.md - Complete report """ - # 报告存储目录 + # Report storage directory REPORTS_DIR = os.path.join(Config.UPLOAD_FOLDER, 'reports') @classmethod def _ensure_reports_dir(cls): - """确保报告根目录存在""" + """Ensure report root directory exists""" os.makedirs(cls.REPORTS_DIR, exist_ok=True) @classmethod def _get_report_folder(cls, report_id: str) -> str: - """获取报告文件夹路径""" + """Get report folder path""" return os.path.join(cls.REPORTS_DIR, report_id) @classmethod def _ensure_report_folder(cls, report_id: str) -> str: - """确保报告文件夹存在并返回路径""" + """Ensure report folder exists and return path""" folder = cls._get_report_folder(report_id) os.makedirs(folder, exist_ok=True) return folder @classmethod def _get_report_path(cls, report_id: str) -> str: - """获取报告元信息文件路径""" + """Get report metadata file path""" return os.path.join(cls._get_report_folder(report_id), "meta.json") @classmethod def _get_report_markdown_path(cls, report_id: str) -> str: - """获取完整报告Markdown文件路径""" + """Get complete report Markdown file path""" return os.path.join(cls._get_report_folder(report_id), "full_report.md") @classmethod def _get_outline_path(cls, report_id: str) -> str: - """获取大纲文件路径""" + """Get outline file path""" return os.path.join(cls._get_report_folder(report_id), "outline.json") @classmethod def _get_progress_path(cls, report_id: str) -> str: - """获取进度文件路径""" + """Get progress file path""" return os.path.join(cls._get_report_folder(report_id), "progress.json") @classmethod def _get_section_path(cls, report_id: str, section_index: int) -> str: - """获取章节Markdown文件路径""" + """Get section Markdown file path""" return os.path.join(cls._get_report_folder(report_id), f"section_{section_index:02d}.md") @classmethod def _get_agent_log_path(cls, report_id: str) -> str: - """获取 Agent 日志文件路径""" + """Get Agent log file path""" return os.path.join(cls._get_report_folder(report_id), "agent_log.jsonl") @classmethod def _get_console_log_path(cls, report_id: str) -> str: - """获取控制台日志文件路径""" + """Get console log file path""" return os.path.join(cls._get_report_folder(report_id), "console_log.txt") @classmethod def get_console_log(cls, report_id: str, from_line: int = 0) -> Dict[str, Any]: """ - 获取控制台日志内容 + Get console log content - 这是报告生成过程中的控制台输出日志(INFO、WARNING等), - 与 agent_log.jsonl 的结构化日志不同。 + These are console output logs during report generation (INFO, WARNING, etc.), + different from the structured logs in agent_log.jsonl. Args: - report_id: 报告ID - from_line: 从第几行开始读取(用于增量获取,0 表示从头开始) - + report_id: Report ID + from_line: Start reading from which line (for incremental retrieval, 0 means from beginning) + Returns: { - "logs": [日志行列表], - "total_lines": 总行数, - "from_line": 起始行号, - "has_more": 是否还有更多日志 + "logs": [list of log lines], + "total_lines": total line count, + "from_line": starting line number, + "has_more": whether there are more logs } """ log_path = cls._get_console_log_path(report_id) @@ -1995,26 +1995,26 @@ def get_console_log(cls, report_id: str, from_line: int = 0) -> Dict[str, Any]: for i, line in enumerate(f): total_lines = i + 1 if i >= from_line: - # 保留原始日志行,去掉末尾换行符 + # Keep original log line, remove trailing newline logs.append(line.rstrip('\n\r')) return { "logs": logs, "total_lines": total_lines, "from_line": from_line, - "has_more": False # 已读取到末尾 + "has_more": False # Read to end } @classmethod def get_console_log_stream(cls, report_id: str) -> List[str]: """ - 获取完整的控制台日志(一次性获取全部) + Get complete console log (all at once) Args: - report_id: 报告ID - + report_id: Report ID + Returns: - 日志行列表 + List of log lines """ result = cls.get_console_log(report_id, from_line=0) return result["logs"] @@ -2022,18 +2022,18 @@ def get_console_log_stream(cls, report_id: str) -> List[str]: @classmethod def get_agent_log(cls, report_id: str, from_line: int = 0) -> Dict[str, Any]: """ - 获取 Agent 日志内容 + Get Agent log content Args: - report_id: 报告ID - from_line: 从第几行开始读取(用于增量获取,0 表示从头开始) - + report_id: Report ID + from_line: Start reading from which line (for incremental retrieval, 0 means from beginning) + Returns: { - "logs": [日志条目列表], - "total_lines": 总行数, - "from_line": 起始行号, - "has_more": 是否还有更多日志 + "logs": [list of log entries], + "total_lines": total line count, + "from_line": starting line number, + "has_more": whether there are more logs } """ log_path = cls._get_agent_log_path(report_id) @@ -2057,26 +2057,26 @@ def get_agent_log(cls, report_id: str, from_line: int = 0) -> Dict[str, Any]: log_entry = json.loads(line.strip()) logs.append(log_entry) except json.JSONDecodeError: - # 跳过解析失败的行 + # Skip lines that fail to parse continue return { "logs": logs, "total_lines": total_lines, "from_line": from_line, - "has_more": False # 已读取到末尾 + "has_more": False # Read to end } @classmethod def get_agent_log_stream(cls, report_id: str) -> List[Dict[str, Any]]: """ - 获取完整的 Agent 日志(用于一次性获取全部) + Get complete Agent log (for getting all at once) Args: - report_id: 报告ID - + report_id: Report ID + Returns: - 日志条目列表 + List of log entries """ result = cls.get_agent_log(report_id, from_line=0) return result["logs"] @@ -2084,16 +2084,16 @@ def get_agent_log_stream(cls, report_id: str) -> List[Dict[str, Any]]: @classmethod def save_outline(cls, report_id: str, outline: ReportOutline) -> None: """ - 保存报告大纲 - - 在规划阶段完成后立即调用 + Save report outline + + Called immediately after planning stage completes """ cls._ensure_report_folder(report_id) with open(cls._get_outline_path(report_id), 'w', encoding='utf-8') as f: json.dump(outline.to_dict(), f, ensure_ascii=False, indent=2) - logger.info(f"大纲已保存: {report_id}") + logger.info(f"Outline saved: {report_id}") @classmethod def save_section( @@ -2103,49 +2103,49 @@ def save_section( section: ReportSection ) -> str: """ - 保存单个章节 + Save a single section - 在每个章节生成完成后立即调用,实现分章节输出 + Called immediately after each section is generated, enabling section-by-section output Args: - report_id: 报告ID - section_index: 章节索引(从1开始) - section: 章节对象 + report_id: Report ID + section_index: Section index (starting from 1) + section: Section object Returns: - 保存的文件路径 + Saved file path """ cls._ensure_report_folder(report_id) - # 构建章节Markdown内容 - 清理可能存在的重复标题 + # Build section Markdown content - clean possible duplicate titles cleaned_content = cls._clean_section_content(section.content, section.title) md_content = f"## {section.title}\n\n" if cleaned_content: md_content += f"{cleaned_content}\n\n" - # 保存文件 + # Save file file_suffix = f"section_{section_index:02d}.md" file_path = os.path.join(cls._get_report_folder(report_id), file_suffix) with open(file_path, 'w', encoding='utf-8') as f: f.write(md_content) - logger.info(f"章节已保存: {report_id}/{file_suffix}") + logger.info(f"Section saved: {report_id}/{file_suffix}") return file_path @classmethod def _clean_section_content(cls, content: str, section_title: str) -> str: """ - 清理章节内容 + Clean section content - 1. 移除内容开头与章节标题重复的Markdown标题行 - 2. 将所有 ### 及以下级别的标题转换为粗体文本 + 1. Remove Markdown heading lines at content start that duplicate section title + 2. Convert all ### and lower level headings to bold text Args: - content: 原始内容 - section_title: 章节标题 + content: Original content + section_title: Section title Returns: - 清理后的内容 + Cleaned content """ import re @@ -2160,26 +2160,26 @@ def _clean_section_content(cls, content: str, section_title: str) -> str: for i, line in enumerate(lines): stripped = line.strip() - # 检查是否是Markdown标题行 + # Check if line is Markdown heading heading_match = re.match(r'^(#{1,6})\s+(.+)$', stripped) if heading_match: level = len(heading_match.group(1)) title_text = heading_match.group(2).strip() - # 检查是否是与章节标题重复的标题(跳过前5行内的重复) + # Check if this is a heading that duplicates the section title (skip duplicates within the first 5 lines) if i < 5: if title_text == section_title or title_text.replace(' ', '') == section_title.replace(' ', ''): skip_next_empty = True continue - # 将所有级别的标题(#, ##, ###, ####等)转换为粗体 - # 因为章节标题由系统添加,内容中不应有任何标题 + # Convert all heading levels (#, ##, ###, ####, etc.) to bold + # Because the section title is added by the system, content should not have any headings cleaned_lines.append(f"**{title_text}**") - cleaned_lines.append("") # 添加空行 + cleaned_lines.append("") # Add blank line continue - # 如果上一行是被跳过的标题,且当前行为空,也跳过 + # If previous line was a skipped heading and current line is empty, skip too if skip_next_empty and stripped == '': skip_next_empty = False continue @@ -2187,14 +2187,14 @@ def _clean_section_content(cls, content: str, section_title: str) -> str: skip_next_empty = False cleaned_lines.append(line) - # 移除开头的空行 + # Remove leading empty lines while cleaned_lines and cleaned_lines[0].strip() == '': cleaned_lines.pop(0) - # 移除开头的分隔线 + # Remove leading separator lines while cleaned_lines and cleaned_lines[0].strip() in ['---', '***', '___']: cleaned_lines.pop(0) - # 同时移除分隔线后的空行 + # Also remove empty lines after separator while cleaned_lines and cleaned_lines[0].strip() == '': cleaned_lines.pop(0) @@ -2211,9 +2211,9 @@ def update_progress( completed_sections: List[str] = None ) -> None: """ - 更新报告生成进度 - - 前端可以通过读取progress.json获取实时进度 + Update report generation progress + + Frontend can get real-time progress by reading progress.json """ cls._ensure_report_folder(report_id) @@ -2231,7 +2231,7 @@ def update_progress( @classmethod def get_progress(cls, report_id: str) -> Optional[Dict[str, Any]]: - """获取报告生成进度""" + """Get report generation progress""" path = cls._get_progress_path(report_id) if not os.path.exists(path): @@ -2243,9 +2243,9 @@ def get_progress(cls, report_id: str) -> Optional[Dict[str, Any]]: @classmethod def get_generated_sections(cls, report_id: str) -> List[Dict[str, Any]]: """ - 获取已生成的章节列表 + Get list of generated sections - 返回所有已保存的章节文件信息 + Returns info of all saved section files """ folder = cls._get_report_folder(report_id) @@ -2259,7 +2259,7 @@ def get_generated_sections(cls, report_id: str) -> List[Dict[str, Any]]: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() - # 从文件名解析章节索引 + # Parse section index from filename parts = filename.replace('.md', '').split('_') section_index = int(parts[1]) @@ -2274,48 +2274,48 @@ def get_generated_sections(cls, report_id: str) -> List[Dict[str, Any]]: @classmethod def assemble_full_report(cls, report_id: str, outline: ReportOutline) -> str: """ - 组装完整报告 - - 从已保存的章节文件组装完整报告,并进行标题清理 + Assemble complete report + + Assembles the complete report from saved section files, with heading cleanup """ folder = cls._get_report_folder(report_id) - # 构建报告头部 + # Build report header md_content = f"# {outline.title}\n\n" md_content += f"> {outline.summary}\n\n" md_content += f"---\n\n" - # 按顺序读取所有章节文件 + # Read all section files in order sections = cls.get_generated_sections(report_id) for section_info in sections: md_content += section_info["content"] - # 后处理:清理整个报告的标题问题 + # Post-process: clean title issues in entire report md_content = cls._post_process_report(md_content, outline) - # 保存完整报告 + # Save complete report full_path = cls._get_report_markdown_path(report_id) with open(full_path, 'w', encoding='utf-8') as f: f.write(md_content) - logger.info(f"完整报告已组装: {report_id}") + logger.info(f"Complete report assembled: {report_id}") return md_content @classmethod def _post_process_report(cls, content: str, outline: ReportOutline) -> str: """ - 后处理报告内容 + Post-process report content - 1. 移除重复的标题 - 2. 保留报告主标题(#)和章节标题(##),移除其他级别的标题(###, ####等) - 3. 清理多余的空行和分隔线 + 1. Remove duplicate titles + 2. Keep report main title (#) and section titles (##), remove other heading levels (###, ####, etc.) + 3. Clean excess empty lines and separator lines Args: - content: 原始报告内容 - outline: 报告大纲 + content: Original report content + outline: Report outline Returns: - 处理后的内容 + Processed content """ import re @@ -2323,7 +2323,7 @@ def _post_process_report(cls, content: str, outline: ReportOutline) -> str: processed_lines = [] prev_was_heading = False - # 收集大纲中的所有章节标题 + # Collect all section titles from the outline section_titles = set() for section in outline.sections: section_titles.add(section.title) @@ -2333,14 +2333,14 @@ def _post_process_report(cls, content: str, outline: ReportOutline) -> str: line = lines[i] stripped = line.strip() - # 检查是否是标题行 + # Check if line is heading heading_match = re.match(r'^(#{1,6})\s+(.+)$', stripped) if heading_match: level = len(heading_match.group(1)) title = heading_match.group(2).strip() - # 检查是否是重复标题(在连续5行内出现相同内容的标题) + # Check for duplicate title (same content heading within 5 consecutive lines) is_duplicate = False for j in range(max(0, len(processed_lines) - 5), len(processed_lines)): prev_line = processed_lines[j].strip() @@ -2352,43 +2352,43 @@ def _post_process_report(cls, content: str, outline: ReportOutline) -> str: break if is_duplicate: - # 跳过重复标题及其后的空行 + # Skip duplicate title and trailing empty lines i += 1 while i < len(lines) and lines[i].strip() == '': i += 1 continue - # 标题层级处理: - # - # (level=1) 只保留报告主标题 - # - ## (level=2) 保留章节标题 - # - ### 及以下 (level>=3) 转换为粗体文本 + # Heading level handling: + # - # (level=1) keep only report main title + # - ## (level=2) keep section titles + # - ### and below (level>=3) convert to bold text if level == 1: if title == outline.title: - # 保留报告主标题 + # Keep report main title processed_lines.append(line) prev_was_heading = True elif title in section_titles: - # 章节标题错误使用了#,修正为## + # Section title incorrectly used #, correcting to ## processed_lines.append(f"## {title}") prev_was_heading = True else: - # 其他一级标题转为粗体 + # Convert other level-1 titles to bold processed_lines.append(f"**{title}**") processed_lines.append("") prev_was_heading = False elif level == 2: if title in section_titles or title == outline.title: - # 保留章节标题 + # Keep section title processed_lines.append(line) prev_was_heading = True else: - # 非章节的二级标题转为粗体 + # Convert non-section level-2 titles to bold processed_lines.append(f"**{title}**") processed_lines.append("") prev_was_heading = False else: - # ### 及以下级别的标题转换为粗体文本 + # Convert ### and lower level headings to bold text processed_lines.append(f"**{title}**") processed_lines.append("") prev_was_heading = False @@ -2397,12 +2397,12 @@ def _post_process_report(cls, content: str, outline: ReportOutline) -> str: continue elif stripped == '---' and prev_was_heading: - # 跳过标题后紧跟的分隔线 + # Skip separator line immediately after heading i += 1 continue elif stripped == '' and prev_was_heading: - # 标题后只保留一个空行 + # Keep only one empty line after heading if processed_lines and processed_lines[-1].strip() != '': processed_lines.append(line) prev_was_heading = False @@ -2413,7 +2413,7 @@ def _post_process_report(cls, content: str, outline: ReportOutline) -> str: i += 1 - # 清理连续的多个空行(保留最多2个) + # Clean consecutive empty lines (keep at most 2) result_lines = [] empty_count = 0 for line in processed_lines: @@ -2429,31 +2429,31 @@ def _post_process_report(cls, content: str, outline: ReportOutline) -> str: @classmethod def save_report(cls, report: Report) -> None: - """保存报告元信息和完整报告""" + """Save report metadata and complete report""" cls._ensure_report_folder(report.report_id) - # 保存元信息JSON + # Save metadata JSON with open(cls._get_report_path(report.report_id), 'w', encoding='utf-8') as f: json.dump(report.to_dict(), f, ensure_ascii=False, indent=2) - # 保存大纲 + # Save outline if report.outline: cls.save_outline(report.report_id, report.outline) - # 保存完整Markdown报告 + # Save complete Markdown report if report.markdown_content: with open(cls._get_report_markdown_path(report.report_id), 'w', encoding='utf-8') as f: f.write(report.markdown_content) - logger.info(f"报告已保存: {report.report_id}") + logger.info(f"Report saved: {report.report_id}") @classmethod def get_report(cls, report_id: str) -> Optional[Report]: - """获取报告""" + """Get report""" path = cls._get_report_path(report_id) if not os.path.exists(path): - # 兼容旧格式:检查直接存储在reports目录下的文件 + # Backward compatible: check files stored directly in reports directory old_path = os.path.join(cls.REPORTS_DIR, f"{report_id}.json") if os.path.exists(old_path): path = old_path @@ -2463,7 +2463,7 @@ def get_report(cls, report_id: str) -> Optional[Report]: with open(path, 'r', encoding='utf-8') as f: data = json.load(f) - # 重建Report对象 + # Rebuild Report object outline = None if data.get('outline'): outline_data = data['outline'] @@ -2479,7 +2479,7 @@ def get_report(cls, report_id: str) -> Optional[Report]: sections=sections ) - # 如果markdown_content为空,尝试从full_report.md读取 + # If markdown_content is empty, try reading from full_report.md markdown_content = data.get('markdown_content', '') if not markdown_content: full_report_path = cls._get_report_markdown_path(report_id) @@ -2502,17 +2502,17 @@ def get_report(cls, report_id: str) -> Optional[Report]: @classmethod def get_report_by_simulation(cls, simulation_id: str) -> Optional[Report]: - """根据模拟ID获取报告""" + """Get report by simulation ID""" cls._ensure_reports_dir() for item in os.listdir(cls.REPORTS_DIR): item_path = os.path.join(cls.REPORTS_DIR, item) - # 新格式:文件夹 + # New format: folder if os.path.isdir(item_path): report = cls.get_report(item) if report and report.simulation_id == simulation_id: return report - # 兼容旧格式:JSON文件 + # Backward compatible: JSON file elif item.endswith('.json'): report_id = item[:-5] report = cls.get_report(report_id) @@ -2523,19 +2523,19 @@ def get_report_by_simulation(cls, simulation_id: str) -> Optional[Report]: @classmethod def list_reports(cls, simulation_id: Optional[str] = None, limit: int = 50) -> List[Report]: - """列出报告""" + """List reports""" cls._ensure_reports_dir() reports = [] for item in os.listdir(cls.REPORTS_DIR): item_path = os.path.join(cls.REPORTS_DIR, item) - # 新格式:文件夹 + # New format: folder if os.path.isdir(item_path): report = cls.get_report(item) if report: if simulation_id is None or report.simulation_id == simulation_id: reports.append(report) - # 兼容旧格式:JSON文件 + # Backward compatible: JSON file elif item.endswith('.json'): report_id = item[:-5] report = cls.get_report(report_id) @@ -2543,25 +2543,25 @@ def list_reports(cls, simulation_id: Optional[str] = None, limit: int = 50) -> L if simulation_id is None or report.simulation_id == simulation_id: reports.append(report) - # 按创建时间倒序 + # Sort by creation time descending reports.sort(key=lambda r: r.created_at, reverse=True) return reports[:limit] @classmethod def delete_report(cls, report_id: str) -> bool: - """删除报告(整个文件夹)""" + """Delete report (entire folder)""" import shutil folder_path = cls._get_report_folder(report_id) - # 新格式:删除整个文件夹 + # New format: delete entire folder if os.path.exists(folder_path) and os.path.isdir(folder_path): shutil.rmtree(folder_path) - logger.info(f"报告文件夹已删除: {report_id}") + logger.info(f"Report folder deleted: {report_id}") return True - # 兼容旧格式:删除单独的文件 + # Backward compatible: delete individual files deleted = False old_json_path = os.path.join(cls.REPORTS_DIR, f"{report_id}.json") old_md_path = os.path.join(cls.REPORTS_DIR, f"{report_id}.md") diff --git a/backend/app/services/simulation_config_generator.py b/backend/app/services/simulation_config_generator.py index 371d594..a3998bf 100644 --- a/backend/app/services/simulation_config_generator.py +++ b/backend/app/services/simulation_config_generator.py @@ -1,13 +1,13 @@ """ -模拟配置智能生成器 -使用LLM根据模拟需求、文档内容、图谱信息自动生成细致的模拟参数 -实现全程自动化,无需人工设置参数 - -采用分步生成策略,避免一次性生成过长内容导致失败: -1. 生成时间配置 -2. 生成事件配置 -3. 分批生成Agent配置 -4. 生成平台配置 +Intelligent Simulation Configuration Generator +Uses LLM to automatically generate detailed simulation parameters based on requirements, document content, and graph info +Fully automated, no manual parameter setting required + +Uses step-by-step generation strategy to avoid failure from generating overly long content at once: +1. Generate time config +2. Generate event config +3. Generate Agent configs in batches +4. Generate platform config """ import json @@ -24,156 +24,156 @@ logger = get_logger('mirofish.simulation_config') -# 中国作息时间配置(北京时间) +# China daily schedule config (Beijing time) CHINA_TIMEZONE_CONFIG = { - # 深夜时段(几乎无人活动) + # Late night period (almost no activity) "dead_hours": [0, 1, 2, 3, 4, 5], - # 早间时段(逐渐醒来) + # Morning period (gradually waking up) "morning_hours": [6, 7, 8], - # 工作时段 + # Work period "work_hours": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - # 晚间高峰(最活跃) + # Evening peak (most active) "peak_hours": [19, 20, 21, 22], - # 夜间时段(活跃度下降) + # Night period (activity declining) "night_hours": [23], - # 活跃度系数 + # Activity coefficients "activity_multipliers": { - "dead": 0.05, # 凌晨几乎无人 - "morning": 0.4, # 早间逐渐活跃 - "work": 0.7, # 工作时段中等 - "peak": 1.5, # 晚间高峰 - "night": 0.5 # 深夜下降 + "dead": 0.05, # Almost no one at dawn + "morning": 0.4, # Gradually active in morning + "work": 0.7, # Moderate during work period + "peak": 1.5, # Evening peak + "night": 0.5 # Declining late at night } } @dataclass class AgentActivityConfig: - """单个Agent的活动配置""" + """Activity config for a single Agent""" agent_id: int entity_uuid: str entity_name: str entity_type: str - # 活跃度配置 (0.0-1.0) - activity_level: float = 0.5 # 整体活跃度 + # Activity level config (0.0-1.0) + activity_level: float = 0.5 # Overall activity level - # 发言频率(每小时预期发言次数) + # Posting frequency (expected posts per hour) posts_per_hour: float = 1.0 comments_per_hour: float = 2.0 - # 活跃时间段(24小时制,0-23) + # Active hours (24-hour format, 0-23) active_hours: List[int] = field(default_factory=lambda: list(range(8, 23))) - # 响应速度(对热点事件的反应延迟,单位:模拟分钟) + # Response speed (reaction delay to hot events, unit: simulated minutes) response_delay_min: int = 5 response_delay_max: int = 60 - # 情感倾向 (-1.0到1.0,负面到正面) + # Sentiment bias (-1.0 to 1.0, negative to positive) sentiment_bias: float = 0.0 - # 立场(对特定话题的态度) + # Stance (attitude toward specific topics) stance: str = "neutral" # supportive, opposing, neutral, observer - # 影响力权重(决定其发言被其他Agent看到的概率) + # Influence weight (determines probability of posts being seen by other Agents) influence_weight: float = 1.0 @dataclass class TimeSimulationConfig: - """时间模拟配置(基于中国人作息习惯)""" - # 模拟总时长(模拟小时数) - total_simulation_hours: int = 72 # 默认模拟72小时(3天) + """Time simulation config (based on Chinese daily schedule)""" + # Total simulation duration (simulated hours) + total_simulation_hours: int = 72 # Default 72 hours (3 days) - # 每轮代表的时间(模拟分钟)- 默认60分钟(1小时),加快时间流速 + # Time per round (simulated minutes) - default 60 minutes (1 hour), accelerated time flow minutes_per_round: int = 60 - # 每小时激活的Agent数量范围 + # Range of Agents activated per hour agents_per_hour_min: int = 5 agents_per_hour_max: int = 20 - # 高峰时段(晚间19-22点,中国人最活跃的时间) + # Peak hours (19-22, most active time for Chinese users) peak_hours: List[int] = field(default_factory=lambda: [19, 20, 21, 22]) peak_activity_multiplier: float = 1.5 - # 低谷时段(凌晨0-5点,几乎无人活动) + # Off-peak hours (0-5 AM, almost no activity) off_peak_hours: List[int] = field(default_factory=lambda: [0, 1, 2, 3, 4, 5]) - off_peak_activity_multiplier: float = 0.05 # 凌晨活跃度极低 + off_peak_activity_multiplier: float = 0.05 # Dawn activity extremely low - # 早间时段 + # Morning hours morning_hours: List[int] = field(default_factory=lambda: [6, 7, 8]) morning_activity_multiplier: float = 0.4 - # 工作时段 + # Work period work_hours: List[int] = field(default_factory=lambda: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18]) work_activity_multiplier: float = 0.7 @dataclass class EventConfig: - """事件配置""" - # 初始事件(模拟开始时的触发事件) + """Event config""" + # Initial events (triggered at simulation start) initial_posts: List[Dict[str, Any]] = field(default_factory=list) - # 定时事件(在特定时间触发的事件) + # Scheduled events (triggered at specific times) scheduled_events: List[Dict[str, Any]] = field(default_factory=list) - # 热点话题关键词 + # Hot topic keywords hot_topics: List[str] = field(default_factory=list) - # 舆论引导方向 + # Narrative direction narrative_direction: str = "" @dataclass class PlatformConfig: - """平台特定配置""" + """Platform-specific config""" platform: str # twitter or reddit - # 推荐算法权重 - recency_weight: float = 0.4 # 时间新鲜度 - popularity_weight: float = 0.3 # 热度 - relevance_weight: float = 0.3 # 相关性 + # Recommendation algorithm weights + recency_weight: float = 0.4 # Recency + popularity_weight: float = 0.3 # Popularity + relevance_weight: float = 0.3 # Relevance - # 病毒传播阈值(达到多少互动后触发扩散) + # Viral threshold (interactions needed to trigger spread) viral_threshold: int = 10 - # 回声室效应强度(相似观点聚集程度) + # Echo chamber strength (degree of similar opinion clustering) echo_chamber_strength: float = 0.5 @dataclass class SimulationParameters: - """完整的模拟参数配置""" - # 基础信息 + """Complete simulation parameter config""" + # Basic info simulation_id: str project_id: str graph_id: str simulation_requirement: str - # 时间配置 + # Time config time_config: TimeSimulationConfig = field(default_factory=TimeSimulationConfig) - # Agent配置列表 + # Agent config list agent_configs: List[AgentActivityConfig] = field(default_factory=list) - # 事件配置 + # Event config event_config: EventConfig = field(default_factory=EventConfig) - # 平台配置 + # Platform config twitter_config: Optional[PlatformConfig] = None reddit_config: Optional[PlatformConfig] = None - # LLM配置 + # LLM config llm_model: str = "" llm_base_url: str = "" - # 生成元数据 + # Generation metadata generated_at: str = field(default_factory=lambda: datetime.now().isoformat()) - generation_reasoning: str = "" # LLM的推理说明 + generation_reasoning: str = "" # LLM's reasoning explanation def to_dict(self) -> Dict[str, Any]: - """转换为字典""" + """Convert to dictionary""" time_dict = asdict(self.time_config) return { "simulation_id": self.simulation_id, @@ -192,34 +192,34 @@ def to_dict(self) -> Dict[str, Any]: } def to_json(self, indent: int = 2) -> str: - """转换为JSON字符串""" + """Convert to JSON string""" return json.dumps(self.to_dict(), ensure_ascii=False, indent=indent) class SimulationConfigGenerator: """ - 模拟配置智能生成器 + Intelligent Simulation Configuration Generator - 使用LLM分析模拟需求、文档内容、图谱实体信息, - 自动生成最佳的模拟参数配置 + Uses LLM to analyze simulation requirements, document content, graph entity info, + and automatically generate optimal simulation parameter configs - 采用分步生成策略: - 1. 生成时间配置和事件配置(轻量级) - 2. 分批生成Agent配置(每批10-20个) - 3. 生成平台配置 + Uses step-by-step generation strategy: + 1. Generate time and event configs (lightweight) + 2. Generate Agent configs in batches (10-20 per batch) + 3. Generate platform config """ - # 上下文最大字符数 + # Max context character count MAX_CONTEXT_LENGTH = 50000 - # 每批生成的Agent数量 + # Agents per batch AGENTS_PER_BATCH = 15 - # 各步骤的上下文截断长度(字符数) - TIME_CONFIG_CONTEXT_LENGTH = 10000 # 时间配置 - EVENT_CONFIG_CONTEXT_LENGTH = 8000 # 事件配置 - ENTITY_SUMMARY_LENGTH = 300 # 实体摘要 - AGENT_SUMMARY_LENGTH = 300 # Agent配置中的实体摘要 - ENTITIES_PER_TYPE_DISPLAY = 20 # 每类实体显示数量 + # Context truncation length for each step (chars) + TIME_CONFIG_CONTEXT_LENGTH = 10000 # Time config + EVENT_CONFIG_CONTEXT_LENGTH = 8000 # Event config + ENTITY_SUMMARY_LENGTH = 300 # Entity summary + AGENT_SUMMARY_LENGTH = 300 # Entity summary in Agent config + ENTITIES_PER_TYPE_DISPLAY = 20 # Entities displayed per type def __init__( self, @@ -232,7 +232,7 @@ def __init__( self.model_name = model_name or Config.LLM_MODEL_NAME if not self.api_key: - raise ValueError("LLM_API_KEY 未配置") + raise ValueError("LLM_API_KEY not configured") self.client = OpenAI( api_key=self.api_key, @@ -252,27 +252,27 @@ def generate_config( progress_callback: Optional[Callable[[int, int, str], None]] = None, ) -> SimulationParameters: """ - 智能生成完整的模拟配置(分步生成) + Intelligently generate complete simulation config (step by step) Args: - simulation_id: 模拟ID - project_id: 项目ID - graph_id: 图谱ID - simulation_requirement: 模拟需求描述 - document_text: 原始文档内容 - entities: 过滤后的实体列表 - enable_twitter: 是否启用Twitter - enable_reddit: 是否启用Reddit - progress_callback: 进度回调函数(current_step, total_steps, message) + simulation_id: Simulation ID + project_id: Project ID + graph_id: Graph ID + simulation_requirement: Simulation requirement description + document_text: Original document content + entities: Filtered entity list + enable_twitter: Whether to enable Twitter + enable_reddit: Whether to enable Reddit + progress_callback: Progress callback function(current_step, total_steps, message) Returns: - SimulationParameters: 完整的模拟参数 + SimulationParameters: Complete simulation parameters """ - logger.info(f"开始智能生成模拟配置: simulation_id={simulation_id}, 实体数={len(entities)}") + logger.info(f"Starting intelligent config generation: simulation_id={simulation_id}, entity_count={len(entities)}") - # 计算总步骤数 + # Calculate total steps num_batches = math.ceil(len(entities) / self.AGENTS_PER_BATCH) - total_steps = 3 + num_batches # 时间配置 + 事件配置 + N批Agent + 平台配置 + total_steps = 3 + num_batches # Time config + Event config + N batches of Agents + Platform config current_step = 0 def report_progress(step: int, message: str): @@ -282,7 +282,7 @@ def report_progress(step: int, message: str): progress_callback(step, total_steps, message) logger.info(f"[{step}/{total_steps}] {message}") - # 1. 构建基础上下文信息 + # 1. Build base context context = self._build_context( simulation_requirement=simulation_requirement, document_text=document_text, @@ -291,20 +291,20 @@ def report_progress(step: int, message: str): reasoning_parts = [] - # ========== 步骤1: 生成时间配置 ========== - report_progress(1, "生成时间配置...") + # ========== Step 1: Generate Time config ========== + report_progress(1, "Generating Time config...") num_entities = len(entities) time_config_result = self._generate_time_config(context, num_entities) time_config = self._parse_time_config(time_config_result, num_entities) - reasoning_parts.append(f"时间配置: {time_config_result.get('reasoning', '成功')}") - - # ========== 步骤2: 生成事件配置 ========== - report_progress(2, "生成事件配置和热点话题...") + reasoning_parts.append(f"Time config: {time_config_result.get('reasoning', 'success')}") + + # ========== Step 2: Generate Event config ========== + report_progress(2, "Generating Event config and hot topics...") event_config_result = self._generate_event_config(context, simulation_requirement, entities) event_config = self._parse_event_config(event_config_result) - reasoning_parts.append(f"事件配置: {event_config_result.get('reasoning', '成功')}") + reasoning_parts.append(f"Event config: {event_config_result.get('reasoning', 'success')}") - # ========== 步骤3-N: 分批生成Agent配置 ========== + # ========== Steps 3-N: Generate Agent configs in batches ========== all_agent_configs = [] for batch_idx in range(num_batches): start_idx = batch_idx * self.AGENTS_PER_BATCH @@ -313,7 +313,7 @@ def report_progress(step: int, message: str): report_progress( 3 + batch_idx, - f"生成Agent配置 ({start_idx + 1}-{end_idx}/{len(entities)})..." + f"Generating Agent configs ({start_idx + 1}-{end_idx}/{len(entities)})..." ) batch_configs = self._generate_agent_configs_batch( @@ -324,16 +324,16 @@ def report_progress(step: int, message: str): ) all_agent_configs.extend(batch_configs) - reasoning_parts.append(f"Agent配置: 成功生成 {len(all_agent_configs)} 个") + reasoning_parts.append(f"Agent configs: successfully generated {len(all_agent_configs)}") - # ========== 为初始帖子分配发布者 Agent ========== - logger.info("为初始帖子分配合适的发布者 Agent...") + # ========== Assign poster Agents to initial posts ========== + logger.info("Assigning suitable poster Agents to initial posts...") event_config = self._assign_initial_post_agents(event_config, all_agent_configs) assigned_count = len([p for p in event_config.initial_posts if p.get("poster_agent_id") is not None]) - reasoning_parts.append(f"初始帖子分配: {assigned_count} 个帖子已分配发布者") + reasoning_parts.append(f"Initial post assignment: {assigned_count} posts assigned to posters") - # ========== 最后一步: 生成平台配置 ========== - report_progress(total_steps, "生成平台配置...") + # ========== Final step: Generate platform config ========== + report_progress(total_steps, "Generating platform config...") twitter_config = None reddit_config = None @@ -357,7 +357,7 @@ def report_progress(step: int, message: str): echo_chamber_strength=0.6 ) - # 构建最终参数 + # Build final parameters params = SimulationParameters( simulation_id=simulation_id, project_id=project_id, @@ -373,7 +373,7 @@ def report_progress(step: int, message: str): generation_reasoning=" | ".join(reasoning_parts) ) - logger.info(f"模拟配置生成完成: {len(params.agent_configs)} 个Agent配置") + logger.info(f"Simulation config generation complete: {len(params.agent_configs)} Agent configs") return params @@ -383,33 +383,33 @@ def _build_context( document_text: str, entities: List[EntityNode] ) -> str: - """构建LLM上下文,截断到最大长度""" + """Build LLM context, truncated to max length""" - # 实体摘要 + # Entity summary entity_summary = self._summarize_entities(entities) - # 构建上下文 + # Build context context_parts = [ - f"## 模拟需求\n{simulation_requirement}", - f"\n## 实体信息 ({len(entities)}个)\n{entity_summary}", + f"## Simulation Requirements\n{simulation_requirement}", + f"\n## Entity Info ({len(entities)} entities)\n{entity_summary}", ] current_length = sum(len(p) for p in context_parts) - remaining_length = self.MAX_CONTEXT_LENGTH - current_length - 500 # 留500字符余量 + remaining_length = self.MAX_CONTEXT_LENGTH - current_length - 500 # leave 500 chars margin if remaining_length > 0 and document_text: doc_text = document_text[:remaining_length] if len(document_text) > remaining_length: - doc_text += "\n...(文档已截断)" - context_parts.append(f"\n## 原始文档内容\n{doc_text}") + doc_text += "\n...(document truncated)" + context_parts.append(f"\n## Original Document Content\n{doc_text}") return "\n".join(context_parts) def _summarize_entities(self, entities: List[EntityNode]) -> str: - """生成实体摘要""" + """Generate entity summary""" lines = [] - # 按类型分组 + # Group by type by_type: Dict[str, List[EntityNode]] = {} for e in entities: t = e.get_entity_type() or "Unknown" @@ -418,20 +418,20 @@ def _summarize_entities(self, entities: List[EntityNode]) -> str: by_type[t].append(e) for entity_type, type_entities in by_type.items(): - lines.append(f"\n### {entity_type} ({len(type_entities)}个)") - # 使用配置的显示数量和摘要长度 + lines.append(f"\n### {entity_type} ({len(type_entities)} entities)") + # Use configured display count and summary length display_count = self.ENTITIES_PER_TYPE_DISPLAY summary_len = self.ENTITY_SUMMARY_LENGTH for e in type_entities[:display_count]: summary_preview = (e.summary[:summary_len] + "...") if len(e.summary) > summary_len else e.summary lines.append(f"- {e.name}: {summary_preview}") if len(type_entities) > display_count: - lines.append(f" ... 还有 {len(type_entities) - display_count} 个") + lines.append(f" ... and {len(type_entities) - display_count} more") return "\n".join(lines) def _call_llm_with_retry(self, prompt: str, system_prompt: str) -> Dict[str, Any]: - """带重试的LLM调用,包含JSON修复逻辑""" + """LLM call with retry, includes JSON repair logic""" import re max_attempts = 3 @@ -446,25 +446,25 @@ def _call_llm_with_retry(self, prompt: str, system_prompt: str) -> Dict[str, Any {"role": "user", "content": prompt} ], response_format={"type": "json_object"}, - temperature=0.7 - (attempt * 0.1) # 每次重试降低温度 - # 不设置max_tokens,让LLM自由发挥 + temperature=0.7 - (attempt * 0.1) # Lower temperature with each retry + # Do not set max_tokens, let LLM generate freely ) content = response.choices[0].message.content finish_reason = response.choices[0].finish_reason - # 检查是否被截断 + # Check if truncated if finish_reason == 'length': - logger.warning(f"LLM输出被截断 (attempt {attempt+1})") + logger.warning(f"LLM output truncated (attempt {attempt+1})") content = self._fix_truncated_json(content) - # 尝试解析JSON + # Try to parse JSON try: return json.loads(content) except json.JSONDecodeError as e: - logger.warning(f"JSON解析失败 (attempt {attempt+1}): {str(e)[:80]}") + logger.warning(f"JSON parse failed (attempt {attempt+1}): {str(e)[:80]}") - # 尝试修复JSON + # Try to fix JSON fixed = self._try_fix_config_json(content) if fixed: return fixed @@ -472,44 +472,44 @@ def _call_llm_with_retry(self, prompt: str, system_prompt: str) -> Dict[str, Any last_error = e except Exception as e: - logger.warning(f"LLM调用失败 (attempt {attempt+1}): {str(e)[:80]}") + logger.warning(f"LLM call failed (attempt {attempt+1}): {str(e)[:80]}") last_error = e import time time.sleep(2 * (attempt + 1)) - raise last_error or Exception("LLM调用失败") + raise last_error or Exception("LLM call failed") def _fix_truncated_json(self, content: str) -> str: - """修复被截断的JSON""" + """Fix truncated JSON""" content = content.strip() - # 计算未闭合的括号 + # Count unclosed brackets open_braces = content.count('{') - content.count('}') open_brackets = content.count('[') - content.count(']') - # 检查是否有未闭合的字符串 + # Check for unclosed strings if content and content[-1] not in '",}]': content += '"' - # 闭合括号 + # Close brackets content += ']' * open_brackets content += '}' * open_braces return content def _try_fix_config_json(self, content: str) -> Optional[Dict[str, Any]]: - """尝试修复配置JSON""" + """Try to fix config JSON""" import re - # 修复被截断的情况 + # Fix truncated case content = self._fix_truncated_json(content) - # 提取JSON部分 + # Extract JSON part json_match = re.search(r'\{[\s\S]*\}', content) if json_match: json_str = json_match.group() - # 移除字符串中的换行符 + # Remove newlines from strings def fix_string(match): s = match.group(0) s = s.replace('\n', ' ').replace('\r', ' ') @@ -521,7 +521,7 @@ def fix_string(match): try: return json.loads(json_str) except: - # 尝试移除所有控制字符 + # Try to remove all control characters json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', ' ', json_str) json_str = re.sub(r'\s+', ' ', json_str) try: @@ -532,35 +532,35 @@ def fix_string(match): return None def _generate_time_config(self, context: str, num_entities: int) -> Dict[str, Any]: - """生成时间配置""" - # 使用配置的上下文截断长度 + """Generate time config""" + # Use configured context truncation length context_truncated = context[:self.TIME_CONFIG_CONTEXT_LENGTH] - # 计算最大允许值(80%的agent数) + # Calculate max allowed value (80% of agent count) max_agents_allowed = max(1, int(num_entities * 0.9)) - prompt = f"""基于以下模拟需求,生成时间模拟配置。 + prompt = f"""Based on the following simulation requirements, generate time simulation config. {context_truncated} -## 任务 -请生成时间配置JSON。 +## Task +Generate a time config JSON. -### 基本原则(仅供参考,需根据具体事件和参与群体灵活调整): -- 用户群体为中国人,需符合北京时间作息习惯 -- 凌晨0-5点几乎无人活动(活跃度系数0.05) -- 早上6-8点逐渐活跃(活跃度系数0.4) -- 工作时间9-18点中等活跃(活跃度系数0.7) -- 晚间19-22点是高峰期(活跃度系数1.5) -- 23点后活跃度下降(活跃度系数0.5) -- 一般规律:凌晨低活跃、早间渐增、工作时段中等、晚间高峰 -- **重要**:以下示例值仅供参考,你需要根据事件性质、参与群体特点来调整具体时段 - - 例如:学生群体高峰可能是21-23点;媒体全天活跃;官方机构只在工作时间 - - 例如:突发热点可能导致深夜也有讨论,off_peak_hours 可适当缩短 +### Basic principles (for reference only, adjust flexibly based on specific events and participating groups): +- User group is Chinese, should follow Beijing time daily schedule +- Almost no activity at 0-5 AM (activity coefficient 0.05) +- Gradually active at 6-8 AM (activity coefficient 0.4) +- Moderately active at 9-18 work hours (activity coefficient 0.7) +- Peak at 19-22 evening (activity coefficient 1.5) +- Activity declining after 23:00 (activity coefficient 0.5) +- General pattern: low activity at dawn, gradually increasing in morning, moderate during work hours, evening peak +- **Important**: The following example values are for reference only; you need to adjust specific time periods based on event nature and participating group characteristics + - Example: student peak may be 21-23; media active all day; official institutions only during work hours + - Example: breaking news may cause late-night discussion, off_peak_hours can be shortened -### 返回JSON格式(不要markdown) +### Return JSON format (no markdown) -示例: +Example: {{ "total_simulation_hours": 72, "minutes_per_round": 60, @@ -570,70 +570,70 @@ def _generate_time_config(self, context: str, num_entities: int) -> Dict[str, An "off_peak_hours": [0, 1, 2, 3, 4, 5], "morning_hours": [6, 7, 8], "work_hours": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - "reasoning": "针对该事件的时间配置说明" + "reasoning": "Time config explanation for this event" }} -字段说明: -- total_simulation_hours (int): 模拟总时长,24-168小时,突发事件短、持续话题长 -- minutes_per_round (int): 每轮时长,30-120分钟,建议60分钟 -- agents_per_hour_min (int): 每小时最少激活Agent数(取值范围: 1-{max_agents_allowed}) -- agents_per_hour_max (int): 每小时最多激活Agent数(取值范围: 1-{max_agents_allowed}) -- peak_hours (int数组): 高峰时段,根据事件参与群体调整 -- off_peak_hours (int数组): 低谷时段,通常深夜凌晨 -- morning_hours (int数组): 早间时段 -- work_hours (int数组): 工作时段 -- reasoning (string): 简要说明为什么这样配置""" +Field descriptions: +- total_simulation_hours (int): Total simulation duration, 24-168 hours, shorter for breaking events, longer for ongoing topics +- minutes_per_round (int): Duration per round, 30-120 minutes, recommended 60 minutes +- agents_per_hour_min (int): Minimum Agents activated per hour (value range: 1-{max_agents_allowed}) +- agents_per_hour_max (int): Maximum Agents activated per hour (value range: 1-{max_agents_allowed}) +- peak_hours (int array): Peak hours, adjust based on event participating groups +- off_peak_hours (int array): Off-peak hours, usually late night/early morning +- morning_hours (int array): Morning hours +- work_hours (int array): Work hours +- reasoning (string): Brief explanation of why this configuration""" - system_prompt = "你是社交媒体模拟专家。返回纯JSON格式,时间配置需符合中国人作息习惯。" + system_prompt = "You are a social media simulation expert. Return pure JSON format. Time config should follow Chinese daily schedule patterns." try: return self._call_llm_with_retry(prompt, system_prompt) except Exception as e: - logger.warning(f"时间配置LLM生成失败: {e}, 使用默认配置") + logger.warning(f"Time config LLM generation failed: {e}, using default config") return self._get_default_time_config(num_entities) def _get_default_time_config(self, num_entities: int) -> Dict[str, Any]: - """获取默认时间配置(中国人作息)""" + """Get default time config (Chinese daily schedule)""" return { "total_simulation_hours": 72, - "minutes_per_round": 60, # 每轮1小时,加快时间流速 + "minutes_per_round": 60, # 1 hour per round, accelerated time flow "agents_per_hour_min": max(1, num_entities // 15), "agents_per_hour_max": max(5, num_entities // 5), "peak_hours": [19, 20, 21, 22], "off_peak_hours": [0, 1, 2, 3, 4, 5], "morning_hours": [6, 7, 8], "work_hours": [9, 10, 11, 12, 13, 14, 15, 16, 17, 18], - "reasoning": "使用默认中国人作息配置(每轮1小时)" + "reasoning": "Using default Chinese daily schedule config (1 hour per round)" } def _parse_time_config(self, result: Dict[str, Any], num_entities: int) -> TimeSimulationConfig: - """解析时间配置结果,并验证agents_per_hour值不超过总agent数""" - # 获取原始值 + """Parse time config result and validate agents_per_hour does not exceed total agent count""" + # Get raw values agents_per_hour_min = result.get("agents_per_hour_min", max(1, num_entities // 15)) agents_per_hour_max = result.get("agents_per_hour_max", max(5, num_entities // 5)) - # 验证并修正:确保不超过总agent数 + # Validate and correct: ensure does not exceed total agent count if agents_per_hour_min > num_entities: - logger.warning(f"agents_per_hour_min ({agents_per_hour_min}) 超过总Agent数 ({num_entities}),已修正") + logger.warning(f"agents_per_hour_min ({agents_per_hour_min}) exceeds total Agent count ({num_entities}), corrected") agents_per_hour_min = max(1, num_entities // 10) if agents_per_hour_max > num_entities: - logger.warning(f"agents_per_hour_max ({agents_per_hour_max}) 超过总Agent数 ({num_entities}),已修正") + logger.warning(f"agents_per_hour_max ({agents_per_hour_max}) exceeds total Agent count ({num_entities}), corrected") agents_per_hour_max = max(agents_per_hour_min + 1, num_entities // 2) - # 确保 min < max + # Ensure min < max if agents_per_hour_min >= agents_per_hour_max: agents_per_hour_min = max(1, agents_per_hour_max // 2) - logger.warning(f"agents_per_hour_min >= max,已修正为 {agents_per_hour_min}") + logger.warning(f"agents_per_hour_min >= max, corrected to {agents_per_hour_min}") return TimeSimulationConfig( total_simulation_hours=result.get("total_simulation_hours", 72), - minutes_per_round=result.get("minutes_per_round", 60), # 默认每轮1小时 + minutes_per_round=result.get("minutes_per_round", 60), # Default 1 hour per round agents_per_hour_min=agents_per_hour_min, agents_per_hour_max=agents_per_hour_max, peak_hours=result.get("peak_hours", [19, 20, 21, 22]), off_peak_hours=result.get("off_peak_hours", [0, 1, 2, 3, 4, 5]), - off_peak_activity_multiplier=0.05, # 凌晨几乎无人 + off_peak_activity_multiplier=0.05, # Almost no one at dawn morning_hours=result.get("morning_hours", [6, 7, 8]), morning_activity_multiplier=0.4, work_hours=result.get("work_hours", list(range(9, 19))), @@ -642,19 +642,19 @@ def _parse_time_config(self, result: Dict[str, Any], num_entities: int) -> TimeS ) def _generate_event_config( - self, - context: str, + self, + context: str, simulation_requirement: str, entities: List[EntityNode] ) -> Dict[str, Any]: - """生成事件配置""" + """Generate event config""" - # 获取可用的实体类型列表,供 LLM 参考 + # Get available entity type list for LLM reference entity_types_available = list(set( e.get_entity_type() or "Unknown" for e in entities )) - # 为每种类型列出代表性实体名称 + # List representative entity names for each type type_examples = {} for e in entities: etype = e.get_entity_type() or "Unknown" @@ -668,53 +668,53 @@ def _generate_event_config( for t, examples in type_examples.items() ]) - # 使用配置的上下文截断长度 + # Use configured context truncation length context_truncated = context[:self.EVENT_CONFIG_CONTEXT_LENGTH] - prompt = f"""基于以下模拟需求,生成事件配置。 + prompt = f"""Based on the following simulation requirements, generate event config. -模拟需求: {simulation_requirement} +Simulation requirement: {simulation_requirement} {context_truncated} -## 可用实体类型及示例 +## Available entity types and examples {type_info} -## 任务 -请生成事件配置JSON: -- 提取热点话题关键词 -- 描述舆论发展方向 -- 设计初始帖子内容,**每个帖子必须指定 poster_type(发布者类型)** +## Task +Generate an event config JSON: +- Extract hot topic keywords +- Describe narrative development direction +- Design initial post content, **each post must specify poster_type (publisher type)** -**重要**: poster_type 必须从上面的"可用实体类型"中选择,这样初始帖子才能分配给合适的 Agent 发布。 -例如:官方声明应由 Official/University 类型发布,新闻由 MediaOutlet 发布,学生观点由 Student 发布。 +**Important**: poster_type must be selected from the "available entity types" above, so initial posts can be assigned to suitable Agents for publishing. +Example: official statements should be published by Official/University types, news by MediaOutlet, student views by Student. -返回JSON格式(不要markdown): +Return JSON format (no markdown): {{ - "hot_topics": ["关键词1", "关键词2", ...], - "narrative_direction": "<舆论发展方向描述>", + "hot_topics": ["keyword1", "keyword2", ...], + "narrative_direction": "", "initial_posts": [ - {{"content": "帖子内容", "poster_type": "实体类型(必须从可用类型中选择)"}}, + {{"content": "post content", "poster_type": "entity type (must select from available types)"}}, ... ], - "reasoning": "<简要说明>" + "reasoning": "" }}""" - system_prompt = "你是舆论分析专家。返回纯JSON格式。注意 poster_type 必须精确匹配可用实体类型。" + system_prompt = "You are a public opinion analysis expert. Return pure JSON format. Note: poster_type must exactly match available entity types." try: return self._call_llm_with_retry(prompt, system_prompt) except Exception as e: - logger.warning(f"事件配置LLM生成失败: {e}, 使用默认配置") + logger.warning(f"Event config LLM generation failed: {e}, using default config") return { "hot_topics": [], "narrative_direction": "", "initial_posts": [], - "reasoning": "使用默认配置" + "reasoning": "Using default config" } def _parse_event_config(self, result: Dict[str, Any]) -> EventConfig: - """解析事件配置结果""" + """Parse event config result""" return EventConfig( initial_posts=result.get("initial_posts", []), scheduled_events=[], @@ -728,14 +728,14 @@ def _assign_initial_post_agents( agent_configs: List[AgentActivityConfig] ) -> EventConfig: """ - 为初始帖子分配合适的发布者 Agent + Assign suitable poster Agents to initial posts - 根据每个帖子的 poster_type 匹配最合适的 agent_id + Match the most suitable agent_id based on each post poster_type """ if not event_config.initial_posts: return event_config - # 按实体类型建立 agent 索引 + # Build agent index by entity type agents_by_type: Dict[str, List[AgentActivityConfig]] = {} for agent in agent_configs: etype = agent.entity_type.lower() @@ -743,7 +743,7 @@ def _assign_initial_post_agents( agents_by_type[etype] = [] agents_by_type[etype].append(agent) - # 类型映射表(处理 LLM 可能输出的不同格式) + # Type alias mapping (handles different formats LLM may output) type_aliases = { "official": ["official", "university", "governmentagency", "government"], "university": ["university", "official"], @@ -755,7 +755,7 @@ def _assign_initial_post_agents( "person": ["person", "student", "alumni"], } - # 记录每种类型已使用的 agent 索引,避免重复使用同一个 agent + # Track used agent index per type to avoid reusing the same agent used_indices: Dict[str, int] = {} updated_posts = [] @@ -763,17 +763,17 @@ def _assign_initial_post_agents( poster_type = post.get("poster_type", "").lower() content = post.get("content", "") - # 尝试找到匹配的 agent + # Try to find matching agent matched_agent_id = None - # 1. 直接匹配 + # 1. Direct match if poster_type in agents_by_type: agents = agents_by_type[poster_type] idx = used_indices.get(poster_type, 0) % len(agents) matched_agent_id = agents[idx].agent_id used_indices[poster_type] = idx + 1 else: - # 2. 使用别名匹配 + # 2. Use alias matching for alias_key, aliases in type_aliases.items(): if poster_type in aliases or alias_key == poster_type: for alias in aliases: @@ -786,11 +786,11 @@ def _assign_initial_post_agents( if matched_agent_id is not None: break - # 3. 如果仍未找到,使用影响力最高的 agent + # 3. If still not found, use agent with highest influence if matched_agent_id is None: - logger.warning(f"未找到类型 '{poster_type}' 的匹配 Agent,使用影响力最高的 Agent") + logger.warning(f"No matching Agent found for type '{poster_type}', using Agent with highest influence") if agent_configs: - # 按影响力排序,选择影响力最高的 + # Sort by influence, select highest sorted_agents = sorted(agent_configs, key=lambda a: a.influence_weight, reverse=True) matched_agent_id = sorted_agents[0].agent_id else: @@ -802,7 +802,7 @@ def _assign_initial_post_agents( "poster_agent_id": matched_agent_id }) - logger.info(f"初始帖子分配: poster_type='{poster_type}' -> agent_id={matched_agent_id}") + logger.info(f"Initial post assignment: poster_type='{poster_type}' -> agent_id={matched_agent_id}") event_config.initial_posts = updated_posts return event_config @@ -814,9 +814,9 @@ def _generate_agent_configs_batch( start_idx: int, simulation_requirement: str ) -> List[AgentActivityConfig]: - """分批生成Agent配置""" + """Generate Agent configs in batches""" - # 构建实体信息(使用配置的摘要长度) + # Build entity info (using configured summary length) entity_list = [] summary_len = self.AGENT_SUMMARY_LENGTH for i, e in enumerate(entities): @@ -827,58 +827,58 @@ def _generate_agent_configs_batch( "summary": e.summary[:summary_len] if e.summary else "" }) - prompt = f"""基于以下信息,为每个实体生成社交媒体活动配置。 + prompt = f"""Based on the following info, generate social media activity config for each entity. -模拟需求: {simulation_requirement} +Simulation requirement: {simulation_requirement} -## 实体列表 +## Entity list ```json {json.dumps(entity_list, ensure_ascii=False, indent=2)} ``` -## 任务 -为每个实体生成活动配置,注意: -- **时间符合中国人作息**:凌晨0-5点几乎不活动,晚间19-22点最活跃 -- **官方机构**(University/GovernmentAgency):活跃度低(0.1-0.3),工作时间(9-17)活动,响应慢(60-240分钟),影响力高(2.5-3.0) -- **媒体**(MediaOutlet):活跃度中(0.4-0.6),全天活动(8-23),响应快(5-30分钟),影响力高(2.0-2.5) -- **个人**(Student/Person/Alumni):活跃度高(0.6-0.9),主要晚间活动(18-23),响应快(1-15分钟),影响力低(0.8-1.2) -- **公众人物/专家**:活跃度中(0.4-0.6),影响力中高(1.5-2.0) +## Task +Generate activity config for each entity, note: +- **Time should follow Chinese daily schedule**: Almost no activity at 0-5 AM, most active at 19-22 evening +- **Official institutions** (University/GovernmentAgency): Low activity (0.1-0.3), work hours (9-17) activity, slow response (60-240 min), high influence (2.5-3.0) +- **Media** (MediaOutlet): Moderate activity (0.4-0.6), all-day activity (8-23), fast response (5-30 min), high influence (2.0-2.5) +- **Individual** (Student/Person/Alumni): High activity (0.6-0.9), primarily evening activity (18-23), fast response (1-15 min), low influence (0.8-1.2) +- **Public figures/Experts**: Moderate activity (0.4-0.6), medium-high influence (1.5-2.0) -返回JSON格式(不要markdown): +Return JSON format (no markdown): {{ "agent_configs": [ {{ - "agent_id": <必须与输入一致>, + "agent_id": , "activity_level": <0.0-1.0>, - "posts_per_hour": <发帖频率>, - "comments_per_hour": <评论频率>, - "active_hours": [<活跃小时列表,考虑中国人作息>], - "response_delay_min": <最小响应延迟分钟>, - "response_delay_max": <最大响应延迟分钟>, - "sentiment_bias": <-1.0到1.0>, + "posts_per_hour": , + "comments_per_hour": , + "active_hours": [], + "response_delay_min": , + "response_delay_max": , + "sentiment_bias": <-1.0 to 1.0>, "stance": "", - "influence_weight": <影响力权重> + "influence_weight": }}, ... ] }}""" - system_prompt = "你是社交媒体行为分析专家。返回纯JSON,配置需符合中国人作息习惯。" + system_prompt = "You are a social media behavior analysis expert. Return pure JSON. Config should follow Chinese daily schedule patterns." try: result = self._call_llm_with_retry(prompt, system_prompt) llm_configs = {cfg["agent_id"]: cfg for cfg in result.get("agent_configs", [])} except Exception as e: - logger.warning(f"Agent配置批次LLM生成失败: {e}, 使用规则生成") + logger.warning(f"Agent config batch LLM generation failed: {e}, using rule-based generation") llm_configs = {} - # 构建AgentActivityConfig对象 + # Build AgentActivityConfig objects configs = [] for i, entity in enumerate(entities): agent_id = start_idx + i cfg = llm_configs.get(agent_id, {}) - # 如果LLM没有生成,使用规则生成 + # If LLM did not generate, use rule-based generation if not cfg: cfg = self._generate_agent_config_by_rule(entity) @@ -902,11 +902,11 @@ def _generate_agent_configs_batch( return configs def _generate_agent_config_by_rule(self, entity: EntityNode) -> Dict[str, Any]: - """基于规则生成单个Agent配置(中国人作息)""" + """Generate single Agent config by rules (Chinese daily schedule)""" entity_type = (entity.get_entity_type() or "Unknown").lower() if entity_type in ["university", "governmentagency", "ngo"]: - # 官方机构:工作时间活动,低频率,高影响力 + # Official institutions: work hours activity, low frequency, high influence return { "activity_level": 0.2, "posts_per_hour": 0.1, @@ -919,7 +919,7 @@ def _generate_agent_config_by_rule(self, entity: EntityNode) -> Dict[str, Any]: "influence_weight": 3.0 } elif entity_type in ["mediaoutlet"]: - # 媒体:全天活动,中等频率,高影响力 + # Media: all-day activity, moderate frequency, high influence return { "activity_level": 0.5, "posts_per_hour": 0.8, @@ -932,7 +932,7 @@ def _generate_agent_config_by_rule(self, entity: EntityNode) -> Dict[str, Any]: "influence_weight": 2.5 } elif entity_type in ["professor", "expert", "official"]: - # 专家/教授:工作+晚间活动,中等频率 + # Experts/Professors: work + evening activity, moderate frequency return { "activity_level": 0.4, "posts_per_hour": 0.3, @@ -945,12 +945,12 @@ def _generate_agent_config_by_rule(self, entity: EntityNode) -> Dict[str, Any]: "influence_weight": 2.0 } elif entity_type in ["student"]: - # 学生:晚间为主,高频率 + # Students: primarily evening, high frequency return { "activity_level": 0.8, "posts_per_hour": 0.6, "comments_per_hour": 1.5, - "active_hours": [8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # 上午+晚间 + "active_hours": [8, 9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # Morning + evening "response_delay_min": 1, "response_delay_max": 15, "sentiment_bias": 0.0, @@ -958,12 +958,12 @@ def _generate_agent_config_by_rule(self, entity: EntityNode) -> Dict[str, Any]: "influence_weight": 0.8 } elif entity_type in ["alumni"]: - # 校友:晚间为主 + # Alumni: primarily evening return { "activity_level": 0.6, "posts_per_hour": 0.4, "comments_per_hour": 0.8, - "active_hours": [12, 13, 19, 20, 21, 22, 23], # 午休+晚间 + "active_hours": [12, 13, 19, 20, 21, 22, 23], # Lunch break + evening "response_delay_min": 5, "response_delay_max": 30, "sentiment_bias": 0.0, @@ -971,12 +971,12 @@ def _generate_agent_config_by_rule(self, entity: EntityNode) -> Dict[str, Any]: "influence_weight": 1.0 } else: - # 普通人:晚间高峰 + # Regular person: Evening peak return { "activity_level": 0.7, "posts_per_hour": 0.5, "comments_per_hour": 1.2, - "active_hours": [9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # 白天+晚间 + "active_hours": [9, 10, 11, 12, 13, 18, 19, 20, 21, 22, 23], # Daytime + evening "response_delay_min": 2, "response_delay_max": 20, "sentiment_bias": 0.0, diff --git a/backend/app/services/simulation_ipc.py b/backend/app/services/simulation_ipc.py index 9d70d0b..155dc60 100644 --- a/backend/app/services/simulation_ipc.py +++ b/backend/app/services/simulation_ipc.py @@ -1,11 +1,11 @@ """ -模拟IPC通信模块 -用于Flask后端和模拟脚本之间的进程间通信 +Simulation IPC Communication Module +Inter-process communication between Flask backend and simulation scripts -通过文件系统实现简单的命令/响应模式: -1. Flask写入命令到 commands/ 目录 -2. 模拟脚本轮询命令目录,执行命令并写入响应到 responses/ 目录 -3. Flask轮询响应目录获取结果 +Simple command/response pattern via file system: +1. Flask writes commands to commands/ directory +2. Simulation script polls command directory, executes commands and writes responses to responses/ directory +3. Flask polls response directory for results """ import os @@ -23,14 +23,14 @@ class CommandType(str, Enum): - """命令类型""" - INTERVIEW = "interview" # 单个Agent采访 - BATCH_INTERVIEW = "batch_interview" # 批量采访 - CLOSE_ENV = "close_env" # 关闭环境 + """Command type""" + INTERVIEW = "interview" # Single Agent interview + BATCH_INTERVIEW = "batch_interview" # Batch interview + CLOSE_ENV = "close_env" # Close environment class CommandStatus(str, Enum): - """命令状态""" + """Command status""" PENDING = "pending" PROCESSING = "processing" COMPLETED = "completed" @@ -39,7 +39,7 @@ class CommandStatus(str, Enum): @dataclass class IPCCommand: - """IPC命令""" + """IPC command""" command_id: str command_type: CommandType args: Dict[str, Any] @@ -65,7 +65,7 @@ def from_dict(cls, data: Dict[str, Any]) -> 'IPCCommand': @dataclass class IPCResponse: - """IPC响应""" + """IPC response""" command_id: str status: CommandStatus result: Optional[Dict[str, Any]] = None @@ -94,23 +94,23 @@ def from_dict(cls, data: Dict[str, Any]) -> 'IPCResponse': class SimulationIPCClient: """ - 模拟IPC客户端(Flask端使用) + Simulation IPC Client (used by Flask side) - 用于向模拟进程发送命令并等待响应 + Used to send commands to simulation process and wait for responses """ def __init__(self, simulation_dir: str): """ - 初始化IPC客户端 + Initialize IPC client Args: - simulation_dir: 模拟数据目录 + simulation_dir: Simulation data directory """ self.simulation_dir = simulation_dir self.commands_dir = os.path.join(simulation_dir, "ipc_commands") self.responses_dir = os.path.join(simulation_dir, "ipc_responses") - # 确保目录存在 + # Ensure directories exist os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) @@ -122,19 +122,19 @@ def send_command( poll_interval: float = 0.5 ) -> IPCResponse: """ - 发送命令并等待响应 + Send command and wait for response Args: - command_type: 命令类型 - args: 命令参数 - timeout: 超时时间(秒) - poll_interval: 轮询间隔(秒) + command_type: Command type + args: Command parameters + timeout: Timeout in seconds + poll_interval: Polling interval in seconds Returns: IPCResponse Raises: - TimeoutError: 等待响应超时 + TimeoutError: Waiting for response timed out """ command_id = str(uuid.uuid4()) command = IPCCommand( @@ -143,14 +143,14 @@ def send_command( args=args ) - # 写入命令文件 + # Write command file command_file = os.path.join(self.commands_dir, f"{command_id}.json") with open(command_file, 'w', encoding='utf-8') as f: json.dump(command.to_dict(), f, ensure_ascii=False, indent=2) - logger.info(f"发送IPC命令: {command_type.value}, command_id={command_id}") + logger.info(f"Sent IPC command: {command_type.value}, command_id={command_id}") - # 等待响应 + # Wait for response response_file = os.path.join(self.responses_dir, f"{command_id}.json") start_time = time.time() @@ -161,30 +161,30 @@ def send_command( response_data = json.load(f) response = IPCResponse.from_dict(response_data) - # 清理命令和响应文件 + # Clean up command and response files try: os.remove(command_file) os.remove(response_file) except OSError: pass - logger.info(f"收到IPC响应: command_id={command_id}, status={response.status.value}") + logger.info(f"Received IPC response: command_id={command_id}, status={response.status.value}") return response except (json.JSONDecodeError, KeyError) as e: - logger.warning(f"解析响应失败: {e}") + logger.warning(f"Failed to parse response: {e}") time.sleep(poll_interval) - # 超时 - logger.error(f"等待IPC响应超时: command_id={command_id}") + # Timeout + logger.error(f"IPC response timed out: command_id={command_id}") - # 清理命令文件 + # Clean up command file try: os.remove(command_file) except OSError: pass - raise TimeoutError(f"等待命令响应超时 ({timeout}秒)") + raise TimeoutError(f"Command response timed out ({timeout}s)") def send_interview( self, @@ -194,19 +194,19 @@ def send_interview( timeout: float = 60.0 ) -> IPCResponse: """ - 发送单个Agent采访命令 + Send single Agent interview command Args: agent_id: Agent ID - prompt: 采访问题 - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时同时采访两个平台,单平台模拟时采访该平台 - timeout: 超时时间 + prompt: Interview question + platform: Specify platform (optional) + - "twitter": Interview Twitter platform only + - "reddit": Interview Reddit platform only + - None: In dual-platform simulation, interview both platforms simultaneously; in single-platform, interview that platform + timeout: Timeout Returns: - IPCResponse,result字段包含采访结果 + IPCResponse,result field contains interview results """ args = { "agent_id": agent_id, @@ -228,18 +228,18 @@ def send_batch_interview( timeout: float = 120.0 ) -> IPCResponse: """ - 发送批量采访命令 + Send batch interview command Args: - interviews: 采访列表,每个元素包含 {"agent_id": int, "prompt": str, "platform": str(可选)} - platform: 默认平台(可选,会被每个采访项的platform覆盖) - - "twitter": 默认只采访Twitter平台 - - "reddit": 默认只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间 + interviews: Interview list, each element contains {"agent_id": int, "prompt": str, "platform": str(optional)} + platform: Default platform (optional, overridden by each interview item platform) + - "twitter": Default: interview Twitter platform only + - "reddit": Default: interview Reddit platform only + - None: In dual-platform simulation, interview both platforms for each Agent + timeout: Timeout Returns: - IPCResponse,result字段包含所有采访结果 + IPCResponse,result field contains all interview results """ args = {"interviews": interviews} if platform: @@ -253,10 +253,10 @@ def send_batch_interview( def send_close_env(self, timeout: float = 30.0) -> IPCResponse: """ - 发送关闭环境命令 + Send close environment command Args: - timeout: 超时时间 + timeout: Timeout Returns: IPCResponse @@ -269,9 +269,9 @@ def send_close_env(self, timeout: float = 30.0) -> IPCResponse: def check_env_alive(self) -> bool: """ - 检查模拟环境是否存活 + Check if simulation environment is alive - 通过检查 env_status.json 文件来判断 + Determined by checking env_status.json file """ status_file = os.path.join(self.simulation_dir, "env_status.json") if not os.path.exists(status_file): @@ -287,41 +287,41 @@ def check_env_alive(self) -> bool: class SimulationIPCServer: """ - 模拟IPC服务器(模拟脚本端使用) + Simulation IPC Server (used by simulation script side) - 轮询命令目录,执行命令并返回响应 + Polls command directory, executes commands and returns responses """ def __init__(self, simulation_dir: str): """ - 初始化IPC服务器 + Initialize IPC server Args: - simulation_dir: 模拟数据目录 + simulation_dir: Simulation data directory """ self.simulation_dir = simulation_dir self.commands_dir = os.path.join(simulation_dir, "ipc_commands") self.responses_dir = os.path.join(simulation_dir, "ipc_responses") - # 确保目录存在 + # Ensure directories exist os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) - # 环境状态 + # Environment status self._running = False def start(self): - """标记服务器为运行状态""" + """Mark server as running""" self._running = True self._update_env_status("alive") def stop(self): - """标记服务器为停止状态""" + """Mark server as stopped""" self._running = False self._update_env_status("stopped") def _update_env_status(self, status: str): - """更新环境状态文件""" + """Update environment status file""" status_file = os.path.join(self.simulation_dir, "env_status.json") with open(status_file, 'w', encoding='utf-8') as f: json.dump({ @@ -331,15 +331,15 @@ def _update_env_status(self, status: str): def poll_commands(self) -> Optional[IPCCommand]: """ - 轮询命令目录,返回第一个待处理的命令 + Poll command directory, return first pending command Returns: - IPCCommand 或 None + IPCCommand or None """ if not os.path.exists(self.commands_dir): return None - # 按时间排序获取命令文件 + # Get command files sorted by time command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): @@ -354,23 +354,23 @@ def poll_commands(self) -> Optional[IPCCommand]: data = json.load(f) return IPCCommand.from_dict(data) except (json.JSONDecodeError, KeyError, OSError) as e: - logger.warning(f"读取命令文件失败: {filepath}, {e}") + logger.warning(f"Failed to read command file: {filepath}, {e}") continue return None def send_response(self, response: IPCResponse): """ - 发送响应 + Send response Args: - response: IPC响应 + response: IPC response """ response_file = os.path.join(self.responses_dir, f"{response.command_id}.json") with open(response_file, 'w', encoding='utf-8') as f: json.dump(response.to_dict(), f, ensure_ascii=False, indent=2) - # 删除命令文件 + # Delete command file command_file = os.path.join(self.commands_dir, f"{response.command_id}.json") try: os.remove(command_file) @@ -378,7 +378,7 @@ def send_response(self, response: IPCResponse): pass def send_success(self, command_id: str, result: Dict[str, Any]): - """发送成功响应""" + """Send success response""" self.send_response(IPCResponse( command_id=command_id, status=CommandStatus.COMPLETED, @@ -386,7 +386,7 @@ def send_success(self, command_id: str, result: Dict[str, Any]): )) def send_error(self, command_id: str, error: str): - """发送错误响应""" + """Send error response""" self.send_response(IPCResponse( command_id=command_id, status=CommandStatus.FAILED, diff --git a/backend/app/services/simulation_manager.py b/backend/app/services/simulation_manager.py index 4468b50..d53edd3 100644 --- a/backend/app/services/simulation_manager.py +++ b/backend/app/services/simulation_manager.py @@ -1,7 +1,7 @@ """ -OASIS模拟管理器 -管理Twitter和Reddit双平台并行模拟 -使用预设脚本 + LLM智能生成配置参数 +OASIS Simulation Manager +Manages parallel simulation across Twitter and Reddit dual platforms +Uses preset scripts + LLM-powered intelligent configuration generation """ import os @@ -22,60 +22,60 @@ class SimulationStatus(str, Enum): - """模拟状态""" + """Simulation status""" CREATED = "created" PREPARING = "preparing" READY = "ready" RUNNING = "running" PAUSED = "paused" - STOPPED = "stopped" # 模拟被手动停止 - COMPLETED = "completed" # 模拟自然完成 + STOPPED = "stopped" # Simulation manually stopped + COMPLETED = "completed" # Simulation completed naturally FAILED = "failed" class PlatformType(str, Enum): - """平台类型""" + """Platform type""" TWITTER = "twitter" REDDIT = "reddit" @dataclass class SimulationState: - """模拟状态""" + """Simulation status""" simulation_id: str project_id: str graph_id: str - # 平台启用状态 + # Platform enable status enable_twitter: bool = True enable_reddit: bool = True - - # 状态 + + # Status status: SimulationStatus = SimulationStatus.CREATED - - # 准备阶段数据 + + # Preparation stage data entities_count: int = 0 profiles_count: int = 0 entity_types: List[str] = field(default_factory=list) - - # 配置生成信息 + + # Config generation info config_generated: bool = False config_reasoning: str = "" - - # 运行时数据 + + # Runtime data current_round: int = 0 twitter_status: str = "not_started" reddit_status: str = "not_started" - - # 时间戳 + + # Timestamps created_at: str = field(default_factory=lambda: datetime.now().isoformat()) updated_at: str = field(default_factory=lambda: datetime.now().isoformat()) - - # 错误信息 + + # Error info error: Optional[str] = None def to_dict(self) -> Dict[str, Any]: - """完整状态字典(内部使用)""" + """Full state dictionary (for internal use)""" return { "simulation_id": self.simulation_id, "project_id": self.project_id, @@ -97,7 +97,7 @@ def to_dict(self) -> Dict[str, Any]: } def to_simple_dict(self) -> Dict[str, Any]: - """简化状态字典(API返回使用)""" + """Simplified state dictionary (for API responses)""" return { "simulation_id": self.simulation_id, "project_id": self.project_id, @@ -113,36 +113,36 @@ def to_simple_dict(self) -> Dict[str, Any]: class SimulationManager: """ - 模拟管理器 - - 核心功能: - 1. 从图谱读取实体并过滤 - 2. 生成OASIS Agent Profile - 3. 使用LLM智能生成模拟配置参数 - 4. 准备预设脚本所需的所有文件 + Simulation Manager + + Core features: + 1. Read entities from the graph and filter them + 2. Generate OASIS Agent Profiles + 3. Use LLM to intelligently generate simulation configuration parameters + 4. Prepare all files required by preset scripts """ - - # 模拟数据存储目录 + + # Simulation data storage directory SIMULATION_DATA_DIR = os.path.join( os.path.dirname(__file__), '../../uploads/simulations' ) def __init__(self): - # 确保目录存在 + # Ensure directory exists os.makedirs(self.SIMULATION_DATA_DIR, exist_ok=True) - - # 内存中的模拟状态缓存 + + # In-memory simulation state cache self._simulations: Dict[str, SimulationState] = {} def _get_simulation_dir(self, simulation_id: str) -> str: - """获取模拟数据目录""" + """Get simulation data directory""" sim_dir = os.path.join(self.SIMULATION_DATA_DIR, simulation_id) os.makedirs(sim_dir, exist_ok=True) return sim_dir def _save_simulation_state(self, state: SimulationState): - """保存模拟状态到文件""" + """Save simulation state to file""" sim_dir = self._get_simulation_dir(state.simulation_id) state_file = os.path.join(sim_dir, "state.json") @@ -154,7 +154,7 @@ def _save_simulation_state(self, state: SimulationState): self._simulations[state.simulation_id] = state def _load_simulation_state(self, simulation_id: str) -> Optional[SimulationState]: - """从文件加载模拟状态""" + """Load simulation state from file""" if simulation_id in self._simulations: return self._simulations[simulation_id] @@ -198,13 +198,13 @@ def create_simulation( enable_reddit: bool = True, ) -> SimulationState: """ - 创建新的模拟 + Create a new simulation Args: - project_id: 项目ID - graph_id: 图谱ID - enable_twitter: 是否启用Twitter模拟 - enable_reddit: 是否启用Reddit模拟 + project_id: Project ID + graph_id: Graph ID + enable_twitter: Whether to enable Twitter simulation + enable_reddit: Whether to enable Reddit simulation Returns: SimulationState @@ -222,7 +222,7 @@ def create_simulation( ) self._save_simulation_state(state) - logger.info(f"创建模拟: {simulation_id}, project={project_id}, graph={graph_id}") + logger.info(f"Created simulation: {simulation_id}, project={project_id}, graph={graph_id}") return state @@ -238,30 +238,30 @@ def prepare_simulation( storage: 'GraphStorage' = None, ) -> SimulationState: """ - 准备模拟环境(全程自动化) + Prepare simulation environment (fully automated) - 步骤: - 1. 从图谱读取并过滤实体 - 2. 为每个实体生成OASIS Agent Profile(可选LLM增强,支持并行) - 3. 使用LLM智能生成模拟配置参数(时间、活跃度、发言频率等) - 4. 保存配置文件和Profile文件 - 5. 复制预设脚本到模拟目录 + Steps: + 1. Read and filter entities from graph + 2. Generate OASIS Agent Profile for each entity (optional LLM enhancement, parallel support) + 3. Use LLM to intelligently generate simulation config parameters (time, activity, posting frequency, etc.) + 4. Save config and profile files + 5. Copy preset scripts to simulation directory Args: - simulation_id: 模拟ID - simulation_requirement: 模拟需求描述(用于LLM生成配置) - document_text: 原始文档内容(用于LLM理解背景) - defined_entity_types: 预定义的实体类型(可选) - use_llm_for_profiles: 是否使用LLM生成详细人设 - progress_callback: 进度回调函数 (stage, progress, message) - parallel_profile_count: 并行生成人设的数量,默认3 + simulation_id: Simulation ID + simulation_requirement: Simulation requirement description (for LLM config generation) + document_text: Original document content (for LLM context understanding) + defined_entity_types: Predefined entity types (optional) + use_llm_for_profiles: Whether to use LLM to generate detailed personas + progress_callback: Progress callback function (stage, progress, message) + parallel_profile_count: Number of parallel persona generations, default 3 Returns: SimulationState """ state = self._load_simulation_state(simulation_id) if not state: - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation does not exist: {simulation_id}") try: state.status = SimulationStatus.PREPARING @@ -269,16 +269,16 @@ def prepare_simulation( sim_dir = self._get_simulation_dir(simulation_id) - # ========== 阶段1: 读取并过滤实体 ========== + # ========== Stage 1: Read and filter entities ========== if progress_callback: - progress_callback("reading", 0, "正在连接图谱...") + progress_callback("reading", 0, "Connecting to graph...") if not storage: raise ValueError("storage (GraphStorage) is required for prepare_simulation") reader = EntityReader(storage) if progress_callback: - progress_callback("reading", 30, "正在读取节点数据...") + progress_callback("reading", 30, "Reading node data...") filtered = reader.filter_defined_entities( graph_id=state.graph_id, @@ -292,29 +292,29 @@ def prepare_simulation( if progress_callback: progress_callback( "reading", 100, - f"完成,共 {filtered.filtered_count} 个实体", + f"Done, total {filtered.filtered_count} entities", current=filtered.filtered_count, total=filtered.filtered_count ) if filtered.filtered_count == 0: state.status = SimulationStatus.FAILED - state.error = "没有找到符合条件的实体,请检查图谱是否正确构建" + state.error = "No matching entities found, please check if the graph is correctly built" self._save_simulation_state(state) return state - # ========== 阶段2: 生成Agent Profile ========== + # ========== Stage 2: Generate Agent Profiles ========== total_entities = len(filtered.entities) if progress_callback: progress_callback( "generating_profiles", 0, - "开始生成...", + "Starting generation...", current=0, total=total_entities ) - # 传入graph_id以启用图谱检索功能,获取更丰富的上下文 + # Pass graph_id to enable graph retrieval for richer context generator = OasisProfileGenerator(storage=storage, graph_id=state.graph_id) def profile_progress(current, total, msg): @@ -328,7 +328,7 @@ def profile_progress(current, total, msg): item_name=msg ) - # 设置实时保存的文件路径(优先使用 Reddit JSON 格式) + # Set realtime save file path (prefer Reddit JSON format) realtime_output_path = None realtime_platform = "reddit" if state.enable_reddit: @@ -342,20 +342,20 @@ def profile_progress(current, total, msg): entities=filtered.entities, use_llm=use_llm_for_profiles, progress_callback=profile_progress, - graph_id=state.graph_id, # 传入graph_id用于图谱检索 - parallel_count=parallel_profile_count, # 并行生成数量 - realtime_output_path=realtime_output_path, # 实时保存路径 - output_platform=realtime_platform # 输出格式 + graph_id=state.graph_id, # Pass graph_id for graph retrieval + parallel_count=parallel_profile_count, # Parallel generation count + realtime_output_path=realtime_output_path, # Realtime save path + output_platform=realtime_platform # Output format ) state.profiles_count = len(profiles) - # 保存Profile文件(注意:Twitter使用CSV格式,Reddit使用JSON格式) - # Reddit 已经在生成过程中实时保存了,这里再保存一次确保完整性 + # Save profile files (note: Twitter uses CSV format, Reddit uses JSON format) + # Reddit profiles are saved in realtime during generation; save again here to ensure completeness if progress_callback: progress_callback( "generating_profiles", 95, - "保存Profile文件...", + "Saving profile files...", current=total_entities, total=total_entities ) @@ -368,7 +368,7 @@ def profile_progress(current, total, msg): ) if state.enable_twitter: - # Twitter使用CSV格式!这是OASIS的要求 + # Twitter uses CSV format! This is required by OASIS generator.save_profiles( profiles=profiles, file_path=os.path.join(sim_dir, "twitter_profiles.csv"), @@ -378,16 +378,16 @@ def profile_progress(current, total, msg): if progress_callback: progress_callback( "generating_profiles", 100, - f"完成,共 {len(profiles)} 个Profile", + f"Done, total {len(profiles)} profiles", current=len(profiles), total=len(profiles) ) - # ========== 阶段3: LLM智能生成模拟配置 ========== + # ========== Stage 3: LLM intelligent simulation config generation ========== if progress_callback: progress_callback( "generating_config", 0, - "正在分析模拟需求...", + "Analyzing simulation requirements...", current=0, total=3 ) @@ -397,7 +397,7 @@ def profile_progress(current, total, msg): if progress_callback: progress_callback( "generating_config", 30, - "正在调用LLM生成配置...", + "Calling LLM to generate config...", current=1, total=3 ) @@ -416,12 +416,12 @@ def profile_progress(current, total, msg): if progress_callback: progress_callback( "generating_config", 70, - "正在保存配置文件...", + "Saving config files...", current=2, total=3 ) - # 保存配置文件 + # Save config file config_path = os.path.join(sim_dir, "simulation_config.json") with open(config_path, 'w', encoding='utf-8') as f: f.write(sim_params.to_json()) @@ -432,25 +432,25 @@ def profile_progress(current, total, msg): if progress_callback: progress_callback( "generating_config", 100, - "配置生成完成", + "Config generation complete", current=3, total=3 ) - # 注意:运行脚本保留在 backend/scripts/ 目录,不再复制到模拟目录 - # 启动模拟时,simulation_runner 会从 scripts/ 目录运行脚本 + # Note: Run scripts remain in backend/scripts/ directory, no longer copied to simulation directory + # When starting simulation, simulation_runner will run scripts from scripts/ directory - # 更新状态 + # Update status state.status = SimulationStatus.READY self._save_simulation_state(state) - logger.info(f"模拟准备完成: {simulation_id}, " + logger.info(f"Simulation preparation complete: {simulation_id}, " f"entities={state.entities_count}, profiles={state.profiles_count}") return state except Exception as e: - logger.error(f"模拟准备失败: {simulation_id}, error={str(e)}") + logger.error(f"Simulation preparation failed: {simulation_id}, error={str(e)}") import traceback logger.error(traceback.format_exc()) state.status = SimulationStatus.FAILED @@ -459,16 +459,16 @@ def profile_progress(current, total, msg): raise def get_simulation(self, simulation_id: str) -> Optional[SimulationState]: - """获取模拟状态""" + """Get simulation status""" return self._load_simulation_state(simulation_id) def list_simulations(self, project_id: Optional[str] = None) -> List[SimulationState]: - """列出所有模拟""" + """List all simulations""" simulations = [] if os.path.exists(self.SIMULATION_DATA_DIR): for sim_id in os.listdir(self.SIMULATION_DATA_DIR): - # 跳过隐藏文件(如 .DS_Store)和非目录文件 + # Skip hidden files (e.g. .DS_Store) and non-directory files sim_path = os.path.join(self.SIMULATION_DATA_DIR, sim_id) if sim_id.startswith('.') or not os.path.isdir(sim_path): continue @@ -481,10 +481,10 @@ def list_simulations(self, project_id: Optional[str] = None) -> List[SimulationS return simulations def get_profiles(self, simulation_id: str, platform: str = "reddit") -> List[Dict[str, Any]]: - """获取模拟的Agent Profile""" + """Get Agent Profiles for simulation""" state = self._load_simulation_state(simulation_id) if not state: - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation does not exist: {simulation_id}") sim_dir = self._get_simulation_dir(simulation_id) profile_path = os.path.join(sim_dir, f"{platform}_profiles.json") @@ -496,7 +496,7 @@ def get_profiles(self, simulation_id: str, platform: str = "reddit") -> List[Dic return json.load(f) def get_simulation_config(self, simulation_id: str) -> Optional[Dict[str, Any]]: - """获取模拟配置""" + """Get simulation config""" sim_dir = self._get_simulation_dir(simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") @@ -507,7 +507,7 @@ def get_simulation_config(self, simulation_id: str) -> Optional[Dict[str, Any]]: return json.load(f) def get_run_instructions(self, simulation_id: str) -> Dict[str, str]: - """获取运行说明""" + """Get run instructions""" sim_dir = self._get_simulation_dir(simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") scripts_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../scripts')) @@ -522,10 +522,10 @@ def get_run_instructions(self, simulation_id: str) -> Dict[str, str]: "parallel": f"python {scripts_dir}/run_parallel_simulation.py --config {config_path}", }, "instructions": ( - f"1. 激活conda环境: conda activate MiroFish\n" - f"2. 运行模拟 (脚本位于 {scripts_dir}):\n" - f" - 单独运行Twitter: python {scripts_dir}/run_twitter_simulation.py --config {config_path}\n" - f" - 单独运行Reddit: python {scripts_dir}/run_reddit_simulation.py --config {config_path}\n" - f" - 并行运行双平台: python {scripts_dir}/run_parallel_simulation.py --config {config_path}" + f"1. Activate conda environment: conda activate MiroFish\n" + f"2. Run simulation (scripts located at {scripts_dir}):\n" + f" - Run Twitter only: python {scripts_dir}/run_twitter_simulation.py --config {config_path}\n" + f" - Run Reddit only: python {scripts_dir}/run_reddit_simulation.py --config {config_path}\n" + f" - Run both platforms in parallel: python {scripts_dir}/run_parallel_simulation.py --config {config_path}" ) } diff --git a/backend/app/services/simulation_runner.py b/backend/app/services/simulation_runner.py index d6f608c..852ff2c 100644 --- a/backend/app/services/simulation_runner.py +++ b/backend/app/services/simulation_runner.py @@ -1,6 +1,6 @@ """ -OASIS模拟运行器 -在后台运行模拟并记录每个Agent的动作,支持实时状态监控 +OASIS Simulation Runner +Runs simulation in background, records each Agent action, supports real-time status monitoring """ import os @@ -25,15 +25,15 @@ logger = get_logger('mirofish.simulation_runner') -# 标记是否已注册清理函数 +# Flag whether cleanup function has been registered _cleanup_registered = False -# 平台检测 +# Platform detection IS_WINDOWS = sys.platform == 'win32' class RunnerStatus(str, Enum): - """运行器状态""" + """Runner status""" IDLE = "idle" STARTING = "starting" RUNNING = "running" @@ -46,7 +46,7 @@ class RunnerStatus(str, Enum): @dataclass class AgentAction: - """Agent动作记录""" + """Agent action record""" round_num: int timestamp: str platform: str # twitter / reddit @@ -73,7 +73,7 @@ def to_dict(self) -> Dict[str, Any]: @dataclass class RoundSummary: - """每轮摘要""" + """Per-round summary""" round_num: int start_time: str end_time: Optional[str] = None @@ -99,52 +99,52 @@ def to_dict(self) -> Dict[str, Any]: @dataclass class SimulationRunState: - """模拟运行状态(实时)""" + """Simulation run state (real-time)""" simulation_id: str runner_status: RunnerStatus = RunnerStatus.IDLE - # 进度信息 + # Progress info current_round: int = 0 total_rounds: int = 0 simulated_hours: int = 0 total_simulation_hours: int = 0 - # 各平台独立轮次和模拟时间(用于双平台并行显示) + # Per-platform independent rounds and simulated time (for dual-platform parallel display) twitter_current_round: int = 0 reddit_current_round: int = 0 twitter_simulated_hours: int = 0 reddit_simulated_hours: int = 0 - # 平台状态 + # Platform status twitter_running: bool = False reddit_running: bool = False twitter_actions_count: int = 0 reddit_actions_count: int = 0 - # 平台完成状态(通过检测 actions.jsonl 中的 simulation_end 事件) + # Platform completion status (detected via simulation_end event in actions.jsonl) twitter_completed: bool = False reddit_completed: bool = False - # 每轮摘要 + # Per-round summaries rounds: List[RoundSummary] = field(default_factory=list) - # 最近动作(用于前端实时展示) + # Recent actions (for frontend real-time display) recent_actions: List[AgentAction] = field(default_factory=list) max_recent_actions: int = 50 - # 时间戳 + # Timestamps started_at: Optional[str] = None updated_at: str = field(default_factory=lambda: datetime.now().isoformat()) completed_at: Optional[str] = None - # 错误信息 + # Error info error: Optional[str] = None - # 进程ID(用于停止) + # Process ID (for stopping) process_pid: Optional[int] = None def add_action(self, action: AgentAction): - """添加动作到最近动作列表""" + """Add action to recent actions list""" self.recent_actions.insert(0, action) if len(self.recent_actions) > self.max_recent_actions: self.recent_actions = self.recent_actions[:self.max_recent_actions] @@ -165,7 +165,7 @@ def to_dict(self) -> Dict[str, Any]: "simulated_hours": self.simulated_hours, "total_simulation_hours": self.total_simulation_hours, "progress_percent": round(self.current_round / max(self.total_rounds, 1) * 100, 1), - # 各平台独立轮次和时间 + # Per-platform independent rounds and time "twitter_current_round": self.twitter_current_round, "reddit_current_round": self.reddit_current_round, "twitter_simulated_hours": self.twitter_simulated_hours, @@ -185,7 +185,7 @@ def to_dict(self) -> Dict[str, Any]: } def to_detail_dict(self) -> Dict[str, Any]: - """包含最近动作的详细信息""" + """Detailed info including recent actions""" result = self.to_dict() result["recent_actions"] = [a.to_dict() for a in self.recent_actions] result["rounds_count"] = len(self.rounds) @@ -194,45 +194,45 @@ def to_detail_dict(self) -> Dict[str, Any]: class SimulationRunner: """ - 模拟运行器 + Simulation Runner - 负责: - 1. 在后台进程中运行OASIS模拟 - 2. 解析运行日志,记录每个Agent的动作 - 3. 提供实时状态查询接口 - 4. 支持暂停/停止/恢复操作 + Responsible for: + 1. Running OASIS simulation in a background process + 2. Parsing run logs, recording each Agent action + 3. Providing real-time status query interface + 4. Supporting pause/stop/resume operations """ - # 运行状态存储目录 + # Run state storage directory RUN_STATE_DIR = os.path.join( os.path.dirname(__file__), '../../uploads/simulations' ) - # 脚本目录 + # Scripts directory SCRIPTS_DIR = os.path.join( os.path.dirname(__file__), '../../scripts' ) - # 内存中的运行状态 + # In-memory run states _run_states: Dict[str, SimulationRunState] = {} _processes: Dict[str, subprocess.Popen] = {} _action_queues: Dict[str, Queue] = {} _monitor_threads: Dict[str, threading.Thread] = {} - _stdout_files: Dict[str, Any] = {} # 存储 stdout 文件句柄 - _stderr_files: Dict[str, Any] = {} # 存储 stderr 文件句柄 + _stdout_files: Dict[str, Any] = {} # Store stdout file handles + _stderr_files: Dict[str, Any] = {} # Store stderr file handles - # 图谱记忆更新配置 + # Graph memory update config _graph_memory_enabled: Dict[str, bool] = {} # simulation_id -> enabled @classmethod def get_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: - """获取运行状态""" + """Get run state""" if simulation_id in cls._run_states: return cls._run_states[simulation_id] - # 尝试从文件加载 + # Try to load from file state = cls._load_run_state(simulation_id) if state: cls._run_states[simulation_id] = state @@ -240,7 +240,7 @@ def get_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: @classmethod def _load_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: - """从文件加载运行状态""" + """Load run state from file""" state_file = os.path.join(cls.RUN_STATE_DIR, simulation_id, "run_state.json") if not os.path.exists(state_file): return None @@ -256,7 +256,7 @@ def _load_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: total_rounds=data.get("total_rounds", 0), simulated_hours=data.get("simulated_hours", 0), total_simulation_hours=data.get("total_simulation_hours", 0), - # 各平台独立轮次和时间 + # Per-platform independent rounds and time twitter_current_round=data.get("twitter_current_round", 0), reddit_current_round=data.get("reddit_current_round", 0), twitter_simulated_hours=data.get("twitter_simulated_hours", 0), @@ -274,7 +274,7 @@ def _load_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: process_pid=data.get("process_pid"), ) - # 加载最近动作 + # Load recent actions actions_data = data.get("recent_actions", []) for a in actions_data: state.recent_actions.append(AgentAction( @@ -291,12 +291,12 @@ def _load_run_state(cls, simulation_id: str) -> Optional[SimulationRunState]: return state except Exception as e: - logger.error(f"加载运行状态失败: {str(e)}") + logger.error(f"Failed to load run state: {str(e)}") return None @classmethod def _save_run_state(cls, state: SimulationRunState): - """保存运行状态到文件""" + """Save run state to file""" sim_dir = os.path.join(cls.RUN_STATE_DIR, state.simulation_id) os.makedirs(sim_dir, exist_ok=True) state_file = os.path.join(sim_dir, "run_state.json") @@ -313,51 +313,51 @@ def start_simulation( cls, simulation_id: str, platform: str = "parallel", # twitter / reddit / parallel - max_rounds: int = None, # 最大模拟轮数(可选,用于截断过长的模拟) - enable_graph_memory_update: bool = False, # 是否将活动更新到图谱 - graph_id: str = None, # 图谱ID(启用图谱更新时必需) + max_rounds: int = None, # Max simulation rounds (optional, truncate overly long simulations) + enable_graph_memory_update: bool = False, # Whether to update activities to graph + graph_id: str = None, # Graph ID (required when graph update enabled) storage: 'GraphStorage' = None # GraphStorage instance (required if enable_graph_memory_update) ) -> SimulationRunState: """ - 启动模拟 + Start simulation Args: - simulation_id: 模拟ID - platform: 运行平台 (twitter/reddit/parallel) - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) - enable_graph_memory_update: 是否将Agent活动动态更新到图谱 - graph_id: 图谱ID(启用图谱更新时必需) + simulation_id: Simulation ID + platform: Run platform (twitter/reddit/parallel) + max_rounds: Max simulation rounds (optional, truncate overly long simulations) + enable_graph_memory_update: Whether to dynamically update Agent activities to graph + graph_id: Graph ID (required when graph update enabled) Returns: SimulationRunState """ - # 检查是否已在运行 + # Check if already running existing = cls.get_run_state(simulation_id) if existing and existing.runner_status in [RunnerStatus.RUNNING, RunnerStatus.STARTING]: - raise ValueError(f"模拟已在运行中: {simulation_id}") + raise ValueError(f"Simulation already running: {simulation_id}") - # 加载模拟配置 + # Load simulation config sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) config_path = os.path.join(sim_dir, "simulation_config.json") if not os.path.exists(config_path): - raise ValueError(f"模拟配置不存在,请先调用 /prepare 接口") + raise ValueError(f"Simulation config does not exist, please call /prepare endpoint first") with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) - # 初始化运行状态 + # Initialize run state time_config = config.get("time_config", {}) total_hours = time_config.get("total_simulation_hours", 72) minutes_per_round = time_config.get("minutes_per_round", 30) total_rounds = int(total_hours * 60 / minutes_per_round) - # 如果指定了最大轮数,则截断 + # If max rounds specified, truncate if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) if total_rounds < original_rounds: - logger.info(f"轮数已截断: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") + logger.info(f"Rounds truncated: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") state = SimulationRunState( simulation_id=simulation_id, @@ -369,24 +369,24 @@ def start_simulation( cls._save_run_state(state) - # 如果启用图谱记忆更新,创建更新器 + # If graph memory update enabled, create updater if enable_graph_memory_update: if not graph_id: - raise ValueError("启用图谱记忆更新时必须提供 graph_id") + raise ValueError("graph_id must be provided when graph memory update is enabled") try: if not storage: - raise ValueError("启用图谱记忆更新时必须提供 storage (GraphStorage)") + raise ValueError("storage (GraphStorage) must be provided when graph memory update is enabled") GraphMemoryManager.create_updater(simulation_id, graph_id, storage) cls._graph_memory_enabled[simulation_id] = True - logger.info(f"已启用图谱记忆更新: simulation_id={simulation_id}, graph_id={graph_id}") + logger.info(f"Graph memory update enabled: simulation_id={simulation_id}, graph_id={graph_id}") except Exception as e: - logger.error(f"创建图谱记忆更新器失败: {e}") + logger.error(f"Failed to create graph memory updater: {e}") cls._graph_memory_enabled[simulation_id] = False else: cls._graph_memory_enabled[simulation_id] = False - # 确定运行哪个脚本(脚本位于 backend/scripts/ 目录) + # Determine which script to run (scripts in backend/scripts/ directory) if platform == "twitter": script_name = "run_twitter_simulation.py" state.twitter_running = True @@ -401,64 +401,64 @@ def start_simulation( script_path = os.path.join(cls.SCRIPTS_DIR, script_name) if not os.path.exists(script_path): - raise ValueError(f"脚本不存在: {script_path}") + raise ValueError(f"Script does not exist: {script_path}") - # 创建动作队列 + # Create action queue action_queue = Queue() cls._action_queues[simulation_id] = action_queue - # 启动模拟进程 + # Start simulation process try: - # 构建运行命令,使用完整路径 - # 新的日志结构: - # twitter/actions.jsonl - Twitter 动作日志 - # reddit/actions.jsonl - Reddit 动作日志 - # simulation.log - 主进程日志 + # Build run command with full paths + # New log structure: + # twitter/actions.jsonl - Twitter action log + # reddit/actions.jsonl - Reddit action log + # simulation.log - Main process log cmd = [ - sys.executable, # Python解释器 + sys.executable, # Python interpreter script_path, - "--config", config_path, # 使用完整配置文件路径 + "--config", config_path, # Use full config file path ] - # 如果指定了最大轮数,添加到命令行参数 + # If max rounds specified, add to command line args if max_rounds is not None and max_rounds > 0: cmd.extend(["--max-rounds", str(max_rounds)]) - # 创建主日志文件,避免 stdout/stderr 管道缓冲区满导致进程阻塞 + # Create main log file to avoid process blocking from full stdout/stderr pipe buffer main_log_path = os.path.join(sim_dir, "simulation.log") main_log_file = open(main_log_path, 'w', encoding='utf-8') - # 设置子进程环境变量,确保 Windows 上使用 UTF-8 编码 - # 这可以修复第三方库(如 OASIS)读取文件时未指定编码的问题 + # Set subprocess env vars to ensure UTF-8 encoding on Windows + # This fixes encoding issues when third-party libraries (e.g. OASIS) read files without specifying encoding env = os.environ.copy() - env['PYTHONUTF8'] = '1' # Python 3.7+ 支持,让所有 open() 默认使用 UTF-8 - env['PYTHONIOENCODING'] = 'utf-8' # 确保 stdout/stderr 使用 UTF-8 + env['PYTHONUTF8'] = '1' # Python 3.7+ support, make all open() default to UTF-8 + env['PYTHONIOENCODING'] = 'utf-8' # Ensure stdout/stderr use UTF-8 - # 设置工作目录为模拟目录(数据库等文件会生成在此) - # 使用 start_new_session=True 创建新的进程组,确保可以通过 os.killpg 终止所有子进程 + # Set working directory to simulation directory (databases etc. are generated here) + # Use start_new_session=True to create new process group, ensuring all child processes can be terminated via os.killpg process = subprocess.Popen( cmd, cwd=sim_dir, stdout=main_log_file, - stderr=subprocess.STDOUT, # stderr 也写入同一个文件 + stderr=subprocess.STDOUT, # stderr also written to same file text=True, - encoding='utf-8', # 显式指定编码 + encoding='utf-8', # Explicitly specify encoding bufsize=1, - env=env, # 传递带有 UTF-8 设置的环境变量 - start_new_session=True, # 创建新进程组,确保服务器关闭时能终止所有相关进程 + env=env, # Pass environment variables with UTF-8 settings + start_new_session=True, # Create new process group to ensure all related processes can be terminated when server shuts down ) - # 保存文件句柄以便后续关闭 + # Save file handles for later closing cls._stdout_files[simulation_id] = main_log_file - cls._stderr_files[simulation_id] = None # 不再需要单独的 stderr + cls._stderr_files[simulation_id] = None # No longer need separate stderr state.process_pid = process.pid state.runner_status = RunnerStatus.RUNNING cls._processes[simulation_id] = process cls._save_run_state(state) - # 启动监控线程 + # Start monitoring thread monitor_thread = threading.Thread( target=cls._monitor_simulation, args=(simulation_id,), @@ -467,7 +467,7 @@ def start_simulation( monitor_thread.start() cls._monitor_threads[simulation_id] = monitor_thread - logger.info(f"模拟启动成功: {simulation_id}, pid={process.pid}, platform={platform}") + logger.info(f"Simulation started successfully: {simulation_id}, pid={process.pid}, platform={platform}") except Exception as e: state.runner_status = RunnerStatus.FAILED @@ -479,10 +479,10 @@ def start_simulation( @classmethod def _monitor_simulation(cls, simulation_id: str): - """监控模拟进程,解析动作日志""" + """Monitor simulation process, parse action logs""" sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) - # 新的日志结构:分平台的动作日志 + # New log structure:per-platform action logs twitter_actions_log = os.path.join(sim_dir, "twitter", "actions.jsonl") reddit_actions_log = os.path.join(sim_dir, "reddit", "actions.jsonl") @@ -496,75 +496,75 @@ def _monitor_simulation(cls, simulation_id: str): reddit_position = 0 try: - while process.poll() is None: # 进程仍在运行 - # 读取 Twitter 动作日志 + while process.poll() is None: # Process still running + # Read Twitter action log if os.path.exists(twitter_actions_log): twitter_position = cls._read_action_log( twitter_actions_log, twitter_position, state, "twitter" ) - # 读取 Reddit 动作日志 + # Read Reddit action log if os.path.exists(reddit_actions_log): reddit_position = cls._read_action_log( reddit_actions_log, reddit_position, state, "reddit" ) - # 更新状态 + # Update status cls._save_run_state(state) time.sleep(2) - # 进程结束后,最后读取一次日志 + # After process ends, read logs one final time if os.path.exists(twitter_actions_log): cls._read_action_log(twitter_actions_log, twitter_position, state, "twitter") if os.path.exists(reddit_actions_log): cls._read_action_log(reddit_actions_log, reddit_position, state, "reddit") - # 进程结束 + # Process ended exit_code = process.returncode if exit_code == 0: state.runner_status = RunnerStatus.COMPLETED state.completed_at = datetime.now().isoformat() - logger.info(f"模拟完成: {simulation_id}") + logger.info(f"Simulation completed: {simulation_id}") else: state.runner_status = RunnerStatus.FAILED - # 从主日志文件读取错误信息 + # Read error info from main log file main_log_path = os.path.join(sim_dir, "simulation.log") error_info = "" try: if os.path.exists(main_log_path): with open(main_log_path, 'r', encoding='utf-8') as f: - error_info = f.read()[-2000:] # 取最后2000字符 + error_info = f.read()[-2000:] # Get last 2000 chars except Exception: pass - state.error = f"进程退出码: {exit_code}, 错误: {error_info}" - logger.error(f"模拟失败: {simulation_id}, error={state.error}") + state.error = f"Process exit code: {exit_code}, error: {error_info}" + logger.error(f"Simulation failed: {simulation_id}, error={state.error}") state.twitter_running = False state.reddit_running = False cls._save_run_state(state) except Exception as e: - logger.error(f"监控线程异常: {simulation_id}, error={str(e)}") + logger.error(f"Monitor thread exception: {simulation_id}, error={str(e)}") state.runner_status = RunnerStatus.FAILED state.error = str(e) cls._save_run_state(state) finally: - # 停止图谱记忆更新器 + # Stop graph memory updater if cls._graph_memory_enabled.get(simulation_id, False): try: GraphMemoryManager.stop_updater(simulation_id) - logger.info(f"已停止图谱记忆更新: simulation_id={simulation_id}") + logger.info(f"Stopped graph memory update: simulation_id={simulation_id}") except Exception as e: - logger.error(f"停止图谱记忆更新器失败: {e}") + logger.error(f"Failed to stop graph memory updater: {e}") cls._graph_memory_enabled.pop(simulation_id, None) - # 清理进程资源 + # Clean up process resources cls._processes.pop(simulation_id, None) cls._action_queues.pop(simulation_id, None) - # 关闭日志文件句柄 + # Close log file handles if simulation_id in cls._stdout_files: try: cls._stdout_files[simulation_id].close() @@ -587,18 +587,18 @@ def _read_action_log( platform: str ) -> int: """ - 读取动作日志文件 + Read action log file Args: - log_path: 日志文件路径 - position: 上次读取位置 - state: 运行状态对象 - platform: 平台名称 (twitter/reddit) + log_path: Log file path + position: Last read position + state: Run state object + platform: Platform name (twitter/reddit) Returns: - 新的读取位置 + New read position """ - # 检查是否启用了图谱记忆更新 + # Check if graph memory update is enabled graph_memory_enabled = cls._graph_memory_enabled.get(state.simulation_id, False) graph_updater = None if graph_memory_enabled: @@ -613,36 +613,36 @@ def _read_action_log( try: action_data = json.loads(line) - # 处理事件类型的条目 + # Handle event type entries if "event_type" in action_data: event_type = action_data.get("event_type") - # 检测 simulation_end 事件,标记平台已完成 + # Detect simulation_end event, mark platform as completed if event_type == "simulation_end": if platform == "twitter": state.twitter_completed = True state.twitter_running = False - logger.info(f"Twitter 模拟已完成: {state.simulation_id}, total_rounds={action_data.get('total_rounds')}, total_actions={action_data.get('total_actions')}") + logger.info(f"Twitter simulation completed: {state.simulation_id}, total_rounds={action_data.get('total_rounds')}, total_actions={action_data.get('total_actions')}") elif platform == "reddit": state.reddit_completed = True state.reddit_running = False - logger.info(f"Reddit 模拟已完成: {state.simulation_id}, total_rounds={action_data.get('total_rounds')}, total_actions={action_data.get('total_actions')}") + logger.info(f"Reddit simulation completed: {state.simulation_id}, total_rounds={action_data.get('total_rounds')}, total_actions={action_data.get('total_actions')}") - # 检查是否所有启用的平台都已完成 - # 如果只运行了一个平台,只检查那个平台 - # 如果运行了两个平台,需要两个都完成 + # Check if all enabled platforms have completed + # If only one platform ran, check only that one + # If two platforms ran, both need to complete all_completed = cls._check_all_platforms_completed(state) if all_completed: state.runner_status = RunnerStatus.COMPLETED state.completed_at = datetime.now().isoformat() - logger.info(f"所有平台模拟已完成: {state.simulation_id}") + logger.info(f"All platform simulations completed: {state.simulation_id}") - # 更新轮次信息(从 round_end 事件) + # Update round info (from round_end event) elif event_type == "round_end": round_num = action_data.get("round", 0) simulated_hours = action_data.get("simulated_hours", 0) - # 更新各平台独立的轮次和时间 + # Update per-platform independent rounds and time if platform == "twitter": if round_num > state.twitter_current_round: state.twitter_current_round = round_num @@ -652,10 +652,10 @@ def _read_action_log( state.reddit_current_round = round_num state.reddit_simulated_hours = simulated_hours - # 总体轮次取两个平台的最大值 + # Overall round takes the max of both platforms if round_num > state.current_round: state.current_round = round_num - # 总体时间取两个平台的最大值 + # Overall time takes the max of both platforms state.simulated_hours = max(state.twitter_simulated_hours, state.reddit_simulated_hours) continue @@ -673,11 +673,11 @@ def _read_action_log( ) state.add_action(action) - # 更新轮次 + # Update round if action.round_num and action.round_num > state.current_round: state.current_round = action.round_num - # 如果启用了图谱记忆更新,将活动发送到图谱 + # If graph memory update is enabled, send activity to graph if graph_updater: graph_updater.add_activity_from_dict(action_data, platform) @@ -685,52 +685,52 @@ def _read_action_log( pass return f.tell() except Exception as e: - logger.warning(f"读取动作日志失败: {log_path}, error={e}") + logger.warning(f"Failed to read action log: {log_path}, error={e}") return position @classmethod def _check_all_platforms_completed(cls, state: SimulationRunState) -> bool: """ - 检查所有启用的平台是否都已完成模拟 + Check if all enabled platforms have completed simulation - 通过检查对应的 actions.jsonl 文件是否存在来判断平台是否被启用 + Determine if platform is enabled by checking if corresponding actions.jsonl file exists Returns: - True 如果所有启用的平台都已完成 + True if all enabled platforms have completed """ sim_dir = os.path.join(cls.RUN_STATE_DIR, state.simulation_id) twitter_log = os.path.join(sim_dir, "twitter", "actions.jsonl") reddit_log = os.path.join(sim_dir, "reddit", "actions.jsonl") - # 检查哪些平台被启用(通过文件是否存在判断) + # Check which platforms are enabled (by file existence) twitter_enabled = os.path.exists(twitter_log) reddit_enabled = os.path.exists(reddit_log) - # 如果平台被启用但未完成,则返回 False + # If platform is enabled but not completed, return False if twitter_enabled and not state.twitter_completed: return False if reddit_enabled and not state.reddit_completed: return False - # 至少有一个平台被启用且已完成 + # At least one platform is enabled and completed return twitter_enabled or reddit_enabled @classmethod def _terminate_process(cls, process: subprocess.Popen, simulation_id: str, timeout: int = 10): """ - 跨平台终止进程及其子进程 + Cross-platform termination of process and child processes Args: - process: 要终止的进程 - simulation_id: 模拟ID(用于日志) - timeout: 等待进程退出的超时时间(秒) + process: Process to terminate + simulation_id: Simulation ID (used for logging) + timeout: Timeout for process exit (seconds) """ if IS_WINDOWS: - # Windows: 使用 taskkill 命令终止进程树 - # /F = 强制终止, /T = 终止进程树(包括子进程) - logger.info(f"终止进程树 (Windows): simulation={simulation_id}, pid={process.pid}") + # Windows: Use taskkill command to terminate process tree + # /F = force terminate, /T = terminate process tree (including child processes) + logger.info(f"Terminating process tree (Windows): simulation={simulation_id}, pid={process.pid}") try: - # 先尝试优雅终止 + # Try graceful termination first subprocess.run( ['taskkill', '/PID', str(process.pid), '/T'], capture_output=True, @@ -739,8 +739,8 @@ def _terminate_process(cls, process: subprocess.Popen, simulation_id: str, timeo try: process.wait(timeout=timeout) except subprocess.TimeoutExpired: - # 强制终止 - logger.warning(f"进程未响应,强制终止: {simulation_id}") + # Force terminate + logger.warning(f"Process not responding, force terminating: {simulation_id}") subprocess.run( ['taskkill', '/F', '/PID', str(process.pid), '/T'], capture_output=True, @@ -748,53 +748,53 @@ def _terminate_process(cls, process: subprocess.Popen, simulation_id: str, timeo ) process.wait(timeout=5) except Exception as e: - logger.warning(f"taskkill 失败,尝试 terminate: {e}") + logger.warning(f"taskkill failed, trying terminate: {e}") process.terminate() try: process.wait(timeout=5) except subprocess.TimeoutExpired: process.kill() else: - # Unix: 使用进程组终止 - # 由于使用了 start_new_session=True,进程组 ID 等于主进程 PID + # Unix: Use process group termination + # Since start_new_session=True was used, process group ID equals main process PID pgid = os.getpgid(process.pid) - logger.info(f"终止进程组 (Unix): simulation={simulation_id}, pgid={pgid}") + logger.info(f"Terminating process group (Unix): simulation={simulation_id}, pgid={pgid}") - # 先发送 SIGTERM 给整个进程组 + # Send SIGTERM to entire process group first os.killpg(pgid, signal.SIGTERM) try: process.wait(timeout=timeout) except subprocess.TimeoutExpired: - # 如果超时后还没结束,强制发送 SIGKILL - logger.warning(f"进程组未响应 SIGTERM,强制终止: {simulation_id}") + # If still running after timeout, force send SIGKILL + logger.warning(f"Process group not responding to SIGTERM, force terminating: {simulation_id}") os.killpg(pgid, signal.SIGKILL) process.wait(timeout=5) @classmethod def stop_simulation(cls, simulation_id: str) -> SimulationRunState: - """停止模拟""" + """Stop simulation""" state = cls.get_run_state(simulation_id) if not state: - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation does not exist: {simulation_id}") if state.runner_status not in [RunnerStatus.RUNNING, RunnerStatus.PAUSED]: - raise ValueError(f"模拟未在运行: {simulation_id}, status={state.runner_status}") + raise ValueError(f"Simulation not running: {simulation_id}, status={state.runner_status}") state.runner_status = RunnerStatus.STOPPING cls._save_run_state(state) - # 终止进程 + # Terminate process process = cls._processes.get(simulation_id) if process and process.poll() is None: try: cls._terminate_process(process, simulation_id) except ProcessLookupError: - # 进程已经不存在 + # Process no longer exists pass except Exception as e: - logger.error(f"终止进程组失败: {simulation_id}, error={e}") - # 回退到直接终止进程 + logger.error(f"Failed to terminate process group: {simulation_id}, error={e}") + # Fall back to direct process termination try: process.terminate() process.wait(timeout=5) @@ -807,16 +807,16 @@ def stop_simulation(cls, simulation_id: str) -> SimulationRunState: state.completed_at = datetime.now().isoformat() cls._save_run_state(state) - # 停止图谱记忆更新器 + # Stop graph memory updater if cls._graph_memory_enabled.get(simulation_id, False): try: GraphMemoryManager.stop_updater(simulation_id) - logger.info(f"已停止图谱记忆更新: simulation_id={simulation_id}") + logger.info(f"Stopped graph memory update: simulation_id={simulation_id}") except Exception as e: - logger.error(f"停止图谱记忆更新器失败: {e}") + logger.error(f"Failed to stop graph memory updater: {e}") cls._graph_memory_enabled.pop(simulation_id, None) - logger.info(f"模拟已停止: {simulation_id}") + logger.info(f"Simulation stopped: {simulation_id}") return state @classmethod @@ -829,14 +829,14 @@ def _read_actions_from_file( round_num: Optional[int] = None ) -> List[AgentAction]: """ - 从单个动作文件中读取动作 + Read actions from a single action file Args: - file_path: 动作日志文件路径 - default_platform: 默认平台(当动作记录中没有 platform 字段时使用) - platform_filter: 过滤平台 - agent_id: 过滤 Agent ID - round_num: 过滤轮次 + file_path: Action log file path + default_platform: Default platform (used when action record has no platform field) + platform_filter: Filter by platform + agent_id: Filter by Agent ID + round_num: Filter by round """ if not os.path.exists(file_path): return [] @@ -852,18 +852,18 @@ def _read_actions_from_file( try: data = json.loads(line) - # 跳过非动作记录(如 simulation_start, round_start, round_end 等事件) + # Skip non-action records (e.g. simulation_start, round_start, round_end events) if "event_type" in data: continue - # 跳过没有 agent_id 的记录(非 Agent 动作) + # Skip records without agent_id (non-Agent actions) if "agent_id" not in data: continue - # 获取平台:优先使用记录中的 platform,否则使用默认平台 + # Get platform: prefer platform from record, otherwise use default record_platform = data.get("platform") or default_platform or "" - # 过滤 + # Filter if platform_filter and record_platform != platform_filter: continue if agent_id is not None and data.get("agent_id") != agent_id: @@ -897,54 +897,54 @@ def get_all_actions( round_num: Optional[int] = None ) -> List[AgentAction]: """ - 获取所有平台的完整动作历史(无分页限制) + Get complete action history from all platforms (no pagination limit) Args: - simulation_id: 模拟ID - platform: 过滤平台(twitter/reddit) - agent_id: 过滤Agent - round_num: 过滤轮次 + simulation_id: Simulation ID + platform: Filter by platform (twitter/reddit) + agent_id: Filter by Agent + round_num: Filter by round Returns: - 完整的动作列表(按时间戳排序,新的在前) + Complete action list (sorted by timestamp, newest first) """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) actions = [] - # 读取 Twitter 动作文件(根据文件路径自动设置 platform 为 twitter) + # Read Twitter action file (auto-set platform to twitter based on file path) twitter_actions_log = os.path.join(sim_dir, "twitter", "actions.jsonl") if not platform or platform == "twitter": actions.extend(cls._read_actions_from_file( twitter_actions_log, - default_platform="twitter", # 自动填充 platform 字段 + default_platform="twitter", # Auto-fill platform field platform_filter=platform, agent_id=agent_id, round_num=round_num )) - # 读取 Reddit 动作文件(根据文件路径自动设置 platform 为 reddit) + # Read Reddit action file (auto-set platform to reddit based on file path) reddit_actions_log = os.path.join(sim_dir, "reddit", "actions.jsonl") if not platform or platform == "reddit": actions.extend(cls._read_actions_from_file( reddit_actions_log, - default_platform="reddit", # 自动填充 platform 字段 + default_platform="reddit", # Auto-fill platform field platform_filter=platform, agent_id=agent_id, round_num=round_num )) - # 如果分平台文件不存在,尝试读取旧的单一文件格式 + # If per-platform files not found, try reading old single-file format if not actions: actions_log = os.path.join(sim_dir, "actions.jsonl") actions = cls._read_actions_from_file( actions_log, - default_platform=None, # 旧格式文件中应该有 platform 字段 + default_platform=None, # Old format files should have platform field platform_filter=platform, agent_id=agent_id, round_num=round_num ) - # 按时间戳排序(新的在前) + # Sort by timestamp (newest first) actions.sort(key=lambda x: x.timestamp, reverse=True) return actions @@ -960,18 +960,18 @@ def get_actions( round_num: Optional[int] = None ) -> List[AgentAction]: """ - 获取动作历史(带分页) + Get action history (with pagination) Args: - simulation_id: 模拟ID - limit: 返回数量限制 - offset: 偏移量 - platform: 过滤平台 - agent_id: 过滤Agent - round_num: 过滤轮次 + simulation_id: Simulation ID + limit: Return count limit + offset: Offset + platform: Filter by platform + agent_id: Filter by Agent + round_num: Filter by round Returns: - 动作列表 + Action list """ actions = cls.get_all_actions( simulation_id=simulation_id, @@ -980,7 +980,7 @@ def get_actions( round_num=round_num ) - # 分页 + # Pagination return actions[offset:offset + limit] @classmethod @@ -991,19 +991,19 @@ def get_timeline( end_round: Optional[int] = None ) -> List[Dict[str, Any]]: """ - 获取模拟时间线(按轮次汇总) + Get simulation timeline (summarized by round) Args: - simulation_id: 模拟ID - start_round: 起始轮次 - end_round: 结束轮次 + simulation_id: Simulation ID + start_round: Start round + end_round: End round Returns: - 每轮的汇总信息 + Summary info per round """ actions = cls.get_actions(simulation_id, limit=10000) - # 按轮次分组 + # Group by round rounds: Dict[int, Dict[str, Any]] = {} for action in actions: @@ -1036,7 +1036,7 @@ def get_timeline( r["action_types"][action.action_type] = r["action_types"].get(action.action_type, 0) + 1 r["last_action_time"] = action.timestamp - # 转换为列表 + # Convert to list result = [] for round_num in sorted(rounds.keys()): r = rounds[round_num] @@ -1057,10 +1057,10 @@ def get_timeline( @classmethod def get_agent_stats(cls, simulation_id: str) -> List[Dict[str, Any]]: """ - 获取每个Agent的统计信息 + Get statistics for each Agent Returns: - Agent统计列表 + Agent statistics list """ actions = cls.get_actions(simulation_id, limit=10000) @@ -1092,7 +1092,7 @@ def get_agent_stats(cls, simulation_id: str) -> List[Dict[str, Any]]: stats["action_types"][action.action_type] = stats["action_types"].get(action.action_type, 0) + 1 stats["last_action_time"] = action.timestamp - # 按总动作数排序 + # Sort by total action count result = sorted(agent_stats.values(), key=lambda x: x["total_actions"], reverse=True) return result @@ -1100,51 +1100,51 @@ def get_agent_stats(cls, simulation_id: str) -> List[Dict[str, Any]]: @classmethod def cleanup_simulation_logs(cls, simulation_id: str) -> Dict[str, Any]: """ - 清理模拟的运行日志(用于强制重新开始模拟) + Clean up simulation run logs (for forcing a restart) - 会删除以下文件: + Will delete the following files: - run_state.json - twitter/actions.jsonl - reddit/actions.jsonl - simulation.log - stdout.log / stderr.log - - twitter_simulation.db(模拟数据库) - - reddit_simulation.db(模拟数据库) - - env_status.json(环境状态) + - twitter_simulation.db(Simulation database) + - reddit_simulation.db(Simulation database) + - env_status.json (environment status) - 注意:不会删除配置文件(simulation_config.json)和 profile 文件 + Note: Will not delete config files (simulation_config.json) and profile files Args: - simulation_id: 模拟ID + simulation_id: Simulation ID Returns: - 清理结果信息 + Cleanup result info """ import shutil sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): - return {"success": True, "message": "模拟目录不存在,无需清理"} + return {"success": True, "message": "Simulation directory does not exist, no cleanup needed"} cleaned_files = [] errors = [] - # 要删除的文件列表(包括数据库文件) + # Files to delete (including database files) files_to_delete = [ "run_state.json", "simulation.log", "stdout.log", "stderr.log", - "twitter_simulation.db", # Twitter 平台数据库 - "reddit_simulation.db", # Reddit 平台数据库 - "env_status.json", # 环境状态文件 + "twitter_simulation.db", # Twitter platform database + "reddit_simulation.db", # Reddit platform database + "env_status.json", # Environment status file ] - # 要删除的目录列表(包含动作日志) + # Directories to delete (containing action logs) dirs_to_clean = ["twitter", "reddit"] - # 删除文件 + # Delete files for filename in files_to_delete: file_path = os.path.join(sim_dir, filename) if os.path.exists(file_path): @@ -1152,9 +1152,9 @@ def cleanup_simulation_logs(cls, simulation_id: str) -> Dict[str, Any]: os.remove(file_path) cleaned_files.append(filename) except Exception as e: - errors.append(f"删除 {filename} 失败: {str(e)}") + errors.append(f"Failed to delete {filename}: {str(e)}") - # 清理平台目录中的动作日志 + # Clean action logs in platform directories for dir_name in dirs_to_clean: dir_path = os.path.join(sim_dir, dir_name) if os.path.exists(dir_path): @@ -1164,13 +1164,13 @@ def cleanup_simulation_logs(cls, simulation_id: str) -> Dict[str, Any]: os.remove(actions_file) cleaned_files.append(f"{dir_name}/actions.jsonl") except Exception as e: - errors.append(f"删除 {dir_name}/actions.jsonl 失败: {str(e)}") + errors.append(f"Failed to delete {dir_name}/actions.jsonl: {str(e)}") - # 清理内存中的运行状态 + # Clear in-memory run state if simulation_id in cls._run_states: del cls._run_states[simulation_id] - logger.info(f"清理模拟日志完成: {simulation_id}, 删除文件: {cleaned_files}") + logger.info(f"Simulation log cleanup complete: {simulation_id}, deleted files: {cleaned_files}") return { "success": len(errors) == 0, @@ -1178,71 +1178,71 @@ def cleanup_simulation_logs(cls, simulation_id: str) -> Dict[str, Any]: "errors": errors if errors else None } - # 防止重复清理的标志 + # Flag to prevent duplicate cleanup _cleanup_done = False @classmethod def cleanup_all_simulations(cls): """ - 清理所有运行中的模拟进程 + Clean up all running simulation processes - 在服务器关闭时调用,确保所有子进程被终止 + Called on server shutdown to ensure all child processes are terminated """ - # 防止重复清理 + # Prevent duplicate cleanup if cls._cleanup_done: return cls._cleanup_done = True - # 检查是否有内容需要清理(避免空进程的进程打印无用日志) + # Check if there is anything to clean up (avoid useless logging for empty processes) has_processes = bool(cls._processes) has_updaters = bool(cls._graph_memory_enabled) if not has_processes and not has_updaters: - return # 没有需要清理的内容,静默返回 + return # Nothing to clean up, return silently - logger.info("正在清理所有模拟进程...") + logger.info("Cleaning up all simulation processes...") - # 首先停止所有图谱记忆更新器(stop_all 内部会打印日志) + # First stop all graph memory updaters (stop_all prints logs internally) try: GraphMemoryManager.stop_all() except Exception as e: - logger.error(f"停止图谱记忆更新器失败: {e}") + logger.error(f"Failed to stop graph memory updater: {e}") cls._graph_memory_enabled.clear() - # 复制字典以避免在迭代时修改 + # Copy dict to avoid modification during iteration processes = list(cls._processes.items()) for simulation_id, process in processes: try: - if process.poll() is None: # 进程仍在运行 - logger.info(f"终止模拟进程: {simulation_id}, pid={process.pid}") + if process.poll() is None: # Process still running + logger.info(f"Terminating simulation process: {simulation_id}, pid={process.pid}") try: - # 使用跨平台的进程终止方法 + # Use cross-platform process termination method cls._terminate_process(process, simulation_id, timeout=5) except (ProcessLookupError, OSError): - # 进程可能已经不存在,尝试直接终止 + # Process may no longer exist, try direct termination try: process.terminate() process.wait(timeout=3) except Exception: process.kill() - # 更新 run_state.json + # Update run_state.json state = cls.get_run_state(simulation_id) if state: state.runner_status = RunnerStatus.STOPPED state.twitter_running = False state.reddit_running = False state.completed_at = datetime.now().isoformat() - state.error = "服务器关闭,模拟被终止" + state.error = "Server shutdown, simulation terminated" cls._save_run_state(state) - # 同时更新 state.json,将状态设为 stopped + # Also update state.json, set status to stopped try: sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) state_file = os.path.join(sim_dir, "state.json") - logger.info(f"尝试更新 state.json: {state_file}") + logger.info(f"Trying to update state.json: {state_file}") if os.path.exists(state_file): with open(state_file, 'r', encoding='utf-8') as f: state_data = json.load(f) @@ -1250,16 +1250,16 @@ def cleanup_all_simulations(cls): state_data['updated_at'] = datetime.now().isoformat() with open(state_file, 'w', encoding='utf-8') as f: json.dump(state_data, f, indent=2, ensure_ascii=False) - logger.info(f"已更新 state.json 状态为 stopped: {simulation_id}") + logger.info(f"Updated state.json status to stopped: {simulation_id}") else: - logger.warning(f"state.json 不存在: {state_file}") + logger.warning(f"state.json does not exist: {state_file}") except Exception as state_err: - logger.warning(f"更新 state.json 失败: {simulation_id}, error={state_err}") + logger.warning(f"Failed to update state.json: {simulation_id}, error={state_err}") except Exception as e: - logger.error(f"清理进程失败: {simulation_id}, error={e}") + logger.error(f"Failed to clean up process: {simulation_id}, error={e}") - # 清理文件句柄 + # Clean up file handles for simulation_id, file_handle in list(cls._stdout_files.items()): try: if file_handle: @@ -1276,89 +1276,89 @@ def cleanup_all_simulations(cls): pass cls._stderr_files.clear() - # 清理内存中的状态 + # Clear in-memory state cls._processes.clear() cls._action_queues.clear() - logger.info("模拟进程清理完成") + logger.info("Simulation process cleanup complete") @classmethod def register_cleanup(cls): """ - 注册清理函数 + Register cleanup function - 在 Flask 应用启动时调用,确保服务器关闭时清理所有模拟进程 + Called on Flask app startup to ensure all simulation processes are cleaned up on server shutdown """ global _cleanup_registered if _cleanup_registered: return - # Flask debug 模式下,只在 reloader 子进程中注册清理(实际运行应用的进程) - # WERKZEUG_RUN_MAIN=true 表示是 reloader 子进程 - # 如果不是 debug 模式,则没有这个环境变量,也需要注册 + # In Flask debug mode, only register cleanup in the reloader child process (the process actually running the app) + # WERKZEUG_RUN_MAIN=true indicates this is the reloader child process + # If not in debug mode, this env variable won't exist, and we still need to register is_reloader_process = os.environ.get('WERKZEUG_RUN_MAIN') == 'true' is_debug_mode = os.environ.get('FLASK_DEBUG') == '1' or os.environ.get('WERKZEUG_RUN_MAIN') is not None - # 在 debug 模式下,只在 reloader 子进程中注册;非 debug 模式下始终注册 + # In debug mode, only register in the reloader child process; in non-debug mode, always register if is_debug_mode and not is_reloader_process: - _cleanup_registered = True # 标记已注册,防止子进程再次尝试 + _cleanup_registered = True # Mark as registered to prevent child process from trying again return - # 保存原有的信号处理器 + # Save original signal handlers original_sigint = signal.getsignal(signal.SIGINT) original_sigterm = signal.getsignal(signal.SIGTERM) - # SIGHUP 只在 Unix 系统存在(macOS/Linux),Windows 没有 + # SIGHUP only exists on Unix systems (macOS/Linux), not on Windows original_sighup = None has_sighup = hasattr(signal, 'SIGHUP') if has_sighup: original_sighup = signal.getsignal(signal.SIGHUP) def cleanup_handler(signum=None, frame=None): - """信号处理器:先清理模拟进程,再调用原处理器""" - # 只有在有进程需要清理时才打印日志 + """Signal handler: clean up simulation processes first, then call original handler""" + # Only print logs when there are processes to clean up if cls._processes or cls._graph_memory_enabled: - logger.info(f"收到信号 {signum},开始清理...") + logger.info(f"Received signal {signum}, starting cleanup...") cls.cleanup_all_simulations() - # 调用原有的信号处理器,让 Flask 正常退出 + # Call original signal handler to let Flask exit normally if signum == signal.SIGINT and callable(original_sigint): original_sigint(signum, frame) elif signum == signal.SIGTERM and callable(original_sigterm): original_sigterm(signum, frame) elif has_sighup and signum == signal.SIGHUP: - # SIGHUP: 终端关闭时发送 + # SIGHUP: sent when terminal closes if callable(original_sighup): original_sighup(signum, frame) else: - # 默认行为:正常退出 + # Default behavior: exit normally sys.exit(0) else: - # 如果原处理器不可调用(如 SIG_DFL),则使用默认行为 + # If original handler is not callable (e.g. SIG_DFL), use default behavior raise KeyboardInterrupt - # 注册 atexit 处理器(作为备用) + # Register atexit handler (as backup) atexit.register(cls.cleanup_all_simulations) - # 注册信号处理器(仅在主线程中) + # Register signal handlers (only in main thread) try: - # SIGTERM: kill 命令默认信号 + # SIGTERM: default signal for kill command signal.signal(signal.SIGTERM, cleanup_handler) # SIGINT: Ctrl+C signal.signal(signal.SIGINT, cleanup_handler) - # SIGHUP: 终端关闭(仅 Unix 系统) + # SIGHUP: terminal close (Unix only) if has_sighup: signal.signal(signal.SIGHUP, cleanup_handler) except ValueError: - # 不在主线程中,只能使用 atexit - logger.warning("无法注册信号处理器(不在主线程),仅使用 atexit") + # Not in main thread, can only use atexit + logger.warning("Cannot register signal handlers (not in main thread), using atexit only") _cleanup_registered = True @classmethod def get_running_simulations(cls) -> List[str]: """ - 获取所有正在运行的模拟ID列表 + Get list of all running simulation IDs """ running = [] for sim_id, process in cls._processes.items(): @@ -1366,18 +1366,18 @@ def get_running_simulations(cls) -> List[str]: running.append(sim_id) return running - # ============== Interview 功能 ============== + # ============== Interview Features ============== @classmethod def check_env_alive(cls, simulation_id: str) -> bool: """ - 检查模拟环境是否存活(可以接收Interview命令) + Check if simulation environment is alive (can receive Interview commands) Args: - simulation_id: 模拟ID + simulation_id: Simulation ID Returns: - True 表示环境存活,False 表示环境已关闭 + True if the environment is alive, False if the environment has been closed """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): @@ -1389,13 +1389,13 @@ def check_env_alive(cls, simulation_id: str) -> bool: @classmethod def get_env_status_detail(cls, simulation_id: str) -> Dict[str, Any]: """ - 获取模拟环境的详细状态信息 + Get detailed status info of simulation environment Args: - simulation_id: 模拟ID + simulation_id: Simulation ID Returns: - 状态详情字典,包含 status, twitter_available, reddit_available, timestamp + Status detail dictionary containing status, twitter_available, reddit_available, timestamp """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) status_file = os.path.join(sim_dir, "env_status.json") @@ -1432,35 +1432,35 @@ def interview_agent( timeout: float = 60.0 ) -> Dict[str, Any]: """ - 采访单个Agent + Interview a single Agent Args: - simulation_id: 模拟ID + simulation_id: Simulation ID agent_id: Agent ID - prompt: 采访问题 - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时同时采访两个平台,返回整合结果 - timeout: 超时时间(秒) + prompt: Interview question + platform: Specify platform (optional) + - "twitter": Interview on Twitter platform only + - "reddit": Interview on Reddit platform only + - None: In dual-platform simulation, interview on both platforms and return combined results + timeout: Timeout in seconds Returns: - 采访结果字典 + Interview result dictionary Raises: - ValueError: 模拟不存在或环境未运行 - TimeoutError: 等待响应超时 + ValueError: Simulation does not exist or environment is not running + TimeoutError: Timed out waiting for response """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation does not exist: {simulation_id}") ipc_client = SimulationIPCClient(sim_dir) if not ipc_client.check_env_alive(): - raise ValueError(f"模拟环境未运行或已关闭,无法执行Interview: {simulation_id}") + raise ValueError(f"Simulation environment not running or closed, cannot perform Interview: {simulation_id}") - logger.info(f"发送Interview命令: simulation_id={simulation_id}, agent_id={agent_id}, platform={platform}") + logger.info(f"Sending Interview command: simulation_id={simulation_id}, agent_id={agent_id}, platform={platform}") response = ipc_client.send_interview( agent_id=agent_id, @@ -1495,34 +1495,34 @@ def interview_agents_batch( timeout: float = 120.0 ) -> Dict[str, Any]: """ - 批量采访多个Agent + Batch interview multiple Agents Args: - simulation_id: 模拟ID - interviews: 采访列表,每个元素包含 {"agent_id": int, "prompt": str, "platform": str(可选)} - platform: 默认平台(可选,会被每个采访项的platform覆盖) - - "twitter": 默认只采访Twitter平台 - - "reddit": 默认只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间(秒) + simulation_id: Simulation ID + interviews: Interview list, each element contains {"agent_id": int, "prompt": str, "platform": str (optional)} + platform: Default platform (optional, overridden by each interview item's platform) + - "twitter": Default to interview on Twitter platform only + - "reddit": Default to interview on Reddit platform only + - None: In dual-platform simulation, interview each Agent on both platforms + timeout: Timeout in seconds Returns: - 批量采访结果字典 + Batch interview result dictionary Raises: - ValueError: 模拟不存在或环境未运行 - TimeoutError: 等待响应超时 + ValueError: Simulation does not exist or environment is not running + TimeoutError: Timed out waiting for response """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation does not exist: {simulation_id}") ipc_client = SimulationIPCClient(sim_dir) if not ipc_client.check_env_alive(): - raise ValueError(f"模拟环境未运行或已关闭,无法执行Interview: {simulation_id}") + raise ValueError(f"Simulation environment not running or closed, cannot perform Interview: {simulation_id}") - logger.info(f"发送批量Interview命令: simulation_id={simulation_id}, count={len(interviews)}, platform={platform}") + logger.info(f"Sending batch Interview command: simulation_id={simulation_id}, count={len(interviews)}, platform={platform}") response = ipc_client.send_batch_interview( interviews=interviews, @@ -1554,39 +1554,39 @@ def interview_all_agents( timeout: float = 180.0 ) -> Dict[str, Any]: """ - 采访所有Agent(全局采访) + Interview all Agents (global interview) - 使用相同的问题采访模拟中的所有Agent + Interview all Agents in the simulation with the same question Args: - simulation_id: 模拟ID - prompt: 采访问题(所有Agent使用相同问题) - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None: 双平台模拟时每个Agent同时采访两个平台 - timeout: 超时时间(秒) + simulation_id: Simulation ID + prompt: Interview question (same question used for all Agents) + platform: Specify platform (optional) + - "twitter": Interview on Twitter platform only + - "reddit": Interview on Reddit platform only + - None: In dual-platform simulation, interview each Agent on both platforms + timeout: Timeout in seconds Returns: - 全局采访结果字典 + Global interview result dictionary """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation does not exist: {simulation_id}") - # 从配置文件获取所有Agent信息 + # Get all Agent info from config file config_path = os.path.join(sim_dir, "simulation_config.json") if not os.path.exists(config_path): - raise ValueError(f"模拟配置不存在: {simulation_id}") + raise ValueError(f"Simulation config does not exist: {simulation_id}") with open(config_path, 'r', encoding='utf-8') as f: config = json.load(f) agent_configs = config.get("agent_configs", []) if not agent_configs: - raise ValueError(f"模拟配置中没有Agent: {simulation_id}") + raise ValueError(f"No Agents in simulation config: {simulation_id}") - # 构建批量采访列表 + # Build batch interview list interviews = [] for agent_config in agent_configs: agent_id = agent_config.get("agent_id") @@ -1596,7 +1596,7 @@ def interview_all_agents( "prompt": prompt }) - logger.info(f"发送全局Interview命令: simulation_id={simulation_id}, agent_count={len(interviews)}, platform={platform}") + logger.info(f"Sending global Interview command: simulation_id={simulation_id}, agent_count={len(interviews)}, platform={platform}") return cls.interview_agents_batch( simulation_id=simulation_id, @@ -1612,45 +1612,45 @@ def close_simulation_env( timeout: float = 30.0 ) -> Dict[str, Any]: """ - 关闭模拟环境(而不是停止模拟进程) + Close simulation environment (without stopping simulation process) - 向模拟发送关闭环境命令,使其优雅退出等待命令模式 + Send close environment command to simulation for graceful exit from command-waiting mode Args: - simulation_id: 模拟ID - timeout: 超时时间(秒) - + simulation_id: Simulation ID + timeout: Timeout in seconds + Returns: - 操作结果字典 + Operation result dictionary """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) if not os.path.exists(sim_dir): - raise ValueError(f"模拟不存在: {simulation_id}") + raise ValueError(f"Simulation does not exist: {simulation_id}") ipc_client = SimulationIPCClient(sim_dir) if not ipc_client.check_env_alive(): return { "success": True, - "message": "环境已经关闭" + "message": "Environment already closed" } - logger.info(f"发送关闭环境命令: simulation_id={simulation_id}") + logger.info(f"Sending close environment command: simulation_id={simulation_id}") try: response = ipc_client.send_close_env(timeout=timeout) return { "success": response.status.value == "completed", - "message": "环境关闭命令已发送", + "message": "Environment close command sent", "result": response.result, "timestamp": response.timestamp } except TimeoutError: - # 超时可能是因为环境正在关闭 + # Timeout may be because environment is shutting down return { "success": True, - "message": "环境关闭命令已发送(等待响应超时,环境可能正在关闭)" + "message": "Environment close command sent (response timed out, environment may be shutting down)" } @classmethod @@ -1661,7 +1661,7 @@ def _get_interview_history_from_db( agent_id: Optional[int] = None, limit: int = 100 ) -> List[Dict[str, Any]]: - """从单个数据库获取Interview历史""" + """Get Interview history from a single database""" import sqlite3 if not os.path.exists(db_path): @@ -1707,7 +1707,7 @@ def _get_interview_history_from_db( conn.close() except Exception as e: - logger.error(f"读取Interview历史失败 ({platform_name}): {e}") + logger.error(f"Failed to read Interview history ({platform_name}): {e}") return results @@ -1720,29 +1720,29 @@ def get_interview_history( limit: int = 100 ) -> List[Dict[str, Any]]: """ - 获取Interview历史记录(从数据库读取) + Get Interview history (read from database) Args: - simulation_id: 模拟ID - platform: 平台类型(reddit/twitter/None) - - "reddit": 只获取Reddit平台的历史 - - "twitter": 只获取Twitter平台的历史 - - None: 获取两个平台的所有历史 - agent_id: 指定Agent ID(可选,只获取该Agent的历史) - limit: 每个平台返回数量限制 + simulation_id: Simulation ID + platform: Platform type (reddit/twitter/None) + - "reddit": Get Reddit platform history only + - "twitter": Get Twitter platform history only + - None: Get all history from both platforms + agent_id: Specify Agent ID (optional, get only this Agent history) + limit: Return count limit per platform Returns: - Interview历史记录列表 + Interview history record list """ sim_dir = os.path.join(cls.RUN_STATE_DIR, simulation_id) results = [] - # 确定要查询的平台 + # Determine platforms to query if platform in ("reddit", "twitter"): platforms = [platform] else: - # 不指定platform时,查询两个平台 + # When platform not specified, query both platforms platforms = ["twitter", "reddit"] for p in platforms: @@ -1755,10 +1755,10 @@ def get_interview_history( ) results.extend(platform_results) - # 按时间降序排序 + # Sort by time in descending order results.sort(key=lambda x: x.get("timestamp", ""), reverse=True) - # 如果查询了多个平台,限制总数 + # If multiple platforms were queried, limit the total count if len(platforms) > 1 and len(results) > limit: results = results[:limit] diff --git a/backend/app/services/text_processor.py b/backend/app/services/text_processor.py index 91e32ac..4b8ea5b 100644 --- a/backend/app/services/text_processor.py +++ b/backend/app/services/text_processor.py @@ -1,5 +1,5 @@ """ -文本处理服务 +Text processing service """ from typing import List, Optional @@ -7,13 +7,13 @@ class TextProcessor: - """文本处理器""" - + """Text processor""" + @staticmethod def extract_from_files(file_paths: List[str]) -> str: - """从多个文件提取文本""" + """Extract text from multiple files""" return FileParser.extract_from_multiple(file_paths) - + @staticmethod def split_text( text: str, @@ -21,48 +21,48 @@ def split_text( overlap: int = 50 ) -> List[str]: """ - 分割文本 - + Split text into chunks + Args: - text: 原始文本 - chunk_size: 块大小 - overlap: 重叠大小 - + text: Original text + chunk_size: Chunk size + overlap: Overlap size + Returns: - 文本块列表 + List of text chunks """ return split_text_into_chunks(text, chunk_size, overlap) - + @staticmethod def preprocess_text(text: str) -> str: """ - 预处理文本 - - 移除多余空白 - - 标准化换行 - + Preprocess text + - Remove excess whitespace + - Normalize line breaks + Args: - text: 原始文本 - + text: Original text + Returns: - 处理后的文本 + Processed text """ import re - - # 标准化换行 + + # Normalize line breaks text = text.replace('\r\n', '\n').replace('\r', '\n') - - # 移除连续空行(保留最多两个换行) + + # Remove consecutive blank lines (keep at most two newlines) text = re.sub(r'\n{3,}', '\n\n', text) - - # 移除行首行尾空白 + + # Remove leading/trailing whitespace from lines lines = [line.strip() for line in text.split('\n')] text = '\n'.join(lines) - + return text.strip() - + @staticmethod def get_text_stats(text: str) -> dict: - """获取文本统计信息""" + """Get text statistics""" return { "total_chars": len(text), "total_lines": text.count('\n') + 1, diff --git a/backend/app/storage/ner_extractor.py b/backend/app/storage/ner_extractor.py index 5213357..dab8d93 100644 --- a/backend/app/storage/ner_extractor.py +++ b/backend/app/storage/ner_extractor.py @@ -21,12 +21,12 @@ {ontology_description} RULES: -1. Only extract entity types and relation types defined in the ontology. +1. Extract ALL named entities (people, organizations, places, etc.) from the text. Use ontology types when they fit, but also use generic types like "Person", "Organization", "Location", "Entity" for anything that doesn't match a specific ontology type. Do NOT skip entities just because they lack a matching ontology type. 2. Normalize entity names: strip whitespace, use canonical form (e.g., "Jack Ma" not "ma jack"). -3. Each entity must have: name, type (from ontology), and optional attributes. -4. Each relation must have: source entity name, target entity name, type (from ontology), and a fact sentence describing the relationship. +3. Each entity must have: name, type (from ontology or a generic type), and optional attributes. +4. Each relation must have: source entity name, target entity name, type (from ontology or a descriptive type like RELATED_TO, PART_OF, LOCATED_IN), and a fact sentence describing the relationship. 5. If no entities or relations are found, return empty lists. -6. Be precise — only extract what is explicitly stated or strongly implied in the text. +6. Be thorough — extract every person, organization, place, and concept mentioned in the text. More entities is better than fewer. Return ONLY valid JSON in this exact format: {{ @@ -82,10 +82,17 @@ def extract(self, text: str, ontology: Dict[str, Any]) -> Dict[str, Any]: try: result = self.llm.chat_json( messages=messages, - temperature=0.1, # Low temp for extraction precision + temperature=0.35, # Balanced: precise but catches implicit entities max_tokens=4096, ) - return self._validate_and_clean(result, ontology) + logger.info(f"[NER] Raw LLM result: {len(result.get('entities', []))} entities, {len(result.get('relations', []))} relations") + if not result.get('entities') and not result.get('relations'): + logger.warning(f"[NER] LLM returned empty extraction. Raw keys: {list(result.keys())}") + logger.warning(f"[NER] Raw result preview: {str(result)[:500]}") + cleaned = self._validate_and_clean(result, ontology) + if len(cleaned.get('entities', [])) < len(result.get('entities', [])): + logger.info(f"[NER] Validation dropped {len(result.get('entities', [])) - len(cleaned.get('entities', []))} entities") + return cleaned except ValueError as e: last_error = e diff --git a/backend/app/utils/__init__.py b/backend/app/utils/__init__.py index 5848792..656a5f0 100644 --- a/backend/app/utils/__init__.py +++ b/backend/app/utils/__init__.py @@ -1,5 +1,5 @@ """ -工具模块 +Utilities module """ from .file_parser import FileParser diff --git a/backend/app/utils/file_parser.py b/backend/app/utils/file_parser.py index 3f1d8ed..29ee55c 100644 --- a/backend/app/utils/file_parser.py +++ b/backend/app/utils/file_parser.py @@ -1,6 +1,6 @@ """ -文件解析工具 -支持PDF、Markdown、TXT文件的文本提取 +File parsing utilities +Supports text extraction from PDF, Markdown, and TXT files """ import os @@ -10,29 +10,29 @@ def _read_text_with_fallback(file_path: str) -> str: """ - 读取文本文件,UTF-8失败时自动探测编码。 - - 采用多级回退策略: - 1. 首先尝试 UTF-8 解码 - 2. 使用 charset_normalizer 检测编码 - 3. 回退到 chardet 检测编码 - 4. 最终使用 UTF-8 + errors='replace' 兜底 - + Read a text file, automatically detecting encoding if UTF-8 fails. + + Uses a multi-level fallback strategy: + 1. First try UTF-8 decoding + 2. Use charset_normalizer to detect encoding + 3. Fall back to chardet for encoding detection + 4. Final fallback: UTF-8 with errors='replace' + Args: - file_path: 文件路径 - + file_path: Path to the file + Returns: - 解码后的文本内容 + Decoded text content """ data = Path(file_path).read_bytes() - - # 首先尝试 UTF-8 + + # First try UTF-8 try: return data.decode('utf-8') except UnicodeDecodeError: pass - - # 尝试使用 charset_normalizer 检测编码 + + # Try using charset_normalizer to detect encoding encoding = None try: from charset_normalizer import from_bytes @@ -41,8 +41,8 @@ def _read_text_with_fallback(file_path: str) -> str: encoding = best.encoding except Exception: pass - - # 回退到 chardet + + # Fall back to chardet if not encoding: try: import chardet @@ -50,140 +50,139 @@ def _read_text_with_fallback(file_path: str) -> str: encoding = result.get('encoding') if result else None except Exception: pass - - # 最终兜底:使用 UTF-8 + replace + + # Final fallback: UTF-8 + replace if not encoding: encoding = 'utf-8' - + return data.decode(encoding, errors='replace') class FileParser: - """文件解析器""" - + """File parser""" + SUPPORTED_EXTENSIONS = {'.pdf', '.md', '.markdown', '.txt'} - + @classmethod def extract_text(cls, file_path: str) -> str: """ - 从文件中提取文本 - + Extract text from a file + Args: - file_path: 文件路径 - + file_path: Path to the file + Returns: - 提取的文本内容 + Extracted text content """ path = Path(file_path) - + if not path.exists(): - raise FileNotFoundError(f"文件不存在: {file_path}") - + raise FileNotFoundError(f"File not found: {file_path}") + suffix = path.suffix.lower() - + if suffix not in cls.SUPPORTED_EXTENSIONS: - raise ValueError(f"不支持的文件格式: {suffix}") - + raise ValueError(f"Unsupported file format: {suffix}") + if suffix == '.pdf': return cls._extract_from_pdf(file_path) elif suffix in {'.md', '.markdown'}: return cls._extract_from_md(file_path) elif suffix == '.txt': return cls._extract_from_txt(file_path) - - raise ValueError(f"无法处理的文件格式: {suffix}") - + + raise ValueError(f"Unable to process file format: {suffix}") + @staticmethod def _extract_from_pdf(file_path: str) -> str: - """从PDF提取文本""" + """Extract text from PDF""" try: import fitz # PyMuPDF except ImportError: - raise ImportError("需要安装PyMuPDF: pip install PyMuPDF") - + raise ImportError("PyMuPDF is required: pip install PyMuPDF") + text_parts = [] with fitz.open(file_path) as doc: for page in doc: text = page.get_text() if text.strip(): text_parts.append(text) - + return "\n\n".join(text_parts) - + @staticmethod def _extract_from_md(file_path: str) -> str: - """从Markdown提取文本,支持自动编码检测""" + """Extract text from Markdown, with automatic encoding detection""" return _read_text_with_fallback(file_path) - + @staticmethod def _extract_from_txt(file_path: str) -> str: - """从TXT提取文本,支持自动编码检测""" + """Extract text from TXT, with automatic encoding detection""" return _read_text_with_fallback(file_path) - + @classmethod def extract_from_multiple(cls, file_paths: List[str]) -> str: """ - 从多个文件提取文本并合并 - + Extract and merge text from multiple files + Args: - file_paths: 文件路径列表 - + file_paths: List of file paths + Returns: - 合并后的文本 + Merged text """ all_texts = [] - + for i, file_path in enumerate(file_paths, 1): try: text = cls.extract_text(file_path) filename = Path(file_path).name - all_texts.append(f"=== 文档 {i}: {filename} ===\n{text}") + all_texts.append(f"=== Document {i}: {filename} ===\n{text}") except Exception as e: - all_texts.append(f"=== 文档 {i}: {file_path} (提取失败: {str(e)}) ===") - + all_texts.append(f"=== Document {i}: {file_path} (extraction failed: {str(e)}) ===") + return "\n\n".join(all_texts) def split_text_into_chunks( - text: str, - chunk_size: int = 500, + text: str, + chunk_size: int = 500, overlap: int = 50 ) -> List[str]: """ - 将文本分割成小块 - + Split text into smaller chunks + Args: - text: 原始文本 - chunk_size: 每块的字符数 - overlap: 重叠字符数 - + text: Original text + chunk_size: Number of characters per chunk + overlap: Number of overlapping characters + Returns: - 文本块列表 + List of text chunks """ if len(text) <= chunk_size: return [text] if text.strip() else [] - + chunks = [] start = 0 - + while start < len(text): end = start + chunk_size - - # 尝试在句子边界处分割 + + # Try to split at sentence boundaries if end < len(text): - # 查找最近的句子结束符 - for sep in ['。', '!', '?', '.\n', '!\n', '?\n', '\n\n', '. ', '! ', '? ']: + # Find the nearest sentence-ending delimiter + for sep in ['.\n', '!\n', '?\n', '\n\n', '. ', '! ', '? ']: last_sep = text[start:end].rfind(sep) if last_sep != -1 and last_sep > chunk_size * 0.3: end = start + last_sep + len(sep) break - + chunk = text[start:end].strip() if chunk: chunks.append(chunk) - - # 下一个块从重叠位置开始 + + # Next chunk starts from the overlap position start = end - overlap if end < len(text) else len(text) - - return chunks + return chunks diff --git a/backend/app/utils/llm_client.py b/backend/app/utils/llm_client.py index 69389e2..a579e5b 100644 --- a/backend/app/utils/llm_client.py +++ b/backend/app/utils/llm_client.py @@ -1,7 +1,7 @@ """ -LLM客户端封装 -统一使用OpenAI格式调用 -支持 Ollama num_ctx 参数防止 prompt 被截断 +LLM client wrapper +Unified OpenAI format for API calls +Supports Ollama num_ctx parameter to prevent prompt truncation """ import json @@ -14,7 +14,7 @@ class LLMClient: - """LLM客户端""" + """LLM client""" def __init__( self, @@ -28,7 +28,7 @@ def __init__( self.model = model or Config.LLM_MODEL_NAME if not self.api_key: - raise ValueError("LLM_API_KEY 未配置") + raise ValueError("LLM_API_KEY is not configured") self.client = OpenAI( api_key=self.api_key, @@ -52,16 +52,16 @@ def chat( response_format: Optional[Dict] = None ) -> str: """ - 发送聊天请求 + Send a chat request Args: - messages: 消息列表 - temperature: 温度参数 - max_tokens: 最大token数 - response_format: 响应格式(如JSON模式) + messages: List of messages + temperature: Temperature parameter + max_tokens: Maximum number of tokens + response_format: Response format (e.g. JSON mode) Returns: - 模型响应文本 + Model response text """ kwargs = { "model": self.model, @@ -81,7 +81,7 @@ def chat( response = self.client.chat.completions.create(**kwargs) content = response.choices[0].message.content - # 部分模型(如MiniMax M2.5)会在content中包含思考内容,需要移除 + # Some models (e.g. MiniMax M2.5) include reasoning content in the response, which needs to be removed content = re.sub(r'[\s\S]*?', '', content).strip() return content @@ -92,15 +92,15 @@ def chat_json( max_tokens: int = 4096 ) -> Dict[str, Any]: """ - 发送聊天请求并返回JSON + Send a chat request and return JSON Args: - messages: 消息列表 - temperature: 温度参数 - max_tokens: 最大token数 + messages: List of messages + temperature: Temperature parameter + max_tokens: Maximum number of tokens Returns: - 解析后的JSON对象 + Parsed JSON object """ response = self.chat( messages=messages, @@ -108,7 +108,7 @@ def chat_json( max_tokens=max_tokens, response_format={"type": "json_object"} ) - # 清理markdown代码块标记 + # Clean up markdown code block markers cleaned_response = response.strip() cleaned_response = re.sub(r'^```(?:json)?\s*\n?', '', cleaned_response, flags=re.IGNORECASE) cleaned_response = re.sub(r'\n?```\s*$', '', cleaned_response) @@ -117,4 +117,4 @@ def chat_json( try: return json.loads(cleaned_response) except json.JSONDecodeError: - raise ValueError(f"LLM返回的JSON格式无效: {cleaned_response}") + raise ValueError(f"Invalid JSON returned by LLM: {cleaned_response}") diff --git a/backend/app/utils/logger.py b/backend/app/utils/logger.py index 1978c0b..0134313 100644 --- a/backend/app/utils/logger.py +++ b/backend/app/utils/logger.py @@ -1,6 +1,6 @@ """ -日志配置模块 -提供统一的日志管理,同时输出到控制台和文件 +Logging configuration module +Provides unified log management with output to both console and file """ import os @@ -12,58 +12,58 @@ def _ensure_utf8_stdout(): """ - 确保 stdout/stderr 使用 UTF-8 编码 - 解决 Windows 控制台中文乱码问题 + Ensure stdout/stderr use UTF-8 encoding + Fixes encoding issues on the Windows console """ if sys.platform == 'win32': - # Windows 下重新配置标准输出为 UTF-8 + # Reconfigure standard output to UTF-8 on Windows if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') -# 日志目录 +# Log directory LOG_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'logs') def setup_logger(name: str = 'mirofish', level: int = logging.DEBUG) -> logging.Logger: """ - 设置日志器 - + Set up a logger + Args: - name: 日志器名称 - level: 日志级别 - + name: Logger name + level: Log level + Returns: - 配置好的日志器 + Configured logger """ - # 确保日志目录存在 + # Ensure log directory exists os.makedirs(LOG_DIR, exist_ok=True) - - # 创建日志器 + + # Create logger logger = logging.getLogger(name) logger.setLevel(level) - - # 阻止日志向上传播到根 logger,避免重复输出 + + # Prevent log propagation to root logger to avoid duplicate output logger.propagate = False - - # 如果已经有处理器,不重复添加 + + # If handlers already exist, do not add duplicates if logger.handlers: return logger - - # 日志格式 + + # Log format detailed_formatter = logging.Formatter( '[%(asctime)s] %(levelname)s [%(name)s.%(funcName)s:%(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) - + simple_formatter = logging.Formatter( '[%(asctime)s] %(levelname)s: %(message)s', datefmt='%H:%M:%S' ) - - # 1. 文件处理器 - 详细日志(按日期命名,带轮转) + + # 1. File handler - detailed logs (named by date, with rotation) log_filename = datetime.now().strftime('%Y-%m-%d') + '.log' file_handler = RotatingFileHandler( os.path.join(LOG_DIR, log_filename), @@ -73,30 +73,30 @@ def setup_logger(name: str = 'mirofish', level: int = logging.DEBUG) -> logging. ) file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(detailed_formatter) - - # 2. 控制台处理器 - 简洁日志(INFO及以上) - # 确保 Windows 下使用 UTF-8 编码,避免中文乱码 + + # 2. Console handler - concise logs (INFO and above) + # Ensure UTF-8 encoding on Windows to avoid garbled output _ensure_utf8_stdout() console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(logging.INFO) console_handler.setFormatter(simple_formatter) - - # 添加处理器 + + # Add handlers logger.addHandler(file_handler) logger.addHandler(console_handler) - + return logger def get_logger(name: str = 'mirofish') -> logging.Logger: """ - 获取日志器(如果不存在则创建) - + Get a logger (creates one if it doesn't exist) + Args: - name: 日志器名称 - + name: Logger name + Returns: - 日志器实例 + Logger instance """ logger = logging.getLogger(name) if not logger.handlers: @@ -104,11 +104,11 @@ def get_logger(name: str = 'mirofish') -> logging.Logger: return logger -# 创建默认日志器 +# Create default logger logger = setup_logger() -# 便捷方法 +# Convenience methods def debug(msg, *args, **kwargs): logger.debug(msg, *args, **kwargs) @@ -123,4 +123,3 @@ def error(msg, *args, **kwargs): def critical(msg, *args, **kwargs): logger.critical(msg, *args, **kwargs) - diff --git a/backend/app/utils/retry.py b/backend/app/utils/retry.py index 819b1cf..736d2de 100644 --- a/backend/app/utils/retry.py +++ b/backend/app/utils/retry.py @@ -1,6 +1,6 @@ """ -API调用重试机制 -用于处理LLM等外部API调用的重试逻辑 +API call retry mechanism +Handles retry logic for external API calls such as LLM """ import time @@ -22,17 +22,17 @@ def retry_with_backoff( on_retry: Optional[Callable[[Exception, int], None]] = None ): """ - 带指数退避的重试装饰器 - + Retry decorator with exponential backoff + Args: - max_retries: 最大重试次数 - initial_delay: 初始延迟(秒) - max_delay: 最大延迟(秒) - backoff_factor: 退避因子 - jitter: 是否添加随机抖动 - exceptions: 需要重试的异常类型 - on_retry: 重试时的回调函数 (exception, retry_count) - + max_retries: Maximum number of retries + initial_delay: Initial delay in seconds + max_delay: Maximum delay in seconds + backoff_factor: Backoff multiplier + jitter: Whether to add random jitter + exceptions: Exception types that trigger a retry + on_retry: Callback function on retry (exception, retry_count) + Usage: @retry_with_backoff(max_retries=3) def call_llm_api(): @@ -43,36 +43,36 @@ def decorator(func: Callable) -> Callable: def wrapper(*args, **kwargs) -> Any: last_exception = None delay = initial_delay - + for attempt in range(max_retries + 1): try: return func(*args, **kwargs) - + except exceptions as e: last_exception = e - + if attempt == max_retries: - logger.error(f"函数 {func.__name__} 在 {max_retries} 次重试后仍失败: {str(e)}") + logger.error(f"Function {func.__name__} still failed after {max_retries} retries: {str(e)}") raise - - # 计算延迟 + + # Calculate delay current_delay = min(delay, max_delay) if jitter: current_delay = current_delay * (0.5 + random.random()) - + logger.warning( - f"函数 {func.__name__} 第 {attempt + 1} 次尝试失败: {str(e)}, " - f"{current_delay:.1f}秒后重试..." + f"Function {func.__name__} attempt {attempt + 1} failed: {str(e)}, " + f"retrying in {current_delay:.1f}s..." ) - + if on_retry: on_retry(e, attempt + 1) - + time.sleep(current_delay) delay *= backoff_factor - + raise last_exception - + return wrapper return decorator @@ -87,53 +87,53 @@ def retry_with_backoff_async( on_retry: Optional[Callable[[Exception, int], None]] = None ): """ - 异步版本的重试装饰器 + Async version of the retry decorator """ import asyncio - + def decorator(func: Callable) -> Callable: @functools.wraps(func) async def wrapper(*args, **kwargs) -> Any: last_exception = None delay = initial_delay - + for attempt in range(max_retries + 1): try: return await func(*args, **kwargs) - + except exceptions as e: last_exception = e - + if attempt == max_retries: - logger.error(f"异步函数 {func.__name__} 在 {max_retries} 次重试后仍失败: {str(e)}") + logger.error(f"Async function {func.__name__} still failed after {max_retries} retries: {str(e)}") raise - + current_delay = min(delay, max_delay) if jitter: current_delay = current_delay * (0.5 + random.random()) - + logger.warning( - f"异步函数 {func.__name__} 第 {attempt + 1} 次尝试失败: {str(e)}, " - f"{current_delay:.1f}秒后重试..." + f"Async function {func.__name__} attempt {attempt + 1} failed: {str(e)}, " + f"retrying in {current_delay:.1f}s..." ) - + if on_retry: on_retry(e, attempt + 1) - + await asyncio.sleep(current_delay) delay *= backoff_factor - + raise last_exception - + return wrapper return decorator class RetryableAPIClient: """ - 可重试的API客户端封装 + Retryable API client wrapper """ - + def __init__( self, max_retries: int = 3, @@ -145,7 +145,7 @@ def __init__( self.initial_delay = initial_delay self.max_delay = max_delay self.backoff_factor = backoff_factor - + def call_with_retry( self, func: Callable, @@ -154,44 +154,44 @@ def call_with_retry( **kwargs ) -> Any: """ - 执行函数调用并在失败时重试 - + Execute a function call with retry on failure + Args: - func: 要调用的函数 - *args: 函数参数 - exceptions: 需要重试的异常类型 - **kwargs: 函数关键字参数 - + func: Function to call + *args: Function arguments + exceptions: Exception types that trigger a retry + **kwargs: Function keyword arguments + Returns: - 函数返回值 + Function return value """ last_exception = None delay = self.initial_delay - + for attempt in range(self.max_retries + 1): try: return func(*args, **kwargs) - + except exceptions as e: last_exception = e - + if attempt == self.max_retries: - logger.error(f"API调用在 {self.max_retries} 次重试后仍失败: {str(e)}") + logger.error(f"API call still failed after {self.max_retries} retries: {str(e)}") raise - + current_delay = min(delay, self.max_delay) current_delay = current_delay * (0.5 + random.random()) - + logger.warning( - f"API调用第 {attempt + 1} 次尝试失败: {str(e)}, " - f"{current_delay:.1f}秒后重试..." + f"API call attempt {attempt + 1} failed: {str(e)}, " + f"retrying in {current_delay:.1f}s..." ) - + time.sleep(current_delay) delay *= self.backoff_factor - + raise last_exception - + def call_batch_with_retry( self, items: list, @@ -200,20 +200,20 @@ def call_batch_with_retry( continue_on_failure: bool = True ) -> Tuple[list, list]: """ - 批量调用并对每个失败项单独重试 - + Batch call with individual retry for each failed item + Args: - items: 要处理的项目列表 - process_func: 处理函数,接收单个item作为参数 - exceptions: 需要重试的异常类型 - continue_on_failure: 单项失败后是否继续处理其他项 - + items: List of items to process + process_func: Processing function that accepts a single item as argument + exceptions: Exception types that trigger a retry + continue_on_failure: Whether to continue processing other items after a single item fails + Returns: - (成功结果列表, 失败项列表) + (list of successful results, list of failed items) """ results = [] failures = [] - + for idx, item in enumerate(items): try: result = self.call_with_retry( @@ -222,17 +222,16 @@ def call_batch_with_retry( exceptions=exceptions ) results.append(result) - + except Exception as e: - logger.error(f"处理第 {idx + 1} 项失败: {str(e)}") + logger.error(f"Processing item {idx + 1} failed: {str(e)}") failures.append({ "index": idx, "item": item, "error": str(e) }) - + if not continue_on_failure: raise - - return results, failures + return results, failures diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 59ed9aa..4d51f71 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "mirofish-offline-backend" -version = "0.2.0-offline" +version = "0.2.0.post1" description = "MiroFish-Offline - Offline-first fork running on Neo4j + Ollama" requires-python = ">=3.11" license = { text = "AGPL-3.0" } @@ -9,27 +9,27 @@ authors = [ ] dependencies = [ - # 核心框架 + # Core framework "flask>=3.0.0", "flask-cors>=6.0.0", - # LLM 相关 + # LLM related "openai>=1.0.0", # Neo4j graph database driver "neo4j>=5.15.0", - # OASIS 社交媒体模拟 + # OASIS social media simulation "camel-oasis==0.2.5", "camel-ai==0.2.78", - # 文件处理 + # File processing "PyMuPDF>=1.24.0", - # 编码检测(支持非UTF-8编码的文本文件) + # Encoding detection (supports non-UTF-8 encoded text files) "charset-normalizer>=3.0.0", "chardet>=5.0.0", - # 工具库 + # Utility libraries "python-dotenv>=1.0.0", "pydantic>=2.0.0", ] diff --git a/backend/requirements.txt b/backend/requirements.txt index 5cffdbf..f80aa7e 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -5,12 +5,12 @@ # Install: pip install -r requirements.txt # =========================================== -# ============= 核心框架 ============= +# ============= Core Framework ============= flask>=3.0.0 flask-cors>=6.0.0 -# ============= LLM 相关 ============= -# OpenAI SDK(统一使用 OpenAI 格式调用 LLM / Ollama) +# ============= LLM Related ============= +# OpenAI SDK (unified OpenAI format for calling LLM / Ollama) openai>=1.0.0 # HTTP client for Ollama embedding API requests>=2.28.0 @@ -18,20 +18,20 @@ requests>=2.28.0 # ============= Neo4j Graph Database ============= neo4j>=5.15.0 -# ============= OASIS 社交媒体模拟 ============= -# OASIS 社交模拟框架 +# ============= OASIS Social Media Simulation ============= +# OASIS social simulation framework camel-oasis==0.2.5 camel-ai==0.2.78 -# ============= 文件处理 ============= +# ============= File Processing ============= PyMuPDF>=1.24.0 -# 编码检测(支持非UTF-8编码的文本文件) +# Encoding detection (supports non-UTF-8 encoded text files) charset-normalizer>=3.0.0 chardet>=5.0.0 -# ============= 工具库 ============= -# 环境变量加载 +# ============= Utility Libraries ============= +# Environment variable loading python-dotenv>=1.0.0 -# 数据验证 +# Data validation pydantic>=2.0.0 diff --git a/backend/run.py b/backend/run.py index 4e3b04f..fb5c295 100644 --- a/backend/run.py +++ b/backend/run.py @@ -1,21 +1,21 @@ """ -MiroFish Backend 启动入口 +MiroFish Backend entry point """ import os import sys -# 解决 Windows 控制台中文乱码问题:在所有导入之前设置 UTF-8 编码 +# Fix Windows console encoding issues: set UTF-8 encoding before all imports if sys.platform == 'win32': - # 设置环境变量确保 Python 使用 UTF-8 + # Set environment variable to ensure Python uses UTF-8 os.environ.setdefault('PYTHONIOENCODING', 'utf-8') - # 重新配置标准输出流为 UTF-8 + # Reconfigure standard output streams to UTF-8 if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') -# 添加项目根目录到路径 +# Add project root directory to path sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from app import create_app @@ -23,28 +23,27 @@ def main(): - """主函数""" - # 验证配置 + """Main function""" + # Validate configuration errors = Config.validate() if errors: - print("配置错误:") + print("Configuration errors:") for err in errors: print(f" - {err}") - print("\n请检查 .env 文件中的配置") + print("\nPlease check the configuration in your .env file") sys.exit(1) - - # 创建应用 + + # Create application app = create_app() - - # 获取运行配置 + + # Get runtime configuration host = os.environ.get('FLASK_HOST', '0.0.0.0') port = int(os.environ.get('FLASK_PORT', 5001)) debug = Config.DEBUG - - # 启动服务 + + # Start server app.run(host=host, port=port, debug=debug, threaded=True) if __name__ == '__main__': main() - diff --git a/backend/scripts/action_logger.py b/backend/scripts/action_logger.py index 38d025a..39773ec 100644 --- a/backend/scripts/action_logger.py +++ b/backend/scripts/action_logger.py @@ -1,15 +1,15 @@ """ -动作日志记录器 -用于记录OASIS模拟中每个Agent的动作,供后端监控使用 +Action logger +Records each Agent's actions during OASIS simulation for backend monitoring -日志结构: +Log structure: sim_xxx/ ├── twitter/ - │ └── actions.jsonl # Twitter 平台动作日志 + │ └── actions.jsonl # Twitter platform action log ├── reddit/ - │ └── actions.jsonl # Reddit 平台动作日志 - ├── simulation.log # 主模拟进程日志 - └── run_state.json # 运行状态(API 查询用) + │ └── actions.jsonl # Reddit platform action log + ├── simulation.log # Main simulation process log + └── run_state.json # Run state (for API queries) """ import json @@ -20,15 +20,15 @@ class PlatformActionLogger: - """单平台动作日志记录器""" - + """Single-platform action logger""" + def __init__(self, platform: str, base_dir: str): """ - 初始化日志记录器 - + Initialize the logger + Args: - platform: 平台名称 (twitter/reddit) - base_dir: 模拟目录的基础路径 + platform: Platform name (twitter/reddit) + base_dir: Base path of the simulation directory """ self.platform = platform self.base_dir = base_dir @@ -37,7 +37,7 @@ def __init__(self, platform: str, base_dir: str): self._ensure_dir() def _ensure_dir(self): - """确保目录存在""" + """Ensure the directory exists""" os.makedirs(self.log_dir, exist_ok=True) def log_action( @@ -50,7 +50,7 @@ def log_action( result: Optional[str] = None, success: bool = True ): - """记录一个动作""" + """Record an action""" entry = { "round": round_num, "timestamp": datetime.now().isoformat(), @@ -66,7 +66,7 @@ def log_action( f.write(json.dumps(entry, ensure_ascii=False) + '\n') def log_round_start(self, round_num: int, simulated_hour: int): - """记录轮次开始""" + """Record round start""" entry = { "round": round_num, "timestamp": datetime.now().isoformat(), @@ -78,7 +78,7 @@ def log_round_start(self, round_num: int, simulated_hour: int): f.write(json.dumps(entry, ensure_ascii=False) + '\n') def log_round_end(self, round_num: int, actions_count: int): - """记录轮次结束""" + """Record round end""" entry = { "round": round_num, "timestamp": datetime.now().isoformat(), @@ -90,7 +90,7 @@ def log_round_end(self, round_num: int, actions_count: int): f.write(json.dumps(entry, ensure_ascii=False) + '\n') def log_simulation_start(self, config: Dict[str, Any]): - """记录模拟开始""" + """Record simulation start""" entry = { "timestamp": datetime.now().isoformat(), "event_type": "simulation_start", @@ -103,7 +103,7 @@ def log_simulation_start(self, config: Dict[str, Any]): f.write(json.dumps(entry, ensure_ascii=False) + '\n') def log_simulation_end(self, total_rounds: int, total_actions: int): - """记录模拟结束""" + """Record simulation end""" entry = { "timestamp": datetime.now().isoformat(), "event_type": "simulation_end", @@ -118,35 +118,35 @@ def log_simulation_end(self, total_rounds: int, total_actions: int): class SimulationLogManager: """ - 模拟日志管理器 - 统一管理所有日志文件,按平台分离 + Simulation log manager + Manages all log files uniformly, separated by platform """ - + def __init__(self, simulation_dir: str): """ - 初始化日志管理器 - + Initialize the log manager + Args: - simulation_dir: 模拟目录路径 + simulation_dir: Simulation directory path """ self.simulation_dir = simulation_dir self.twitter_logger: Optional[PlatformActionLogger] = None self.reddit_logger: Optional[PlatformActionLogger] = None self._main_logger: Optional[logging.Logger] = None - # 设置主日志 + # Set up main logger self._setup_main_logger() - + def _setup_main_logger(self): - """设置主模拟日志""" + """Set up main simulation logger""" log_path = os.path.join(self.simulation_dir, "simulation.log") - # 创建 logger + # Create logger self._main_logger = logging.getLogger(f"simulation.{os.path.basename(self.simulation_dir)}") self._main_logger.setLevel(logging.INFO) self._main_logger.handlers.clear() - # 文件处理器 + # File handler file_handler = logging.FileHandler(log_path, encoding='utf-8', mode='w') file_handler.setLevel(logging.INFO) file_handler.setFormatter(logging.Formatter( @@ -155,7 +155,7 @@ def _setup_main_logger(self): )) self._main_logger.addHandler(file_handler) - # 控制台处理器 + # Console handler console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(logging.Formatter( @@ -167,19 +167,19 @@ def _setup_main_logger(self): self._main_logger.propagate = False def get_twitter_logger(self) -> PlatformActionLogger: - """获取 Twitter 平台日志记录器""" + """Get the Twitter platform logger""" if self.twitter_logger is None: self.twitter_logger = PlatformActionLogger("twitter", self.simulation_dir) return self.twitter_logger def get_reddit_logger(self) -> PlatformActionLogger: - """获取 Reddit 平台日志记录器""" + """Get the Reddit platform logger""" if self.reddit_logger is None: self.reddit_logger = PlatformActionLogger("reddit", self.simulation_dir) return self.reddit_logger def log(self, message: str, level: str = "info"): - """记录主日志""" + """Record main log""" if self._main_logger: getattr(self._main_logger, level.lower(), self._main_logger.info)(message) @@ -196,12 +196,12 @@ def debug(self, message: str): self.log(message, "debug") -# ============ 兼容旧接口 ============ +# ============ Legacy interface compatibility ============ class ActionLogger: """ - 动作日志记录器(兼容旧接口) - 建议使用 SimulationLogManager 代替 + Action logger (legacy interface compatibility) + It is recommended to use SimulationLogManager instead """ def __init__(self, log_path: str): @@ -288,12 +288,12 @@ def log_simulation_end(self, platform: str, total_rounds: int, total_actions: in f.write(json.dumps(entry, ensure_ascii=False) + '\n') -# 全局日志实例(兼容旧接口) +# Global logger instance (legacy interface compatibility) _global_logger: Optional[ActionLogger] = None def get_logger(log_path: Optional[str] = None) -> ActionLogger: - """获取全局日志实例(兼容旧接口)""" + """Get the global logger instance (legacy interface compatibility)""" global _global_logger if log_path: diff --git a/backend/scripts/run_parallel_simulation.py b/backend/scripts/run_parallel_simulation.py index 2a627ff..6df1a23 100644 --- a/backend/scripts/run_parallel_simulation.py +++ b/backend/scripts/run_parallel_simulation.py @@ -1,62 +1,62 @@ """ -OASIS 双平台并行模拟预设脚本 -同时运行Twitter和Reddit模拟,读取相同的配置文件 +OASIS dual-platform parallel simulation preset script +Runs Twitter and Reddit simulations simultaneously, reading the same config file -功能特性: -- 双平台(Twitter + Reddit)并行模拟 -- 完成模拟后不立即关闭环境,进入等待命令模式 -- 支持通过IPC接收Interview命令 -- 支持单个Agent采访和批量采访 -- 支持远程关闭环境命令 +Features: +- Dual-platform (Twitter + Reddit) parallel simulation +- After simulation completes, does not close environment immediately, enters command waiting mode +- Supports receiving Interview commands via IPC +- Supports single Agent interview and batch interview +- Supports remote close environment command -使用方式: +Usage: python run_parallel_simulation.py --config simulation_config.json - python run_parallel_simulation.py --config simulation_config.json --no-wait # 完成后立即关闭 + python run_parallel_simulation.py --config simulation_config.json --no-wait # Close immediately after completion python run_parallel_simulation.py --config simulation_config.json --twitter-only python run_parallel_simulation.py --config simulation_config.json --reddit-only -日志结构: +Log structure: sim_xxx/ ├── twitter/ - │ └── actions.jsonl # Twitter 平台动作日志 + │ └── actions.jsonl # Twitter platform action log ├── reddit/ - │ └── actions.jsonl # Reddit 平台动作日志 - ├── simulation.log # 主模拟进程日志 - └── run_state.json # 运行状态(API 查询用) + │ └── actions.jsonl # Reddit platform action log + ├── simulation.log # Main simulation process log + └── run_state.json # Run state (for API queries) """ # ============================================================ -# 解决 Windows 编码问题:在所有 import 之前设置 UTF-8 编码 -# 这是为了修复 OASIS 第三方库读取文件时未指定编码的问题 +# Fix Windows encoding issue: set UTF-8 encoding before all imports +# This fixes the issue where OASIS third-party libraries read files without specifying encoding # ============================================================ import sys import os if sys.platform == 'win32': - # 设置 Python 默认 I/O 编码为 UTF-8 - # 这会影响所有未指定编码的 open() 调用 + # Set Python default I/O encoding to UTF-8 + # This affects all open() calls without specified encoding os.environ.setdefault('PYTHONUTF8', '1') os.environ.setdefault('PYTHONIOENCODING', 'utf-8') - # 重新配置标准输出流为 UTF-8(解决控制台中文乱码) + # Reconfigure standard output streams to UTF-8 (fix console encoding issues) if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(encoding='utf-8', errors='replace') if hasattr(sys.stderr, 'reconfigure'): sys.stderr.reconfigure(encoding='utf-8', errors='replace') - # 强制设置默认编码(影响 open() 函数的默认编码) - # 注意:这需要在 Python 启动时就设置,运行时设置可能不生效 - # 所以我们还需要 monkey-patch 内置的 open 函数 + # Force set default encoding (affects open() function default encoding) + # Note: This needs to be set at Python startup, runtime setting may not take effect + # So we also need to monkey-patch the built-in open function import builtins _original_open = builtins.open def _utf8_open(file, mode='r', buffering=-1, encoding=None, errors=None, newline=None, closefd=True, opener=None): """ - 包装 open() 函数,对于文本模式默认使用 UTF-8 编码 - 这可以修复第三方库(如 OASIS)读取文件时未指定编码的问题 + Wrapper for open() that defaults to UTF-8 encoding for text mode + This fixes the issue where third-party libraries (like OASIS) read files without specifying encoding """ - # 只对文本模式(非二进制)且未指定编码的情况设置默认编码 + # Only set default encoding for text mode (non-binary) when encoding is not specified if encoding is None and 'b' not in mode: encoding = 'utf-8' return _original_open(file, mode, buffering, encoding, errors, @@ -77,52 +77,52 @@ def _utf8_open(file, mode='r', buffering=-1, encoding=None, errors=None, from typing import Dict, Any, List, Optional, Tuple -# 全局变量:用于信号处理 +# Global variables: for signal handling _shutdown_event = None _cleanup_done = False -# 添加 backend 目录到路径 -# 脚本固定位于 backend/scripts/ 目录 +# Add backend directory to path +# Script is fixed in backend/scripts/ directory _scripts_dir = os.path.dirname(os.path.abspath(__file__)) _backend_dir = os.path.abspath(os.path.join(_scripts_dir, '..')) _project_root = os.path.abspath(os.path.join(_backend_dir, '..')) sys.path.insert(0, _scripts_dir) sys.path.insert(0, _backend_dir) -# 加载项目根目录的 .env 文件(包含 LLM_API_KEY 等配置) +# Load .env file from project root (contains LLM_API_KEY and other configs) from dotenv import load_dotenv _env_file = os.path.join(_project_root, '.env') if os.path.exists(_env_file): load_dotenv(_env_file) - print(f"已加载环境配置: {_env_file}") + print(f"Loaded environment config: {_env_file}") else: - # 尝试加载 backend/.env + # Try loading backend/.env _backend_env = os.path.join(_backend_dir, '.env') if os.path.exists(_backend_env): load_dotenv(_backend_env) - print(f"已加载环境配置: {_backend_env}") + print(f"Loaded environment config: {_backend_env}") class MaxTokensWarningFilter(logging.Filter): - """过滤掉 camel-ai 关于 max_tokens 的警告(我们故意不设置 max_tokens,让模型自行决定)""" + """Filter out camel-ai warnings about max_tokens (we intentionally do not set max_tokens, letting the model decide)""" def filter(self, record): - # 过滤掉包含 max_tokens 警告的日志 + # Filter out logs containing max_tokens warnings if "max_tokens" in record.getMessage() and "Invalid or missing" in record.getMessage(): return False return True -# 在模块加载时立即添加过滤器,确保在 camel 代码执行前生效 +# Add filter immediately at module load time to ensure it takes effect before camel code executes logging.getLogger().addFilter(MaxTokensWarningFilter()) def disable_oasis_logging(): """ - 禁用 OASIS 库的详细日志输出 - OASIS 的日志太冗余(记录每个 agent 的观察和动作),我们使用自己的 action_logger + Disable detailed OASIS library log output + OASIS logging is too verbose (logs each agent's observations and actions), we use our own action_logger """ - # 禁用 OASIS 的所有日志器 + # Disable all OASIS loggers oasis_loggers = [ "social.agent", "social.twitter", @@ -133,22 +133,22 @@ def disable_oasis_logging(): for logger_name in oasis_loggers: logger = logging.getLogger(logger_name) - logger.setLevel(logging.CRITICAL) # 只记录严重错误 + logger.setLevel(logging.CRITICAL) # Only log critical errors logger.handlers.clear() logger.propagate = False def init_logging_for_simulation(simulation_dir: str): """ - 初始化模拟的日志配置 + Initialize simulation logging config Args: - simulation_dir: 模拟目录路径 + simulation_dir: Simulation directory path """ - # 禁用 OASIS 的详细日志 + # Disable detailed OASIS logging disable_oasis_logging() - # 清理旧的 log 目录(如果存在) + # Clean up old log directory (if exists) old_log_dir = os.path.join(simulation_dir, "log") if os.path.exists(old_log_dir): import shutil @@ -169,12 +169,12 @@ def init_logging_for_simulation(simulation_dir: str): generate_reddit_agent_graph ) except ImportError as e: - print(f"错误: 缺少依赖 {e}") - print("请先安装: pip install oasis-ai camel-ai") + print(f"Error: Missing dependency {e}") + print("Please install first: pip install oasis-ai camel-ai") sys.exit(1) -# Twitter可用动作(不包含INTERVIEW,INTERVIEW只能通过ManualAction手动触发) +# Twitter available actions (excludes INTERVIEW, INTERVIEW can only be triggered manually via ManualAction) TWITTER_ACTIONS = [ ActionType.CREATE_POST, ActionType.LIKE_POST, @@ -184,7 +184,7 @@ def init_logging_for_simulation(simulation_dir: str): ActionType.QUOTE_POST, ] -# Reddit可用动作(不包含INTERVIEW,INTERVIEW只能通过ManualAction手动触发) +# Reddit available actions (excludes INTERVIEW, INTERVIEW can only be triggered manually via ManualAction) REDDIT_ACTIONS = [ ActionType.LIKE_POST, ActionType.DISLIKE_POST, @@ -202,13 +202,13 @@ def init_logging_for_simulation(simulation_dir: str): ] -# IPC相关常量 +# IPC-related constants IPC_COMMANDS_DIR = "ipc_commands" IPC_RESPONSES_DIR = "ipc_responses" ENV_STATUS_FILE = "env_status.json" class CommandType: - """命令类型常量""" + """Command type constants""" INTERVIEW = "interview" BATCH_INTERVIEW = "batch_interview" CLOSE_ENV = "close_env" @@ -216,9 +216,9 @@ class CommandType: class ParallelIPCHandler: """ - 双平台IPC命令处理器 + Dual-platform IPC command handler - 管理两个平台的环境,处理Interview命令 + Manages environments for both platforms, handles Interview commands """ def __init__( @@ -239,12 +239,12 @@ def __init__( self.responses_dir = os.path.join(simulation_dir, IPC_RESPONSES_DIR) self.status_file = os.path.join(simulation_dir, ENV_STATUS_FILE) - # 确保目录存在 + # Ensure directories exist os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) def update_status(self, status: str): - """更新环境状态""" + """Update environment status""" with open(self.status_file, 'w', encoding='utf-8') as f: json.dump({ "status": status, @@ -254,11 +254,11 @@ def update_status(self, status: str): }, f, ensure_ascii=False, indent=2) def poll_command(self) -> Optional[Dict[str, Any]]: - """轮询获取待处理命令""" + """Poll for pending commands""" if not os.path.exists(self.commands_dir): return None - # 获取命令文件(按时间排序) + # Get command files (sorted by time) command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): @@ -277,7 +277,7 @@ def poll_command(self) -> Optional[Dict[str, Any]]: return None def send_response(self, command_id: str, status: str, result: Dict = None, error: str = None): - """发送响应""" + """Send response""" response = { "command_id": command_id, "status": status, @@ -290,7 +290,7 @@ def send_response(self, command_id: str, status: str, result: Dict = None, error with open(response_file, 'w', encoding='utf-8') as f: json.dump(response, f, ensure_ascii=False, indent=2) - # 删除命令文件 + # Delete command file command_file = os.path.join(self.commands_dir, f"{command_id}.json") try: os.remove(command_file) @@ -299,13 +299,13 @@ def send_response(self, command_id: str, status: str, result: Dict = None, error def _get_env_and_graph(self, platform: str): """ - 获取指定平台的环境和agent_graph + Get environment and agent_graph for specified platform Args: - platform: 平台名称 ("twitter" 或 "reddit") + platform: Platform name ("twitter" or "reddit") Returns: - (env, agent_graph, platform_name) 或 (None, None, None) + (env, agent_graph, platform_name) or (None, None, None) """ if platform == "twitter" and self.twitter_env: return self.twitter_env, self.twitter_agent_graph, "twitter" @@ -316,15 +316,15 @@ def _get_env_and_graph(self, platform: str): async def _interview_single_platform(self, agent_id: int, prompt: str, platform: str) -> Dict[str, Any]: """ - 在单个平台上执行Interview + Execute Interview on a single platform Returns: - 包含结果的字典,或包含error的字典 + Dictionary containing result, or dictionary containing error """ env, agent_graph, actual_platform = self._get_env_and_graph(platform) if not env or not agent_graph: - return {"platform": platform, "error": f"{platform}平台不可用"} + return {"platform": platform, "error": f"{platform}platform unavailable"} try: agent = agent_graph.get_agent(agent_id) @@ -344,36 +344,36 @@ async def _interview_single_platform(self, agent_id: int, prompt: str, platform: async def handle_interview(self, command_id: str, agent_id: int, prompt: str, platform: str = None) -> bool: """ - 处理单个Agent采访命令 + Handle single Agent interview command Args: - command_id: 命令ID + command_id: Command ID agent_id: Agent ID - prompt: 采访问题 - platform: 指定平台(可选) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None/不指定: 同时采访两个平台,返回整合结果 + prompt: Interview question + platform: Specify platform (optional) + - "twitter": Interview Twitter platform only + - "reddit": Interview Reddit platform only + - None/not specified: Interview both platforms simultaneously, return integrated results Returns: - True 表示成功,False 表示失败 + True indicates success, False indicates failure """ - # 如果指定了平台,只采访该平台 + # If platform specified, only interview that platform if platform in ("twitter", "reddit"): result = await self._interview_single_platform(agent_id, prompt, platform) if "error" in result: self.send_response(command_id, "failed", error=result["error"]) - print(f" Interview失败: agent_id={agent_id}, platform={platform}, error={result['error']}") + print(f" Interview failed: agent_id={agent_id}, platform={platform}, error={result['error']}") return False else: self.send_response(command_id, "completed", result=result) - print(f" Interview完成: agent_id={agent_id}, platform={platform}") + print(f" Interview completed: agent_id={agent_id}, platform={platform}") return True - # 未指定平台:同时采访两个平台 + # Platform not specified: interview both platforms simultaneously if not self.twitter_env and not self.reddit_env: - self.send_response(command_id, "failed", error="没有可用的模拟环境") + self.send_response(command_id, "failed", error="No available simulation environment") return False results = { @@ -383,7 +383,7 @@ async def handle_interview(self, command_id: str, agent_id: int, prompt: str, pl } success_count = 0 - # 并行采访两个平台 + # Interview both platforms in parallel tasks = [] platforms_to_interview = [] @@ -395,7 +395,7 @@ async def handle_interview(self, command_id: str, agent_id: int, prompt: str, pl tasks.append(self._interview_single_platform(agent_id, prompt, "reddit")) platforms_to_interview.append("reddit") - # 并行执行 + # Execute in parallel platform_results = await asyncio.gather(*tasks) for platform_name, platform_result in zip(platforms_to_interview, platform_results): @@ -405,30 +405,30 @@ async def handle_interview(self, command_id: str, agent_id: int, prompt: str, pl if success_count > 0: self.send_response(command_id, "completed", result=results) - print(f" Interview完成: agent_id={agent_id}, 成功平台数={success_count}/{len(platforms_to_interview)}") + print(f" Interview completed: agent_id={agent_id}, successful platforms={success_count}/{len(platforms_to_interview)}") return True else: - errors = [f"{p}: {r.get('error', '未知错误')}" for p, r in results["platforms"].items()] + errors = [f"{p}: {r.get('error', 'Unknown error')}" for p, r in results["platforms"].items()] self.send_response(command_id, "failed", error="; ".join(errors)) - print(f" Interview失败: agent_id={agent_id}, 所有平台都失败") + print(f" Interview failed: agent_id={agent_id}, All platforms failed") return False async def handle_batch_interview(self, command_id: str, interviews: List[Dict], platform: str = None) -> bool: """ - 处理批量采访命令 + Handle batch interview command Args: - command_id: 命令ID + command_id: Command ID interviews: [{"agent_id": int, "prompt": str, "platform": str(optional)}, ...] - platform: 默认平台(可被每个interview项覆盖) - - "twitter": 只采访Twitter平台 - - "reddit": 只采访Reddit平台 - - None/不指定: 每个Agent同时采访两个平台 + platform: Default platform (can be overridden by each interview item) + - "twitter": Interview Twitter platform only + - "reddit": Interview Reddit platform only + - None/not specified: Interview each Agent on both platforms simultaneously """ - # 按平台分组 + # Group by platform twitter_interviews = [] reddit_interviews = [] - both_platforms_interviews = [] # 需要同时采访两个平台的 + both_platforms_interviews = [] # Those needing to interview on both platforms for interview in interviews: item_platform = interview.get("platform", platform) @@ -437,10 +437,10 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict], elif item_platform == "reddit": reddit_interviews.append(interview) else: - # 未指定平台:两个平台都采访 + # Platform not specified: interview on both platforms both_platforms_interviews.append(interview) - # 把 both_platforms_interviews 拆分到两个平台 + # Split both_platforms_interviews to both platforms if both_platforms_interviews: if self.twitter_env: twitter_interviews.extend(both_platforms_interviews) @@ -449,7 +449,7 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict], results = {} - # 处理Twitter平台的采访 + # Process Twitter platform interviews if twitter_interviews and self.twitter_env: try: twitter_actions = {} @@ -463,7 +463,7 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict], action_args={"prompt": prompt} ) except Exception as e: - print(f" 警告: 无法获取Twitter Agent {agent_id}: {e}") + print(f" Warning: Unable to get Twitter Agent {agent_id}: {e}") if twitter_actions: await self.twitter_env.step(twitter_actions) @@ -474,9 +474,9 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict], result["platform"] = "twitter" results[f"twitter_{agent_id}"] = result except Exception as e: - print(f" Twitter批量Interview失败: {e}") + print(f" Twitter batch Interview failed: {e}") - # 处理Reddit平台的采访 + # Process Reddit platform interviews if reddit_interviews and self.reddit_env: try: reddit_actions = {} @@ -490,7 +490,7 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict], action_args={"prompt": prompt} ) except Exception as e: - print(f" 警告: 无法获取Reddit Agent {agent_id}: {e}") + print(f" Warning: Unable to get Reddit Agent {agent_id}: {e}") if reddit_actions: await self.reddit_env.step(reddit_actions) @@ -501,21 +501,21 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict], result["platform"] = "reddit" results[f"reddit_{agent_id}"] = result except Exception as e: - print(f" Reddit批量Interview失败: {e}") + print(f" Reddit batch Interview failed: {e}") if results: self.send_response(command_id, "completed", result={ "interviews_count": len(results), "results": results }) - print(f" 批量Interview完成: {len(results)} 个Agent") + print(f" Batch Interview completed: {len(results)} Agents") return True else: - self.send_response(command_id, "failed", error="没有成功的采访") + self.send_response(command_id, "failed", error="No successful interviews") return False def _get_interview_result(self, agent_id: int, platform: str) -> Dict[str, Any]: - """从数据库获取最新的Interview结果""" + """Get latest Interview result from database""" db_path = os.path.join(self.simulation_dir, f"{platform}_simulation.db") result = { @@ -531,7 +531,7 @@ def _get_interview_result(self, agent_id: int, platform: str) -> Dict[str, Any]: conn = sqlite3.connect(db_path) cursor = conn.cursor() - # 查询最新的Interview记录 + # Query latest Interview record cursor.execute(""" SELECT user_id, info, created_at FROM trace @@ -553,16 +553,16 @@ def _get_interview_result(self, agent_id: int, platform: str) -> Dict[str, Any]: conn.close() except Exception as e: - print(f" 读取Interview结果失败: {e}") + print(f" Failed to read Interview result: {e}") return result async def process_commands(self) -> bool: """ - 处理所有待处理命令 + Process all pending commands Returns: - True 表示继续运行,False 表示应该退出 + True means continue running, False means should exit """ command = self.poll_command() if not command: @@ -572,7 +572,7 @@ async def process_commands(self) -> bool: command_type = command.get("command_type") args = command.get("args", {}) - print(f"\n收到IPC命令: {command_type}, id={command_id}") + print(f"\nReceived IPC command: {command_type}, id={command_id}") if command_type == CommandType.INTERVIEW: await self.handle_interview( @@ -592,25 +592,25 @@ async def process_commands(self) -> bool: return True elif command_type == CommandType.CLOSE_ENV: - print("收到关闭环境命令") - self.send_response(command_id, "completed", result={"message": "环境即将关闭"}) + print("Received close environment command") + self.send_response(command_id, "completed", result={"message": "Environment is about to close"}) return False else: - self.send_response(command_id, "failed", error=f"未知命令类型: {command_type}") + self.send_response(command_id, "failed", error=f"Unknown command type: {command_type}") return True def load_config(config_path: str) -> Dict[str, Any]: - """加载配置文件""" + """Load config file""" with open(config_path, 'r', encoding='utf-8') as f: return json.load(f) -# 需要过滤掉的非核心动作类型(这些动作对分析价值较低) +# Non-core action types to filter out (these actions have lower analytical value) FILTERED_ACTIONS = {'refresh', 'sign_up'} -# 动作类型映射表(数据库中的名称 -> 标准名称) +# Action type mapping (database name -> standard name) ACTION_TYPE_MAP = { 'create_post': 'CREATE_POST', 'like_post': 'LIKE_POST', @@ -632,15 +632,15 @@ def load_config(config_path: str) -> Dict[str, Any]: def get_agent_names_from_config(config: Dict[str, Any]) -> Dict[int, str]: """ - 从 simulation_config 中获取 agent_id -> entity_name 的映射 + Get agent_id -> entity_name mapping from simulation_config - 这样可以在 actions.jsonl 中显示真实的实体名称,而不是 "Agent_0" 这样的代号 + This allows displaying real entity names in actions.jsonl instead of codes like "Agent_0" Args: - config: simulation_config.json 的内容 + config: Contents of simulation_config.json Returns: - agent_id -> entity_name 的映射字典 + agent_id -> entity_name mapping dictionary """ agent_names = {} agent_configs = config.get("agent_configs", []) @@ -660,17 +660,17 @@ def fetch_new_actions_from_db( agent_names: Dict[int, str] ) -> Tuple[List[Dict[str, Any]], int]: """ - 从数据库中获取新的动作记录,并补充完整的上下文信息 + Get new action records from database and supplement with complete context info Args: - db_path: 数据库文件路径 - last_rowid: 上次读取的最大 rowid 值(使用 rowid 而不是 created_at,因为不同平台的 created_at 格式不同) - agent_names: agent_id -> agent_name 映射 + db_path: Database file path + last_rowid: Last read max rowid value (using rowid instead of created_at, because created_at formats differ across platforms) + agent_names: agent_id -> agent_name mapping Returns: (actions_list, new_last_rowid) - - actions_list: 动作列表,每个元素包含 agent_id, agent_name, action_type, action_args(含上下文信息) - - new_last_rowid: 新的最大 rowid 值 + - actions_list: Action list, each element contains agent_id, agent_name, action_type, action_args (with context info) + - new_last_rowid: New max rowid value """ actions = [] new_last_rowid = last_rowid @@ -682,8 +682,8 @@ def fetch_new_actions_from_db( conn = sqlite3.connect(db_path) cursor = conn.cursor() - # 使用 rowid 来追踪已处理的记录(rowid 是 SQLite 的内置自增字段) - # 这样可以避免 created_at 格式差异问题(Twitter 用整数,Reddit 用日期时间字符串) + # Use rowid to track processed records (rowid is SQLite built-in auto-increment field) + # This avoids created_at format difference issues (Twitter uses integers, Reddit uses datetime strings) cursor.execute(""" SELECT rowid, user_id, action, info FROM trace @@ -692,20 +692,20 @@ def fetch_new_actions_from_db( """, (last_rowid,)) for rowid, user_id, action, info_json in cursor.fetchall(): - # 更新最大 rowid + # Update max rowid new_last_rowid = rowid - # 过滤非核心动作 + # Filter out non-core actions if action in FILTERED_ACTIONS: continue - # 解析动作参数 + # Parse action parameters try: action_args = json.loads(info_json) if info_json else {} except json.JSONDecodeError: action_args = {} - # 精简 action_args,只保留关键字段(保留完整内容,不截断) + # Simplify action_args, keep only key fields (preserve full content, no truncation) simplified_args = {} if 'content' in action_args: simplified_args['content'] = action_args['content'] @@ -726,10 +726,10 @@ def fetch_new_actions_from_db( if 'dislike_id' in action_args: simplified_args['dislike_id'] = action_args['dislike_id'] - # 转换动作类型名称 + # Convert action type name action_type = ACTION_TYPE_MAP.get(action, action.upper()) - # 补充上下文信息(帖子内容、用户名等) + # Supplement context info (post content, username, etc.) _enrich_action_context(cursor, action_type, simplified_args, agent_names) actions.append({ @@ -741,7 +741,7 @@ def fetch_new_actions_from_db( conn.close() except Exception as e: - print(f"读取数据库动作失败: {e}") + print(f"Failed to read database actions: {e}") return actions, new_last_rowid @@ -753,16 +753,16 @@ def _enrich_action_context( agent_names: Dict[int, str] ) -> None: """ - 为动作补充上下文信息(帖子内容、用户名等) + Supplement context info for action (post content, username, etc.) Args: - cursor: 数据库游标 - action_type: 动作类型 - action_args: 动作参数(会被修改) - agent_names: agent_id -> agent_name 映射 + cursor: Database cursor + action_type: Action type + action_args: Action parameters (will be modified) + agent_names: agent_id -> agent_name mapping """ try: - # 点赞/踩帖子:补充帖子内容和作者 + # Like/dislike post: supplement post content and author if action_type in ('LIKE_POST', 'DISLIKE_POST'): post_id = action_args.get('post_id') if post_id: @@ -771,11 +771,11 @@ def _enrich_action_context( action_args['post_content'] = post_info.get('content', '') action_args['post_author_name'] = post_info.get('author_name', '') - # 转发帖子:补充原帖内容和作者 + # Repost: supplement original post content and author elif action_type == 'REPOST': new_post_id = action_args.get('new_post_id') if new_post_id: - # 转发帖子的 original_post_id 指向原帖 + # Repost's original_post_id points to the original post cursor.execute(""" SELECT original_post_id FROM post WHERE post_id = ? """, (new_post_id,)) @@ -787,7 +787,7 @@ def _enrich_action_context( action_args['original_content'] = original_info.get('content', '') action_args['original_author_name'] = original_info.get('author_name', '') - # 引用帖子:补充原帖内容、作者和引用评论 + # Quote post: supplement original post content, author and quote comment elif action_type == 'QUOTE_POST': quoted_id = action_args.get('quoted_id') new_post_id = action_args.get('new_post_id') @@ -798,7 +798,7 @@ def _enrich_action_context( action_args['original_content'] = original_info.get('content', '') action_args['original_author_name'] = original_info.get('author_name', '') - # 获取引用帖子的评论内容(quote_content) + # Get quote post comment content (quote_content) if new_post_id: cursor.execute(""" SELECT quote_content FROM post WHERE post_id = ? @@ -807,11 +807,11 @@ def _enrich_action_context( if row and row[0]: action_args['quote_content'] = row[0] - # 关注用户:补充被关注用户的名称 + # Follow user: supplement followed user name elif action_type == 'FOLLOW': follow_id = action_args.get('follow_id') if follow_id: - # 从 follow 表获取 followee_id + # Get followee_id from follow table cursor.execute(""" SELECT followee_id FROM follow WHERE follow_id = ? """, (follow_id,)) @@ -822,16 +822,16 @@ def _enrich_action_context( if target_name: action_args['target_user_name'] = target_name - # 屏蔽用户:补充被屏蔽用户的名称 + # Mute user: supplement muted user name elif action_type == 'MUTE': - # 从 action_args 中获取 user_id 或 target_id + # Get user_id or target_id from action_args target_id = action_args.get('user_id') or action_args.get('target_id') if target_id: target_name = _get_user_name(cursor, target_id, agent_names) if target_name: action_args['target_user_name'] = target_name - # 点赞/踩评论:补充评论内容和作者 + # Like/dislike comment: supplement comment content and author elif action_type in ('LIKE_COMMENT', 'DISLIKE_COMMENT'): comment_id = action_args.get('comment_id') if comment_id: @@ -840,7 +840,7 @@ def _enrich_action_context( action_args['comment_content'] = comment_info.get('content', '') action_args['comment_author_name'] = comment_info.get('author_name', '') - # 发表评论:补充所评论的帖子信息 + # Create comment: supplement commented post info elif action_type == 'CREATE_COMMENT': post_id = action_args.get('post_id') if post_id: @@ -850,8 +850,8 @@ def _enrich_action_context( action_args['post_author_name'] = post_info.get('author_name', '') except Exception as e: - # 补充上下文失败不影响主流程 - print(f"补充动作上下文失败: {e}") + # Context supplementation failure does not affect main flow + print(f"Failed to supplement action context: {e}") def _get_post_info( @@ -860,15 +860,15 @@ def _get_post_info( agent_names: Dict[int, str] ) -> Optional[Dict[str, str]]: """ - 获取帖子信息 + Get post info Args: - cursor: 数据库游标 - post_id: 帖子ID - agent_names: agent_id -> agent_name 映射 + cursor: Database cursor + post_id: Post ID + agent_names: agent_id -> agent_name mapping Returns: - 包含 content 和 author_name 的字典,或 None + Dictionary containing content and author_name, or None """ try: cursor.execute(""" @@ -883,12 +883,12 @@ def _get_post_info( user_id = row[1] agent_id = row[2] - # 优先使用 agent_names 中的名称 + # Prefer names from agent_names author_name = '' if agent_id is not None and agent_id in agent_names: author_name = agent_names[agent_id] elif user_id: - # 从 user 表获取名称 + # Get name from user table cursor.execute("SELECT name, user_name FROM user WHERE user_id = ?", (user_id,)) user_row = cursor.fetchone() if user_row: @@ -906,15 +906,15 @@ def _get_user_name( agent_names: Dict[int, str] ) -> Optional[str]: """ - 获取用户名称 + Get user name Args: - cursor: 数据库游标 - user_id: 用户ID - agent_names: agent_id -> agent_name 映射 + cursor: Database cursor + user_id: User ID + agent_names: agent_id -> agent_name mapping Returns: - 用户名称,或 None + User name, or None """ try: cursor.execute(""" @@ -926,7 +926,7 @@ def _get_user_name( name = row[1] user_name = row[2] - # 优先使用 agent_names 中的名称 + # Prefer names from agent_names if agent_id is not None and agent_id in agent_names: return agent_names[agent_id] return name or user_name or '' @@ -941,15 +941,15 @@ def _get_comment_info( agent_names: Dict[int, str] ) -> Optional[Dict[str, str]]: """ - 获取评论信息 + Get comment info Args: - cursor: 数据库游标 - comment_id: 评论ID - agent_names: agent_id -> agent_name 映射 + cursor: Database cursor + comment_id: Comment ID + agent_names: agent_id -> agent_name mapping Returns: - 包含 content 和 author_name 的字典,或 None + Dictionary containing content and author_name, or None """ try: cursor.execute(""" @@ -964,12 +964,12 @@ def _get_comment_info( user_id = row[1] agent_id = row[2] - # 优先使用 agent_names 中的名称 + # Prefer names from agent_names author_name = '' if agent_id is not None and agent_id in agent_names: author_name = agent_names[agent_id] elif user_id: - # 从 user 表获取名称 + # Get name from user table cursor.execute("SELECT name, user_name FROM user WHERE user_id = ?", (user_id,)) user_row = cursor.fetchone() if user_row: @@ -983,53 +983,53 @@ def _get_comment_info( def create_model(config: Dict[str, Any], use_boost: bool = False): """ - 创建LLM模型 + Create LLM model - 支持双 LLM 配置,用于并行模拟时提速: - - 通用配置:LLM_API_KEY, LLM_BASE_URL, LLM_MODEL_NAME - - 加速配置(可选):LLM_BOOST_API_KEY, LLM_BOOST_BASE_URL, LLM_BOOST_MODEL_NAME + Supports dual LLM configuration for speeding up parallel simulations: + - General config:LLM_API_KEY, LLM_BASE_URL, LLM_MODEL_NAME + - Boost config (optional):LLM_BOOST_API_KEY, LLM_BOOST_BASE_URL, LLM_BOOST_MODEL_NAME - 如果配置了加速 LLM,并行模拟时可以让不同平台使用不同的 API 服务商,提高并发能力。 + If boost LLM is configured, different platforms can use different API providers during parallel simulation to improve concurrency. Args: - config: 模拟配置字典 - use_boost: 是否使用加速 LLM 配置(如果可用) + config: Simulation config dictionary + use_boost: Whether to use boost LLM config (if available) """ - # 检查是否有加速配置 + # Check if boost config exists boost_api_key = os.environ.get("LLM_BOOST_API_KEY", "") boost_base_url = os.environ.get("LLM_BOOST_BASE_URL", "") boost_model = os.environ.get("LLM_BOOST_MODEL_NAME", "") has_boost_config = bool(boost_api_key) - # 根据参数和配置情况选择使用哪个 LLM + # Choose which LLM to use based on parameters and config if use_boost and has_boost_config: - # 使用加速配置 + # Use boost config llm_api_key = boost_api_key llm_base_url = boost_base_url llm_model = boost_model or os.environ.get("LLM_MODEL_NAME", "") - config_label = "[加速LLM]" + config_label = "[Boost LLM]" else: - # 使用通用配置 + # Use general config llm_api_key = os.environ.get("LLM_API_KEY", "") llm_base_url = os.environ.get("LLM_BASE_URL", "") llm_model = os.environ.get("LLM_MODEL_NAME", "") - config_label = "[通用LLM]" + config_label = "[General LLM]" - # 如果 .env 中没有模型名,则使用 config 作为备用 + # If no model name in .env, use config as fallback if not llm_model: llm_model = config.get("llm_model", "gpt-4o-mini") - # 设置 camel-ai 所需的环境变量 + # Set environment variables required by camel-ai if llm_api_key: os.environ["OPENAI_API_KEY"] = llm_api_key if not os.environ.get("OPENAI_API_KEY"): - raise ValueError("缺少 API Key 配置,请在项目根目录 .env 文件中设置 LLM_API_KEY") + raise ValueError("Missing API Key configuration. Please set LLM_API_KEY in the project root .env file") if llm_base_url: os.environ["OPENAI_API_BASE_URL"] = llm_base_url - print(f"{config_label} model={llm_model}, base_url={llm_base_url[:40] if llm_base_url else '默认'}...") + print(f"{config_label} model={llm_model}, base_url={llm_base_url[:40] if llm_base_url else 'default'}...") return ModelFactory.create( model_platform=ModelPlatformType.OPENAI, @@ -1043,7 +1043,7 @@ def get_active_agents_for_round( current_hour: int, round_num: int ) -> List: - """根据时间和配置决定本轮激活哪些Agent""" + """Determine which Agents to activate this round based on time and config""" time_config = config.get("time_config", {}) agent_configs = config.get("agent_configs", []) @@ -1091,7 +1091,7 @@ def get_active_agents_for_round( class PlatformSimulation: - """平台模拟结果容器""" + """Platform simulation result container""" def __init__(self): self.env = None self.agent_graph = None @@ -1105,17 +1105,17 @@ async def run_twitter_simulation( main_logger: Optional[SimulationLogManager] = None, max_rounds: Optional[int] = None ) -> PlatformSimulation: - """运行Twitter模拟 + """Run Twitter simulation Args: - config: 模拟配置 - simulation_dir: 模拟目录 - action_logger: 动作日志记录器 - main_logger: 主日志管理器 - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) + config: Simulation config + simulation_dir: Simulation directory + action_logger: Action logger + main_logger: Main log manager + max_rounds: Maximum simulation rounds (optional, for truncating overly long simulations) Returns: - PlatformSimulation: 包含env和agent_graph的结果对象 + PlatformSimulation: Result object containing env and agent_graph """ result = PlatformSimulation() @@ -1124,15 +1124,15 @@ def log_info(msg): main_logger.info(f"[Twitter] {msg}") print(f"[Twitter] {msg}") - log_info("初始化...") + log_info("Initializing...") - # Twitter 使用通用 LLM 配置 + # Twitter uses general LLM config model = create_model(config, use_boost=False) - # OASIS Twitter使用CSV格式 + # OASIS Twitter uses CSV format profile_path = os.path.join(simulation_dir, "twitter_profiles.csv") if not os.path.exists(profile_path): - log_info(f"错误: Profile文件不存在: {profile_path}") + log_info(f"Error: Profile file does not exist: {profile_path}") return result result.agent_graph = await generate_twitter_agent_graph( @@ -1141,9 +1141,9 @@ def log_info(msg): available_actions=TWITTER_ACTIONS, ) - # 从配置文件获取 Agent 真实名称映射(使用 entity_name 而非默认的 Agent_X) + # Get Agent real name mapping from config (using entity_name instead of default Agent_X) agent_names = get_agent_names_from_config(config) - # 如果配置中没有某个 agent,则使用 OASIS 的默认名称 + # If an agent is not in config, use OASIS's default name for agent_id, agent in result.agent_graph.get_agents(): if agent_id not in agent_names: agent_names[agent_id] = getattr(agent, 'name', f'Agent_{agent_id}') @@ -1156,23 +1156,23 @@ def log_info(msg): agent_graph=result.agent_graph, platform=oasis.DefaultPlatformType.TWITTER, database_path=db_path, - semaphore=30, # 限制最大并发 LLM 请求数,防止 API 过载 + semaphore=30, # Limit max concurrent LLM requests to prevent API overload ) await result.env.reset() - log_info("环境已启动") + log_info("Environment started") if action_logger: action_logger.log_simulation_start(config) total_actions = 0 - last_rowid = 0 # 跟踪数据库中最后处理的行号(使用 rowid 避免 created_at 格式差异) + last_rowid = 0 # Track last processed row in database (using rowid to avoid created_at format differences) - # 执行初始事件 + # Execute initial events event_config = config.get("event_config", {}) initial_posts = event_config.get("initial_posts", []) - # 记录 round 0 开始(初始事件阶段) + # Record round 0 start (initial events phase) if action_logger: action_logger.log_round_start(0, 0) # round 0, simulated_hour 0 @@ -1204,32 +1204,32 @@ def log_info(msg): if initial_actions: await result.env.step(initial_actions) - log_info(f"已发布 {len(initial_actions)} 条初始帖子") + log_info(f"Published {len(initial_actions)} initial posts") - # 记录 round 0 结束 + # Record round 0 end if action_logger: action_logger.log_round_end(0, initial_action_count) - # 主模拟循环 + # Main simulation loop time_config = config.get("time_config", {}) total_hours = time_config.get("total_simulation_hours", 72) minutes_per_round = time_config.get("minutes_per_round", 30) total_rounds = (total_hours * 60) // minutes_per_round - # 如果指定了最大轮数,则截断 + # If max rounds specified, truncate if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) if total_rounds < original_rounds: - log_info(f"轮数已截断: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") + log_info(f"Rounds truncated: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") start_time = datetime.now() for round_num in range(total_rounds): - # 检查是否收到退出信号 + # Check for exit signal if _shutdown_event and _shutdown_event.is_set(): if main_logger: - main_logger.info(f"收到退出信号,在第 {round_num + 1} 轮停止模拟") + main_logger.info(f"Received exit signal, stopping simulation at round {round_num + 1} ") break simulated_minutes = round_num * minutes_per_round @@ -1240,12 +1240,12 @@ def log_info(msg): result.env, config, simulated_hour, round_num ) - # 无论是否有活跃agent,都记录round开始 + # Record round start regardless of active agents if action_logger: action_logger.log_round_start(round_num + 1, simulated_hour) if not active_agents: - # 没有活跃agent时也记录round结束(actions_count=0) + # Record round end even with no active agents (actions_count=0) if action_logger: action_logger.log_round_end(round_num + 1, 0) continue @@ -1253,7 +1253,7 @@ def log_info(msg): actions = {agent: LLMAction() for _, agent in active_agents} await result.env.step(actions) - # 从数据库获取实际执行的动作并记录 + # Get actually executed actions from database and record them actual_actions, last_rowid = fetch_new_actions_from_db( db_path, last_rowid, agent_names ) @@ -1278,14 +1278,14 @@ def log_info(msg): progress = (round_num + 1) / total_rounds * 100 log_info(f"Day {simulated_day}, {simulated_hour:02d}:00 - Round {round_num + 1}/{total_rounds} ({progress:.1f}%)") - # 注意:不关闭环境,保留给Interview使用 + # Note: do not close environment, keep it for Interview use if action_logger: action_logger.log_simulation_end(total_rounds, total_actions) result.total_actions = total_actions elapsed = (datetime.now() - start_time).total_seconds() - log_info(f"模拟循环完成! 耗时: {elapsed:.1f}秒, 总动作: {total_actions}") + log_info(f"Simulation loop complete! Time: {elapsed:.1f}s, Total actions: {total_actions}") return result @@ -1297,17 +1297,17 @@ async def run_reddit_simulation( main_logger: Optional[SimulationLogManager] = None, max_rounds: Optional[int] = None ) -> PlatformSimulation: - """运行Reddit模拟 + """Run Reddit simulation Args: - config: 模拟配置 - simulation_dir: 模拟目录 - action_logger: 动作日志记录器 - main_logger: 主日志管理器 - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) + config: Simulation config + simulation_dir: Simulation directory + action_logger: Action logger + main_logger: Main log manager + max_rounds: Maximum simulation rounds (optional, for truncating overly long simulations) Returns: - PlatformSimulation: 包含env和agent_graph的结果对象 + PlatformSimulation: Result object containing env and agent_graph """ result = PlatformSimulation() @@ -1316,14 +1316,14 @@ def log_info(msg): main_logger.info(f"[Reddit] {msg}") print(f"[Reddit] {msg}") - log_info("初始化...") + log_info("Initializing...") - # Reddit 使用加速 LLM 配置(如果有的话,否则回退到通用配置) + # Reddit uses boost LLM config (if available, otherwise falls back to general config) model = create_model(config, use_boost=True) profile_path = os.path.join(simulation_dir, "reddit_profiles.json") if not os.path.exists(profile_path): - log_info(f"错误: Profile文件不存在: {profile_path}") + log_info(f"Error: Profile file does not exist: {profile_path}") return result result.agent_graph = await generate_reddit_agent_graph( @@ -1332,9 +1332,9 @@ def log_info(msg): available_actions=REDDIT_ACTIONS, ) - # 从配置文件获取 Agent 真实名称映射(使用 entity_name 而非默认的 Agent_X) + # Get Agent real name mapping from config (using entity_name instead of default Agent_X) agent_names = get_agent_names_from_config(config) - # 如果配置中没有某个 agent,则使用 OASIS 的默认名称 + # If an agent is not in config, use OASIS's default name for agent_id, agent in result.agent_graph.get_agents(): if agent_id not in agent_names: agent_names[agent_id] = getattr(agent, 'name', f'Agent_{agent_id}') @@ -1347,23 +1347,23 @@ def log_info(msg): agent_graph=result.agent_graph, platform=oasis.DefaultPlatformType.REDDIT, database_path=db_path, - semaphore=30, # 限制最大并发 LLM 请求数,防止 API 过载 + semaphore=30, # Limit max concurrent LLM requests to prevent API overload ) await result.env.reset() - log_info("环境已启动") + log_info("Environment started") if action_logger: action_logger.log_simulation_start(config) total_actions = 0 - last_rowid = 0 # 跟踪数据库中最后处理的行号(使用 rowid 避免 created_at 格式差异) + last_rowid = 0 # Track last processed row in database (using rowid to avoid created_at format differences) - # 执行初始事件 + # Execute initial events event_config = config.get("event_config", {}) initial_posts = event_config.get("initial_posts", []) - # 记录 round 0 开始(初始事件阶段) + # Record round 0 start (initial events phase) if action_logger: action_logger.log_round_start(0, 0) # round 0, simulated_hour 0 @@ -1403,32 +1403,32 @@ def log_info(msg): if initial_actions: await result.env.step(initial_actions) - log_info(f"已发布 {len(initial_actions)} 条初始帖子") + log_info(f"Published {len(initial_actions)} initial posts") - # 记录 round 0 结束 + # Record round 0 end if action_logger: action_logger.log_round_end(0, initial_action_count) - # 主模拟循环 + # Main simulation loop time_config = config.get("time_config", {}) total_hours = time_config.get("total_simulation_hours", 72) minutes_per_round = time_config.get("minutes_per_round", 30) total_rounds = (total_hours * 60) // minutes_per_round - # 如果指定了最大轮数,则截断 + # If max rounds specified, truncate if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) if total_rounds < original_rounds: - log_info(f"轮数已截断: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") + log_info(f"Rounds truncated: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") start_time = datetime.now() for round_num in range(total_rounds): - # 检查是否收到退出信号 + # Check for exit signal if _shutdown_event and _shutdown_event.is_set(): if main_logger: - main_logger.info(f"收到退出信号,在第 {round_num + 1} 轮停止模拟") + main_logger.info(f"Received exit signal, stopping simulation at round {round_num + 1} ") break simulated_minutes = round_num * minutes_per_round @@ -1439,12 +1439,12 @@ def log_info(msg): result.env, config, simulated_hour, round_num ) - # 无论是否有活跃agent,都记录round开始 + # Record round start regardless of active agents if action_logger: action_logger.log_round_start(round_num + 1, simulated_hour) if not active_agents: - # 没有活跃agent时也记录round结束(actions_count=0) + # Record round end even with no active agents (actions_count=0) if action_logger: action_logger.log_round_end(round_num + 1, 0) continue @@ -1452,7 +1452,7 @@ def log_info(msg): actions = {agent: LLMAction() for _, agent in active_agents} await result.env.step(actions) - # 从数据库获取实际执行的动作并记录 + # Get actually executed actions from database and record them actual_actions, last_rowid = fetch_new_actions_from_db( db_path, last_rowid, agent_names ) @@ -1477,76 +1477,76 @@ def log_info(msg): progress = (round_num + 1) / total_rounds * 100 log_info(f"Day {simulated_day}, {simulated_hour:02d}:00 - Round {round_num + 1}/{total_rounds} ({progress:.1f}%)") - # 注意:不关闭环境,保留给Interview使用 + # Note: do not close environment, keep it for Interview use if action_logger: action_logger.log_simulation_end(total_rounds, total_actions) result.total_actions = total_actions elapsed = (datetime.now() - start_time).total_seconds() - log_info(f"模拟循环完成! 耗时: {elapsed:.1f}秒, 总动作: {total_actions}") + log_info(f"Simulation loop complete! Time: {elapsed:.1f}s, Total actions: {total_actions}") return result async def main(): - parser = argparse.ArgumentParser(description='OASIS双平台并行模拟') + parser = argparse.ArgumentParser(description='OASIS dual-platform parallel simulation') parser.add_argument( '--config', type=str, required=True, - help='配置文件路径 (simulation_config.json)' + help='Config file path (simulation_config.json)' ) parser.add_argument( '--twitter-only', action='store_true', - help='只运行Twitter模拟' + help='Run Twitter simulation only' ) parser.add_argument( '--reddit-only', action='store_true', - help='只运行Reddit模拟' + help='Run Reddit simulation only' ) parser.add_argument( '--max-rounds', type=int, default=None, - help='最大模拟轮数(可选,用于截断过长的模拟)' + help='Maximum simulation rounds (optional, for truncating overly long simulations)' ) parser.add_argument( '--no-wait', action='store_true', default=False, - help='模拟完成后立即关闭环境,不进入等待命令模式' + help='Close environment immediately after simulation, do not enter command waiting mode' ) args = parser.parse_args() - # 在 main 函数开始时创建 shutdown 事件,确保整个程序都能响应退出信号 + # Create shutdown event at the start of main to ensure entire program can respond to exit signals global _shutdown_event _shutdown_event = asyncio.Event() if not os.path.exists(args.config): - print(f"错误: 配置文件不存在: {args.config}") + print(f"Error: Config file does not exist: {args.config}") sys.exit(1) config = load_config(args.config) simulation_dir = os.path.dirname(args.config) or "." wait_for_commands = not args.no_wait - # 初始化日志配置(禁用 OASIS 日志,清理旧文件) + # Initialize logging config (disable OASIS logs, clean up old files) init_logging_for_simulation(simulation_dir) - # 创建日志管理器 + # Create log manager log_manager = SimulationLogManager(simulation_dir) twitter_logger = log_manager.get_twitter_logger() reddit_logger = log_manager.get_reddit_logger() log_manager.info("=" * 60) - log_manager.info("OASIS 双平台并行模拟") - log_manager.info(f"配置文件: {args.config}") - log_manager.info(f"模拟ID: {config.get('simulation_id', 'unknown')}") - log_manager.info(f"等待命令模式: {'启用' if wait_for_commands else '禁用'}") + log_manager.info("OASIS Dual-Platform Parallel Simulation") + log_manager.info(f"Config file: {args.config}") + log_manager.info(f"Simulation ID: {config.get('simulation_id', 'unknown')}") + log_manager.info(f"Command waiting mode: {'Enabled' if wait_for_commands else 'Disabled'}") log_manager.info("=" * 60) time_config = config.get("time_config", {}) @@ -1554,25 +1554,25 @@ async def main(): minutes_per_round = time_config.get('minutes_per_round', 30) config_total_rounds = (total_hours * 60) // minutes_per_round - log_manager.info(f"模拟参数:") - log_manager.info(f" - 总模拟时长: {total_hours}小时") - log_manager.info(f" - 每轮时间: {minutes_per_round}分钟") - log_manager.info(f" - 配置总轮数: {config_total_rounds}") + log_manager.info(f"Simulation parameters:") + log_manager.info(f" - Total simulation duration: {total_hours}hours") + log_manager.info(f" - Time per round: {minutes_per_round}minutes") + log_manager.info(f" - Config total rounds: {config_total_rounds}") if args.max_rounds: - log_manager.info(f" - 最大轮数限制: {args.max_rounds}") + log_manager.info(f" - Max rounds limit: {args.max_rounds}") if args.max_rounds < config_total_rounds: - log_manager.info(f" - 实际执行轮数: {args.max_rounds} (已截断)") - log_manager.info(f" - Agent数量: {len(config.get('agent_configs', []))}") + log_manager.info(f" - Actual execution rounds: {args.max_rounds} (truncated)") + log_manager.info(f" - Agent count: {len(config.get('agent_configs', []))}") - log_manager.info("日志结构:") - log_manager.info(f" - 主日志: simulation.log") - log_manager.info(f" - Twitter动作: twitter/actions.jsonl") - log_manager.info(f" - Reddit动作: reddit/actions.jsonl") + log_manager.info("Log structure:") + log_manager.info(f" - Main log: simulation.log") + log_manager.info(f" - Twitter actions: twitter/actions.jsonl") + log_manager.info(f" - Reddit actions: reddit/actions.jsonl") log_manager.info("=" * 60) start_time = datetime.now() - # 存储两个平台的模拟结果 + # Store simulation results for both platforms twitter_result: Optional[PlatformSimulation] = None reddit_result: Optional[PlatformSimulation] = None @@ -1581,7 +1581,7 @@ async def main(): elif args.reddit_only: reddit_result = await run_reddit_simulation(config, simulation_dir, reddit_logger, log_manager, args.max_rounds) else: - # 并行运行(每个平台使用独立的日志记录器) + # Run in parallel (each platform uses independent logger) results = await asyncio.gather( run_twitter_simulation(config, simulation_dir, twitter_logger, log_manager, args.max_rounds), run_reddit_simulation(config, simulation_dir, reddit_logger, log_manager, args.max_rounds), @@ -1590,17 +1590,17 @@ async def main(): total_elapsed = (datetime.now() - start_time).total_seconds() log_manager.info("=" * 60) - log_manager.info(f"模拟循环完成! 总耗时: {total_elapsed:.1f}秒") + log_manager.info(f"Simulation loop complete! Total time: {total_elapsed:.1f}s") - # 是否进入等待命令模式 + # Whether to enter command waiting mode if wait_for_commands: log_manager.info("") log_manager.info("=" * 60) - log_manager.info("进入等待命令模式 - 环境保持运行") - log_manager.info("支持的命令: interview, batch_interview, close_env") + log_manager.info("Entering command waiting mode - environment stays running") + log_manager.info("Supported commands: interview, batch_interview, close_env") log_manager.info("=" * 60) - # 创建IPC处理器 + # Create IPC handler ipc_handler = ParallelIPCHandler( simulation_dir=simulation_dir, twitter_env=twitter_result.env if twitter_result else None, @@ -1610,40 +1610,40 @@ async def main(): ) ipc_handler.update_status("alive") - # 等待命令循环(使用全局 _shutdown_event) + # Command waiting loop (using global _shutdown_event) try: while not _shutdown_event.is_set(): should_continue = await ipc_handler.process_commands() if not should_continue: break - # 使用 wait_for 替代 sleep,这样可以响应 shutdown_event + # Use wait_for instead of sleep to respond to shutdown_event try: await asyncio.wait_for(_shutdown_event.wait(), timeout=0.5) - break # 收到退出信号 + break # Exit signal received except asyncio.TimeoutError: - pass # 超时继续循环 + pass # Timeout, continue loop except KeyboardInterrupt: - print("\n收到中断信号") + print("\nInterrupt signal received") except asyncio.CancelledError: - print("\n任务被取消") + print("\nTask cancelled") except Exception as e: - print(f"\n命令处理出错: {e}") + print(f"\nCommand processing error: {e}") - log_manager.info("\n关闭环境...") + log_manager.info("\nClosing environment...") ipc_handler.update_status("stopped") - # 关闭环境 + # Close environment if twitter_result and twitter_result.env: await twitter_result.env.close() - log_manager.info("[Twitter] 环境已关闭") + log_manager.info("[Twitter] Environment closed") if reddit_result and reddit_result.env: await reddit_result.env.close() - log_manager.info("[Reddit] 环境已关闭") + log_manager.info("[Reddit] Environment closed") log_manager.info("=" * 60) - log_manager.info(f"全部完成!") - log_manager.info(f"日志文件:") + log_manager.info(f"All done!") + log_manager.info(f"Log files:") log_manager.info(f" - {os.path.join(simulation_dir, 'simulation.log')}") log_manager.info(f" - {os.path.join(simulation_dir, 'twitter', 'actions.jsonl')}") log_manager.info(f" - {os.path.join(simulation_dir, 'reddit', 'actions.jsonl')}") @@ -1652,29 +1652,29 @@ async def main(): def setup_signal_handlers(loop=None): """ - 设置信号处理器,确保收到 SIGTERM/SIGINT 时能够正确退出 + Set up signal handlers to ensure proper exit on SIGTERM/SIGINT - 持久化模拟场景:模拟完成后不退出,等待 interview 命令 - 当收到终止信号时,需要: - 1. 通知 asyncio 循环退出等待 - 2. 让程序有机会正常清理资源(关闭数据库、环境等) - 3. 然后才退出 + Persistent simulation scenario: do not exit after simulation, wait for interview commands + When receiving termination signal, need to: + 1. Notify asyncio loop to exit waiting + 2. Give program a chance to clean up resources (close database, environment, etc.) + 3. Then exit """ def signal_handler(signum, frame): global _cleanup_done sig_name = "SIGTERM" if signum == signal.SIGTERM else "SIGINT" - print(f"\n收到 {sig_name} 信号,正在退出...") + print(f"\nReceived {sig_name} signal, exiting...") if not _cleanup_done: _cleanup_done = True - # 设置事件通知 asyncio 循环退出(让循环有机会清理资源) + # Set event to notify asyncio loop to exit (give loop a chance to clean up resources) if _shutdown_event: _shutdown_event.set() - # 不要直接 sys.exit(),让 asyncio 循环正常退出并清理资源 - # 如果是重复收到信号,才强制退出 + # Do not call sys.exit() directly, let asyncio loop exit normally and clean up resources + # Force exit only on repeated signal else: - print("强制退出...") + print("Force exiting...") sys.exit(1) signal.signal(signal.SIGTERM, signal_handler) @@ -1686,14 +1686,14 @@ def signal_handler(signum, frame): try: asyncio.run(main()) except KeyboardInterrupt: - print("\n程序被中断") + print("\nProgram interrupted") except SystemExit: pass finally: - # 清理 multiprocessing 资源跟踪器(防止退出时的警告) + # Clean up multiprocessing resource tracker (prevent warnings on exit) try: from multiprocessing import resource_tracker resource_tracker._resource_tracker._stop() except Exception: pass - print("模拟进程已退出") + print("Simulation process has exited") diff --git a/backend/scripts/run_reddit_simulation.py b/backend/scripts/run_reddit_simulation.py index 14907cb..7602417 100644 --- a/backend/scripts/run_reddit_simulation.py +++ b/backend/scripts/run_reddit_simulation.py @@ -1,16 +1,16 @@ """ -OASIS Reddit模拟预设脚本 -此脚本读取配置文件中的参数来执行模拟,实现全程自动化 +OASIS Reddit simulation preset script +This script reads parameters from config files to execute simulation, fully automated -功能特性: -- 完成模拟后不立即关闭环境,进入等待命令模式 -- 支持通过IPC接收Interview命令 -- 支持单个Agent采访和批量采访 -- 支持远程关闭环境命令 +Features: +- After simulation completes, does not close environment immediately, enters command waiting mode +- Supports receiving Interview commands via IPC +- Supports single Agent interview and batch interview +- Supports remote close environment command -使用方式: +Usage: python run_reddit_simulation.py --config /path/to/simulation_config.json - python run_reddit_simulation.py --config /path/to/simulation_config.json --no-wait # 完成后立即关闭 + python run_reddit_simulation.py --config /path/to/simulation_config.json --no-wait # Close immediately after completion """ import argparse @@ -25,18 +25,18 @@ from datetime import datetime from typing import Dict, Any, List, Optional -# 全局变量:用于信号处理 +# Global variables: for signal handling _shutdown_event = None _cleanup_done = False -# 添加项目路径 +# Add project paths _scripts_dir = os.path.dirname(os.path.abspath(__file__)) _backend_dir = os.path.abspath(os.path.join(_scripts_dir, '..')) _project_root = os.path.abspath(os.path.join(_backend_dir, '..')) sys.path.insert(0, _scripts_dir) sys.path.insert(0, _backend_dir) -# 加载项目根目录的 .env 文件(包含 LLM_API_KEY 等配置) +# Load .env file from project root (contains LLM_API_KEY and other configs) from dotenv import load_dotenv _env_file = os.path.join(_project_root, '.env') if os.path.exists(_env_file): @@ -51,7 +51,7 @@ class UnicodeFormatter(logging.Formatter): - """自定义格式化器,将 Unicode 转义序列转换为可读字符""" + """Custom formatter that converts Unicode escape sequences to readable characters""" UNICODE_ESCAPE_PATTERN = re.compile(r'\\u([0-9a-fA-F]{4})') @@ -68,24 +68,24 @@ def replace_unicode(match): class MaxTokensWarningFilter(logging.Filter): - """过滤掉 camel-ai 关于 max_tokens 的警告(我们故意不设置 max_tokens,让模型自行决定)""" + """Filter out camel-ai warnings about max_tokens (we intentionally do not set max_tokens, letting the model decide)""" def filter(self, record): - # 过滤掉包含 max_tokens 警告的日志 + # Filter out logs containing max_tokens warnings if "max_tokens" in record.getMessage() and "Invalid or missing" in record.getMessage(): return False return True -# 在模块加载时立即添加过滤器,确保在 camel 代码执行前生效 +# Add filter immediately at module load time to ensure it takes effect before camel code executes logging.getLogger().addFilter(MaxTokensWarningFilter()) def setup_oasis_logging(log_dir: str): - """配置 OASIS 的日志,使用固定名称的日志文件""" + """Configure OASIS logging with fixed-name log files""" os.makedirs(log_dir, exist_ok=True) - # 清理旧的日志文件 + # Clean up old log files for f in os.listdir(log_dir): old_log = os.path.join(log_dir, f) if os.path.isfile(old_log) and f.endswith('.log'): @@ -126,25 +126,25 @@ def setup_oasis_logging(log_dir: str): generate_reddit_agent_graph ) except ImportError as e: - print(f"错误: 缺少依赖 {e}") - print("请先安装: pip install oasis-ai camel-ai") + print(f"Error: Missing dependency {e}") + print("Please install first: pip install oasis-ai camel-ai") sys.exit(1) -# IPC相关常量 +# IPC-related constants IPC_COMMANDS_DIR = "ipc_commands" IPC_RESPONSES_DIR = "ipc_responses" ENV_STATUS_FILE = "env_status.json" class CommandType: - """命令类型常量""" + """Command type constants""" INTERVIEW = "interview" BATCH_INTERVIEW = "batch_interview" CLOSE_ENV = "close_env" class IPCHandler: - """IPC命令处理器""" + """IPC command handler""" def __init__(self, simulation_dir: str, env, agent_graph): self.simulation_dir = simulation_dir @@ -155,12 +155,12 @@ def __init__(self, simulation_dir: str, env, agent_graph): self.status_file = os.path.join(simulation_dir, ENV_STATUS_FILE) self._running = True - # 确保目录存在 + # Ensure directories exist os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) def update_status(self, status: str): - """更新环境状态""" + """Update environment status""" with open(self.status_file, 'w', encoding='utf-8') as f: json.dump({ "status": status, @@ -168,11 +168,11 @@ def update_status(self, status: str): }, f, ensure_ascii=False, indent=2) def poll_command(self) -> Optional[Dict[str, Any]]: - """轮询获取待处理命令""" + """Poll for pending commands""" if not os.path.exists(self.commands_dir): return None - # 获取命令文件(按时间排序) + # Get command files (sorted by time) command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): @@ -191,7 +191,7 @@ def poll_command(self) -> Optional[Dict[str, Any]]: return None def send_response(self, command_id: str, status: str, result: Dict = None, error: str = None): - """发送响应""" + """Send response""" response = { "command_id": command_id, "status": status, @@ -204,7 +204,7 @@ def send_response(self, command_id: str, status: str, result: Dict = None, error with open(response_file, 'w', encoding='utf-8') as f: json.dump(response, f, ensure_ascii=False, indent=2) - # 删除命令文件 + # Delete command file command_file = os.path.join(self.commands_dir, f"{command_id}.json") try: os.remove(command_file) @@ -213,49 +213,49 @@ def send_response(self, command_id: str, status: str, result: Dict = None, error async def handle_interview(self, command_id: str, agent_id: int, prompt: str) -> bool: """ - 处理单个Agent采访命令 + Handle single Agent interview command Returns: - True 表示成功,False 表示失败 + True indicates success, False indicates failure """ try: - # 获取Agent + # Get Agent agent = self.agent_graph.get_agent(agent_id) - # 创建Interview动作 + # Create Interview action interview_action = ManualAction( action_type=ActionType.INTERVIEW, action_args={"prompt": prompt} ) - # 执行Interview + # Execute Interview actions = {agent: interview_action} await self.env.step(actions) - # 从数据库获取结果 + # Get results from database result = self._get_interview_result(agent_id) self.send_response(command_id, "completed", result=result) - print(f" Interview完成: agent_id={agent_id}") + print(f" Interview completed: agent_id={agent_id}") return True except Exception as e: error_msg = str(e) - print(f" Interview失败: agent_id={agent_id}, error={error_msg}") + print(f" Interview failed: agent_id={agent_id}, error={error_msg}") self.send_response(command_id, "failed", error=error_msg) return False async def handle_batch_interview(self, command_id: str, interviews: List[Dict]) -> bool: """ - 处理批量采访命令 + Handle batch interview command Args: interviews: [{"agent_id": int, "prompt": str}, ...] """ try: - # 构建动作字典 + # Build action dictionary actions = {} - agent_prompts = {} # 记录每个agent的prompt + agent_prompts = {} # Record each agent prompt for interview in interviews: agent_id = interview.get("agent_id") @@ -269,16 +269,16 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict]) ) agent_prompts[agent_id] = prompt except Exception as e: - print(f" 警告: 无法获取Agent {agent_id}: {e}") + print(f" Warning: Unable to get Agent {agent_id}: {e}") if not actions: - self.send_response(command_id, "failed", error="没有有效的Agent") + self.send_response(command_id, "failed", error="No valid Agents") return False - # 执行批量Interview + # Execute batch Interview await self.env.step(actions) - # 获取所有结果 + # Get all results results = {} for agent_id in agent_prompts.keys(): result = self._get_interview_result(agent_id) @@ -288,17 +288,17 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict]) "interviews_count": len(results), "results": results }) - print(f" 批量Interview完成: {len(results)} 个Agent") + print(f" Batch Interview completed: {len(results)} Agents") return True except Exception as e: error_msg = str(e) - print(f" 批量Interview失败: {error_msg}") + print(f" Batch Interview failed: {error_msg}") self.send_response(command_id, "failed", error=error_msg) return False def _get_interview_result(self, agent_id: int) -> Dict[str, Any]: - """从数据库获取最新的Interview结果""" + """Get latest Interview result from database""" db_path = os.path.join(self.simulation_dir, "reddit_simulation.db") result = { @@ -314,7 +314,7 @@ def _get_interview_result(self, agent_id: int) -> Dict[str, Any]: conn = sqlite3.connect(db_path) cursor = conn.cursor() - # 查询最新的Interview记录 + # Query latest Interview record cursor.execute(""" SELECT user_id, info, created_at FROM trace @@ -336,16 +336,16 @@ def _get_interview_result(self, agent_id: int) -> Dict[str, Any]: conn.close() except Exception as e: - print(f" 读取Interview结果失败: {e}") + print(f" Failed to read Interview result: {e}") return result async def process_commands(self) -> bool: """ - 处理所有待处理命令 + Process all pending commands Returns: - True 表示继续运行,False 表示应该退出 + True means continue running, False means should exit """ command = self.poll_command() if not command: @@ -355,7 +355,7 @@ async def process_commands(self) -> bool: command_type = command.get("command_type") args = command.get("args", {}) - print(f"\n收到IPC命令: {command_type}, id={command_id}") + print(f"\nReceived IPC command: {command_type}, id={command_id}") if command_type == CommandType.INTERVIEW: await self.handle_interview( @@ -373,19 +373,19 @@ async def process_commands(self) -> bool: return True elif command_type == CommandType.CLOSE_ENV: - print("收到关闭环境命令") - self.send_response(command_id, "completed", result={"message": "环境即将关闭"}) + print("Received close environment command") + self.send_response(command_id, "completed", result={"message": "Environment is about to close"}) return False else: - self.send_response(command_id, "failed", error=f"未知命令类型: {command_type}") + self.send_response(command_id, "failed", error=f"Unknown command type: {command_type}") return True class RedditSimulationRunner: - """Reddit模拟运行器""" + """Reddit simulation runner""" - # Reddit可用动作(不包含INTERVIEW,INTERVIEW只能通过ManualAction手动触发) + # Reddit available actions (excludes INTERVIEW, INTERVIEW can only be triggered manually via ManualAction) AVAILABLE_ACTIONS = [ ActionType.LIKE_POST, ActionType.DISLIKE_POST, @@ -404,11 +404,11 @@ class RedditSimulationRunner: def __init__(self, config_path: str, wait_for_commands: bool = True): """ - 初始化模拟运行器 + Initialize simulation runner Args: - config_path: 配置文件路径 (simulation_config.json) - wait_for_commands: 模拟完成后是否等待命令(默认True) + config_path: Config file path (simulation_config.json) + wait_for_commands: Whether to wait for commands after simulation completes (default True) """ self.config_path = config_path self.config = self._load_config() @@ -419,47 +419,47 @@ def __init__(self, config_path: str, wait_for_commands: bool = True): self.ipc_handler = None def _load_config(self) -> Dict[str, Any]: - """加载配置文件""" + """Load config file""" with open(self.config_path, 'r', encoding='utf-8') as f: return json.load(f) def _get_profile_path(self) -> str: - """获取Profile文件路径""" + """Get profile file path""" return os.path.join(self.simulation_dir, "reddit_profiles.json") def _get_db_path(self) -> str: - """获取数据库路径""" + """Get database path""" return os.path.join(self.simulation_dir, "reddit_simulation.db") def _create_model(self): """ - 创建LLM模型 + Create LLM model - 统一使用项目根目录 .env 文件中的配置(优先级最高): - - LLM_API_KEY: API密钥 - - LLM_BASE_URL: API基础URL - - LLM_MODEL_NAME: 模型名称 + Unified use of config from project root .env file (highest priority): + - LLM_API_KEY: API key + - LLM_BASE_URL: API base URL + - LLM_MODEL_NAME: Model name """ - # 优先从 .env 读取配置 + # Read config from .env first llm_api_key = os.environ.get("LLM_API_KEY", "") llm_base_url = os.environ.get("LLM_BASE_URL", "") llm_model = os.environ.get("LLM_MODEL_NAME", "") - # 如果 .env 中没有,则使用 config 作为备用 + # If not in .env, use config as fallback if not llm_model: llm_model = self.config.get("llm_model", "gpt-4o-mini") - # 设置 camel-ai 所需的环境变量 + # Set environment variables required by camel-ai if llm_api_key: os.environ["OPENAI_API_KEY"] = llm_api_key if not os.environ.get("OPENAI_API_KEY"): - raise ValueError("缺少 API Key 配置,请在项目根目录 .env 文件中设置 LLM_API_KEY") + raise ValueError("Missing API Key configuration. Please set LLM_API_KEY in the project root .env file") if llm_base_url: os.environ["OPENAI_API_BASE_URL"] = llm_base_url - print(f"LLM配置: model={llm_model}, base_url={llm_base_url[:40] if llm_base_url else '默认'}...") + print(f"LLM config: model={llm_model}, base_url={llm_base_url[:40] if llm_base_url else 'default'}...") return ModelFactory.create( model_platform=ModelPlatformType.OPENAI, @@ -473,7 +473,7 @@ def _get_active_agents_for_round( round_num: int ) -> List: """ - 根据时间和配置决定本轮激活哪些Agent + Determine which Agents to activate this round based on time and config """ time_config = self.config.get("time_config", {}) agent_configs = self.config.get("agent_configs", []) @@ -521,16 +521,16 @@ def _get_active_agents_for_round( return active_agents async def run(self, max_rounds: int = None): - """运行Reddit模拟 + """Run Reddit simulation Args: - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) + max_rounds: Maximum simulation rounds (optional, for truncating overly long simulations) """ print("=" * 60) - print("OASIS Reddit模拟") - print(f"配置文件: {self.config_path}") - print(f"模拟ID: {self.config.get('simulation_id', 'unknown')}") - print(f"等待命令模式: {'启用' if self.wait_for_commands else '禁用'}") + print("OASIS Reddit Simulation") + print(f"Config file: {self.config_path}") + print(f"Simulation ID: {self.config.get('simulation_id', 'unknown')}") + print(f"Command waiting mode: {'Enabled' if self.wait_for_commands else 'Disabled'}") print("=" * 60) time_config = self.config.get("time_config", {}) @@ -538,28 +538,28 @@ async def run(self, max_rounds: int = None): minutes_per_round = time_config.get("minutes_per_round", 30) total_rounds = (total_hours * 60) // minutes_per_round - # 如果指定了最大轮数,则截断 + # If max rounds specified, truncate if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) if total_rounds < original_rounds: - print(f"\n轮数已截断: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") + print(f"\nRounds truncated: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") - print(f"\n模拟参数:") - print(f" - 总模拟时长: {total_hours}小时") - print(f" - 每轮时间: {minutes_per_round}分钟") - print(f" - 总轮数: {total_rounds}") + print(f"\nSimulation parameters:") + print(f" - Total simulation duration: {total_hours}hours") + print(f" - Time per round: {minutes_per_round}minutes") + print(f" - Total rounds: {total_rounds}") if max_rounds: - print(f" - 最大轮数限制: {max_rounds}") - print(f" - Agent数量: {len(self.config.get('agent_configs', []))}") + print(f" - Max rounds limit: {max_rounds}") + print(f" - Agent count: {len(self.config.get('agent_configs', []))}") - print("\n初始化LLM模型...") + print("\nInitializing LLM model...") model = self._create_model() - print("加载Agent Profile...") + print("Loading Agent Profile...") profile_path = self._get_profile_path() if not os.path.exists(profile_path): - print(f"错误: Profile文件不存在: {profile_path}") + print(f"Error: Profile file does not exist: {profile_path}") return self.agent_graph = await generate_reddit_agent_graph( @@ -571,29 +571,29 @@ async def run(self, max_rounds: int = None): db_path = self._get_db_path() if os.path.exists(db_path): os.remove(db_path) - print(f"已删除旧数据库: {db_path}") + print(f"Deleted old database: {db_path}") - print("创建OASIS环境...") + print("Creating OASIS environment...") self.env = oasis.make( agent_graph=self.agent_graph, platform=oasis.DefaultPlatformType.REDDIT, database_path=db_path, - semaphore=30, # 限制最大并发 LLM 请求数,防止 API 过载 + semaphore=30, # Limit max concurrent LLM requests to prevent API overload ) await self.env.reset() - print("环境初始化完成\n") + print("Environment initialization complete\n") - # 初始化IPC处理器 + # Initialize IPC handler self.ipc_handler = IPCHandler(self.simulation_dir, self.env, self.agent_graph) self.ipc_handler.update_status("running") - # 执行初始事件 + # Execute initial events event_config = self.config.get("event_config", {}) initial_posts = event_config.get("initial_posts", []) if initial_posts: - print(f"执行初始事件 ({len(initial_posts)}条初始帖子)...") + print(f"Executing initial events ({len(initial_posts)} initial posts)...") initial_actions = {} for post in initial_posts: agent_id = post.get("poster_agent_id", 0) @@ -613,14 +613,14 @@ async def run(self, max_rounds: int = None): action_args={"content": content} ) except Exception as e: - print(f" 警告: 无法为Agent {agent_id}创建初始帖子: {e}") + print(f" Warning: Unable to create initial post for Agent {agent_id} {e}") if initial_actions: await self.env.step(initial_actions) - print(f" 已发布 {len(initial_actions)} 条初始帖子") + print(f" Published {len(initial_actions)} initial posts") - # 主模拟循环 - print("\n开始模拟循环...") + # Main simulation loop + print("\nStarting simulation loop...") start_time = datetime.now() for round_num in range(total_rounds): @@ -651,20 +651,20 @@ async def run(self, max_rounds: int = None): f"- elapsed: {elapsed:.1f}s") total_elapsed = (datetime.now() - start_time).total_seconds() - print(f"\n模拟循环完成!") - print(f" - 总耗时: {total_elapsed:.1f}秒") - print(f" - 数据库: {db_path}") + print(f"\nSimulation loop complete!") + print(f" - Total time: {total_elapsed:.1f}s") + print(f" - Database: {db_path}") - # 是否进入等待命令模式 + # Whether to enter command waiting mode if self.wait_for_commands: print("\n" + "=" * 60) - print("进入等待命令模式 - 环境保持运行") - print("支持的命令: interview, batch_interview, close_env") + print("Entering command waiting mode - environment stays running") + print("Supported commands: interview, batch_interview, close_env") print("=" * 60) self.ipc_handler.update_status("alive") - # 等待命令循环(使用全局 _shutdown_event) + # Command waiting loop (using global _shutdown_event) try: while not _shutdown_event.is_set(): should_continue = await self.ipc_handler.process_commands() @@ -672,58 +672,58 @@ async def run(self, max_rounds: int = None): break try: await asyncio.wait_for(_shutdown_event.wait(), timeout=0.5) - break # 收到退出信号 + break # Exit signal received except asyncio.TimeoutError: pass except KeyboardInterrupt: - print("\n收到中断信号") + print("\nInterrupt signal received") except asyncio.CancelledError: - print("\n任务被取消") + print("\nTask cancelled") except Exception as e: - print(f"\n命令处理出错: {e}") + print(f"\nCommand processing error: {e}") - print("\n关闭环境...") + print("\nClosing environment...") - # 关闭环境 + # Close environment self.ipc_handler.update_status("stopped") await self.env.close() - print("环境已关闭") + print("Environment closed") print("=" * 60) async def main(): - parser = argparse.ArgumentParser(description='OASIS Reddit模拟') + parser = argparse.ArgumentParser(description='OASIS Reddit Simulation') parser.add_argument( '--config', type=str, required=True, - help='配置文件路径 (simulation_config.json)' + help='Config file path (simulation_config.json)' ) parser.add_argument( '--max-rounds', type=int, default=None, - help='最大模拟轮数(可选,用于截断过长的模拟)' + help='Maximum simulation rounds (optional, for truncating overly long simulations)' ) parser.add_argument( '--no-wait', action='store_true', default=False, - help='模拟完成后立即关闭环境,不进入等待命令模式' + help='Close environment immediately after simulation, do not enter command waiting mode' ) args = parser.parse_args() - # 在 main 函数开始时创建 shutdown 事件 + # Create shutdown event at the start of main function global _shutdown_event _shutdown_event = asyncio.Event() if not os.path.exists(args.config): - print(f"错误: 配置文件不存在: {args.config}") + print(f"Error: Config file does not exist: {args.config}") sys.exit(1) - # 初始化日志配置(使用固定文件名,清理旧日志) + # Initialize logging config (using fixed filenames, clean up old logs) simulation_dir = os.path.dirname(args.config) or "." setup_oasis_logging(os.path.join(simulation_dir, "log")) @@ -736,20 +736,20 @@ async def main(): def setup_signal_handlers(): """ - 设置信号处理器,确保收到 SIGTERM/SIGINT 时能够正确退出 - 让程序有机会正常清理资源(关闭数据库、环境等) + Set up signal handlers to ensure proper exit on SIGTERM/SIGINT + Give the program a chance to clean up resources (close database, environment, etc.) """ def signal_handler(signum, frame): global _cleanup_done sig_name = "SIGTERM" if signum == signal.SIGTERM else "SIGINT" - print(f"\n收到 {sig_name} 信号,正在退出...") + print(f"\nReceived {sig_name} signal, exiting...") if not _cleanup_done: _cleanup_done = True if _shutdown_event: _shutdown_event.set() else: - # 重复收到信号才强制退出 - print("强制退出...") + # Force exit only on repeated signal + print("Force exiting...") sys.exit(1) signal.signal(signal.SIGTERM, signal_handler) @@ -761,9 +761,9 @@ def signal_handler(signum, frame): try: asyncio.run(main()) except KeyboardInterrupt: - print("\n程序被中断") + print("\nProgram interrupted") except SystemExit: pass finally: - print("模拟进程已退出") + print("Simulation process has exited") diff --git a/backend/scripts/run_twitter_simulation.py b/backend/scripts/run_twitter_simulation.py index caab9e9..31adda8 100644 --- a/backend/scripts/run_twitter_simulation.py +++ b/backend/scripts/run_twitter_simulation.py @@ -1,16 +1,16 @@ """ -OASIS Twitter模拟预设脚本 -此脚本读取配置文件中的参数来执行模拟,实现全程自动化 +OASIS Twitter simulation preset script +This script reads parameters from config files to execute simulation, fully automated -功能特性: -- 完成模拟后不立即关闭环境,进入等待命令模式 -- 支持通过IPC接收Interview命令 -- 支持单个Agent采访和批量采访 -- 支持远程关闭环境命令 +Features: +- After simulation completes, does not close environment immediately, enters command waiting mode +- Supports receiving Interview commands via IPC +- Supports single Agent interview and batch interview +- Supports remote close environment command -使用方式: +Usage: python run_twitter_simulation.py --config /path/to/simulation_config.json - python run_twitter_simulation.py --config /path/to/simulation_config.json --no-wait # 完成后立即关闭 + python run_twitter_simulation.py --config /path/to/simulation_config.json --no-wait # Close immediately after completion """ import argparse @@ -25,18 +25,18 @@ from datetime import datetime from typing import Dict, Any, List, Optional -# 全局变量:用于信号处理 +# Global variables: for signal handling _shutdown_event = None _cleanup_done = False -# 添加项目路径 +# Add project paths _scripts_dir = os.path.dirname(os.path.abspath(__file__)) _backend_dir = os.path.abspath(os.path.join(_scripts_dir, '..')) _project_root = os.path.abspath(os.path.join(_backend_dir, '..')) sys.path.insert(0, _scripts_dir) sys.path.insert(0, _backend_dir) -# 加载项目根目录的 .env 文件(包含 LLM_API_KEY 等配置) +# Load .env file from project root (contains LLM_API_KEY and other configs) from dotenv import load_dotenv _env_file = os.path.join(_project_root, '.env') if os.path.exists(_env_file): @@ -51,7 +51,7 @@ class UnicodeFormatter(logging.Formatter): - """自定义格式化器,将 Unicode 转义序列转换为可读字符""" + """Custom formatter that converts Unicode escape sequences to readable characters""" UNICODE_ESCAPE_PATTERN = re.compile(r'\\u([0-9a-fA-F]{4})') @@ -68,24 +68,24 @@ def replace_unicode(match): class MaxTokensWarningFilter(logging.Filter): - """过滤掉 camel-ai 关于 max_tokens 的警告(我们故意不设置 max_tokens,让模型自行决定)""" + """Filter out camel-ai warnings about max_tokens (we intentionally do not set max_tokens, letting the model decide)""" def filter(self, record): - # 过滤掉包含 max_tokens 警告的日志 + # Filter out logs containing max_tokens warnings if "max_tokens" in record.getMessage() and "Invalid or missing" in record.getMessage(): return False return True -# 在模块加载时立即添加过滤器,确保在 camel 代码执行前生效 +# Add filter immediately at module load time to ensure it takes effect before camel code executes logging.getLogger().addFilter(MaxTokensWarningFilter()) def setup_oasis_logging(log_dir: str): - """配置 OASIS 的日志,使用固定名称的日志文件""" + """Configure OASIS logging with fixed-name log files""" os.makedirs(log_dir, exist_ok=True) - # 清理旧的日志文件 + # Clean up old log files for f in os.listdir(log_dir): old_log = os.path.join(log_dir, f) if os.path.isfile(old_log) and f.endswith('.log'): @@ -126,25 +126,25 @@ def setup_oasis_logging(log_dir: str): generate_twitter_agent_graph ) except ImportError as e: - print(f"错误: 缺少依赖 {e}") - print("请先安装: pip install oasis-ai camel-ai") + print(f"Error: Missing dependency {e}") + print("Please install first: pip install oasis-ai camel-ai") sys.exit(1) -# IPC相关常量 +# IPC-related constants IPC_COMMANDS_DIR = "ipc_commands" IPC_RESPONSES_DIR = "ipc_responses" ENV_STATUS_FILE = "env_status.json" class CommandType: - """命令类型常量""" + """Command type constants""" INTERVIEW = "interview" BATCH_INTERVIEW = "batch_interview" CLOSE_ENV = "close_env" class IPCHandler: - """IPC命令处理器""" + """IPC command handler""" def __init__(self, simulation_dir: str, env, agent_graph): self.simulation_dir = simulation_dir @@ -155,12 +155,12 @@ def __init__(self, simulation_dir: str, env, agent_graph): self.status_file = os.path.join(simulation_dir, ENV_STATUS_FILE) self._running = True - # 确保目录存在 + # Ensure directories exist os.makedirs(self.commands_dir, exist_ok=True) os.makedirs(self.responses_dir, exist_ok=True) def update_status(self, status: str): - """更新环境状态""" + """Update environment status""" with open(self.status_file, 'w', encoding='utf-8') as f: json.dump({ "status": status, @@ -168,11 +168,11 @@ def update_status(self, status: str): }, f, ensure_ascii=False, indent=2) def poll_command(self) -> Optional[Dict[str, Any]]: - """轮询获取待处理命令""" + """Poll for pending commands""" if not os.path.exists(self.commands_dir): return None - # 获取命令文件(按时间排序) + # Get command files (sorted by time) command_files = [] for filename in os.listdir(self.commands_dir): if filename.endswith('.json'): @@ -191,7 +191,7 @@ def poll_command(self) -> Optional[Dict[str, Any]]: return None def send_response(self, command_id: str, status: str, result: Dict = None, error: str = None): - """发送响应""" + """Send response""" response = { "command_id": command_id, "status": status, @@ -204,7 +204,7 @@ def send_response(self, command_id: str, status: str, result: Dict = None, error with open(response_file, 'w', encoding='utf-8') as f: json.dump(response, f, ensure_ascii=False, indent=2) - # 删除命令文件 + # Delete command file command_file = os.path.join(self.commands_dir, f"{command_id}.json") try: os.remove(command_file) @@ -213,49 +213,49 @@ def send_response(self, command_id: str, status: str, result: Dict = None, error async def handle_interview(self, command_id: str, agent_id: int, prompt: str) -> bool: """ - 处理单个Agent采访命令 + Handle single Agent interview command Returns: - True 表示成功,False 表示失败 + True indicates success, False indicates failure """ try: - # 获取Agent + # Get Agent agent = self.agent_graph.get_agent(agent_id) - # 创建Interview动作 + # Create Interview action interview_action = ManualAction( action_type=ActionType.INTERVIEW, action_args={"prompt": prompt} ) - # 执行Interview + # Execute Interview actions = {agent: interview_action} await self.env.step(actions) - # 从数据库获取结果 + # Get results from database result = self._get_interview_result(agent_id) self.send_response(command_id, "completed", result=result) - print(f" Interview完成: agent_id={agent_id}") + print(f" Interview completed: agent_id={agent_id}") return True except Exception as e: error_msg = str(e) - print(f" Interview失败: agent_id={agent_id}, error={error_msg}") + print(f" Interview failed: agent_id={agent_id}, error={error_msg}") self.send_response(command_id, "failed", error=error_msg) return False async def handle_batch_interview(self, command_id: str, interviews: List[Dict]) -> bool: """ - 处理批量采访命令 + Handle batch interview command Args: interviews: [{"agent_id": int, "prompt": str}, ...] """ try: - # 构建动作字典 + # Build action dictionary actions = {} - agent_prompts = {} # 记录每个agent的prompt + agent_prompts = {} # Record each agent prompt for interview in interviews: agent_id = interview.get("agent_id") @@ -269,16 +269,16 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict]) ) agent_prompts[agent_id] = prompt except Exception as e: - print(f" 警告: 无法获取Agent {agent_id}: {e}") + print(f" Warning: Unable to get Agent {agent_id}: {e}") if not actions: - self.send_response(command_id, "failed", error="没有有效的Agent") + self.send_response(command_id, "failed", error="No valid Agents") return False - # 执行批量Interview + # Execute batch Interview await self.env.step(actions) - # 获取所有结果 + # Get all results results = {} for agent_id in agent_prompts.keys(): result = self._get_interview_result(agent_id) @@ -288,17 +288,17 @@ async def handle_batch_interview(self, command_id: str, interviews: List[Dict]) "interviews_count": len(results), "results": results }) - print(f" 批量Interview完成: {len(results)} 个Agent") + print(f" Batch Interview completed: {len(results)} Agents") return True except Exception as e: error_msg = str(e) - print(f" 批量Interview失败: {error_msg}") + print(f" Batch Interview failed: {error_msg}") self.send_response(command_id, "failed", error=error_msg) return False def _get_interview_result(self, agent_id: int) -> Dict[str, Any]: - """从数据库获取最新的Interview结果""" + """Get latest Interview result from database""" db_path = os.path.join(self.simulation_dir, "twitter_simulation.db") result = { @@ -314,7 +314,7 @@ def _get_interview_result(self, agent_id: int) -> Dict[str, Any]: conn = sqlite3.connect(db_path) cursor = conn.cursor() - # 查询最新的Interview记录 + # Query latest Interview record cursor.execute(""" SELECT user_id, info, created_at FROM trace @@ -336,16 +336,16 @@ def _get_interview_result(self, agent_id: int) -> Dict[str, Any]: conn.close() except Exception as e: - print(f" 读取Interview结果失败: {e}") + print(f" Failed to read Interview result: {e}") return result async def process_commands(self) -> bool: """ - 处理所有待处理命令 + Process all pending commands Returns: - True 表示继续运行,False 表示应该退出 + True means continue running, False means should exit """ command = self.poll_command() if not command: @@ -355,7 +355,7 @@ async def process_commands(self) -> bool: command_type = command.get("command_type") args = command.get("args", {}) - print(f"\n收到IPC命令: {command_type}, id={command_id}") + print(f"\nReceived IPC command: {command_type}, id={command_id}") if command_type == CommandType.INTERVIEW: await self.handle_interview( @@ -373,19 +373,19 @@ async def process_commands(self) -> bool: return True elif command_type == CommandType.CLOSE_ENV: - print("收到关闭环境命令") - self.send_response(command_id, "completed", result={"message": "环境即将关闭"}) + print("Received close environment command") + self.send_response(command_id, "completed", result={"message": "Environment is about to close"}) return False else: - self.send_response(command_id, "failed", error=f"未知命令类型: {command_type}") + self.send_response(command_id, "failed", error=f"Unknown command type: {command_type}") return True class TwitterSimulationRunner: - """Twitter模拟运行器""" + """Twitter simulation runner""" - # Twitter可用动作(不包含INTERVIEW,INTERVIEW只能通过ManualAction手动触发) + # Twitter available actions (excludes INTERVIEW, INTERVIEW can only be triggered manually via ManualAction) AVAILABLE_ACTIONS = [ ActionType.CREATE_POST, ActionType.LIKE_POST, @@ -397,11 +397,11 @@ class TwitterSimulationRunner: def __init__(self, config_path: str, wait_for_commands: bool = True): """ - 初始化模拟运行器 + Initialize simulation runner Args: - config_path: 配置文件路径 (simulation_config.json) - wait_for_commands: 模拟完成后是否等待命令(默认True) + config_path: Config file path (simulation_config.json) + wait_for_commands: Whether to wait for commands after simulation completes (default True) """ self.config_path = config_path self.config = self._load_config() @@ -412,47 +412,47 @@ def __init__(self, config_path: str, wait_for_commands: bool = True): self.ipc_handler = None def _load_config(self) -> Dict[str, Any]: - """加载配置文件""" + """Load config file""" with open(self.config_path, 'r', encoding='utf-8') as f: return json.load(f) def _get_profile_path(self) -> str: - """获取Profile文件路径(OASIS Twitter使用CSV格式)""" + """Get profile file path (OASIS Twitter uses CSV format)""" return os.path.join(self.simulation_dir, "twitter_profiles.csv") def _get_db_path(self) -> str: - """获取数据库路径""" + """Get database path""" return os.path.join(self.simulation_dir, "twitter_simulation.db") def _create_model(self): """ - 创建LLM模型 + Create LLM model - 统一使用项目根目录 .env 文件中的配置(优先级最高): - - LLM_API_KEY: API密钥 - - LLM_BASE_URL: API基础URL - - LLM_MODEL_NAME: 模型名称 + Unified use of config from project root .env file (highest priority): + - LLM_API_KEY: API key + - LLM_BASE_URL: API base URL + - LLM_MODEL_NAME: Model name """ - # 优先从 .env 读取配置 + # Read config from .env first llm_api_key = os.environ.get("LLM_API_KEY", "") llm_base_url = os.environ.get("LLM_BASE_URL", "") llm_model = os.environ.get("LLM_MODEL_NAME", "") - # 如果 .env 中没有,则使用 config 作为备用 + # If not in .env, use config as fallback if not llm_model: llm_model = self.config.get("llm_model", "gpt-4o-mini") - # 设置 camel-ai 所需的环境变量 + # Set environment variables required by camel-ai if llm_api_key: os.environ["OPENAI_API_KEY"] = llm_api_key if not os.environ.get("OPENAI_API_KEY"): - raise ValueError("缺少 API Key 配置,请在项目根目录 .env 文件中设置 LLM_API_KEY") + raise ValueError("Missing API Key configuration. Please set LLM_API_KEY in the project root .env file") if llm_base_url: os.environ["OPENAI_API_BASE_URL"] = llm_base_url - print(f"LLM配置: model={llm_model}, base_url={llm_base_url[:40] if llm_base_url else '默认'}...") + print(f"LLM config: model={llm_model}, base_url={llm_base_url[:40] if llm_base_url else 'default'}...") return ModelFactory.create( model_platform=ModelPlatformType.OPENAI, @@ -466,24 +466,24 @@ def _get_active_agents_for_round( round_num: int ) -> List: """ - 根据时间和配置决定本轮激活哪些Agent + Determine which Agents to activate this round based on time and config Args: - env: OASIS环境 - current_hour: 当前模拟小时(0-23) - round_num: 当前轮数 + env: OASIS environment + current_hour: Current simulated hour (0-23) + round_num: Current round Returns: - 激活的Agent列表 + List of activated Agents """ time_config = self.config.get("time_config", {}) agent_configs = self.config.get("agent_configs", []) - # 基础激活数量 + # Base activation count base_min = time_config.get("agents_per_hour_min", 5) base_max = time_config.get("agents_per_hour_max", 20) - # 根据时段调整 + # Adjust by time period peak_hours = time_config.get("peak_hours", [9, 10, 11, 14, 15, 20, 21, 22]) off_peak_hours = time_config.get("off_peak_hours", [0, 1, 2, 3, 4, 5]) @@ -496,28 +496,28 @@ def _get_active_agents_for_round( target_count = int(random.uniform(base_min, base_max) * multiplier) - # 根据每个Agent的配置计算激活概率 + # Calculate activation probability based on each Agent config candidates = [] for cfg in agent_configs: agent_id = cfg.get("agent_id", 0) active_hours = cfg.get("active_hours", list(range(8, 23))) activity_level = cfg.get("activity_level", 0.5) - # 检查是否在活跃时间 + # Check if within active hours if current_hour not in active_hours: continue - # 根据活跃度计算概率 + # Calculate probability based on activity level if random.random() < activity_level: candidates.append(agent_id) - # 随机选择 + # Random selection selected_ids = random.sample( candidates, min(target_count, len(candidates)) ) if candidates else [] - # 转换为Agent对象 + # Convert to Agent objects active_agents = [] for agent_id in selected_ids: try: @@ -529,50 +529,50 @@ def _get_active_agents_for_round( return active_agents async def run(self, max_rounds: int = None): - """运行Twitter模拟 + """Run Twitter simulation Args: - max_rounds: 最大模拟轮数(可选,用于截断过长的模拟) + max_rounds: Maximum simulation rounds (optional, for truncating overly long simulations) """ print("=" * 60) - print("OASIS Twitter模拟") - print(f"配置文件: {self.config_path}") - print(f"模拟ID: {self.config.get('simulation_id', 'unknown')}") - print(f"等待命令模式: {'启用' if self.wait_for_commands else '禁用'}") + print("OASIS Twitter Simulation") + print(f"Config file: {self.config_path}") + print(f"Simulation ID: {self.config.get('simulation_id', 'unknown')}") + print(f"Command waiting mode: {'Enabled' if self.wait_for_commands else 'Disabled'}") print("=" * 60) - # 加载时间配置 + # Load time config time_config = self.config.get("time_config", {}) total_hours = time_config.get("total_simulation_hours", 72) minutes_per_round = time_config.get("minutes_per_round", 30) - # 计算总轮数 + # Calculate total rounds total_rounds = (total_hours * 60) // minutes_per_round - # 如果指定了最大轮数,则截断 + # If max rounds specified, truncate if max_rounds is not None and max_rounds > 0: original_rounds = total_rounds total_rounds = min(total_rounds, max_rounds) if total_rounds < original_rounds: - print(f"\n轮数已截断: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") + print(f"\nRounds truncated: {original_rounds} -> {total_rounds} (max_rounds={max_rounds})") - print(f"\n模拟参数:") - print(f" - 总模拟时长: {total_hours}小时") - print(f" - 每轮时间: {minutes_per_round}分钟") - print(f" - 总轮数: {total_rounds}") + print(f"\nSimulation parameters:") + print(f" - Total simulation duration: {total_hours}hours") + print(f" - Time per round: {minutes_per_round}minutes") + print(f" - Total rounds: {total_rounds}") if max_rounds: - print(f" - 最大轮数限制: {max_rounds}") - print(f" - Agent数量: {len(self.config.get('agent_configs', []))}") + print(f" - Max rounds limit: {max_rounds}") + print(f" - Agent count: {len(self.config.get('agent_configs', []))}") - # 创建模型 - print("\n初始化LLM模型...") + # Create model + print("\nInitializing LLM model...") model = self._create_model() - # 加载Agent图 - print("加载Agent Profile...") + # Load Agent graph + print("Loading Agent Profile...") profile_path = self._get_profile_path() if not os.path.exists(profile_path): - print(f"错误: Profile文件不存在: {profile_path}") + print(f"Error: Profile file does not exist: {profile_path}") return self.agent_graph = await generate_twitter_agent_graph( @@ -581,34 +581,34 @@ async def run(self, max_rounds: int = None): available_actions=self.AVAILABLE_ACTIONS, ) - # 数据库路径 + # Database path db_path = self._get_db_path() if os.path.exists(db_path): os.remove(db_path) - print(f"已删除旧数据库: {db_path}") + print(f"Deleted old database: {db_path}") - # 创建环境 - print("创建OASIS环境...") + # Create environment + print("Creating OASIS environment...") self.env = oasis.make( agent_graph=self.agent_graph, platform=oasis.DefaultPlatformType.TWITTER, database_path=db_path, - semaphore=30, # 限制最大并发 LLM 请求数,防止 API 过载 + semaphore=30, # Limit max concurrent LLM requests to prevent API overload ) await self.env.reset() - print("环境初始化完成\n") + print("Environment initialization complete\n") - # 初始化IPC处理器 + # Initialize IPC handler self.ipc_handler = IPCHandler(self.simulation_dir, self.env, self.agent_graph) self.ipc_handler.update_status("running") - # 执行初始事件 + # Execute initial events event_config = self.config.get("event_config", {}) initial_posts = event_config.get("initial_posts", []) if initial_posts: - print(f"执行初始事件 ({len(initial_posts)}条初始帖子)...") + print(f"Executing initial events ({len(initial_posts)} initial posts)...") initial_actions = {} for post in initial_posts: agent_id = post.get("poster_agent_id", 0) @@ -620,23 +620,23 @@ async def run(self, max_rounds: int = None): action_args={"content": content} ) except Exception as e: - print(f" 警告: 无法为Agent {agent_id}创建初始帖子: {e}") + print(f" Warning: Unable to create initial post for Agent {agent_id} {e}") if initial_actions: await self.env.step(initial_actions) - print(f" 已发布 {len(initial_actions)} 条初始帖子") + print(f" Published {len(initial_actions)} initial posts") - # 主模拟循环 - print("\n开始模拟循环...") + # Main simulation loop + print("\nStarting simulation loop...") start_time = datetime.now() for round_num in range(total_rounds): - # 计算当前模拟时间 + # Calculate current simulation time simulated_minutes = round_num * minutes_per_round simulated_hour = (simulated_minutes // 60) % 24 simulated_day = simulated_minutes // (60 * 24) + 1 - # 获取本轮激活的Agent + # Get activated Agents for this round active_agents = self._get_active_agents_for_round( self.env, simulated_hour, round_num ) @@ -644,16 +644,16 @@ async def run(self, max_rounds: int = None): if not active_agents: continue - # 构建动作 + # Build actions actions = { agent: LLMAction() for _, agent in active_agents } - # 执行动作 + # Execute actions await self.env.step(actions) - # 打印进度 + # Print progress if (round_num + 1) % 10 == 0 or round_num == 0: elapsed = (datetime.now() - start_time).total_seconds() progress = (round_num + 1) / total_rounds * 100 @@ -663,20 +663,20 @@ async def run(self, max_rounds: int = None): f"- elapsed: {elapsed:.1f}s") total_elapsed = (datetime.now() - start_time).total_seconds() - print(f"\n模拟循环完成!") - print(f" - 总耗时: {total_elapsed:.1f}秒") - print(f" - 数据库: {db_path}") + print(f"\nSimulation loop complete!") + print(f" - Total time: {total_elapsed:.1f}s") + print(f" - Database: {db_path}") - # 是否进入等待命令模式 + # Whether to enter command waiting mode if self.wait_for_commands: print("\n" + "=" * 60) - print("进入等待命令模式 - 环境保持运行") - print("支持的命令: interview, batch_interview, close_env") + print("Entering command waiting mode - environment stays running") + print("Supported commands: interview, batch_interview, close_env") print("=" * 60) self.ipc_handler.update_status("alive") - # 等待命令循环(使用全局 _shutdown_event) + # Command waiting loop (using global _shutdown_event) try: while not _shutdown_event.is_set(): should_continue = await self.ipc_handler.process_commands() @@ -684,58 +684,58 @@ async def run(self, max_rounds: int = None): break try: await asyncio.wait_for(_shutdown_event.wait(), timeout=0.5) - break # 收到退出信号 + break # Exit signal received except asyncio.TimeoutError: pass except KeyboardInterrupt: - print("\n收到中断信号") + print("\nInterrupt signal received") except asyncio.CancelledError: - print("\n任务被取消") + print("\nTask cancelled") except Exception as e: - print(f"\n命令处理出错: {e}") + print(f"\nCommand processing error: {e}") - print("\n关闭环境...") + print("\nClosing environment...") - # 关闭环境 + # Close environment self.ipc_handler.update_status("stopped") await self.env.close() - print("环境已关闭") + print("Environment closed") print("=" * 60) async def main(): - parser = argparse.ArgumentParser(description='OASIS Twitter模拟') + parser = argparse.ArgumentParser(description='OASIS Twitter Simulation') parser.add_argument( '--config', type=str, required=True, - help='配置文件路径 (simulation_config.json)' + help='Config file path (simulation_config.json)' ) parser.add_argument( '--max-rounds', type=int, default=None, - help='最大模拟轮数(可选,用于截断过长的模拟)' + help='Maximum simulation rounds (optional, for truncating overly long simulations)' ) parser.add_argument( '--no-wait', action='store_true', default=False, - help='模拟完成后立即关闭环境,不进入等待命令模式' + help='Close environment immediately after simulation, do not enter command waiting mode' ) args = parser.parse_args() - # 在 main 函数开始时创建 shutdown 事件 + # Create shutdown event at the start of main function global _shutdown_event _shutdown_event = asyncio.Event() if not os.path.exists(args.config): - print(f"错误: 配置文件不存在: {args.config}") + print(f"Error: Config file does not exist: {args.config}") sys.exit(1) - # 初始化日志配置(使用固定文件名,清理旧日志) + # Initialize logging config (using fixed filenames, clean up old logs) simulation_dir = os.path.dirname(args.config) or "." setup_oasis_logging(os.path.join(simulation_dir, "log")) @@ -748,20 +748,20 @@ async def main(): def setup_signal_handlers(): """ - 设置信号处理器,确保收到 SIGTERM/SIGINT 时能够正确退出 - 让程序有机会正常清理资源(关闭数据库、环境等) + Set up signal handlers to ensure proper exit on SIGTERM/SIGINT + Give the program a chance to clean up resources (close database, environment, etc.) """ def signal_handler(signum, frame): global _cleanup_done sig_name = "SIGTERM" if signum == signal.SIGTERM else "SIGINT" - print(f"\n收到 {sig_name} 信号,正在退出...") + print(f"\nReceived {sig_name} signal, exiting...") if not _cleanup_done: _cleanup_done = True if _shutdown_event: _shutdown_event.set() else: - # 重复收到信号才强制退出 - print("强制退出...") + # Force exit only on repeated signal + print("Force exiting...") sys.exit(1) signal.signal(signal.SIGTERM, signal_handler) @@ -773,8 +773,8 @@ def signal_handler(signum, frame): try: asyncio.run(main()) except KeyboardInterrupt: - print("\n程序被中断") + print("\nProgram interrupted") except SystemExit: pass finally: - print("模拟进程已退出") + print("Simulation process has exited") diff --git a/backend/scripts/test_profile_format.py b/backend/scripts/test_profile_format.py index 354e8b5..e8841b1 100644 --- a/backend/scripts/test_profile_format.py +++ b/backend/scripts/test_profile_format.py @@ -1,8 +1,8 @@ """ -测试Profile格式生成是否符合OASIS要求 -验证: -1. Twitter Profile生成CSV格式 -2. Reddit Profile生成JSON详细格式 +Test whether Profile format generation meets OASIS requirements +Verify: +1. Twitter Profile generates CSV format +2. Reddit Profile generates JSON detailed format """ import os @@ -11,19 +11,19 @@ import csv import tempfile -# 添加项目路径 +# Add project paths sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from app.services.oasis_profile_generator import OasisProfileGenerator, OasisAgentProfile def test_profile_formats(): - """测试Profile格式""" + """Test Profile formats""" print("=" * 60) - print("OASIS Profile格式测试") + print("OASIS Profile Format Test") print("=" * 60) - # 创建测试Profile数据 + # Create test Profile data test_profiles = [ OasisAgentProfile( user_id=0, @@ -63,84 +63,84 @@ def test_profile_formats(): generator = OasisProfileGenerator.__new__(OasisProfileGenerator) - # 使用临时目录 + # Use temporary directory with tempfile.TemporaryDirectory() as temp_dir: twitter_path = os.path.join(temp_dir, "twitter_profiles.csv") reddit_path = os.path.join(temp_dir, "reddit_profiles.json") - # 测试Twitter CSV格式 - print("\n1. 测试Twitter Profile (CSV格式)") + # Test Twitter CSV format + print("\n1. Test Twitter Profile (CSV format)") print("-" * 40) generator._save_twitter_csv(test_profiles, twitter_path) - # 读取并验证CSV + # Read and verify CSV with open(twitter_path, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) rows = list(reader) - print(f" 文件: {twitter_path}") - print(f" 行数: {len(rows)}") - print(f" 表头: {list(rows[0].keys())}") - print(f"\n 示例数据 (第1行):") + print(f" File: {twitter_path}") + print(f" Rows: {len(rows)}") + print(f" Headers: {list(rows[0].keys())}") + print(f"\n Sample data (row 1):") for key, value in rows[0].items(): print(f" {key}: {value}") - # 验证必需字段 + # Verify required fields required_twitter_fields = ['user_id', 'user_name', 'name', 'bio', 'friend_count', 'follower_count', 'statuses_count', 'created_at'] missing = set(required_twitter_fields) - set(rows[0].keys()) if missing: - print(f"\n [错误] 缺少字段: {missing}") + print(f"\n [ERROR] Missing fields: {missing}") else: - print(f"\n [通过] 所有必需字段都存在") + print(f"\n [PASS] All required fields are present") - # 测试Reddit JSON格式 - print("\n2. 测试Reddit Profile (JSON详细格式)") + # Test Reddit JSON format + print("\n2. Test Reddit Profile (JSON detailed format)") print("-" * 40) generator._save_reddit_json(test_profiles, reddit_path) - # 读取并验证JSON + # Read and verify JSON with open(reddit_path, 'r', encoding='utf-8') as f: reddit_data = json.load(f) - print(f" 文件: {reddit_path}") - print(f" 条目数: {len(reddit_data)}") - print(f" 字段: {list(reddit_data[0].keys())}") - print(f"\n 示例数据 (第1条):") + print(f" File: {reddit_path}") + print(f" Entries: {len(reddit_data)}") + print(f" Fields: {list(reddit_data[0].keys())}") + print(f"\n Sample data (entry 1):") print(json.dumps(reddit_data[0], ensure_ascii=False, indent=4)) - # 验证详细格式字段 + # Verify detailed format fields required_reddit_fields = ['realname', 'username', 'bio', 'persona'] optional_reddit_fields = ['age', 'gender', 'mbti', 'country', 'profession', 'interested_topics'] missing = set(required_reddit_fields) - set(reddit_data[0].keys()) if missing: - print(f"\n [错误] 缺少必需字段: {missing}") + print(f"\n [ERROR] Missing required fields: {missing}") else: - print(f"\n [通过] 所有必需字段都存在") + print(f"\n [PASS] All required fields are present") present_optional = set(optional_reddit_fields) & set(reddit_data[0].keys()) - print(f" [信息] 可选字段: {present_optional}") + print(f" [INFO] Optional fields: {present_optional}") print("\n" + "=" * 60) - print("测试完成!") + print("Test complete!") print("=" * 60) def show_expected_formats(): - """显示OASIS期望的格式""" + """Display expected OASIS formats""" print("\n" + "=" * 60) - print("OASIS 期望的Profile格式参考") + print("OASIS Expected Profile Format Reference") print("=" * 60) - print("\n1. Twitter Profile (CSV格式)") + print("\n1. Twitter Profile (CSV format)") print("-" * 40) twitter_example = """user_id,user_name,name,bio,friend_count,follower_count,statuses_count,created_at 0,user0,User Zero,I am user zero with interests in technology.,100,150,500,2023-01-01 1,user1,User One,Tech enthusiast and coffee lover.,200,250,1000,2023-01-02""" print(twitter_example) - print("\n2. Reddit Profile (JSON详细格式)") + print("\n2. Reddit Profile (JSON detailed format)") print("-" * 40) reddit_example = [ { diff --git a/frontend/CHINESE_TEXT_INVENTORY.md b/frontend/CHINESE_TEXT_INVENTORY.md deleted file mode 100644 index a1a6957..0000000 --- a/frontend/CHINESE_TEXT_INVENTORY.md +++ /dev/null @@ -1,451 +0,0 @@ -# MiroFish Frontend - Chinese Text Translation Inventory - -## Overview -Complete inventory of Chinese text found in the MiroFish frontend application that requires translation to English. - -**Scan Date:** 2026-03-15 -**Total Files with Chinese:** 20 files -**Total Chinese Character Occurrences:** 1,986 -**Total Unique Chinese Strings:** ~1,100+ - ---- - -## File-by-File Breakdown - -### 1. **App.vue** (5 occurrences) -**Location:** `/src/App.vue` -- Comments only (no user-facing text) -- Examples: - - Line 6: `使用 Vue Router 来管理页面` (Using Vue Router to manage pages) - - Line 10: `全局样式重置` (Global style reset) - - Line 25: `滚动条样式` (Scrollbar style) - - Line 43: `全局按钮样式` (Global button style) - -**Priority:** LOW (Comments only) - ---- - -### 2. **api/graph.js** (13 occurrences) -**Location:** `/src/api/graph.js` -- JSDoc comments for API functions -- Examples: - - `生成本体(上传文档和模拟需求)` (Generate ontology - upload docs and simulation requirements) - - `构建图谱` (Build graph) - - `查询任务状态` (Query task status) - - `获取图谱数据` (Get graph data) - - `获取项目信息` (Get project information) - -**Priority:** LOW (Comments only) - ---- - -### 3. **api/index.js** (12 occurrences) -**Location:** `/src/api/index.js` -- Comments in API configuration -- Examples: - - `创建axios实例` (Create axios instance) - - `5分钟超时(本体生成可能需要较长时间)` (5-minute timeout - ontology generation may take longer) - - `请求拦截器` (Request interceptor) - - `响应拦截器(容错重试机制)` (Response interceptor - error handling retry mechanism) - - `处理网络错误` (Handle network errors) - -**Priority:** LOW (Comments only) - ---- - -### 4. **api/report.js** (12 occurrences) -**Location:** `/src/api/report.js` -- JSDoc comments -- Examples: - - `开始报告生成` (Start report generation) - - `获取报告生成状态` (Get report generation status) - - `获取 Agent 日志(增量)` (Get Agent logs - incremental) - - `获取控制台日志` (Get console logs) - - `与 Report Agent 对话` (Dialogue with Report Agent) - -**Priority:** LOW (Comments only) - ---- - -### 5. **api/simulation.js** (38 occurrences) -**Location:** `/src/api/simulation.js` -- JSDoc comments for simulation API functions -- Examples: - - `创建模拟` (Create simulation) - - `准备模拟环境(异步任务)` (Prepare simulation environment - async task) - - `获取模拟的 Agent Profiles` (Get simulation Agent Profiles) - - `启动模拟` (Start simulation) - - `停止模拟` (Stop simulation) - - `关闭模拟环境(优雅退出)` (Close simulation environment - graceful exit) - - `获取模拟运行实时状态` (Get simulation run real-time status) - - `获取历史模拟列表(带项目详情)` (Get historical simulation list - with project details) - -**Priority:** LOW (Comments only) - ---- - -### 6. **components/GraphPanel.vue** (134 occurrences, 128 unique) -**Location:** `/src/components/GraphPanel.vue` -**Priority:** MEDIUM (Mix of UI text and comments) - -**Key User-Facing Strings:** -- `刷新图谱` (Refresh graph) - Title attribute -- `最大化/还原` (Maximize/Restore) - Title attribute -- `GraphRAG长短期记忆实时更新中` (GraphRAG short/long-term memory real-time updating) -- `实时更新中...` (Real-time updating...) -- `还有少量内容处理中,建议稍后手动刷新图谱` (Some content is still being processed, suggest manually refreshing the graph later) -- `关闭提示` (Close hint) - Title attribute -- `图谱数据加载中...` (Graph data loading...) -- `等待本体生成...` (Waiting for ontology generation...) - -**UI Elements:** -- Node Details panel labels -- Relationship information text - ---- - -### 7. **components/HistoryDatabase.vue** (218 occurrences, 184 unique) -**Location:** `/src/components/HistoryDatabase.vue` -**Priority:** HIGH (Significant user-facing content) - -**Key User-Facing Strings:** -- `推演记录` (Simulation Records/History) -- `暂无文件` (No files) -- Various status indicators and labels -- Project card titles and descriptions -- Navigation and interaction text - -**Notable Sections:** -- Title: `推演记录` (visible on home page) -- Empty state text: `暂无文件` (No files) -- File count display: `+X 个文件` (+X files) -- Status icons with tooltips: `图谱构建`, `环境搭建`, `分析报告` - ---- - -### 8. **components/Step1GraphBuild.vue** (36 occurrences, 31 unique) -**Location:** `/src/components/Step1GraphBuild.vue` -**Priority:** MEDIUM (Workflow step component) - -**Key User-Facing Strings:** -- `本体生成` (Ontology Generation) - Step title -- `正在分析文档...` (Analyzing documents...) -- `GENERATED ENTITY TYPES` - Labels -- `GENERATED RELATION TYPES` - Labels -- `GraphRAG构建` (GraphRAG Building) - Step title -- `基于生成的本体,将文档自动分块后调用 Zep 构建知识图谱...` (Based on generated ontology, automatically chunk documents and call Zep to build knowledge graph...) -- `实体节点` (Entity nodes) -- `关系边` (Relationship edges) -- `构建完成` (Build completed) -- `图谱构建已完成,请进入下一步进行模拟环境搭建` (Graph building completed, proceed to next step for simulation environment setup) -- `进入环境搭建 ➝` (Enter environment setup) - ---- - -### 9. **components/Step2EnvSetup.vue** (313 occurrences, 241 unique) -**Location:** `/src/components/Step2EnvSetup.vue` -**Priority:** VERY HIGH (Major workflow step with extensive Chinese UI text) - -**Key Sections & Strings:** -- Step 1: `模拟实例初始化` (Simulation Instance Initialization) -- Step 2: `生成 Agent 人设` (Generate Agent Personas) - - `当前Agent数` (Current Agent Count) - - `预期Agent总数` (Expected Total Agents) - - `现实种子当前关联话题数` (Current Topic Count for Seed) - - `已生成的 Agent 人设` (Generated Agent Personas) - - Profile fields: `username`, `profession`, `bio`, `interested_topics` - - `未知职业` (Unknown profession) - - `暂无简介` (No biography) - -- Step 3: `配置模拟参数` (Configure Simulation Parameters) - - `模拟配置(从模板生成)` (Simulation Configuration - from template) - - `配置编辑` (Configuration edit) - - Time-related settings and labels - -**Extensive Chinese UI Text** - requires careful translation of all configuration labels and instructions - ---- - -### 10. **components/Step3Simulation.vue** (113 occurrences, 103 unique) -**Location:** `/src/components/Step3Simulation.vue` -**Priority:** HIGH (Real-time simulation monitoring) - -**Key User-Facing Strings:** -- Platform headers: `Info Plaza`, `Topic Community` -- Status labels and metrics -- `开启图谱实时刷新 (30s)` (Enable real-time graph refresh - 30s) -- `停止图谱实时刷新` (Stop real-time graph refresh) -- `开始生成结果报告` (Start generating result report) -- Round progress indicators -- Elapsed time displays -- Action counters (ACTS) - ---- - -### 11. **components/Step4Report.vue** (389 occurrences, 271 unique) -**Location:** `/src/components/Step4Report.vue` -**Priority:** VERY HIGH (Report generation with extensive content) - -**Key Sections:** -- Report generation status tracking -- Section generation progress -- `正在生成...` (Generating...) -- `Waiting for Report Agent...` (partial English) -- Timeline and workflow visualization -- Agent interaction labels -- Content markdown rendering with Chinese labels - -**Major Chinese Content Areas:** -- Report outline structure -- Section titles and descriptions -- Status indicators -- Tool usage tracking -- Timeline metrics - ---- - -### 12. **components/Step5Interaction.vue** (141 occurrences, 115 unique) -**Location:** `/src/components/Step5Interaction.vue` -**Priority:** HIGH (Interactive component) - -**Key User-Facing Strings:** -- `深度互动` (Deep Interaction) - Step title -- `与世界中任意个体对话` (Dialogue with any individual in the world) -- `与模拟个体对话` (Dialogue with simulated individuals) -- `与 ReportAgent 进行对话` (Dialogue with ReportAgent) -- Profile selection and interaction labels -- Chat interface labels -- Message history display - ---- - -### 13. **store/pendingUpload.js** (5 occurrences) -**Location:** `/src/store/pendingUpload.js` -- Comments only -- `临时存储待上传的文件和需求` (Temporarily store files and requirements to upload) -- `用于首页点击启动引擎后立即跳转,在Process页面再进行API调用` (Used for immediate navigation after clicking start engine on home page, then perform API calls on Process page) - -**Priority:** LOW (Comments only) - ---- - -### 14. **views/Home.vue** (121 occurrences, 116 unique) -**Location:** `/src/views/Home.vue` -**Priority:** VERY HIGH (Landing page - user-facing) - -**Key Hero Section Strings:** -- `简洁通用的群体智能引擎` (Concise universal collective intelligence engine) -- `v0.1-预览版` (v0.1-preview) -- `上传任意报告 / 即刻推演未来` (Upload any report / Instantly predict the future) -- `上帝视角注入变量,在复杂的群体交互中寻找动态环境下的"局部最优解"` (Inject variables from god's perspective, find "local optimal solutions" in dynamic environments within complex group interactions) -- `让未来在 Agent 群中预演,让决策在百战后胜出` (Let the future be rehearsed in the Agent swarm, let decisions prevail after hundreds of battles) - -**Left Panel:** -- `系统状态` (System Status) -- `准备就绪` (Ready) -- `预测引擎待命中,可上传多份非结构化数据以初始化模拟序列` (Prediction engine on standby, can upload multiple unstructured data to initialize simulation sequence) -- `低成本` (Low cost) -- `常规模拟平均5$/次` (Regular simulation average $5/time) -- `高可用` (High availability) -- `最多百万级Agent模拟` (Up to millions of Agent simulations) -- `工作流序列` (Workflow sequence) - -**Workflow Steps (5 steps):** -1. `图谱构建` - `现实种子提取 & 个体与群体记忆注入 & GraphRAG构建` -2. `环境搭建` - `实体关系抽取 & 人设生成 & 环境配置Agent注入仿真参数` -3. `开始模拟` - `双平台并行模拟 & 自动解析预测需求 & 动态更新时序记忆` -4. `报告生成` - `ReportAgent拥有丰富的工具集与模拟后环境进行深度交互` -5. `深度互动` - `与模拟世界中的任意一位进行对话 & 与ReportAgent进行对话` - -**Right Panel (Upload Console):** -- `01 / 现实种子` (01 / Reality Seed) -- `支持格式: PDF, MD, TXT` (Supported formats: PDF, MD, TXT) -- `拖拽文件上传` (Drag and drop files to upload) -- `或点击浏览文件系统` (Or click to browse file system) -- `02 / 模拟提示词` (02 / Simulation Prompt) -- `// 用自然语言输入模拟或预测需求...` (// Input simulation or prediction requirements in natural language...) -- `启动引擎` (Start engine) -- `初始化中...` (Initializing...) - ---- - -### 15. **views/InteractionView.vue** (22 occurrences) -**Location:** `/src/views/InteractionView.vue` -**Priority:** MEDIUM (Logs and status text) - -**Strings:** -- `加载报告数据: ${reportId}` (Loading report data) -- `项目加载成功: ${projectId}` (Project loaded successfully) -- `获取报告信息失败` (Failed to get report information) -- `加载异常` (Loading exception) -- `InteractionView 初始化` (InteractionView initialization) - ---- - -### 16. **views/MainView.vue** (22 occurrences) -**Location:** `/src/views/MainView.vue` -**Priority:** MEDIUM (Step navigation) - -**Strings:** -- `图谱` (Graph) -- `双栏` (Split view) -- `工作台` (Workbench) -- Step names with numbers - ---- - -### 17. **views/Process.vue** (231 occurrences, 196 unique) -**Location:** `/src/views/Process.vue` -**Priority:** VERY HIGH (Main workflow orchestration) - -**Key Strings:** -- `顶部导航栏` (Top navigation bar) -- `中间步骤指示器` (Middle step indicator) -- `实时知识图谱` (Real-time knowledge graph) -- `节点` (Nodes) -- `关系` (Relationships) -- `刷新图谱` (Refresh graph) -- `退出全屏` (Exit fullscreen) -- `全屏显示` (Fullscreen) -- Navigation and workflow step labels -- Status messages and logs -- Error handling text - ---- - -### 18. **views/ReportView.vue** (22 occurrences) -**Location:** `/src/views/ReportView.vue` -**Priority:** MEDIUM (Report viewing step) - -**Strings:** -- Similar to InteractionView -- `加载报告数据` (Loading report data) -- `项目加载成功` (Project loaded successfully) -- `图谱数据加载成功` (Graph data loaded successfully) -- `ReportView 初始化` (ReportView initialization) - ---- - -### 19. **views/SimulationRunView.vue** (72 occurrences, 64 unique) -**Location:** `/src/views/SimulationRunView.vue` -**Priority:** HIGH (Simulation execution) - -**Strings:** -- `开启图谱实时刷新 (30s)` (Enable real-time graph refresh) -- `停止图谱实时刷新` (Stop real-time graph refresh) -- `准备返回 Step 2,正在关闭模拟...` (Preparing to return to Step 2, closing simulation...) -- `正在关闭模拟环境...` (Closing simulation environment...) -- `模拟环境已关闭` (Simulation environment closed) -- `关闭模拟环境失败,尝试强制停止...` (Failed to close simulation environment, attempting force stop...) -- `模拟已强制停止` (Simulation force stopped) -- Error and status messages - ---- - -### 20. **views/SimulationView.vue** (67 occurrences, 56 unique) -**Location:** `/src/views/SimulationView.vue` -**Priority:** HIGH (Environment setup step) - -**Strings:** -- `环境搭建` (Environment setup) -- `模拟初始化` (Simulation initialization) -- `检测到模拟环境正在运行,正在关闭...` (Detected running simulation environment, closing...) -- `检测到模拟状态为运行中,正在停止...` (Detected running simulation state, stopping...) -- `使用自动配置的模拟轮数` (Using auto-configured simulation rounds) -- Error and status messages - ---- - -### 21. **index.html** (2 occurrences) -**Location:** `/frontend/index.html` -**Priority:** MEDIUM - -**Strings:** -- `` - Language attribute -- `` (MiroFish - Social Media Opinion Simulation System) -- `MiroFish - 预测万物` (MiroFish - Predict Everything) - ---- - -## Translation Categories - -### 1. **UI Labels & Navigation** (HIGH PRIORITY) -- Step titles and names -- Button labels -- Menu items -- Form labels -- Tab titles - -### 2. **User-Facing Messages** (HIGH PRIORITY) -- Status messages -- Success/error notifications -- Loading states -- Empty state messages -- Placeholder text - -### 3. **Instructions & Descriptions** (MEDIUM PRIORITY) -- Help text -- Instructions -- Tool tips -- Descriptive text -- Feature explanations - -### 4. **Code Comments** (LOW PRIORITY) -- JSDoc comments -- Inline comments -- File headers -- Algorithm explanations - -### 5. **Data Labels** (MEDIUM PRIORITY) -- Field names -- Column headers -- Status indicators -- Configuration options - ---- - -## Estimated Translation Effort - -| Component | Complexity | Est. Strings | Priority | -|-----------|-----------|--------------|----------| -| Home.vue | High | 116 | VERY HIGH | -| Step2EnvSetup.vue | Very High | 241 | VERY HIGH | -| Step4Report.vue | Very High | 271 | VERY HIGH | -| Process.vue | High | 196 | VERY HIGH | -| HistoryDatabase.vue | High | 184 | HIGH | -| Step5Interaction.vue | Medium | 115 | HIGH | -| GraphPanel.vue | Medium | 128 | MEDIUM | -| Step3Simulation.vue | Medium | 103 | HIGH | -| SimulationView.vue | Medium | 56 | HIGH | -| SimulationRunView.vue | Medium | 64 | HIGH | -| Step1GraphBuild.vue | Medium | 31 | MEDIUM | -| MainView.vue | Low | 14 | MEDIUM | -| InteractionView.vue | Low | 17 | MEDIUM | -| ReportView.vue | Low | 17 | MEDIUM | -| API files (graph.js, index.js, report.js, simulation.js) | Low | 75 | LOW | -| App.vue & Other | Very Low | 10 | LOW | -| **TOTAL** | - | **1,400+** | - | - ---- - -## Notes - -1. **Most Critical Files for User Experience:** - - `Home.vue` - Landing page marketing copy - - `Step2EnvSetup.vue` - Complex workflow with many configuration options - - `Step4Report.vue` - Report generation with extensive content - -2. **Internationalization Strategy:** - - Consider implementing i18n (Vue I18n) for dynamic language switching - - Store translations in separate locale files (`en.json`, `zh.json`, etc.) - - Use translation keys instead of hardcoded strings where possible - -3. **Testing Considerations:** - - Text length may vary significantly between Chinese and English - - UI layout adjustments may be needed for longer English text - - Date/time formatting should be locale-aware - -4. **Comment Translation:** - - While low priority, consider translating comments for developer experience - - This aids future maintenance and onboarding - diff --git a/frontend/src/components/Step4Report.vue b/frontend/src/components/Step4Report.vue index 0853618..49a238a 100644 --- a/frontend/src/components/Step4Report.vue +++ b/frontend/src/components/Step4Report.vue @@ -552,30 +552,30 @@ const parseInsightForge = (text) => { try { // Extract analysis query - const queryMatch = text.match(/分析问题:\s*(.+?)(?:\n|$)/) + const queryMatch = text.match(/Analysis question:\s*(.+?)(?:\n|$)/) if (queryMatch) result.query = queryMatch[1].trim() // Extract prediction scenario - const reqMatch = text.match(/预测场景:\s*(.+?)(?:\n|$)/) + const reqMatch = text.match(/Prediction scenario:\s*(.+?)(?:\n|$)/) if (reqMatch) result.simulationRequirement = reqMatch[1].trim() - // Extract statistics - match "相关预测事实: X条" format - const factMatch = text.match(/相关预测事实:\s*(\d+)/) - const entityMatch = text.match(/涉及实体:\s*(\d+)/) - const relMatch = text.match(/关系链:\s*(\d+)/) + // Extract statistics + const factMatch = text.match(/Related prediction facts:\s*(\d+)/) + const entityMatch = text.match(/Entities involved:\s*(\d+)/) + const relMatch = text.match(/Relationship chains:\s*(\d+)/) if (factMatch) result.stats.facts = parseInt(factMatch[1]) if (entityMatch) result.stats.entities = parseInt(entityMatch[1]) if (relMatch) result.stats.relationships = parseInt(relMatch[1]) // Extract sub-questions - full extraction, unlimited quantity - const subQSection = text.match(/### 分析的子问题\n([\s\S]*?)(?=\n###|$)/) + const subQSection = text.match(/### Analyzed Sub-questions\n([\s\S]*?)(?=\n###|$)/) if (subQSection) { const lines = subQSection[1].split('\n').filter(l => l.match(/^\d+\./)) result.subQueries = lines.map(l => l.replace(/^\d+\.\s*/, '').trim()).filter(Boolean) } // Extract key facts - full extraction, unlimited quantity - const factsSection = text.match(/### 【关键事实】[\s\S]*?\n([\s\S]*?)(?=\n###|$)/) + const factsSection = text.match(/### \[Key Facts\][\s\S]*?\n([\s\S]*?)(?=\n###|$)/) if (factsSection) { const lines = factsSection[1].split('\n').filter(l => l.match(/^\d+\./)) result.facts = lines.map(l => { @@ -585,15 +585,15 @@ const parseInsightForge = (text) => { } // Extract core entities - full extraction, with summary and related fact counts - const entitySection = text.match(/### 【核心实体】\n([\s\S]*?)(?=\n###|$)/) + const entitySection = text.match(/### \[Core Entities\]\n([\s\S]*?)(?=\n###|$)/) if (entitySection) { const entityText = entitySection[1] // Split entity blocks by "- **" const entityBlocks = entityText.split(/\n(?=- \*\*)/).filter(b => b.trim().startsWith('- **')) result.entities = entityBlocks.map(block => { const nameMatch = block.match(/^-\s*\*\*(.+?)\*\*\s*\((.+?)\)/) - const summaryMatch = block.match(/摘要:\s*"?(.+?)"?(?:\n|$)/) - const relatedMatch = block.match(/相关事实:\s*(\d+)/) + const summaryMatch = block.match(/Summary:\s*"?(.+?)"?(?:\n|$)/) + const relatedMatch = block.match(/Related facts:\s*(\d+)/) return { name: nameMatch ? nameMatch[1].trim() : '', type: nameMatch ? nameMatch[2].trim() : '', @@ -604,7 +604,7 @@ const parseInsightForge = (text) => { } // Extract relationship chains - full extraction, unlimited quantity - const relSection = text.match(/### 【关系链】\n([\s\S]*?)(?=\n###|$)/) + const relSection = text.match(/### \[Relationship Chains\]\n([\s\S]*?)(?=\n###|$)/) if (relSection) { const lines = relSection[1].split('\n').filter(l => l.trim().startsWith('-')) result.relations = lines.map(l => { @@ -633,21 +633,21 @@ const parsePanorama = (text) => { try { // Extract query - const queryMatch = text.match(/查询:\s*(.+?)(?:\n|$)/) + const queryMatch = text.match(/Query:\s*(.+?)(?:\n|$)/) if (queryMatch) result.query = queryMatch[1].trim() // Extract statistics - const nodesMatch = text.match(/总节点数:\s*(\d+)/) - const edgesMatch = text.match(/总边数:\s*(\d+)/) - const activeMatch = text.match(/当前有效事实:\s*(\d+)/) - const histMatch = text.match(/历史\/过期事实:\s*(\d+)/) + const nodesMatch = text.match(/Total nodes:\s*(\d+)/) + const edgesMatch = text.match(/Total edges:\s*(\d+)/) + const activeMatch = text.match(/Current active facts:\s*(\d+)/) + const histMatch = text.match(/Historical\/expired facts:\s*(\d+)/) if (nodesMatch) result.stats.nodes = parseInt(nodesMatch[1]) if (edgesMatch) result.stats.edges = parseInt(edgesMatch[1]) if (activeMatch) result.stats.activeFacts = parseInt(activeMatch[1]) if (histMatch) result.stats.historicalFacts = parseInt(histMatch[1]) // Extract current active facts - full extraction, unlimited quantity - const activeSection = text.match(/### 【当前有效事实】[\s\S]*?\n([\s\S]*?)(?=\n###|$)/) + const activeSection = text.match(/### \[Current Active Facts\][\s\S]*?\n([\s\S]*?)(?=\n###|$)/) if (activeSection) { const lines = activeSection[1].split('\n').filter(l => l.match(/^\d+\./)) result.activeFacts = lines.map(l => { @@ -658,7 +658,7 @@ const parsePanorama = (text) => { } // Extract historical/expired facts - full extraction, unlimited quantity - const histSection = text.match(/### 【历史\/过期事实】[\s\S]*?\n([\s\S]*?)(?=\n###|$)/) + const histSection = text.match(/### \[Historical\/Expired Facts\][\s\S]*?\n([\s\S]*?)(?=\n###|$)/) if (histSection) { const lines = histSection[1].split('\n').filter(l => l.match(/^\d+\./)) result.historicalFacts = lines.map(l => { @@ -668,7 +668,7 @@ const parsePanorama = (text) => { } // Extract involved entities - full extraction, unlimited quantity - const entitySection = text.match(/### 【涉及实体】\n([\s\S]*?)(?=\n###|$)/) + const entitySection = text.match(/### \[Involved Entities\]\n([\s\S]*?)(?=\n###|$)/) if (entitySection) { const lines = entitySection[1].split('\n').filter(l => l.trim().startsWith('-')) result.entities = lines.map(l => { @@ -697,11 +697,11 @@ const parseInterview = (text) => { try { // Extract interview topic - const topicMatch = text.match(/\*\*采访主题:\*\*\s*(.+?)(?:\n|$)/) + const topicMatch = text.match(/\*\*Interview Topic:\*\*\s*(.+?)(?:\n|$)/) if (topicMatch) result.topic = topicMatch[1].trim() - // Extract interview count (e.g., "5 / 9 Simulated Agents") - const countMatch = text.match(/\*\*采访人数:\*\*\s*(\d+)\s*\/\s*(\d+)/) + // Extract interview count (e.g., "5 / 9 simulated Agents") + const countMatch = text.match(/\*\*Interviewees:\*\*\s*(\d+)\s*\/\s*(\d+)/) if (countMatch) { result.successCount = parseInt(countMatch[1]) result.totalCount = parseInt(countMatch[2]) @@ -709,7 +709,7 @@ const parseInterview = (text) => { } // Extract selection reasoning for interview subjects - const reasonMatch = text.match(/### 采访对象选择理由\n([\s\S]*?)(?=\n---\n|\n### 采访实录)/) + const reasonMatch = text.match(/### Interview Subject Selection Reasoning\n([\s\S]*?)(?=\n---\n|\n### Interview Transcripts)/) if (reasonMatch) { result.selectionReason = reasonMatch[1].trim() } @@ -739,7 +739,7 @@ const parseInterview = (text) => { // Format 2: - Select Name (index X): Reason // Example: - Select Parent_601 (index 0): As parent group representative... if (!headerMatch) { - headerMatch = line.match(/^-\s*选择([^((]+)(?:[((]index\s*=?\s*\d+[))])?[::]\s*(.*)/) + headerMatch = line.match(/^-\s*Select\s+([^((]+)(?:[((]index\s*=?\s*\d+[))])?[::]\s*(.*)/) if (headerMatch) { name = headerMatch[1].trim() reasonStart = headerMatch[2] @@ -764,7 +764,7 @@ const parseInterview = (text) => { // Start new person currentName = name currentReason = reasonStart ? [reasonStart.trim()] : [] - } else if (currentName && line.trim() && !line.match(/^未选|^综上|^最终选择/)) { + } else if (currentName && line.trim() && !line.match(/^Not selected|^In summary|^Final selection/)) { // Continuation of reason (exclude ending summary paragraphs) currentReason.push(line.trim()) } @@ -781,7 +781,7 @@ const parseInterview = (text) => { const individualReasons = parseIndividualReasons(result.selectionReason) // Extract each interview record - const interviewBlocks = text.split(/#### 采访 #\d+:/).slice(1) + const interviewBlocks = text.split(/#### Interview #\d+:/).slice(1) interviewBlocks.forEach((block, index) => { const interview = { @@ -811,7 +811,7 @@ const parseInterview = (text) => { } // Extract bio - const bioMatch = block.match(/_简介:\s*([\s\S]*?)_\n/) + const bioMatch = block.match(/_Bio:\s*([\s\S]*?)_\n/) if (bioMatch) { interview.bio = bioMatch[1].trim().replace(/\.\.\.$/, '...') } @@ -834,13 +834,13 @@ const parseInterview = (text) => { } // Extract answers - separate Twitter and Reddit - const answerMatch = block.match(/\*\*A:\*\*\s*([\s\S]*?)(?=\*\*关键引言|$)/) + const answerMatch = block.match(/\*\*A:\*\*\s*([\s\S]*?)(?=\*\*Key Quotes|$)/) if (answerMatch) { const answerText = answerMatch[1].trim() // Separate Twitter and Reddit answers - const twitterMatch = answerText.match(/【Twitter平台回答】\n?([\s\S]*?)(?=【Reddit平台回答】|$)/) - const redditMatch = answerText.match(/【Reddit平台回答】\n?([\s\S]*?)$/) + const twitterMatch = answerText.match(/\[Twitter Platform Response\]\n?([\s\S]*?)(?=\[Reddit Platform Response\]|$)/) + const redditMatch = answerText.match(/\[Reddit Platform Response\]\n?([\s\S]*?)$/) if (twitterMatch) { interview.twitterAnswer = twitterMatch[1].trim() @@ -852,11 +852,11 @@ const parseInterview = (text) => { // Platform fallback logic (compatible with old format: only one platform marker) if (!twitterMatch && redditMatch) { // Only Reddit answer, copy as default display if not placeholder - if (interview.redditAnswer && interview.redditAnswer !== '(该平台未获得回复)') { + if (interview.redditAnswer && interview.redditAnswer !== '(No response from this platform)') { interview.twitterAnswer = interview.redditAnswer } } else if (twitterMatch && !redditMatch) { - if (interview.twitterAnswer && interview.twitterAnswer !== '(该平台未获得回复)') { + if (interview.twitterAnswer && interview.twitterAnswer !== '(No response from this platform)') { interview.redditAnswer = interview.twitterAnswer } } else if (!twitterMatch && !redditMatch) { @@ -866,7 +866,7 @@ const parseInterview = (text) => { } // Extract key quotes (compatible with various quote formats) - const quotesMatch = block.match(/\*\*关键引言:\*\*\n([\s\S]*?)(?=\n---|\n####|$)/) + const quotesMatch = block.match(/\*\*Key Quotes:\*\*\n([\s\S]*?)(?=\n---|\n####|$)/) if (quotesMatch) { const quotesText = quotesMatch[1] // Prefer matching > "text" format @@ -888,7 +888,7 @@ const parseInterview = (text) => { }) // Extract interview summary - const summaryMatch = text.match(/### 采访摘要与核心观点\n([\s\S]*?)$/) + const summaryMatch = text.match(/### Interview Summary and Core Views\n([\s\S]*?)$/) if (summaryMatch) { result.summary = summaryMatch[1].trim() } @@ -910,22 +910,22 @@ const parseQuickSearch = (text) => { try { // Extract search query - const queryMatch = text.match(/搜索查询:\s*(.+?)(?:\n|$)/) + const queryMatch = text.match(/Search query:\s*(.+?)(?:\n|$)/) if (queryMatch) result.query = queryMatch[1].trim() // Extract result count - const countMatch = text.match(/找到\s*(\d+)\s*条/) + const countMatch = text.match(/Found\s*(\d+)\s*related/) if (countMatch) result.count = parseInt(countMatch[1]) // Extract related facts - full extraction, unlimited quantity - const factsSection = text.match(/### 相关事实:\n([\s\S]*)$/) + const factsSection = text.match(/### Related facts:\n([\s\S]*)$/) if (factsSection) { const lines = factsSection[1].split('\n').filter(l => l.match(/^\d+\./)) result.facts = lines.map(l => l.replace(/^\d+\.\s*/, '').trim()).filter(Boolean) } // Try to extract edge information (if present) - const edgesSection = text.match(/### 相关边:\n([\s\S]*?)(?=\n###|$)/) + const edgesSection = text.match(/### Related edges:\n([\s\S]*?)(?=\n###|$)/) if (edgesSection) { const lines = edgesSection[1].split('\n').filter(l => l.trim().startsWith('-')) result.edges = lines.map(l => { @@ -938,7 +938,7 @@ const parseQuickSearch = (text) => { } // Try to extract node information (if present) - const nodesSection = text.match(/### 相关节点:\n([\s\S]*?)(?=\n###|$)/) + const nodesSection = text.match(/### Related nodes:\n([\s\S]*?)(?=\n###|$)/) if (nodesSection) { const lines = nodesSection[1].split('\n').filter(l => l.trim().startsWith('-')) result.nodes = lines.map(l => { @@ -1327,7 +1327,7 @@ const InterviewDisplay = { const isPlaceholderText = (text) => { if (!text) return true const t = text.trim() - return t === '(该平台未获得回复)' || t === '(该平台未获得回复)' || t === '[无回复]' + return t === '(No response from this platform)' || t === '[No response]' } // Try to split answer by question numbering @@ -1336,14 +1336,14 @@ const InterviewDisplay = { if (isPlaceholderText(answerText)) return [''] // Support two numbering formats: - // 1. "问题X:" or "问题X:" (Chinese format, new backend format) + // 1. "Question X:" (English format, new backend format) // 2. "1. " or "\n1. " (number+dot, old format compatible) let matches = [] let match - // Prefer to try "问题X:" format - const cnPattern = /(?:^|[\r\n]+)问题(\d+)[::]\s*/g - while ((match = cnPattern.exec(answerText)) !== null) { + // Prefer to try "Question X:" format + const qPattern = /(?:^|[\r\n]+)Question\s*(\d+)[::]\s*/g + while ((match = qPattern.exec(answerText)) !== null) { matches.push({ num: parseInt(match[1]), index: match.index, @@ -1366,7 +1366,7 @@ const InterviewDisplay = { // If no numbering or only one found, return whole text if (matches.length <= 1) { const cleaned = answerText - .replace(/^问题\d+[::]\s*/, '') + .replace(/^Question\s*\d+[::]\s*/, '') .replace(/^\d+\.\s+/, '') .trim() return [cleaned || answerText] @@ -2006,8 +2006,8 @@ const getActionLabel = (action) => { } const getLogLevelClass = (log) => { - if (log.includes('ERROR') || log.includes('错误')) return 'error' - if (log.includes('WARNING') || log.includes('警告')) return 'warning' + if (log.includes('ERROR') || log.includes('error')) return 'error' + if (log.includes('WARNING') || log.includes('warning')) return 'warning' // INFO uses default color, not marked as success return '' } @@ -2097,10 +2097,10 @@ const extractFinalContent = (response) => { return finalAnswerMatch[1].trim() } - // Try to find content after "最终答案:" - const chineseFinalMatch = response.match(/最终答案[::]\s*\n*([\s\S]*)$/i) - if (chineseFinalMatch) { - return chineseFinalMatch[1].trim() + // Try to find content after "Final Answer:" (alternate format) + const altFinalMatch = response.match(/Final Answer[:]\s*\n*([\s\S]*)$/i) + if (altFinalMatch) { + return altFinalMatch[1].trim() } // If starts with ## or # or >, it might be direct markdown content