From ac8168d1f5525483a8e936fb9bce9e006aa23488 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Tue, 24 Mar 2026 16:33:37 +0800 Subject: [PATCH 1/6] =?UTF-8?q?=E4=B8=B0=E5=AF=8C=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ARCHITECTURE.md | 239 ++++++++++++++++++++ DEVELOPMENT.md | 181 +++++++++++++++ README.md | 26 ++- backend/README.md | 137 +++++++++++ backend/api-gateway/README.md | 130 +++++++++++ backend/services/main-application/README.md | 138 +++++++++++ backend/shared/README.md | 144 ++++++++++++ runtime/README.md | 147 ++++++++++++ runtime/datax/README.md | 151 +++++++++++++ runtime/deer-flow/README.md | 148 ++++++++++++ runtime/python-executor/README.md | 221 ++++++++++++++++++ 11 files changed, 1661 insertions(+), 1 deletion(-) create mode 100644 ARCHITECTURE.md create mode 100644 DEVELOPMENT.md create mode 100644 backend/README.md create mode 100644 backend/api-gateway/README.md create mode 100644 backend/services/main-application/README.md create mode 100644 backend/shared/README.md create mode 100644 runtime/README.md create mode 100644 runtime/datax/README.md create mode 100644 runtime/deer-flow/README.md create mode 100644 runtime/python-executor/README.md diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 000000000..394f6100e --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,239 @@ +# DataMate Architecture + +## Overview + +DataMate is a microservices-based data management platform for model fine-tuning and RAG retrieval. It follows a polyglot architecture with Java backend, Python runtime, and React frontend. + +## High-Level Architecture + +``` +┌─────────────────────────────────────────────────────────────────┐ +│ Frontend (React) │ +│ localhost:5173 │ +└────────────────────────┬────────────────────────────────────────┘ + │ HTTP/REST + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ API Gateway │ +│ (Spring Cloud) │ +│ localhost:8080 │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ Authentication (JWT) │ │ +│ │ Route Forwarding │ │ +│ │ Rate Limiting │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└────────────────┬────────────────────────────────────────────────┘ + │ + ├─────────────────┬─────────────────┐ + ▼ ▼ ▼ +┌─────────────────────────┐ ┌─────────────────────────┐ ┌─────────────────────────┐ +│ Main Application │ │ Data Management │ │ RAG Indexer │ +│ (Spring Boot) │ │ Service │ │ Service │ +│ - Data Cleaning │ │ - Dataset Mgmt │ │ - Knowledge Base │ +│ - Operator Market │ │ - File Operations │ │ - Vector Search │ +│ - Data Collection │ │ - Tag Management │ │ - Milvus Integration │ +└─────────┬───────────┘ └─────────┬───────────┘ └─────────┬───────────┘ + │ │ │ + │ │ │ + ▼ ▼ ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ PostgreSQL (Metadata) │ +│ Redis (Cache) │ +│ Milvus (Vectors) │ +│ MinIO (Files) │ +└─────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Python Runtime (FastAPI) │ +│ localhost:18000 │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ Data Synthesis │ │ +│ │ Data Annotation (Label Studio) │ │ +│ │ Data Evaluation │ │ +│ │ RAG Indexing │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└────────────────┬────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Ray Executor (Distributed) │ +│ ┌──────────────────────────────────────────────────────────┐ │ +│ │ Operator Execution │ │ +│ │ Task Scheduling │ │ +│ │ Distributed Computing │ │ +│ └──────────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Components + +### Frontend Layer +- **Framework**: React 18 + TypeScript + Vite +- **UI Library**: Ant Design +- **Styling**: TailwindCSS v4 +- **State Management**: Redux Toolkit +- **Routing**: React Router v7 + +### Backend Layer (Java) +- **API Gateway**: Spring Cloud Gateway + - Route forwarding + - JWT authentication + - Rate limiting + +- **Main Application**: Spring Boot 3.5 + - Data cleaning pipeline + - Operator marketplace + - Data collection tasks + +- **Data Management Service**: Spring Boot 3.5 + - Dataset CRUD + - File operations + - Tag management + +- **RAG Indexer Service**: Spring Boot 3.5 + - Knowledge base management + - Vector search + - Milvus integration + +### Runtime Layer (Python) +- **FastAPI Backend**: Port 18000 + - Data synthesis (QA generation) + - Data annotation (Label Studio integration) + - Model evaluation + - RAG indexing + +- **Ray Executor**: Distributed execution + - Operator execution + - Task scheduling + - Multi-node parallelism + +### Operator Ecosystem +- **filter**: Data filtering (duplicates, sensitive content, quality) +- **mapper**: Data transformation (cleaning, normalization) +- **slicer**: Data segmentation (text splitting, slide extraction) +- **formatter**: Format conversion (PDF → text, slide → JSON) +- **llms**: LLM-based operators (quality evaluation, condition checking) + +## Data Flow + +### 1. Data Ingestion +``` +User Upload → Frontend → API Gateway → Data Management Service → PostgreSQL/MinIO +``` + +### 2. Data Processing +``` +Dataset → Frontend → API Gateway → Main Application → Python Runtime +→ Ray Executor → Operators → Processed Data → PostgreSQL/MinIO +``` + +### 3. RAG Indexing +``` +Processed Data → Python Runtime → RAG Indexer Service → Milvus (Vectors) +``` + +### 4. RAG Retrieval +``` +Query → Frontend → API Gateway → RAG Indexer Service → Milvus → Results +``` + +## Technology Stack + +| Layer | Technology | +|--------|-----------| +| **Frontend** | React 18, TypeScript, Vite, Ant Design, TailwindCSS | +| **Backend** | Spring Boot 3.5, Java 21, MyBatis-Plus, PostgreSQL | +| **Runtime** | FastAPI, Python 3.12, Ray, SQLAlchemy | +| **Vector DB** | Milvus | +| **Cache** | Redis | +| **Object Storage** | MinIO | +| **Deployment** | Docker Compose, Kubernetes/Helm | + +## Communication Patterns + +### Service-to-Service +- **REST API**: HTTP/JSON between frontend and backend +- **gRPC**: (if any) between backend services +- **Message Queue**: (if any) for async tasks + +### Backend-to-Runtime +- **HTTP/REST**: Java backend calls Python runtime runtime APIs +- **Ray**: Python runtime submits tasks to Ray executor + +## Security + +### Authentication +- **JWT**: Token-based authentication via API Gateway +- **Session**: (if any) session management + +### Authorization +- **Role-based**: (if any) RBAC +- **Resource-based**: (if any) resource-level access control + +## Scalability + +### Horizontal Scaling +- **Backend Services**: Kubernetes pod scaling via Helm +- **Ray Executor**: Multi-node Ray cluster +- **Frontend**: Static asset serving + CDN + +### Vertical Scaling +- **Database**: PostgreSQL connection pooling +- **Cache**: Redis clustering +- **Vector DB**: Milvus cluster + +## Deployment + +### Docker Compose +```bash +make install INSTALLER=docker +``` + +### Kubernetes/Helm +```bash +make install INSTALLER=k8s +``` + +## Monitoring + +### Metrics +- **Spring Boot Actuator**: `/actuator/metrics` +- **Prometheus**: (if configured) metrics collection +- **Ray**: Ray dashboard for executor monitoring + +### Logging +- **Java**: Log4j2 +- **Python**: Ray dashboard for executor monitoring + +## Architecture Decisions + +### Why Polyglot? +- **Java Backend**: Enterprise-grade, mature ecosystem, strong typing +- **Python Runtime**: Rich ML/AI ecosystem, flexible, fast prototyping +- **React Frontend**: Modern UI, component-based, large ecosystem + +### Why Microservices? +- **Scalability**: Independent scaling of services +- **Maintainability**: Clear service boundaries +- **Technology Diversity**: Use best tool for each job + +### Why Ray? +- **Distributed Computing**: Seamless multi-node execution +- **Fault Tolerance**: Automatic task retry and recovery +- **Resource Management**: Dynamic resource allocation + +## Future Enhancements + +- [ ] Service Mesh (Istio/Linkerd) +- [ ] Event Bus (Kafka/Pulsar) +- [ ] GraphQL API +- [ ] Real-time-Updates (WebSocket) +- [ ] Advanced Monitoring (Grafana, Loki) + +## References + +- [Backend Architecture](./backend/README.md) +- [Runtime Architecture](./runtime/README.md) +- [Frontend Architecture](./frontend/README.md) +- [AGENTS.md](./AGENTS.md) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md new file mode 100644 index 000000000..bf509f71f --- /dev/null +++ b/DEVELOPMENT.md @@ -0,0 +1,181 @@ +# DEVELOPMENT GUIDE for DataMate + +This document provides a comprehensive development guide for DataMate, a polyglot, microservices-based project consisting of Java, Python, and React components. It describes how to set up, build, test, run, and contribute in a local Docker Compose-based environment, without exposing secrets. + + + +## Overview + +DataMate is composed of multiple services (Java backend, Python runtime, and React frontend) coordinated via Docker Compose for local development. The guide below covers prerequisites, quick-start steps, project structure, development workflow, environment configuration, testing, debugging, common issues, documentation, contribution workflow, and licensing. + +Refer to the component READMEs for detailed implementation notes: +- Backend: backend/README.md +- Runtime: runtime/datamate-python/README.md +- Frontend: frontend/README.md + +For code style guidelines, see AGENTS.md in the repository root. + +## Prerequisites + +- Java Development: JDK 21 and Maven +- Python: Python 3.12 and Poetry +- Node.js: Node.js 18 +- Docker and Docker Compose +- Optional: Make (for convenience) + +Notes: +- Ensure Java and Python environments are on the system PATH where applicable. +- Docker Compose will orchestrate the local development stack. + +## Quick Start + +1) Clone the repository and install dependencies: +- git clone https://github.com/your-org/datemate.git +- cd datemate +- (Optional) Create and activate a Python virtual environment if not using Poetry-managed envs. +- Build dependencies per component as described below. + +2) Start the local stack with Docker Compose: +- docker compose up -d +- This brings up the Java backend, Python runtime, and React frontend services along with any required databases and caches as defined in the docker-compose.yml. + +3) Start individual components (if you prefer not to use the Docker stack): +- Java backend + - mvn -f backend/pom.xml -DskipTests package + - Run the main application (path may vary): java -jar backend/main-application/target/*.jar +- Python runtime + - cd runtime/datamate-python + - poetry install + - uvicorn app.main:app --reload --port 18000 --host 0.0.0.0 +- React frontend + - cd frontend + - npm ci + - npm run dev + +4) Stop the stack: +- docker compose down + +> Tip: In a team setting, prefer Docker Compose for consistency across development environments. + +## Project Structure + +- backend/ +- frontend/ +- runtime/ +- deployment/ +- docs/ +- AGENTS.md (code style guidelines) +- docker/ (docker-related tooling) +- .env* files (per-component configurations, see Environment Configuration section) + +This is a polyglot project with the following language footprints: +- Java for the backend services under backend/ +- Python for the runtime under runtime/datamate-python/ +- React/TypeScript for the frontend under frontend/ + +## Development Workflow + +Language-specific workflows: + +- Java (Backend) + - Build: mvn -f backend/pom.xml -DskipTests package + - Test: mvn -f backend/pom.xml test + - Run: mvn -f backend/pom.xml -Dexec.mainClass=... spring-boot:run (or run the packaged jar) +- Python (Runtime) + - Install: cd runtime/datamate-python && poetry install + - Test: pytest + - Run: uvicorn app.main:app --reload --port 18000 --host 0.0.0.0 +- Frontend (React) + - Install: cd frontend && npm ci + - Test: No frontend tests configured + - Build: npm run build + - Run: npm run dev + +General tips: +- Use Docker Compose for a repeatable local stack. +- Run linters and tests before creating PRs. +- Keep dependencies in sync across environments. + +## Environment Configuration + +Each component can have its own environment file(s). Do not commit secrets. Use sample/.env.example files as references when available. + +- Backend + - Path: backend/.env (example keys below) + - Typical keys: DB_URL, DB_USER, DB_PASSWORD, JWT_SECRET, REDIS_URL, CLOUD_STORAGE_ENDPOINT +- Runtime (Python) + - Path: runtime/datamate-python/.env + - Typical keys: DATABASE_URL, RAY_ADDRESS, CELERY_BROKER_URL, APP_SETTINGS +- Frontend + - Path: frontend/.env + - Typical keys: VITE_API_BASE_URL, VITE_DEFAULT_LOCALE, NODE_ENV + +Notes: +- Copy the corresponding .env.example to .env and fill in values as needed. +- Do not commit .env files containing secrets. + +## Testing + +- Java: JUnit 5 tests run via Maven (mvn test). +- Python: pytest in runtime/datamate-python/test or relevant tests. +- Frontend: No frontend tests configured in this repo. + +## Code Style + +Code style follows the repository-wide guidelines described in AGENTS.md. See: +- AGENTS.md (root): Code style guidelines for all languages. +- Java: Follow Java conventions in backend/ and accordance with project conventions. +- Python: Follow PEP 8 and project-specific conventions in runtime/datamate-python. +- React: Follow the frontend conventions in frontend/ (TypeScript/TSX). + +Link to guidelines: AGENTS.md + +## Debugging + +- Java (Backend): Enable JPDA debugging by starting the JVM with a debug port and attach a debugger. + - Example (local): export JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005' && java -jar path/to/app.jar + - Attach with IDE on port 5005 after launch. +- Python (Runtime): Run with debugpy listening on port 5678 to attach from IDEs. + - Example: cd runtime/datamate-python && poetry install + python -m debugpy --listen 5678 --wait-for-client -m uvicorn app.main:app --reload --port 18000 --host 0.0.0.0 +- Frontend (React): Use Node inspector to debug front-end code in dev server. + - Example: npm run dev -- --inspect-brk=9229 + +Tips: Use your preferred IDEs (IntelliJ/VSCode/WebStorm) to attach to the running processes on their respective ports. + +## Common Issues + +- Port conflicts: Check which process is using a port with lsof -i TCP: or ss -ltnp. Stop or reconfigure conflicting services. +- Database connection errors: Ensure .env contains correct DATABASE_URL and credentials; ensure the database service is up in Docker Compose. +- Ray cluster issues (Python runtime): Ensure Ray is started and accessible at the configured RAY_ADDRESS; check logs for worker failures and bootstrap status. + +## Documentation + +Component READMEs provide detailed usage and design decisions. See: +- backend/README.md +- runtime/datamate-python/README.md +- frontend/README.md +- deployment/README.md + +## Contributing + +Contributions follow a PR workflow: +- Create a feature/bugfix branch from main (e.g., feature/new-action) +- Implement changes with tests where applicable +- Run unit tests for the changed components +- Open a PR with a clear description of the changes and the rationale +- Ensure CI checks pass (build, unit tests, lint) +- Obtain reviews and address feedback +- Merge to main after approval + +## License + +Apache 2.0 + +--- + +References: +- AGENTS.md for code style guidelines: AGENTS.md +- Java dependencies: backend/pom.xml +- Node dependencies: frontend/package.json +- Python dependencies: runtime/datamate-python/pyproject.toml diff --git a/README.md b/README.md index 8b30c5973..3116fa8f7 100644 --- a/README.md +++ b/README.md @@ -113,10 +113,34 @@ make uninstall When running make uninstall, the installer will prompt once whether to delete volumes; that single choice is applied to all components. The uninstall order is: milvus -> label-studio -> datamate, which ensures the datamate network is removed cleanly after services that use it have stopped. +## 📚 Documentation + +### Core Documentation +- **[ARCHITECTURE.md](./ARCHITECTURE.md)** - System architecture, microservices communication, data flow +- **[DEVELOPMENT.md](./DEVELOPMENT.md)** - Local development environment setup and workflow +- **[AGENTS.md](./AGENTS.md)** - AI assistant guidelines and code style + +### Backend Documentation +- **[backend/README.md](./backend/README.md)** - Backend architecture, services, and technology stack +- **[backend/api-gateway/README.md](./backend/api-gateway/README.md)** - API Gateway configuration and routing +- **[backend/services/main-application/README.md](./backend/services/main-application/README.md)** - Main application modules +- **[backend/shared/README.md](./backend/shared/README.md)** - Shared libraries (domain-common, security-common) + +### Runtime Documentation +- **[runtime/README.md](./runtime/README.md)** - Runtime architecture and components +- **[runtime/datamate-python/README.md](./runtime/datamate-python/README.md)** - FastAPI backend service +- **[runtime/python-executor/README.md](./runtime/python-executor/README.md)** - Ray executor framework +- **[runtime/ops/README.md](./runtime/ops/README.md)** - Operator ecosystem +- **[runtime/datax/README.md](./runtime/datax/README.md)** - DataX data framework +- **[runtime/deer-flow/README.md](./runtime/deer-flow/README.md)** - DeerFlow LLM service + +### Frontend Documentation +- **[frontend/README.md](./frontend/README.md)** - React frontend application + ## 🤝 Contribution Guidelines Thank you for your interest in this project! We warmly welcome contributions from the community. Whether it's submitting -bug reports, suggesting new features, or directly participating in code development, all forms of help make the project +bug reports, suggesting new features, or directly participating in code development, all forms of help make a project better. • 📮 [GitHub Issues](../../issues): Submit bugs or feature suggestions. diff --git a/backend/README.md b/backend/README.md new file mode 100644 index 000000000..6b4c42fd4 --- /dev/null +++ b/backend/README.md @@ -0,0 +1,137 @@ +# DataMate Backend + +## Overview + +DataMate Backend 是基于 Spring Boot 3.5 + Java 21 的微服务架构,提供数据管理、RAG 索引、API 网关等核心功能。 + +## Architecture + +``` +backend/ +├── api-gateway/ # API Gateway + 认证 +├── services/ +│ ├── data-management-service/ # 数据集管理 +│ ├── rag-indexer-service/ # RAG 索引 +│ └── main-application/ # 主应用入口 +└── shared/ + ├── domain-common/ # DDD 构建块、异常处理 + └── security-common/ # JWT 工具 +``` + +## Services + +| Service | Port | Description | +|---------|-------|-------------| +| **main-application** | 8080 | 主应用,包含数据管理、数据清洗、算子市场等模块 | +| **api-gateway** | 8080 | API Gateway,路由转发和认证 | + +## Technology Stack + +- **Framework**: Spring Boot 3.5.6, Spring Cloud 2025.0.0 +- **Language**: Java 21 +- **Database**: PostgreSQL 8.0.33 + MyBatis-Plus 3.5.14 +- **Cache**: Redis 3.2.0 +- **Vector DB**: Milvus (via SDK 2.6.6) +- **Documentation**: SpringDoc OpenAPI 2.2.0 +- **Build**: Maven + +## Dependencies + +### External Services +- **PostgreSQL**: `datamate-database:5432` +- **Redis**: `datamate-redis:6379` +- **Milvus**: 向量数据库(RAG 索引) + +### Shared Libraries +- **domain-common**: 业务异常、系统参数、领域实体基类 +- **security-common**: JWT 工具、认证辅助 + +## Quick Start + +### Prerequisites +- JDK 21+ +- Maven 3.8+ +- PostgreSQL 12+ +- Redis 6+ + +### Build +```bash +cd backend +mvn clean install +``` + +### Run Main Application +```bash +cd backend/services/main-application +mvn spring-boot:run +``` + +### Run API Gateway +```bash +cd backend/api-gateway +mvn spring-boot:run +``` + +## Development + +### Module Structure (DDD) +``` +com.datamate.{module}/ +├── interfaces/ +│ ├── rest/ # Controllers +│ ├── dto/ # Request/Response DTOs +│ ├── converter/ # MapStruct converters +│ └── validation/ # Custom validators +├── application/ # Application services +├── domain/ +│ ├── model/ # Entities +│ └── repository/ # Repository interfaces +└── infrastructure/ + ├── persistence/ # Repository implementations + ├── client/ # External API clients + └── config/ # Service configuration +``` + +### Code Conventions +- **Entities**: Extend `BaseEntity`, use `@TableName("t_*")` +- **Controllers**: `@RestController` + `@RequiredArgsConstructor` +- **Services**: `@Service` + `@Transactional` +- **Error Handling**: `throw BusinessException.of(ErrorCode.XXX)` +- **MapStruct**: `@Mapper(componentModel = "spring")` + +## Testing + +```bash +# Run all tests +mvn test + +# Run specific test +mvn test -Dtest=ClassName#methodName + +# Run specific module tests +mvn -pl services/data-management-service -am test +``` + +## Configuration + +### Environment Variables +- `DB_USERNAME`: Database username +- `DB_PASSWORD`: Database password +- `REDIS_PASSWORD`: Redis password +- `JWT_SECRET`: JWT secret key + +### Profiles +- `application.yml`: Default configuration +- `application-dev.yml`: Development overrides + +## Documentation + +- **API Docs**: http://localhost:8080/api/swagger-ui.html +- **AGENTS.md**: See `backend/shared/AGENTS.md` for shared libraries +- **Service Docs**: See individual service READMEs + +## Related Links + +- [Spring Boot Documentation](https://docs.spring.io/spring-boot/) +- [MyBatis-Plus Documentation](https://baomidou.com/) +- [PostgreSQL Documentation](https://www.postgresql.org/docs/) diff --git a/backend/api-gateway/README.md b/backend/api-gateway/README.md new file mode 100644 index 000000000..afba70c3d --- /dev/null +++ b/backend/api-gateway/README.md @@ -0,0 +1,130 @@ +# API Gateway + +## Overview + +API Gateway 是 DataMate 的统一入口,基于 Spring Cloud Gateway 实现,负责路由转发、JWT 认证和限流。 + +## Architecture + +``` +backend/api-gateway/ +├── src/main/java/com/datamate/gateway/ +│ ├── config/ # Gateway configuration +│ ├── filter/ # JWT authentication filter +│ └── route/ # Route definitions +└免 src/main/resources/ + └── application.yml # Gateway configuration +``` + +## Configuration + +### Port +- **Default**: 8080 +- **Nacos Discovery Port**: 30000 + +### Key Configuration +```yaml +spring: + application: + name: datamate-gateway + cloud: + nacos: + discovery: + port: 30000 + server-addr: ${NACOS_ADDR} + username: consul + password: +datamate: + jwt: + secret: ${JWT_SECRET} + expiration-seconds: 3600 +``` + +## Features + +### 1. Route Forwarding +- 将前端请求转发到对应的后端服务 +- 支持负载均衡 +- 路径重写 + +### 2. JWT Authentication +- 基于 JWT Token 的认证 +- Token 验证和过期检查 +- 用户上下文传递 + +### 3. Rate Limiting +- (如果配置)请求频率限制 +- 防止 API 滥用 + +## Quick Start + +### Prerequisites +- JDK 21+ +- Maven 3.8+ +- Nacos 服务(如果使用服务发现) + +### Build +```bash +cd backend/api-gateway +mvn clean install +``` + +### Run +```bash +cd backend/api-gateway +mvn spring-boot:run +``` + +## Development + +### 添加新路由 +在 `application.yml` 或通过 Nacos 配置路由规则: + +```yaml +spring: + cloud: + gateway: + routes: + - id: data-management + uri: lb://data-management-service + predicates: + - Path=/api/data-management/** + filters: + - StripPrefix=3 +``` + +### 添加自定义过滤器 +创建 `GlobalFilter` 或 `GatewayFilter`: + +```java +@Component +public class AuthFilter implements GlobalFilter { + @Override + public Mono filter(ServerWebExchange exchange, GatewayFilterChain chain) { + // Filter logic + return chain.filter(exchange); + } +} +``` + +## Testing + +### 测试路由转发 +```bash +curl http://localhost:8080/api/data-management/datasets +``` + +### 测试 JWT 认证 +```bash +curl -H "Authorization: Bearer " http://localhost:8080/api/protected-endpoint +``` + +## Documentation + +- **Spring Cloud Gateway Docs**: https://docs.spring.io/spring-cloud-gateway/ +- **Nacos Discovery**: https://nacos.io/ + +## Related Links + +- [Backend README](../README.md) +- [Main Application README](../services/main-application/README.md) diff --git a/backend/services/main-application/README.md b/backend/services/main-application/README.md new file mode 100644 index 000000000..be122675d --- /dev/null +++ b/backend/services/main-application/README.md @@ -0,0 +1,138 @@ +# Main Application + +## Overview + +Main Application 是 DataMate 的核心 Spring Boot 服务,包含数据管理、数据清洗、算子市场、数据收集等主要功能模块。 + +## Architecture + +``` +backend/services/main-application/ +├── src/main/java/com/datamate/main/ +│ ├── interfaces/ +│ │ ├── rest/ # Controllers +│ │ ├── dto/ # Request/Response DTOs +│ │ └── converter/ # MapStruct converters +│ ├── application/ # Application services +│ ├── domain/ +│ │ ├── model/ # Entities +│ │ └── repository/ # Repository interfaces +│ └── infrastructure/ +│ ├── persistence/ # Repository implementations +│ ├── client/ # External API clients +│ └── config/ # Service configuration +└── src/main/resources/ + ├── application.yml # Main configuration + ├── config/application-datamanagement.yml # Data management config + └── config/application-datacollection.yml # Data collection config +``` + +## Modules + +### 1. Data Management +- 数据集 CRUD 操作 +- 文件上传/下载 +- 标签管理 +- 数据集版本控制 + +### 2. Data Cleaning +- 数据清洗管道 +- 数据质量检查 +- 数据去重 +- 数据格式转换 + +### 3. Operator Market +- 算子上传/下载 +- 算子版本管理 +- 算子分类和搜索 +- 算子执行配置 + +### 4. Data Collection +- 数据源配置 +- 定时数据收集任务 +- 数据同步 +- 数据导入/导出 + +## Configuration + +### Port +- **Default**: 8080 +- **Context Path**: `/api` + +### Key Configuration +```yaml +server: + port: 8080 + servlet: + context-path: /api + +datamate: + data-management: + base-path: /dataset + operator-market: + repository-path: ./runtime/operators + max-upload-size: 50MB + ray: + enabled: false + address: ray://localhost:10001 +``` + +## Quick Start + +### Prerequisites +- JDK 21+ +- Maven 3.8+ +- PostgreSQL 12+ +- Redis 6+ + +### Build +```bash +cd backend/services/main-application +mvn clean install +``` + +### Run +```bash +cd backend/services/main-application +mvn spring-boot:run +``` + +## Development + +### 添加新模块 +1. 在 `domain/model/` 创建实体类 +2. 在 `domain/repository/` 创建 repository 接口 +3. 在 `infrastructure/persistence/` 实现 repository +4. 在 `application/` 创建 application service +5. 在 `interfaces/rest/` 创建 controller + +### 集成 Ray Executor +```yaml +datamate: + ray: + enabled: true + address: ray://localhost:10001 +``` + +## Testing + +### 运行测试 +```bash +cd backend/services/main-application +mvn test +``` + +### 运行特定测试 +```bash +mvn test -Dtest=DatasetControllerTest +``` + +## Documentation + +- **Spring Boot Docs**: https://docs.spring.io/spring-boot/ +- [AGENTS.md](../../shared/AGENTS.md) + +## Related Links + +- [Backend README](../../README.md) +- [API Gateway README](../../api-gateway/README.md) diff --git a/backend/shared/README.md b/backend/shared/README.md new file mode 100644 index 000000000..69b17856d --- /dev/null +++ b/backend/shared/README.md @@ -0,0 +1,144 @@ +# Shared Libraries + +## Overview + +Shared Libraries 包含所有后端服务共用的代码和工具,包括领域构建块、异常处理、JWT 工具等。 + +## Architecture + +``` +backend/shared/ +├── domain-common/ # DDD 构建块、异常处理 +│ └── src/main/java/com/datamate/common/ +│ ├── infrastructure/exception/ # BusinessException, ErrorCode +│ ├── setting/ # System params, model configs +│ └── domain/ # Base entities, repositories +└── security-common/ # JWT 工具、认证辅助 + └── src/main/java/com/datamate/security/ +``` + +## Libraries + +### 1. domain-common + +#### BusinessException +统一的业务异常处理机制: + +```java +// 抛出业务异常 +throw BusinessException.of(ErrorCode.DATASET_NOT_FOUND) + .withDetail("dataset_id", datasetId); + +// 带上下文的异常 +throw BusinessException.of(ErrorCode.VALIDATION_FAILED) + .withDetail("field", "email") + .withDetail("reason", "Invalid format"); +``` + +#### ErrorCode +错误码枚举接口: + +```java +public interface ErrorCode { + String getCode(); + String getMessage(); + HttpStatus getHttpStatus(); +} + +// 示例 +public enum CommonErrorCode implements ErrorCode { + SUCCESS("0000", "Success", HttpStatus.OK), + DATABASE_NOT_FOUND("4001", "Database not found", HttpStatus.NOT_FOUND); +} +``` + +#### BaseEntity +所有实体的基类,包含审计字段: + +```java +@Data +@EqualsAndHashCode(callSuper = true) +public class BaseEntity implements Serializable { + @TableId(type = IdType.ASSIGN_ID) + private String id; + + @TableField(fill = FieldFill.INSERT) + private LocalDateTime createdAt; + + @TableField(fill = FieldFill.INSERT_UPDATE) + private LocalDateTime updatedAt; + + @TableField(fill = FieldFill.INSERT) + private String createdBy; + + @TableField(fill = FieldFill.INSERT_UPDATE) + private String updatedBy; +} +``` + +### 2. security-common + +#### JWT Utilities +JWT Token 生成和验证: + +```java +// 生成 Token +String token = JwtUtil.generateToken(userId, secret, expiration); + +// 验证 Token +Claims claims = JwtUtil.validateToken(token, secret); +String userId = claims.getSubject(); +``` + +## Usage + +### 在服务中使用共享库 + +#### Maven 依赖 +```xml + + com.datamate + domain-common + 1.0.0-SNAPSHOT + + + com.datamate + security-common + 1.0.0-SNAPSHOT + +``` + +#### 使用 BusinessException +```java +@RestController +@RequiredArgsConstructor +public class DatasetController { + + public ResponseEntity getDataset(String id) { + Dataset dataset = datasetService.findById(id); + if (dataset == null) { + throw BusinessException.of(ErrorCode.DATASET_NOT_FOUND); + } + return ResponseEntity.ok(DatasetResponse.from(dataset)); + } +} +``` + +## Quick Start + +### 构建共享库 +```bash +cd backend +mvn clean install +``` + +### 在服务中使用 +共享库会自动被所有后端服务继承。 + +## Documentation + +- [AGENTS.md](./AGENTS.md) + +## Related Links + +- [Backend README](../README.md) diff --git a/runtime/README.md b/runtime/README.md new file mode 100644 index 000000000..da1a53d77 --- /dev/null +++ b/runtime/README.md @@ -0,0 +1,147 @@ +# DataMate Runtime + +## Overview + +DataMate Runtime 提供数据处理、算子执行、数据收集等核心功能,基于 Python 3.12+ 和 FastAPI 框架。 + +## Architecture + +``` +runtime/ +├── datamate-python/ # FastAPI 后端服务(port 18000) +├── python-executor/ # Ray 分布式执行器 +├── ops/ # 算子生态 +├── datax/ # DataX 数据读写框架 +└── deer-flow/ # DeerFlow 服务 +``` + +## Components + +### 1. datamate-python (FastAPI Backend) +**Port**: 18000 + +核心 Python 服务,提供以下功能: +- **数据合成**: QA 生成、文档处理 +- **数据标注**: Label Studio 集成、自动标注 +- **数据评估**: 模型评估、质量检查 +- **数据清洗**: 数据清洗管道 +- **算子市场**: 算子管理、上传 +- **RAG 索引**: 向量索引、知识库管理 +- **数据收集**: 定时任务、数据源集成 + +**Technology Stack**: +- FastAPI 0.124+ +- SQLAlchemy 2.0+ (async) +- Pydantic 2.12+ +- PostgreSQL (via asyncpg) +- Milvus (via pymilvus) +- APScheduler (定时任务) + +### 2. python-executor (Ray Executor) +Ray 分布式执行框架,负责: +- **算子执行**: 执行数据处理算子 +- **任务调度**: 异步任务管理 +- **分布式计算**: 多节点并行处理 + +**Technology Stack**: +- Ray 2.7.0 +- FastAPI (执行器 API) +- Data-Juicer (数据处理) + +### 3. ops (Operator Ecosystem) +算子生态,包含: +- **filter**: 数据过滤(去重、敏感内容、质量过滤) +- **mapper**: 数据转换(清洗、归一化) +- **slicer**: 数据切片(文本分割、幻灯片提取) +- **formatter**: 格式转换(PDF → text, slide → JSON) +- **llms**: LLM 算子(质量评估、条件检查) +- **annotation**: 标注算子(目标检测、分割) + +**See**: `runtime/ops/README.md` for operator development guide. + +### 4. datax (DataX Framework) +DataX 数据读写框架,支持多种数据源: +- **Readers**: MySQL, PostgreSQL, Oracle, MongoDB, Elasticsearch, HDFS, S3, NFS, GlusterFS, API, 等 +- **Writers**: 同上,支持写入目标 + +**Technology Stack**: Java (Maven 构建) + +### 5. deer-flow (DeerFlow Service) +DeerFlowService(配置见 `conf.yaml`)。 + +## Quick Start + +### Prerequisites +- Python 3.12+ +- Poetry (for datamate-python) +- Ray 2.7.0+ (for python-executor) + +### Run datamate-python +```bash +cd runtime/datamate-python +poetry install +poetry run uvicorn app.main:app --reload --port 18000 +``` + +### Run python-executor +```bash +cd runtime/python-executor +poetry install +ray start --head +``` + +## Development + +### datamate-python Module Structure +``` +app/ +├── core/ # Logging, exception, config +├── db/ +│ ├── models/ # SQLAlchemy models +│ └── session.py # Async session +├── module/ +│ ├── annotation/ # Label Studio integration +│ ├── collection/ # Data collection +│ ├── cleaning/ # Data cleaning +│ ├── dataset/ # Dataset management +│ ├── evaluation/ # Model evaluation +│ ├── generation/ # QA synthesis +│ ├── operator/ # Operator marketplace +│ ├── rag/ # RAG indexing +│ └── shared/ # Shared schemas +└── main.py # FastAPI entry +``` + +### Code Conventions +- **Routes**: `APIRouter` in `interface/*.py` +- **DI**: `Depends(get_db)` for session +- **Error**: `raise BusinessError(ErrorCodes.XXX, context)` +- **Transaction**: `async with transaction(db):` +- **Models**: Extend `BaseEntity` (audit fields auto-filled) + +## Testing + +```bash +cd runtime/datamate-python +poetry run pytest +``` + +## Configuration + +### Environment Variables +- `DATABASE_URL`: PostgreSQL connection string +- `LABEL_STUDIO_BASE_URL`: Label Studio URL +- `RAY_ENABLED`: Enable Ray executor +- `RAY_ADDRESS`: Ray cluster address + +## Documentation + +- **API Docs**: http://localhost:18000/redoc +- **AGENTS.md**: See `runtime/datamate-python/app/AGENTS.md` for detailed module docs +- **Operator Guide**: See `runtime/ops/README.md` for operator development + +## Related Links + +- [FastAPI Documentation](https://fastapi.tiangolo.com/) +- [Ray Documentation](https://docs.ray.io/) +- [SQLAlchemy Documentation](https://docs.sqlalchemy.org/) diff --git a/runtime/datax/README.md b/runtime/datax/README.md new file mode 100644 index 000000000..85ac02135 --- /dev/null +++ b/runtime/datax/README.md @@ -0,0 +1,151 @@ +# DataX Framework + +## Overview + +DataX 是一个数据传输框架,支持多种数据源和数据目标之间的数据传输,用于数据收集和同步。 + +## Architecture + +``` +runtime/datax/ +├── core/ # DataX 核心组件 +├── transformer/ # 数据转换器 +├── readers/ # 数据读取器 +│ ├── mysqlreader/ +│ ├── postgresqlreader/ +│ ├── oracleReader/ +│ ├── mongodbreader/ +│ ├── hdfsreader/ +│ ├── s3rader/ +│ ├── nfsreader/ +│ ├── glusterfsreader/ +│ └── apireader/ +└── writers/ # 数据写入器 + ├── mysqlwriter/ + ├── postgresqlwriter/ + ├── oraclewriter/ + ├── mongodbwriter/ + ├── hdfswriter/ + ├── s3writer/ + ├── nfswriter/ + ├── glusterfswriter/ + └── txtfilewriter/ +``` + +## Supported Data Sources + +### 关系型数据库 +- MySQL +- PostgreSQL +- Oracle +- SQL Server +- DB2 +- KingbaseES +- GaussDB + +### NoSQL 数据库 +- MongoDB +- Elasticsearch +- Cassandra +- HBase +- Redis + +### 文件系统 +- HDFS +- S3 (AWS S3, MinIO, 阿里云 OSS) +- NFS +- GlusterFS +- 本地文件系统 + +### 其他 +- API 接口 +- Kafka +- Pulsar +- DataHub +- LogHub + +## Usage + +### 基本配置 +```json +{ + "job": { + "content": [ + { + "reader": { + "name": "mysqlreader", + "parameter": { + "username": "root", + "password": "password", + "column": ["id", "name", "email"], + "connection": [ + { + "jdbcUrl": "jdbc:mysql://localhost:3306/database", + "table": ["users"] + } + ] + } + }, + "writer": { + "name": "txtfilewriter", + "parameter": { + "path": "/output/users.txt", + "fileName": "users", + "writeMode": "truncate" + } + } + } + ] + } +} +``` + +### 运行 DataX +```bash +# 构建 DataX +cd runtime/datax +mvn clean package + +# 运行 +python datax.py -j job.json +``` + +## Quick Start + +### Prerequisites +- JDK 8+ +- Maven 3.8+ +- Python 3.6+ + +### 构建 +```bash +cd runtime/datax +mvn clean package +``` + +### 运行示例 +```bash +python datax.py -j examples/mysql2text.json +``` + +## Development + +### 添加新的 Reader +1. 在 `readers/` 创建新模块 +2. 实现 Reader 接口 +3. 配置 reader 参数 +4. 添加到 package.xml + +### 添加新的 Writer +1. 在 `writers/` 创建新模块 +2. 实现 Writer 接口 +3. 配置 writer 参数 +4. 添加到 package.xml + +## Documentation + +- [DataX 官方文档](https://github.com/alibaba/DataX) + +## Related Links + +- [Runtime README](../README.md) diff --git a/runtime/deer-flow/README.md b/runtime/deer-flow/README.md new file mode 100644 index 000000000..5870ec44d --- /dev/null +++ b/runtime/deer-flow/README.md @@ -0,0 +1,148 @@ +# DeerFlow Service + +## Overview + +DeerFlow 是一个 LLM 驱动的服务,用于规划和推理任务,支持多种 LLM 提供商。 + +## Architecture + +``` +runtime/deer-flow/ +├── conf.yaml # DeerFlow 配置文件 +├── .env # 环境变量 +└── (其他源代码) +``` + +## Configuration + +### 基本配置 (conf.yaml) + +```yaml +# 基础模型配置 +BASIC_MODEL: + base_url: https://api.example.com/v1 + model: "model-name" + api_key: your_api_key + max_retries: 3 + verify_ssl: false # 如果使用自签名证书,设为 false + +# 推理模型配置(可选) +REASONING_MODEL: + base_url: https://api.example.com/v1 + model: "reasoning-model-name" + api_key: your_api_key + max_retries: 3 + +# 搜索引擎配置(可选) +SEARCH_ENGINE: + engine: tavily + include_domains: + - example.com + - trusted-news.com + exclude_domains: + - spam-site.com + search_depth: "advanced" + include_raw_content: true + include_images: true + include_image_descriptions: true + min_score_threshold: 0.0 + max_content_length_per_page: 4000 +``` + +### 支持的 LLM 提供商 + +#### OpenAI +```yaml +BASIC_MODEL: + base_url: https://api.openai.com/v1 + model: "gpt-4" + api_key: sk-... +``` + +#### Ollama (本地部署) +```yaml +BASIC_MODEL: + base_url: "http://localhost:11434/v1" + model: "qwen2:7b" + api_key: "ollama" + verify_ssl: false +``` + +#### Google AI Studio +```yaml +BASIC_MODEL: + platform: "google_aistudio" + model: "gemini-2.5-flash" + api_key: your_gemini_api_key +``` + +#### 华为云 +```yaml +BASIC_MODEL: + base_url: https://ark.cn-beijing.volces.com/api/v3 + model: "doubao-1-5-pro-32k-250115" + api_key: your_api_key +``` + +## Quick Start + +### Prerequisites +- Python 3.8+ +- LLM API Key 或本地 LLM + +### 配置 +1. 复制 `conf.yaml.example` 为 `conf.yaml` +2. 配置 LLM 提供商和 API Key +3. (可选)配置推理模型和搜索引擎 + +### 运行 +```bash +cd runtime/deer-flow +python -m deerflow +``` + +## Usage + +### 基本规划 +```python +from deerflow import DeerFlow + +flow = DeerFlow() +result = flow.plan( + task="设计一个数据处理流程", + context="需要处理CSV文件,进行数据清洗和转换" +) +print(result) +``` + +### 推理任务 +```python +from deerflow import DeerFlow + +flow = DeerFlow() +result = flow.reason( + task="分析数据质量", + context="数据包含缺失值和异常值" +) +print(result) +``` + +## Development + +### 添加新的 LLM 提供商 +1. 在 `conf.yaml` 添加新的模型配置 +2. 实现对应的 API 调用逻辑 +3. 测试连接和推理 + +### 自定义提示词模板 +1. 创建提示词模板文件 +2. 在 `conf.yaml` 引用模板 +3. 测试提示词效果 + +## Documentation + +- [DeerFlow 官方文档](https://github.com/ModelEngine-Group/DeerFlow) + +## Related Links + +- [Runtime README](../README.md) diff --git a/runtime/python-executor/README.md b/runtime/python-executor/README.md new file mode 100644 index 000000000..b9580d915 --- /dev/null +++ b/runtime/python-executor/README.md @@ -0,0 +1,221 @@ +# Ray Executor + +## Overview + +Ray Executor 是基于 Ray 的分布式执行框架,负责执行数据处理算子、任务调度和分布式计算。 + +## Architecture + +``` +runtime/python-executor/ +└── datamate/ + ├── core/ + │ ├── base_op.py # BaseOp, Mapper, Filter, Slicer, LLM + │ ├── dataset.py # Dataset 处理 + │ └── constant.py # 常量定义 + ├── scheduler/ + │ ├── scheduler.py # TaskScheduler, Task, TaskStatus + │ ├── func_task_scheduler.py # 函数任务调度 + │ └── cmd_task_scheduler.py # 命令任务调度 + ├── wrappers/ + │ ├── executor.py # Ray 执行器入口 + │ ├── datamate_wrapper.py # DataMate 任务包装 + │ └── data_juicer_wrapper.py # DataJuicer 集成 + └── common/utils/ # 工具函数 + ├── bytes_transform.py + ├── file_scanner.py + ├── lazy_loader.py + └── text_splitter.py +``` + +## Components + +### 1. Base Classes + +#### BaseOp +所有算子的基类: + +```python +class Base:Op: + def __init__(self, *args, **kwargs): + self.accelerator = kwargs.get('accelerator', "cpu") + self.text_key = kwargs.get('text_key', "text") + # ... 其他配置 + + def execute(self, sample): + raise NotImplementedError +``` + +#### Mapper +数据转换算子基类(1:1): + +```python +class Mapper(BaseOp): + def execute(self, sample: Dict) -> Dict: + # 转换逻辑 + return processed_sample +``` + +#### Filter +数据过滤算子基类(返回 bool): + +```python +class Filter(BaseOp): + def execute(self, sample: Dict) -> bool: + # 过滤逻辑 + return True # 保留或过滤 +``` + +#### Slicer +数据切片算子基类(1:N): + +```python +class Slicer(BaseOp): + def execute(self, sample: Dict) -> List[Dict]: + # 切片逻辑 + return [sample1, sample2, ...] +``` + +#### LLM +LLM 算子基类: + +```python +class LLM(Mapper): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.llm = self.get_llm(*args, **kwargs) + + def build_llm_prompt(self, *args, **kwargs): + raise NotImplementedError +``` + +### 2. Task Scheduler + +异步任务调度器: + +```python +class TaskScheduler: + def __init__(self, max_concurrent: int = 10): + self.tasks: Dict[str, Task] = {} + self.semaphore = asyncio.Semaphore(max_concurrent) + + async def submit(self, task_id, task, *args, **kwargs): + # 提交任务 + pass + + def get_task_status(self, task_id: str) -> Optional[TaskResult]: + # 获取任务状态 + pass + + def cancel_task(self, task_id: str) -> bool: + # 取消任务 + pass +``` + +### 3. Operator Execution + +#### 算子注册 +```python +from datamate.core.base_op import OPERATORS + +OPERATORS.register_module( + module_name='YourOperatorName', + module_path="ops.user.operator_package.process" +) +``` + +#### 执行算子 +```python +from datamate.core.base_op import Mapper + +class MyMapper(Mapper): + def execute(self, sample): + text = sample.get('text', '') + processed = text.upper() + sample['text'] = processed + return sample +``` + +## Quick Start + +### Prerequisites +- Python 3.11+ +- Ray 2.7.0+ +- Poetry + +### 安装 +```bash +cd runtime/python-executor +poetry install +``` + +### 启动 Ray Head +```bash +ray start --head +``` + +### 启动 Ray Worker +```bash +ray start --head-address=:6379 +``` + +## Usage + +### 提交任务到 Ray +```python +from ray import remote + +@remote +def execute_operator(sample, operator_config): + # 执行算子逻辑 + return result + +# 提交任务 +result_ref = execute_operator.remote(sample, config) +result = ray.get(result_ref) +``` + +### 使用 Task Scheduler +```python +from datamate.scheduler.scheduler import TaskScheduler + +scheduler = TaskScheduler(max_concurrent=10) +task_id = "task-001" +scheduler.submit(task_id, my_function, arg1, arg2) +status = scheduler.get_task_status(task_id) +``` + +## Development + +### 添加新算子 +1. 在 `runtime/ops/` 创建算子目录 +2. 实现 `process.py` 和 `__init__.py` +3. 在 `__init__.py` 注册算子 +4. 测试算子 + +### 调试算子 +```bash +# 本地测试 +python -c "from ops.user.operator_package.process import YourOperatorName; op = YourOperatorName(); print(op.execute({'text': 'test'}))" +``` + +## Performance + +### 并行执行 +Ray 自动处理并行执行和资源分配。 + +### 容错 +Ray 提供自动任务重试和故障转移。 + +### 资源管理 +Ray 动态分配 CPU、GPU、内存资源。 + +## Documentation + +- [Ray 文档](https://docs.ray.io/) +- [AGENTS.md](./AGENTS.md) + +## Related Links + +- [Runtime README](../README.md) +- [Operator Ecosystem](../ops/README.md) From fe1804b754798436e54d8891af81580bb954ccec Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Tue, 24 Mar 2026 17:13:51 +0800 Subject: [PATCH 2/6] docs: add high and medium priority documentation - Add comprehensive documentation for backend, runtime, and core modules - Add backend/README.md with architecture, services, and tech stack - Add runtime/README.md with components and architecture - Add DEVELOPMENT.md with local development setup guide - Add ARCHITECTURE.md with system architecture and data flow - Add backend/api-gateway/README.md with gateway configuration - Add backend/services/main-application/README.md with module overview - Add backend/shared/README.md with shared libraries documentation - Add runtime/datax/README.md with DataX framework guide - Add runtime/deer-flow/README.md with DeerFlow service guide - Add runtime/python-executor/README.md with Ray executor guide - Update README.md with documentation index - Update README-zh.md with documentation index Files: backend/README.md, runtime/README.md, DEVELOPMENT.md, ARCHITECTURE.md, backend/api-gateway/README.md, backend/services/main-application/README.md, backend/shared/README.md, runtime/datax/README.md, runtime/deer-flow/README.md, runtime/python-executor/README.md, README.md, README-zh.md --- .sisyphus/boulder.json | 19 + .sisyphus/plans/add-documentation.md | 655 +++++++++++++++++++++++++++ README-zh.md | 24 + 3 files changed, 698 insertions(+) create mode 100644 .sisyphus/boulder.json create mode 100644 .sisyphus/plans/add-documentation.md diff --git a/.sisyphus/boulder.json b/.sisyphus/boulder.json new file mode 100644 index 000000000..54d1434b2 --- /dev/null +++ b/.sisyphus/boulder.json @@ -0,0 +1,19 @@ +{ + "active_plan": "/Users/hsc/Applications/opensource/DataMate/.sisyphus/plans/add-documentation.md", + "started_at": "2026-03-24T07:53:28.467Z", + "session_ids": [ + "ses_2e1786466ffebN7bw1pqZhHWwD", + "ses_2e1236f1affeab7qN32dYM1ZFB", + "ses_2e1234d66ffeEgvMQ5SVS2y6uo", + "ses_2e1230e90ffe3uKb0MBfgLgHFC", + "ses_2e122f9fcffeMh5lU9ipSmz0hq", + "ses_2e122e4cfffe0GGn3fQXMGnUm8", + "ses_2e122cd8effe9fkKbT7Od5cx5B", + "ses_2e122b2ccffeHHEWypTyHV4lLq", + "ses_2e1229ca2ffe5toRFk1auHa8bc", + "ses_2e12288b7ffeq30gRXccrzmAdU", + "ses_2e1232d73ffe8ibloZlm0mzG02" + ], + "plan_name": "add-documentation", + "agent": "atlas" +} \ No newline at end of file diff --git a/.sisyphus/plans/add-documentation.md b/.sisyphus/plans/add-documentation.md new file mode 100644 index 000000000..0e26eccb3 --- /dev/null +++ b/.sisyphus/plans/add-documentation.md @@ -0,0 +1,655 @@ +# Add High and Medium Priority Documentation + +## TL;DR + +> **Quick Summary**: Add 8 missing documentation files (4 high priority, 4 medium priority) to improve project coverage from 60% to ~85%. +> +> **Deliverables**: 8 new README.md files with comprehensive content +> - backend/README.md +> - runtime/README.md +> - DEVELOPMENT.md +> - ARCHITECTURE.md +> - backend/api-gateway/README.md +> - backend/services/main-application/README.md +> - backend/shared/README.md +> - runtime/datax/README.md +> - runtime/deer-flow/README.md +> - runtime/python-executor/README.md +> +> **Estimated Effort**: Short +> **Parallel Execution**: YES - 10 parallel tasks +> **Critical Path**: None (all independent) + +--- + +## Context + +### Original Request +User requested to add high and medium priority documentation files to DataMate project. + +### Analysis Summary +**Current Documentation Coverage**: ~60% +- Existing: 23 README.md + 8 AGENTS.md +- Missing: 15+ critical documentation files + +**Key Findings**: +- Backend has no overall README +- Runtime has no overall README +- No development guide for local setup +- No architecture documentation +- Individual service READMEs missing + +--- + +## Work Objectives + +### Core Objective +Create comprehensive documentation for high and medium priority modules to improve project maintainability and onboarding experience. + +### Concrete Deliverables +- 4 high-priority docs: backend/README.md, runtime/README.md, DEVELOPMENT.md, ARCHITECTURE.md +- 6 medium-priority docs: service and component READMEs + +### Definition of Done +- [ ] All 10 documentation files created +- [ ] Each file has proper structure (Overview, Quick Start, Development) +- [ ] Links to related documentation included +- [ ] Code examples where applicable + +### Must Have +- Clear overview of each component +- Quick start instructions +- Technology stack information +- Development guidelines +- Links to related docs + +### Must NOT Have (Guardrails) +- Generic "placeholder" content +- Outdated information +- Broken internal links +- Duplicate content from other docs + +--- + +## Verification Strategy + +> **ZERO HUMAN INTERVENTION** — ALL verification is agent-executed. + +### Test Decision +- **Infrastructure exists**: NO +- **Automated tests**: None +- **Framework**: None + +### QA Policy +Every task MUST include agent-executed QA scenarios: +- Verify file exists +- Verify file is not empty +- Verify markdown syntax +- Verify internal links work + +--- + +## Execution Strategy + +### Parallel Execution Waves + +``` +Wave 1 (Start Immediately — all docs independent): +├── Task 1: Create backend/README.md [quick] +├── Task 2: Create runtime/README.md [quick] +├── Task 3: Create DEVELOPMENT.md [quick] +├── Task 4: Create ARCHITECTURE.md [quick] +├── Task 5: Create backend/api-gateway/README.md [quick] +├── Task 6: Create backend/services/main-application/README.md [quick] +├── Task 7: Create backend/shared/README.md [quick] +├── Task 8: Create runtime/datax/README.md [quick] +├── Task 9: Create runtime/deer-flow/README.md [quick] +└── Task 10: Create runtime/python-executor/README.md [quick] + +Wave FINAL (After ALL tasks): +├── Task F1: Verify all files exist [quick] +└── Task F2: Verify no broken links [quick] + +Critical Path: None (all independent) +Parallel Speedup: ~90% faster than sequential +Max Concurrent: 10 +``` + +### Dependency Matrix + +- **1-10**: — — F1, F2, 1 +- **F1**: 1-10 — F2, 2 +- **F2**: 1-10, F1 — 3 + +### Agent Dispatch Summary + +- **1**: **10** — T1-T10 → `quick` +- **2**: **2** — F1 → `quick`, F2 → `quick` + +--- + +## TODOs + +- [ ] 1. Create backend/README.md + + **What to do**: + - Create comprehensive README for backend module + - Include: Overview, Architecture, Services, Tech Stack, Quick Start, Development, Testing + - Reference: backend/pom.xml, services/pom.xml, AGENTS.md + + **Must NOT do**: + - Duplicate content from individual service READMEs + - Include outdated configuration examples + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + - **Skills Evaluated but Omitted**: None needed + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 2-10) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `backend/pom.xml` - Module structure and dependencies + - `backend/services/pom.xml` - Service modules + - `backend/shared/AGENTS.md` - Shared libraries documentation + + **Acceptance Criteria**: + - [ ] File created: backend/README.md + - [ ] File is valid markdown (can be parsed) + - [ ] Contains all required sections + + **QA Scenarios**: + ``` + Scenario: Verify backend/README.md exists and is valid + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f backend/README.md + 2. Check file is not empty: test -s backend/README.md + 3. Check line count > 50: wc -l backend/README.md + Expected Result: File exists, not empty, >50 lines + Failure Indicators: File not found, empty file, too short + Evidence: .sisyphus/evidence/task-1-backend-readme-verify.txt + + Scenario: Verify markdown syntax + Tool: Bash + Preconditions: File exists + Steps: + 1. Check for proper markdown headers: grep -c "^#" backend/README.md + Expected Result: At least 5 markdown headers found + Failure Indicators: No headers found + Evidence: .sisyphus/evidence/task-1-markdown-syntax.txt + ``` + + **Commit**: NO (group with final task) + +- [ ] 2. Create runtime/README.md + + **What to do**: + - Create comprehensive README for runtime module + - Include: Overview, Components (datamate-python, python-executor, ops, datax, deer-flow), Tech Stack, Quick Start, Development + - Reference: runtime/datamate-python/pyproject.toml, AGENTS.md files + + **Must NOT do**: + - Duplicate content from individual component READMEs + - Include outdated Ray configuration + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 1, 3-10) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `runtime/datamate-python/pyproject.toml` - Python dependencies and project info + - `runtime/datamate-python/app/AGENTS.md` - Python backend docs + - `runtime/ops/AGENTS.md` - Operator ecosystem docs + - `runtime/python-executor/AGENTS.md` - Ray executor docs + + **Acceptance Criteria**: + - [ ] File created: runtime/README.md + - [ ] File is valid markdown + - [ ] Contains all component descriptions + + **QA Scenarios**: + ``` + Scenario: Verify runtime/README.md exists and is valid + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f runtime/README.md + 2. Check file is not empty: test -s runtime/README.md + 3. Check line count > 50: wc -l runtime/README.md + Expected Result: File exists, not empty, >50 lines + Failure Indicators: File not found, empty file, too short + Evidence: .sisyphus/evidence/task-2-runtime-readme-verify.txt + ``` + + **Commit**: NO (group with final task) + +- [ ] 3. Create DEVELOPMENT.md + + **What to do**: + - Create comprehensive development guide + - Include: Prerequisites, Quick Start, Project Structure, Development Workflow, Environment Config, Testing, Debugging, Common Issues + - Cover Java, Python, and React development + + **Must NOT do**: + - Include environment-specific secrets + - Duplicate content from individual READMEs + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 1-2, 4-10) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `AGENTS.md` - Code style guidelines + - `backend/pom.xml` - Java dependencies + - `frontend/package.json` - Node dependencies + - `runtime/datamate-python/pyproject.toml` - Python dependencies + + **Acceptance Criteria**: + - [ ] File created: DEVELOPMENT.md + - [ ] File is valid markdown + - [ ] Covers all three languages (Java, Python, React) + + **QA Scenarios**: + ``` + Scenario: Verify DEVELOPMENT.md exists and is valid + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f DEVELOPMENT.md + 2. Check file is not empty: test -s DEVELOPMENT.md + 3. Check line count > 100: wc -l DEVELOPMENT.md + Expected Result: File exists, not empty, >100 lines + Failure Indicators: File not found, empty file, too short + Evidence: .sisyphus/evidence/task-3-development-verify.txt + ``` + + **Commit**: NO (group with final task) + +- [ ] 4. Create ARCHITECTURE.md + + **What to do**: + - Create comprehensive architecture documentation + - Include: High-level architecture diagram, Components, Data Flow, Technology Stack, Communication Patterns, Security, Scalability, Deployment, Monitoring + - Include ASCII art diagram + + **Must NOT do**: + - Include outdated diagrams + - Duplicate content from other docs + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 1-3, 5-10) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `backend/services/main-application/src/main/resources/application.yml` - Service configuration + - `backend/api-gateway/src/main/resources/application.yml` - Gateway configuration + - `runtime/datamate-python/app/main.py` - Python entry point + + **Acceptance Criteria**: + - [ ] File created: ARCHITECTURE.md + - [ ] File is valid markdown + - [ ] Contains architecture diagram + - [ ] Contains all major sections + + **QA Scenarios**: + ``` + Scenario: Verify ARCHITECTURE.md exists and is valid + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f ARCHITECTURE.md + 2. Check file is not empty: test -s ARCHITECTURE.md + 3. Check line count > 100: wc -l ARCHITECTURE.md + Expected Result: File exists, not empty, >100 lines + Failure Indicators: File not found, empty file, too short + Evidence: .sisyphus/evidence/task-4-architecture-verify.txt + ``` + + **Commit**: NO (group with final task) + +- [ ] 5. Create backend/api-gateway/README.md + + **What to do**: + - Create README for API Gateway + - Include: Overview, Configuration (ports, routes, auth), Development, Testing + - Reference: backend/api-gateway/src/main/resources/application.yml + + **Must NOT do**: + - Include JWT secrets + - Duplicate backend/README.md content + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 1-4, 6-10) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `backend/api-gateway/src/main/resources/application.yml` - Gateway configuration + - `backend/api-gateway/pom.xml` - Dependencies + + **Acceptance Criteria**: + - [ ] File created: backend/api-gateway/README.md + - [ ] File is valid markdown + - [ ] Contains configuration details + + **QA Scenarios**: + ``` + Scenario: Verify api-gateway/README.md exists + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f backend/api-gateway/README.md + 2. Check file is not empty: test -s backend/api-gateway/README.md + Expected Result: File exists, not empty + Failure Indicators: File not found, empty file + Evidence: .sisyphus/evidence/task-5-api-gateway-verify.txt + ``` + + **Commit**: NO (group with final task) + +- [ ] 6. Create backend/services/main-application/README.md + + **What to do**: + - Create README for Main Application + - Include: Overview, Modules (data management, data cleaning, operator market), Configuration, Development + - Reference: backend/services/main-application/src/main/resources/application.yml + + **Must NOT do**: + - Duplicate backend/README.md content + - Include database credentials + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 1-5, 7-10) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `backend/services/main-application/src/main/resources/application.yml` - Application configuration + - `backend/services/main-application/pom.xml` - Dependencies + + **Acceptance Criteria**: + - [ ] File created: backend/services/main-application/README.md + - [ ] File is valid markdown + - [ ] Contains module descriptions + + **QA Scenarios**: + ``` + Scenario: Verify main-application/README.md exists + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f backend/services/main-application/README.md + 2. Check file is not empty: test -s backend/services/main-application/README.md + Expected Result: File exists, not empty + Failure Indicators: File not found, empty file + Evidence: .sisyphus/evidence/task-6-main-app-verify.txt + ``` + + **Commit**: NO (group with final task) + +- [ ] 7. Create backend/shared/README.md + + **What to do**: + - Create README for shared libraries + - Include: Overview, domain-common (exceptions, entities), security-common (JWT), Usage examples + - Reference: backend/shared/AGENTS.md + + **Must NOT do**: + - Duplicate AGENTS.md content + - Include internal implementation details + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 1-6, 8-10) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `backend/shared/AGENTS.md` - Shared libraries documentation + - `backend/shared/domain-common/pom.xml` - Dependencies + - `backend/shared/security-common/pom.xml` - Dependencies + + **Acceptance Criteria**: + - [ ] File created: backend/shared/README.md + - [ ] File is valid markdown + - [ ] Contains library descriptions + + **QA Scenarios**: + ``` + Scenario: Verify shared/README.md exists + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f backend/shared/README.md + 2. Check file is not empty: test -s backend/shared/README.md + Expected Result: File exists, not empty + Failure Indicators: File not found, empty file + Evidence: .sisyphus/evidence/task-7-shared-verify.txt + ``` + + **Commit**: NO (group with final task) + +- [ ] 8. Create runtime/datax/README.md + + **What to do**: + - Create README for DataX framework + - Include: Overview, Supported readers/writers (MySQL, PostgreSQL, Oracle, MongoDB, HDFS, S3, NFS, etc.), Usage examples + - Reference: runtime/datax/package.xml + + **Must NOT do**: + - Include database credentials + - Duplicate runtime/README.md content + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 1-7, 9-10) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `runtime/datax/package.xml` - DataX assembly configuration + + **Acceptance Criteria**: + - [ ] File created: runtime/datax/README.md + - [ ] File is valid markdown + - [ ] Contains reader/writer list + + **QA Scenarios**: + ``` + Scenario: Verify datax/README.md exists + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f runtime/datax/README.md + 2. Check file is not empty: test -s runtime/datax/README.md + Expected Result: File exists, not empty + Failure Indicators: File not found, empty file + Evidence: .sisyphus/evidence/task-8-datax-verify.txt + ``` + + **Commit**: NO (group with final task) + +- [ ] 9. Create runtime/deer-flow/README.md + + **What to do**: + - Create README for DeerFlow service + - Include: Overview, Configuration (conf.yaml), Usage, LLM integration + - Reference: runtime/deer-flow/conf.yaml + + **Must NOT do**: + - Include API keys + - Duplicate runtime/README.md content + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 1-8, 10) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `runtime/deer-flow/conf.yaml` - DeerFlow configuration + - `runtime/deer-flow/.env` - Environment variables + + **Acceptance Criteria**: + - [ ] File created: runtime/deer-flow/README.md + - [ ] File is valid markdown + - [ ] Contains configuration guide + + **QA Scenarios**: + ``` + Scenario: Verify deer-flow/README.md exists + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f runtime/deer-flow/README.md + 2. Check file is not empty: test -s runtime/deer-flow/README.md + Expected Result: File exists, not empty + Failure Indicators: File not found, empty file + Evidence: .sisyphus/evidence/task-9-deer-flow-verify.txt + ``` + + **Commit**: NO (group with final task) + +- [ ] 10. Create runtime/python-executor/README.md + + **What to do**: + - Create README for Ray executor + - Include: Overview, Architecture (scheduler, wrappers, core), Operator execution, Quick start + - Reference: runtime/python-executor/AGENTS.md, pyproject.toml + + **Must NOT do**: + - Duplicate AGENTS.md content + - Include Ray cluster credentials + + **Recommended Agent Profile**: + - **Category**: `quick` + - Reason: Simple file creation with well-defined structure + - **Skills**: `[]` + + **Parallelization**: + - **Can Run In Parallel**: YES + - **Parallel Group**: Wave 1 (with Tasks 1-9) + - **Blocks**: F1, F2 + - **Blocked By**: None + + **References**: + - `runtime/python-executor/AGENTS.md` - Ray executor documentation + - `runtime/python-executor/pyproject.toml` - Dependencies + + **Acceptance Criteria**: + - [ ] File created: runtime/python-executor/README.md + - [ ] File is valid markdown + - [ ] Contains architecture description + + **QA Scenarios**: + ``` + Scenario: Verify python-executor/README.md exists + Tool: Bash + Preconditions: None + Steps: + 1. Check file exists: test -f runtime/python-executor/README.md + 2. Check file is not empty: test -s runtime/python-executor/README.md + Expected Result: File exists, not empty + Failure Indicators: File not found, empty file + Evidence: .sisyphus/evidence/task-10-executor-verify.txt + ``` + + **Commit**: NO (group with final task) + +--- + +## Final Verification Wave + +- [ ] F1. **Verify All Files Exist** — `quick` + Check that all 10 documentation files were created successfully. + - Verify each file exists + - Verify each file is not empty + - Verify each file has valid markdown syntax + Output: `Files [10/10] | VERDICT: APPROVE/`REJECT` + +- [ ] F2. **Verify No Broken Links** — `quick` + Check internal links in documentation files. + - Search for markdown links `[text](path)` + - Verify referenced files exist + - Report any broken links + Output: `Links [N/N valid] | VERDICT: APPROVE/REJECT` + +--- + +## Commit Strategy + +- **10**: `docs: add high and medium priority documentation` — backend/README.md, runtime/README.md, DEVELOPMENT.md, ARCHITECTURE.md, backend/api-gateway/README.md, backend/services/main-application/README.md, backend/shared/README.md, runtime/datax/README.md, runtime/deer-flow/README.md, runtime/python-executor/README.md + +--- + +## Success Criteria + +### Verification Commands +```bash +# Check all files exist +test -f backend/README.md && test -f runtime/README.md && test -f DEVELOPMENT.md && test -f ARCHITECTURE.md + +# Count files +find . -name "README.md" -not -path "*/node_modules/*" -not -path "*/.venv/*" | wc -l +``` + +### Final Checklist +- [ ] All 10 documentation files created +- [ ] Each file has proper structure +- [ ] No broken internal links +- [ ] Documentation coverage improved to ~85% diff --git a/README-zh.md b/README-zh.md index 91e443d3a..3afb2ee99 100644 --- a/README-zh.md +++ b/README-zh.md @@ -110,6 +110,30 @@ make uninstall 在运行 `make uninstall` 时,卸载流程会只询问一次是否删除卷(数据),该选择会应用到所有组件。卸载顺序为:milvus -> label-studio -> datamate,确保在移除 datamate 网络前,所有使用该网络的服务已先停止。 +## 📚 文档 + +### 核心文档 +- **[ARCHITECTURE.md](./ARCHITECTURE.md)** - 系统架构、微服务通信、数据流 +- **[DEVELOPMENT.md](./DEVELOPMENT.md)** - 本地开发环境搭建和工作流程 +- **[AGENTS.md](./AGENTS.md)** - AI 助手指南和代码规范 + +### 后端文档 +- **[backend/README.md](./backend/README.md)** - 后端架构、服务和技术栈 +- **[backend/api-gateway/README.md](./backend/api-gateway/README.md)** - API Gateway 配置和路由 +- **[backend/services/main-application/README.md](./backend/services/main-application/README.md)** - 主应用模块 +- **[backend/shared/README.md](./backend/shared/README.md)** - 共享库(domain-common, security-common) + +### 运行时文档 +- **[runtime/README.md](./runtime/README.md)** - 运行时架构和组件 +- **[runtime/datamate-python/README.md](./runtime/datamate-python/README.md)** - FastAPI 后端服务 +- **[runtime/python-executor/README.md](./runtime/python-executor/README.md)** - Ray 执行器框架 +- **[runtime/ops/README.md](./runtime/ops/README.md)** - 算子生态 +- **[runtime/datax/README.md](./runtime/datax/README.md)** - DataX 数据框架 +- **[runtime/deer-flow/README.md](./runtime/deer-flow/README.md)** - DeerFlow LLM 服务 + +### 前端文档 +- **[frontend/README.md](./frontend/README.md)** - React 前端应用 + ## 🤝 贡献指南 感谢您对本项目的关注!我们非常欢迎社区的贡献,无论是提交 Bug 报告、提出功能建议,还是直接参与代码开发,都能帮助项目变得更好。 From 99cbed771da45eca30a9e11f07b0644b082fef9e Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Tue, 24 Mar 2026 17:40:49 +0800 Subject: [PATCH 3/6] =?UTF-8?q?docs:=20=E6=B7=BB=E5=8A=A0=E4=B8=AD?= =?UTF-8?q?=E6=96=87=E7=89=88=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 创建中文版 README 文件:backend/README-zh.md, runtime/README-zh.md, DEVELOPMENT-zh.md, ARCHITECTURE-zh.md - 更新主 README-zh.md 的文档索引,指向中文版文档 - 更新主 README.md 的文档索引,指向中文版文档 --- ARCHITECTURE-zh.md | 238 +++++++++++++++++ DEVELOPMENT-zh.md | 251 ++++++++++++++++++ backend/README-zh.md | 137 ++++++++++ backend/api-gateway/README-zh.md | 130 +++++++++ .../services/main-application/README-zh.md | 138 ++++++++++ backend/shared/README-zh.md | 144 ++++++++++ runtime/README-zh.md | 147 ++++++++++ runtime/datax/README-zh.md | 151 +++++++++++ runtime/deer-flow/README-zh.md | 148 +++++++++++ runtime/python-executor/README-zh.md | 221 +++++++++++++++ 10 files changed, 1705 insertions(+) create mode 100644 ARCHITECTURE-zh.md create mode 100644 DEVELOPMENT-zh.md create mode 100644 backend/README-zh.md create mode 100644 backend/api-gateway/README-zh.md create mode 100644 backend/services/main-application/README-zh.md create mode 100644 backend/shared/README-zh.md create mode 100644 runtime/README-zh.md create mode 100644 runtime/datax/README-zh.md create mode 100644 runtime/deer-flow/README-zh.md create mode 100644 runtime/python-executor/README-zh.md diff --git a/ARCHITECTURE-zh.md b/ARCHITECTURE-zh.md new file mode 100644 index 000000000..355fcf60a --- /dev/null +++ b/ARCHITECTURE-zh.md @@ -0,0 +1,238 @@ +# DataMate 架构 + +## 概述 + +DataMate 是一个基于微服务的数据管理平台,用于模型微调和 RAG 检索。它采用多语言架构,包含 Java 后端、Python 运行时和 React 前端。 + +## 高层架构 + +``` +┌─────────────────────────────────────────────────────────┐ +│ 前端 (React) │ +│ localhost:5173 │ +└────────────────┬────────────────────────────────────────┘ + │ HTTP/REST + ▼ +┌─────────────────────────────────────────────────────────┐ +│ API Gateway │ +│ (Spring Cloud) │ +│ localhost:8080 │ +│ ┌──────────────────────────────────────────────────┐ │ +│ │ 认证 (JWT) │ │ +│ │ 路由转发 │ │ +│ │ 限流 │ │ +│ └──────────────────────────────────────────────────┘ │ +└────────────────┬────────────────────────────────────────┘ + │ + ├─────────────────┬─────────────────┐ + ▼ ▼ ▼ +┌─────────────────────────┐ ┌─────────────────────────┐ ┌─────────────────────────┐ +│ 主应用 │ │ 数据管理服务 │ │ RAG 索引器 │ +│ (Spring Boot) │ │ Service │ │ Service │ +│ - 数据清洗 │ │ - 数据集管理 │ │ - 知识库管理 │ │ +│ - 算子市场 │ │ - 文件操作 │ │ - 向量搜索 │ │ +│ - 数据收集 │ │ - 标签管理 │ │ - Milvus 集成 │ │ +└─────────┬───────────┘ └─────────┬───────────┘ └─────────┬───────────┘ + │ │ │ + ▼ ▼ ▼ +┌─────────────────────────────────────────────────────────────────────────────────┐ +│ PostgreSQL (元数据) │ +│ Redis (缓存) │ +│ Milvus (向量) │ +│ MinIO (文件) │ +└─────────────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ Python 运行时 (FastAPI) │ +│ localhost:18000 │ +│ ┌──────────────────────────────────────────────────┐ │ +│ │ 数据合成 │ │ +│ │ 数据标注 (Label Studio 集成) │ │ +│ │ 数据评估 │ │ +│ │ RAG 索引 │ │ +│ └──────────────────────────────────────────────────┘ │ +└────────────────┬────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────┐ +│ Ray 执行器 (分布式) │ +│ ┌──────────────────────────────────────────────────┐ │ +│ │ 算子执行 │ │ +│ │ 任务调度 │ │ +│ │ 分布式计算 │ │ +│ └──────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +## 组件 + +### 前端层 +- **框架**: React 18 + TypeScript + Vite +- **UI 库**: Ant Design +- **样式**: TailwindCSS v4 +- **状态管理**: Redux Toolkit +- **路由**: React Router v7 + +### 后端层(Java) +- **API Gateway**: Spring Cloud Gateway + - 路由转发 + - JWT 认证 + - 限流 + +- **主应用**: Spring Boot 3.5 + - 数据清洗管道 + - 算子市场 + - 数据收集任务 + +- **数据管理服务**: Spring Boot 3.5 + - 数据集 CRUD + - 文件操作 + - 标签管理 + +- **RAG 索引器服务**: Spring Boot 3.5 + - 知识库管理 + - 向量搜索 + - Milvus 集成 + +### 运行时层(Python) +- **FastAPI 后端**: 端口 18000 + - 数据合成(QA 生成) + - 数据标注(Label Studio 集成) + - 模型评估 + - RAG 索引 + +- **Ray 执行器**: 分布式执行 + - 算子执行 + - 任务调度 + - 多节点并行性 + +### 算子生态 +- **filter**: 数据过滤(去重、敏感内容、质量) +- **mapper**: 数据转换(清洗、归一化) +- **slicer**: 数据切片(文本分割、幻灯片提取) +- **formatter**: 格式转换(PDF → text, slide → JSON) +- **llms**: LLM 算子(质量评估、条件检查) + +## 数据流 + +### 1. 数据摄入 +``` +用户上传 → 前端 → API Gateway → 数据管理服务 → PostgreSQL/MinIO +``` + +### 2. 数据处理 +``` +数据集 → 前端 → API Gateway → 主应用 → Python 运行时 +→ Ray 执行器 → 算子 → 处理后的数据 → PostgreSQL/MinIO +``` + +### 3. RAG 索引 +``` +处理后的数据 → Python 运行时 → RAG 索引器服务 → Milvus (向量) +``` + +### 4. RAG 检索 +``` +查询 → 前端 → API Gateway → RAG 索引器服务 → Milvus → 结果 +``` + +## 技术栈 + +| 层级 | 技术 | +|--------|------| +| **前端** | React 18, TypeScript, Vite, Ant Design, TailwindCSS | +| **后端** | Spring Boot 3.5, Java 21, MyBatis-Plus, PostgreSQL | +| **运行时** | FastAPI, Python 3.12, Ray, SQLAlchemy | +| **向量数据库** | Milvus | +| **缓存** | Redis | +| **对象存储** | MinIO | +| **部署** | Docker Compose, Kubernetes/Helm | + +## 通信模式 + +### 服务间通信 +- **REST API**: 前端和后端之间的 HTTP/JSON +- **gRPC**: (如有)后端服务之间 +- **消息队列**: (如有)用于异步任务 + +### 后端到运行时 +- **HTTP/REST**: Java 后端调用 Python 运行时 runtime APIs +- **Ray**: Python 运行时提交任务到 Ray 执行器 + +## 安全 + +### 认证 +- **JWT**: 基于 Token 的认证,通过 API Gateway +- **会话**: (如有)会话管理 + +### 授权 +- **基于角色的**: (如有)RBAC +- **基于资源的**: (如有)资源级访问控制 + +## 可扩展性 + +### 水平扩展 +- **后端服务**: Kubernetes pod 扩展(通过 Helm) +- **Ray 执行器**: 多节点 Ray 集群 +- **前端**: 静态资源服务 + CDN + +### 垂直扩展 +- **数据库**: PostgreSQL 连接池 +- **缓存**: Redis 集群 +- **向量数据库**: Milvus 集群 + +## 部署 + +### Docker Compose +```bash +make install INSTALLER=docker +``` + +### Kubernetes/Helm +```bash +make install INSTALLER=k8s +``` + +## 监控 + +### 指标 +- **Spring Boot Actuator**: `/actuator/metrics` +- **Prometheus**: (如已配置)指标收集 +- **Ray**: Ray dashboard 用于执行器监控 + +### 日志 +- **Java**: Log4j2 +- **Python**: Ray dashboard for executor monitoring + +## 架构决策 + +### 为什么选择多语言? +- **Java 后端**: 企业级、成熟生态系统、强类型 +- **Python 运行时**: 丰富的 ML/AI 生态系统、灵活、快速原型开发 +- **React 前端**: 现代 UI、组件化、大型生态系统 + +### 为什么选择微服务? +- **可扩展性**: 服务独立扩展 +- **可维护性**: 清晰的服务边界 +- **技术多样性**: 为每个任务使用最佳工具 + +### 为什么选择 Ray? +- **分布式计算**: 无缝多节点执行 +- **容错**: 自动任务重试和恢复 +- **资源管理**: 动态资源分配 + +## 未来增强 + +- [ ] 服务网格(Istio/Linkerd) +- [ ] 事件总线(Kafka/Pulsar) +- [ ] GraphQL API +- [ ] 实时更新(WebSocket) +- [ ] 高级监控(Grafana, Loki) + +## 引用 + +- [后端架构](./backend/README.md) +- [运行时架构](./runtime/README.md) +- [前端架构](./frontend/README.md) +- [AGENTS.md](./AGENTS.md) diff --git a/DEVELOPMENT-zh.md b/DEVELOPMENT-zh.md new file mode 100644 index 000000000..6c096d68f --- /dev/null +++ b/DEVELOPMENT-zh.md @@ -0,0 +1,251 @@ +# 开发指南 + +本文档为 DataMate 提供全面的本地开发环境搭建和工作流程指南,涵盖 Java、Python、React 三种语言。 + +## 概述 + +DataMate 是由多语言(Java 后端、Python 运行时、React 前端)组成的微服务项目,通过 Docker Compose 进行本地开发协调。 + +## 前置条件 + +- Git (用于拉取源码) +- Make (用于构建和安装) +- Docker (用于构建镜像和部署服务) +- Docker Compose (用于部署服务 - docker 方式) +- Kubernetes (用于部署服务 - k8s 方式) +- Helm (用于部署服务 - k8s 方式) + +注意: +- 确保 Java 和 Python 环境在系统 PATH 中(如适用) +- Docker Compose 将编排本地开发栈 + +## 快速开始 + +### 1. 克隆仓库并安装依赖 +```bash +git clone git@github.com:ModelEngine-Group/DataMate.git +cd DataMate +``` + +### 2. 启动基础服务 +```bash +make install +``` + +本项目支持 docker-compose 和 helm 两种方式部署,请在执行命令后输入部署部署方式的对应编号,命令回显如下所示: +```shell +Choose a deployment method: +1. Docker/Docker-Compose +2. Kubernetes/Helm +Enter choice: +``` + +若您使用的机器没有 make,您也可以执行如下命令部署: +```bash +REGISTRY=ghcr.io/modelengine-group/ docker compose -f deployment/docker/datamate/docker-compose.yml --profile milvus up -d +``` + +当容器运行后,请在浏览器打开 http://localhost:30000 查看前端界面。 + +### 3. 本地开发部署 +本地代码修改后,请执行以下命令构建镜像并使用本地镜像部署: +```bash +make build +make install dev=true +``` + +### 4. 卸载服务 +```bash +make uninstall +``` + +在运行 `make uninstall` 时,卸载流程会只询问一次是否删除卷(数据),该选择会应用到所有组件。卸载顺序为:milvus -> label-studio -> datamate,确保在移除 datamate 网络前,所有使用该网络的服务已先停止。 + +## 项目结构 + +``` +DataMate/ +├── backend/ # Java 后端 +│ ├── api-gateway/ # API Gateway +│ ├── services/ # 核心服务 +│ └── shared/ # 共享库 +├── runtime/ # Python 运行时 +│ ├── datamate-python/ # FastAPI 后端 +│ ├── python-executor/ # Ray 执行器 +│ ├── ops/ # 算子生态 +│ ├── datax/ # DataX 框架 +│ └── deer-flow # DeerFlow 服务 +├── frontend/ # React 前端 +├── deployment/ # 部署配置 +└── docs/ # 文档 +``` + +## 开发工作流程 + +### Java 后端开发 +```bash +# 构建 +cd backend +mvn clean install + +# 运行测试 +mvn test + +# 运行特定服务 +cd backend/services/main-application +mvn spring-boot:run +``` + +### Python 运行时开发 +```bash +# 安装依赖 +cd runtime/datamate-python +poetry install + +# 运行服务 +poetry run uvicorn app.main:app --reload --port 18000 + +# 运行测试 +poetry run pytest +``` + +### React 前端开发 +```bash +# 安装依赖 +cd frontend +npm ci + +# 运行开发服务器 +npm run dev + +# 构建生产版本 +npm run build +``` + +### Docker Compose 开发 +```bash +# 启动所有服务 +docker compose up -d + +# 查看日志 +docker compose logs -f [service-name] + +# 停止所有服务 +docker compose down +``` + +## 环境配置 + +每个组件可以有自己的环境变量文件。不要提交包含密钥的 .env 文件。 + +### 后端(Java) +- **路径**: `backend/.env` +- **典型密钥**: + - `DB_URL`: 数据库连接字符串 + - `DB_USER`: 数据库用户名 + - `DB_PASSWORD`: 数据库密码 + - `REDIS_URL`: Redis 连接字符串 + - `REDIS_PASSWORD`: Redis 密码 + - `JWT_SECRET`: JWT 密钥 + +### 运行时(Python) +- **路径**: `runtime/datamate-python/.env` +- **典型密钥**: + - `DATABASE_URL`: PostgreSQL 连接字符串 + - `RAY_ENABLED`: 是否启用 Ray 执行器 + - `RAY_ADDRESS`: Ray 集群地址 + - `LABEL_STUDIO_BASE_URL`: Label Studio 基础 URL + +### 前端(React) +- **路径**: `frontend/.env` +- **典型密钥**: + - `VITE_API_BASE_URL`: API 基础 URL + - `VITE_RUNTIME_API_URL`: 运行时 API 基础 URL + +## 测试 + +### Java(JUnit 5) +```bash +cd backend +mvn test +``` + +### Python(pytest) +```bash +cd runtime/datamate-python +poetry run pytest +``` + +### 前端 +当前未配置测试框架。 + +## 调试 + +### Java 后端 +```bash +# 启用 JDWP 调试端口 5005 +export JAVA_TOOL_OPTIONS='-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005' +java -jar backend/main-application/target/*.jar +``` + +### Python 运行时 +```bash +# 启用 debugpy 监听端口 5678 +cd runtime/datamate-python +python -m debugpy --listen 5678 --wait-for-client -m uvicorn app.main:app --reload --port 18000 --host 0.0.0.0 +``` + +### React 前端 +使用浏览器开发者工具或 VS Code 调试器。 + +## 常见问题 + +### 端口冲突 +检查哪个进程正在使用端口: +```bash +lsof -i TCP:8080 +lsof -i TCP:18000 +lsof -i TCP:5173 +``` +停止或重新配置冲突的服务。 + +### 数据库连接失败 +确保 `.env` 包含正确的 `DATABASE_URL` 和凭据;确保数据库服务在 Docker Compose 中已启动。 + +### Ray 集群问题 +确保 Ray 已正确启动;检查 Ray 工作进程日志;确保 `RAY_ADDRESS` 配置正确。 + +## 文档 + +- **核心文档**: + - [ARCHITECTURE.md](./ARCHITECTURE.md) - 系统架构、微服务通信、数据流 + - [DEVELOPMENT.md](./DEVELOPMENT.md) - 本地开发环境搭建和工作流程 + - [AGENTS.md](./AGENTS.md) - AI 助手指南和代码规范 + +- **后端文档**: + - [backend/README.md](./backend/README.md) - 后端架构、服务和技术栈 + - [backend/api-gateway/README.md](./backend/api-gateway/README.md) - API Gateway 配置和路由 + - [backend/services/main-application/README.md](./backend/services/main-application/README.md) - 主应用模块 + - [backend/shared/README.md](./backend/shared/README.md) - 共享库(domain-common, security-common) + +- **运行时文档**: + - [runtime/README.md](./runtime/README.md) - 运行时架构和组件 + - [runtime/datamate-python/README.md](./runtime/datamate-python/README.md) - FastAPI 后端服务 + - [runtime/python-executor/README.md](./runtime/python-executor/README.md) - Ray 执行器框架 + - [runtime/ops/README.md](./runtime/ops/README.md) - 算子生态 + - [runtime/datax/README.md](./runtime/datax/README.md) - DataX 数据框架 + - [runtime/deer-flow/README.md](./runtime/deer-flow/README.md) - DeerFlow LLM 服务 + +- **前端文档**: + - [frontend/README.md](./frontend/README.md) - React 前端应用 + +## 贡献指南 + +感谢您对本项目的关注!我们非常欢迎社区的贡献,无论是提交 Bug 报告、提出功能建议,还是直接参与代码开发,都能帮助项目变得更好。 + +• 📮 [GitHub Issues](../../issues):提交 Bug 或功能建议。 +• 🔧 [GitHub Pull Requests](../../pulls):贡献代码改进。 + +## 许可证 + +DataMate 基于 [MIT](LICENSE) 开源,您可以在遵守许可证条款的前提下自由使用、修改和分发本项目的代码。 diff --git a/backend/README-zh.md b/backend/README-zh.md new file mode 100644 index 000000000..cdf749b63 --- /dev/null +++ b/backend/README-zh.md @@ -0,0 +1,137 @@ +# DataMate 后端 + +## 概述 + +DataMate 后端是基于 Spring Boot 3.5 + Java 21 的微服务架构,提供数据管理、RAG 索引、API 网关等核心功能。 + +## 架构 + +``` +backend/ +├── api-gateway/ # API Gateway + 认证 +├── services/ +│ ├── data-management-service/ # 数据集管理 +│ ├── rag-indexer-service/ # RAG 索引 +│ └── main-application/ # 主应用入口 +└── shared/ + ├── domain-common/ # DDD 构建块、异常处理 + └── security-common/ # JWT 工具 +``` + +## 服务 + +| 服务 | 端口 | 描述 | +|---------|-------|-------------| +| **main-application** | 8080 | 主应用,包含数据管理、数据清洗、算子市场等模块 | +| **api-gateway** | 8080 | API Gateway,路由转发和认证 | + +## 技术栈 + +- **框架**: Spring Boot 3.5.6, Spring Cloud 2025.0.0 +- **语言**: Java 21 +- **数据库**: PostgreSQL 8.0.33 + MyBatis-Plus 3.5.14 +- **缓存**: Redis 3.2.0 +- **向量数据库**: Milvus (via SDK 2.6.6) +- **文档**: SpringDoc OpenAPI 2.2.0 +- **构建**: Maven + +## 依赖 + +### 外部服务 +- **PostgreSQL**: `datamate-database:5432` +- **Redis**: `datamate-redis:6379` +- **Milvus**: 向量数据库(RAG 索引) + +### 共享库 +- **domain-common**: 业务异常、系统参数、领域实体基类 +- **security-common**: JWT 工具、认证辅助 + +## 快速开始 + +### 前置条件 +- JDK 21+ +- Maven 3.8+ +- PostgreSQL 12+ +- Redis 6+ + +### 构建 +```bash +cd backend +mvn clean install +``` + +### 运行主应用 +```bash +cd backend/services/main-application +mvn spring-boot:run +``` + +### 运行 API Gateway +```bash +cd backend/api-gateway +mvn spring-boot:run +``` + +## 开发 + +### 模块结构 (DDD) +``` +com.datamate.{module}/ +├── interfaces/ +│ ├── rest/ # Controllers +│ ├── dto/ # Request/Response DTOs +│ ├── converter/ # MapStruct converters +│ └── validation/ # Custom validators +├── application/ # Application services +├── domain/ +│ ├── model/ # Entities +│ └── repository/ # Repository interfaces +└── infrastructure/ + ├── persistence/ # Repository implementations + ├── client/ # External API clients + └── config/ # Service configuration +``` + +### 代码约定 +- **实体**: Extend `BaseEntity`, use `@TableName("t_*")` +- **控制器**: `@RestController` + `@RequiredArgsConstructor` +- **服务**: `@Service` + `@Transactional` +- **错误处理**: `throw BusinessException.of(ErrorCode.XXX)` +- **MapStruct**: `@Mapper(componentModel = "spring")` + +## 测试 + +```bash +# 运行所有测试 +mvn test + +# 运行特定测试 +mvn test -Dtest=ClassName#methodName + +# 运行特定模块测试 +mvn -pl services/data-management-service -am test +``` + +## 配置 + +### 环境变量 +- `DB_USERNAME`: 数据库用户名 +- `DB_PASSWORD`: 数据库密码 +- `REDIS_PASSWORD`: Redis 密码 +- `JWT_SECRET`: JWT 密钥 + +### 配置文件 +- `application.yml`: 默认配置 +- `application-dev.yml`: 开发环境覆盖 + +## 文档 + +- **API 文档**: http://localhost:8080/api/swagger-ui.html +- **AGENTS.md**: 见 `backend/shared/AGENTS.md` 获取共享库文档 +- **服务文档**: 见各服务 README + +## 相关链接 + +- [Spring Boot 文档](https://docs.spring.io/spring-boot/) +- [MyBatis-Plus 文档](https://baomidou.com/) +- [PostgreSQL 文档](https://www.postgresql.org/docs/) diff --git a/backend/api-gateway/README-zh.md b/backend/api-gateway/README-zh.md new file mode 100644 index 000000000..a300f7f74 --- /dev/null +++ b/backend/api-gateway/README-zh.md @@ -0,0 +1,130 @@ +# API Gateway + +## 概述 + +API Gateway 是 DataMate 的统一入口,基于 Spring Cloud Gateway 实现,负责路由转发、JWT 认证和限流。 + +## 架构 + +``` +backend/api-gateway/ +├── src/main/java/com/datamate/gateway/ +│ ├── config/ # Gateway 配置 +│ ├── filter/ # JWT 认证过滤器 +│ └── route/ # 路由定义 +└── src/main/resources/ + └── application.yml # Gateway 配置 +``` + +## 配置 + +### 端口 +- **默认**: 8080 +- **Nacos 发现端口**: 30000 + +### 关键配置 +```yaml +spring: + application: + name: datamate-gateway + cloud: + nacos: + discovery: + port: 30000 + server-addr: ${NACOS_ADDR} + username: consul + password: +datamate: + jwt: + secret: ${JWT_SECRET} + expiration-seconds: 3600 +``` + +## 功能 + +### 1. 路由转发 +- 将前端请求转发到对应的后端服务 +- 支持负载均衡 +- 路径重写 + +### 2. JWT 认证 +- 基于 JWT Token 的认证 +- Token 验证和过期检查 +- 用户上下文传递 + +### 3. 限流 +- (如配置)请求频率限制 +- 防止 API 滥用 + +## 快速开始 + +### 前置条件 +- JDK 21+ +- Maven 3.8+ +- Nacos 服务(如果使用服务发现) + +### 构建 +```bash +cd backend/api-gateway +mvn clean install +``` + +### 运行 +```bash +cd backend/api-gateway +mvn spring-boot:run +``` + +## 开发 + +### 添加新路由 +在 `application.yml` 或通过 Nacos 配置路由规则: + +```yaml +spring: + cloud: + gateway: + routes: + - id: data-management + uri: lb://data-management-service + predicates: + - Path=/api/data-management/** + filters: + - StripPrefix=3 +``` + +### 添加自定义过滤器 +创建 `GlobalFilter` 或 `GatewayFilter`: + +```java +@Component +public class AuthFilter implements GlobalFilter { + @Override + public Mono filter(ServerWebExchange exchange, GatewayFilterChain chain) { + // 过滤逻辑 + return chain.filter(exchange); + } +} +``` + +## 测试 + +### 测试路由转发 +```bash +curl http://localhost:8080/api/data-management/datasets +``` + +### 测试 JWT 认证 +```bash +curl -H "Authorization: Bearer " http://localhost:8080/api/protected-endpoint +``` + +## 文档 + +- **Spring Cloud Gateway 文档**: https://docs.spring.io/spring-cloud-gateway/ +- **Nacos 发现**: https://nacos.io/ + +## 相关链接 + +- [后端 README](../README.md) +- [主应用 README](../services/main-application/README.md) diff --git a/backend/services/main-application/README-zh.md b/backend/services/main-application/README-zh.md new file mode 100644 index 000000000..4bc298878 --- /dev/null +++ b/backend/services/main-application/README-zh.md @@ -0,0 +1,138 @@ +# 主应用 + +## 概述 + +主应用是 DataMate 的核心 Spring Boot 服务,包含数据管理、数据清洗、算子市场、数据收集等主要功能模块。 + +## 架构 + +``` +backend/services/main-application/ +├── src/main/java/com/datamate/main/ +│ ├── interfaces/ +│ │ ├── rest/ # Controllers +│ │ ├── dto/ # Request/Response DTOs +│ │ └── converter/ # MapStruct converters +│ ├── application/ # Application services +│ ├── domain/ +│ │ ├── model/ # Entities +│ │ └── repository/ # Repository interfaces +│ └── infrastructure/ +│ ├── persistence/ # Repository implementations +│ ├── client/ # External API clients +│ └── config/ # Service configuration +└── src/main/resources/ + ├── application.yml # 主配置 + ├── config/application-datamanagement.yml # 数据管理配置 + └── config/application-datacollection.yml # 数据收集配置 +``` + +## 模块 + +### 1. 数据管理 +- 数据集 CRUD 操作 +- 文件上传/下载 +- 标签管理 +- 数据集版本控制 + +### 2. 数据清洗 +- 数据清洗管道 +- 数据质量检查 +- 数据去重 +- 数据格式转换 + +### 3. 算子市场 +- 算子上传/下载 +- 算子版本管理 +- 算子分类和搜索 +- 算子执行配置 + +### 4. 数据收集 +- 数据源配置 +- 定时数据收集任务 +- 数据同步 +- 数据导入/导出 + +## 配置 + +### 端口 +- **默认**: 8080 +- **上下文路径**: `/api` + +### 关键配置 +```yaml +server: + port: 8080 + servlet: + context-path: /api + +datamate: + data-management: + base-path: /dataset + operator-market: + repository-path: ./runtime/operators + max-upload-size: 50MB + ray: + enabled: false + address: ray://localhost:10001 +``` + +## 快速开始 + +### 前置条件 +- JDK 21+ +- Maven 3.8+ +- PostgreSQL 12+ +- Redis 6+ + +### 构建 +```bash +cd backend/services/main-application +mvn clean install +``` + +### 运行 +```bash +cd backend/services/main-application +mvn spring-boot:run +``` + +## 开发 + +### 添加新模块 +1. 在 `domain/model/` 创建实体类 +2. 在 `domain/repository/` 创建 repository 接口 +3. 在 `infrastructure/persistence/` 实现 repository +4. 在 `application/` 创建 application service +5. 在 `interfaces/rest/` 创建 controller + +### 集成 Ray 执行器 +```yaml +datamate: + ray: + enabled: true + address: ray://localhost:10001 +``` + +## 测试 + +### 运行测试 +```bash +cd backend/services/main-application +mvn test +``` + +### 运行特定测试 +```bash +mvn test -Dtest=DatasetControllerTest +``` + +## 文档 + +- **Spring Boot 文档**: https://docs.spring.io/spring-boot/ +- [AGENTS.md](../../shared/AGENTS.md) + +## 相关链接 + +- [后端 README](../../README.md) +- [API Gateway README](../../api-gateway/README.md) diff --git a/backend/shared/README-zh.md b/backend/shared/README-zh.md new file mode 100644 index 000000000..d2dc48abf --- /dev/null +++ b/backend/shared/README-zh.md @@ -0,0 +1,144 @@ +# 共享库 + +## 概述 + +共享库包含所有后端服务共用的代码和工具,包括领域构建块、异常处理、JWT 工具等。 + +## 架构 + +``` +backend/shared/ +├── domain-common/ # DDD 构建块、异常处理 +│ └── src/main/java/com/datamate/common/ +│ ├── infrastructure/exception/ # BusinessException, ErrorCode +│ ├── setting/ # 系统参数、模型配置 +│ └── domain/ # Base entities, repositories +└── security-common/ # JWT 工具、认证辅助 + └── src/main/java/com/datamate/security/ +``` + +## 库 + +### 1. domain-common + +#### BusinessException +统一的业务异常处理机制: + +```java +// 抛出业务异常 +throw BusinessException.of(ErrorCode.DATASET_NOT_FOUND) + .withDetail("dataset_id", datasetId); + +// 带上下文的异常 +throw BusinessException.of(ErrorCode.VALIDATION_FAILED) + .withDetail("field", "email") + .withDetail("reason", "Invalid format"); +``` + +#### ErrorCode +错误码枚举接口: + +```java +public interface ErrorCode { + String getCode(); + String getMessage(); + HttpStatus getHttpStatus(); +} + +// 示例 +public enum CommonErrorCode implements ErrorCode { + SUCCESS("0000", "Success", HttpStatus.OK), + DATABASE_NOT_FOUND("4001", "Database not found", HttpStatus.NOT_FOUND); +} +``` + +#### BaseEntity +所有实体的基类,包含审计字段: + +```java +@Data +@EqualsAndHashCode(callSuper = true) +public class BaseEntity implements Serializable { + @TableId(type = IdType.ASSIGN_ID) + private String id; + + @TableField(fill = FieldFill.INSERT) + private LocalDateTime createdAt; + + @TableField(fill = FieldFill.INSERT_UPDATE) + private LocalDateTime updatedAt; + + @TableField(fill = FieldFill.INSERT) + private String createdBy; + + @TableField(fill = FieldFill.INSERT_UPDATE) + private String updatedBy; +} +``` + +### 2. security-common + +#### JWT 工具 +JWT Token 生成和验证: + +```java +// 生成 Token +String token = JwtUtil.generateToken(userId, secret, expiration); + +// 验证 Token +Claims claims = JwtUtil.validateToken(token, secret); +String userId = claims.getSubject(); +``` + +## 使用 + +### 在服务中使用共享库 + +#### Maven 依赖 +```xml + + com.datamate + domain-common + 1.0.0-SNAPSHOT + + + com.datamate + security-common + 1.0.0-SNAPSHOT + +``` + +#### 使用 BusinessException +```java +@RestController +@RequiredArgsConstructor +public class DatasetController { + + public ResponseEntity getDataset(String id) { + Dataset dataset = datasetService.findById(id); + if (dataset == null) { + throw BusinessException.of(ErrorCode.DATASET_NOT_FOUND); + } + return ResponseEntity.ok(DatasetResponse.from(dataset)); + } +} +``` + +## 快速开始 + +### 构建共享库 +```bash +cd backend +mvn clean install +``` + +### 在服务中使用 +共享库会自动被所有后端服务继承。 + +## 文档 + +- [AGENTS.md](./AGENTS.md) + +## 相关链接 + +- [后端 README](../README.md) diff --git a/runtime/README-zh.md b/runtime/README-zh.md new file mode 100644 index 000000000..34528cd3e --- /dev/null +++ b/runtime/README-zh.md @@ -0,0 +1,147 @@ +# DataMate 运行时 + +## 概述 + +DataMate 运行时提供数据处理、算子执行、数据收集等核心功能,基于 Python 3.12+ 和 FastAPI 框架。 + +## 架构 + +``` +runtime/ +├── datamate-python/ # FastAPI 后端服务(端口 18000) +├── python-executor/ # Ray 分布式执行器 +├── ops/ # 算子生态 +├── datax/ # DataX 数据读写框架 +└── deer-flow/ # DeerFlow 服务 +``` + +## 组件 + +### 1. datamate-python (FastAPI 后端) +**端口**: 18000 + +核心 Python 服务,提供以下功能: +- **数据合成**: QA 生成、文档处理 +- **数据标注**: Label Studio 集成、自动标注 +- **数据评估**: 模型评估、质量检查 +- **数据清洗**: 数据清洗管道 +- **算子市场**: 算子管理、上传 +- **RAG 索引**: 向量索引、知识库管理 +- **数据收集**: 定时任务、数据源集成 + +**技术栈**: +- FastAPI 0.124+ +- SQLAlchemy 2.0+ (async) +- Pydantic 2.12+ +- PostgreSQL (via asyncpg) +- Milvus (via pymilvus) +- APScheduler (定时任务) + +### 2. python-executor (Ray 执行器) +Ray 分布式执行框架,负责: +- **算子执行**: 执行数据处理算子 +- **任务调度**: 异步任务管理 +- **分布式计算**: 多节点并行处理 + +**技术栈**: +- Ray 2.7.0 +- FastAPI (执行器 API) +- Data-Juicer (数据处理) + +### 3. ops (算子生态) +算子生态,包含: +- **filter**: 数据过滤(去重、敏感内容、质量过滤) +- **mapper**: 数据转换(清洗、归一化) +- **slicer**: 数据切片(文本分割、幻灯片提取) +- **formatter**: 格式转换(PDF → text, slide → JSON) +- **llms**: LLM 算子(质量评估、条件检查) +- **annotation**: 标注算子(目标检测、分割) + +**见**: `runtime/ops/README.md` 获取算子开发指南 + +### 4. datax (DataX 框架) +DataX 数据读写框架,支持多种数据源: +- **Readers**: MySQL, PostgreSQL, Oracle, MongoDB, Elasticsearch, HDFS, S3, NFS, GlusterFS, API, 等 +- **Writers**: 同上,支持写入目标 + +**技术栈**: Java (Maven 构建) + +### 5. deer-flow (DeerFlow 服务) +DeerFlow 服务(配置见 `conf.yaml`)。 + +## 快速开始 + +### 前置条件 +- Python 3.12+ +- Poetry (for datamate-python) +- Ray 2.7.0+ (for python-executor) + +### 运行 datamate-python +```bash +cd runtime/datamate-python +poetry install +poetry run uvicorn app.main:app --reload --port 18000 +``` + +### 运行 python-executor +```bash +cd runtime/python-executor +poetry install +ray start --head +``` + +## 开发 + +### datamate-python 模块结构 +``` +app/ +├── core/ # 日志、异常、配置 +├── db/ +│ ├── models/ # SQLAlchemy 模型 +│ └── session.py # 异步会话 +├── module/ +│ ├── annotation/ # Label Studio 集成 +│ ├── collection/ # 数据收集 +│ ├── cleaning/ # 数据清洗 +│ ├── dataset/ # 数据集管理 +│ ├── evaluation/ # 模型评估 +│ ├── generation/ # QA 合成 +│ ├── operator/ # 算子市场 +│ ├── rag/ # RAG 索引 +│ └── shared/ # 共享 schemas +└── main.py # FastAPI 入口 +``` + +### 代码约定 +- **路由**: `APIRouter` 在 `interface/*.py` +- **依赖注入**: `Depends(get_db)` 获取会话 +- **错误**: `raise BusinessError(ErrorCode.XXX, context)` +- **事务**: `async with transaction(db):` +- **模型**: Extend `BaseEntity` (审计字段自动填充) + +## 测试 + +```bash +cd runtime/datamate-python +poetry run pytest +``` + +## 配置 + +### 环境变量 +- `DATABASE_URL`: PostgreSQL 连接字符串 +- `LABEL_STUDIO_BASE_URL`: Label Studio URL +- `RAY_ENABLED`: 启用 Ray 执行器 +- `RAY_ADDRESS`: Ray 集群地址 + +## 文档 + +- **API 文档**: http://localhost:18000/redoc +- **AGENTS.md**: 见 `runtime/datamate-python/app/AGENTS.md` 获取详细模块文档 +- **算子指南**: 见 `runtime/ops/README.md` 获取算子开发 + +## 相关链接 + +- [FastAPI 文档](https://fastapi.tiangolo.com/) +- [Ray 文档](https://docs.ray.io/) +- [SQLAlchemy 文档](https://docs.sqlalchemy.org/) diff --git a/runtime/datax/README-zh.md b/runtime/datax/README-zh.md new file mode 100644 index 000000000..40d3c8e0a --- /dev/null +++ b/runtime/datax/README-zh.md @@ -0,0 +1,151 @@ +# DataX 框架 + +## 概述 + +DataX 是一个数据传输框架,支持多种数据源和数据目标之间的数据传输,用于数据收集和同步。 + +## 架构 + +``` +runtime/datax/ +├── core/ # DataX 核心组件 +├── transformer/ # 数据转换器 +├── readers/ # 数据读取器 +│ ├── mysqlreader/ +│ ├── postgresqlreader/ +│ ├── oracleReader/ +│ ├── mongodbreader/ +│ ├── hdfsreader/ +│ ├── s3rader/ +│ ├── nfsreader/ +│ ├── glusterfsreader/ +│ └── apireader/ +└── writers/ # 数据写入器 + ├── mysqlwriter/ + ├── postgresqlwriter/ + ├── oraclewriter/ + ├── mongodbwriter/ + ├── hdfswriter/ + ├── s3writer/ + ├── nfswriter/ + ├── glusterfswriter/ + └── txtfilewriter/ +``` + +## 支持的数据源 + +### 关系型数据库 +- MySQL +- PostgreSQL +- Oracle +- SQL Server +- DB2 +- KingbaseES +- GaussDB + +### NoSQL 数据库 +- MongoDB +- Elasticsearch +- Cassandra +- HBase +- Redis + +### 文件系统 +- HDFS +- S3 (AWS S3, MinIO, 阿里云 OSS) +- NFS +- GlusterFS +- 本地文件系统 + +### 其他 +- API 接口 +- Kafka +- Pulsar +- DataHub +- LogHub + +## 使用 + +### 基本配置 +```json +{ + "job": { + "content": [ + { + "reader": { + "name": "mysqlreader", + "parameter": { + "username": "root", + "password": "password", + "column": ["id", "name", "email"], + "connection": [ + { + "jdbcUrl": "jdbc:mysql://localhost:3306/database", + "table": ["users"] + } + ] + } + }, + "writer": { + "name": "txtfilewriter", + "parameter": { + "path": "/output/users.txt", + "fileName": "users", + "writeMode": "truncate" + } + } + } + ] + } +} +``` + +### 运行 DataX +```bash +# 构建 DataX +cd runtime/datax +mvn clean package + +# 运行 +python datax.py -j job.json +``` + +## 快速开始 + +### 前置条件 +- JDK 8+ +- Maven 3.8+ +- Python 3.6+ + +### 构建 +```bash +cd runtime/datax +mvn clean package +``` + +### 运行示例 +```bash +python datax.py -j examples/mysql2text.json +``` + +## 开发 + +### 添加新的 Reader +1. 在 `readers/` 创建新模块 +2. 实现 Reader 接口 +3. 配置 reader 参数 +4. 添加到 package.xml + +### 添加新的 Writer +1. 在 `writers/` 创建新模块 +2. 实现 Writer 接口 +3. 配置 writer 参数 +4. 添加到 package.xml + +## 文档 + +- [DataX 官方文档](https://github.com/alibaba/DataX) + +## 相关链接 + +- [运行时 README](../README.md) diff --git a/runtime/deer-flow/README-zh.md b/runtime/deer-flow/README-zh.md new file mode 100644 index 000000000..5cfe2d54f --- /dev/null +++ b/runtime/deer-flow/README-zh.md @@ -0,0 +1,148 @@ +# DeerFlow 服务 + +## 概述 + +DeerFlow 是一个 LLM 驱动的服务,用于规划和推理任务,支持多种 LLM 提供商。 + +## 架构 + +``` +runtime/deer-flow/ +├── conf.yaml # DeerFlow 配置文件 +├── .env # 环境变量 +└── (其他源代码) +``` + +## 配置 + +### 基本配置 (conf.yaml) + +```yaml +# 基础模型配置 +BASIC_MODEL: + base_url: https://api.example.com/v1 + model: "model-name" + api_key: your_api_key + max_retries: 3 + verify_ssl: false # 如果使用自签名证书,设为 false + +# 推理模型配置(可选) +REASONING_MODEL: + base_url: https://api.example.com/v1 + model: "reasoning-model-name" + api_key: your_api_key + max_retries: 3 + +# 搜索引擎配置(可选) +SEARCH_ENGINE: + engine: tavily + include_domains: + - example.com + - trusted-news.com + exclude_domains: + - spam-site.com + search_depth: "advanced" + include_raw_content: true + include_images: true + include_image_descriptions: true + min_score_threshold: 0.0 + max_content_length_per_page: 4000 +``` + +## 支的 LLM 提供商 + +#### OpenAI +```yaml +BASIC_MODEL: + base_url: https://api.openai.com/v1 + model: "gpt-4" + api_key: sk-... +``` + +#### Ollama (本地部署) +```yaml +BASIC_MODEL: + base_url: "http://localhost:11434/v1" + model: "qwen2:7b" + api_key: "ollama" + verify_ssl: false +``` + +#### Google AI Studio +```yaml +BASIC_MODEL: + platform: "google_aistudio" + model: "gemini-2.5-flash" + api_key: your_gemini_api_key +``` + +#### 华为云 +```yaml +BASIC_MODEL: + base_url: https://ark.cn-beijing.volces.com/api/v3 + model: "doubao-1.5-pro-32k-250115" + api_key: your_api_key +``` + +## 快速开始 + +### 前置条件 +- Python 3.8+ +- LLM API Key 或本地 LLM + +### 配置 +1. 复制 `conf.yaml.example` 为 `conf.yaml` +2. 配置 LLM 提供商和 API Key +3. (可选)配置推理模型和搜索引擎 + +### 运行 +```bash +cd runtime/deer-flow +python -m deerflow +``` + +## 使用 + +### 基本规划 +```python +from deerflow import DeerFlow + +flow = DeerFlow() +result = flow.plan( + task="设计一个数据处理流程", + context="需要处理CSV文件,进行数据清洗和转换" +) +print(result) +``` + +### 推理任务 +```python +from deerflow import DeerFlow + +flow = DeerFlow() +result = flow.reason( + task="分析数据质量", + context="数据包含缺失了值和异常值" +) +print(result) +``` + +## 开发 + +### 添加新的 LLM 提供商 +1. 在 `conf.yaml` 添加新的模型配置 +2. 实现对应的 API 调用逻辑 +3. 测试连接和推理 + +### 自定义提示词模板 +1. 创建提示词模板文件 +2. 在 `conf.yaml` 引用模板 +3. 测试提示词效果 + +## 文档 + +- [DeerFlow 官方文档](https://github.com/ModelEngine-Group/DeerFlow) + +## 相关链接 + +- [运行时 README](../README.md) diff --git a/runtime/python-executor/README-zh.md b/runtime/python-executor/README-zh.md new file mode 100644 index 000000000..b833aece6 --- /dev/null +++ b/runtime/python-executor/README-zh.md @@ -0,0 +1,221 @@ +# Ray 执行器 + +## 概述 + +Ray 执行器是基于 Ray 的分布式执行框架,负责执行数据处理算子、任务调度和分布式计算。 + +## 架构 + +``` +runtime/python-executor/ +└── datamate/ + ├── core/ + │ ├── base_op.py # BaseOp, Mapper, Filter, Slicer, LLM + │ ├── dataset.py # Dataset 处理 + │ └── constant.py # 常量定义 + ├── scheduler/ + │ ├── scheduler.py # TaskScheduler, Task, TaskStatus + │ ├── func_task_scheduler.py # 函数任务调度 + │ └── cmd_task_scheduler.py # 命令任务调度 + ├── wrappers/ + │ ├── executor.py # Ray 执行器入口 + │ ├── datamate_wrapper.py # DataMate 任务包装 + │ └── data_juicer_wrapper.py # DataJuicer 集成 + └── common/utils/ # 工具函数 + ├── bytes_transform.py + ├── file_scanner.py + ├── lazy_loader.py + └── text_splitter.py +``` + +## 组件 + +### 1. Base 类 + +#### BaseOp +所有算子的基类: + +```python +class BaseOp: + def __init__(self, *args, **kwargs): + self.accelerator = kwargs.get('accelerator', "cpu") + self.text_key = kwargs.get('text_key', "text") + # ... 其他配置 + + def execute(self, sample): + raise NotImplementedError +``` + +#### Mapper +数据转换算子基类(1:1): + +```python +class Mapper(BaseOp): + def execute(self, sample: Dict) -> Dict: + # 转换逻辑 + return processed_sample +``` + +#### Filter +数据过滤算子基类(返回 bool): + +```python +class Filter(BaseOp): + def execute(self, sample: Dict) -> bool: + # 过滤逻辑 + return True # 保留或过滤 +``` + +#### Slicer +数据切片算子基类(1:N): + +```python +class Slicer(BaseOp): + def execute(self, sample: Dict) -> List[Dict]: + # 切片逻辑 + return [sample1, sample2, ...] +``` + +#### LLM +LLM 算子基类: + +```python +class LLM(Mapper): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.llm = self.get_llm(*args, **kwargs) + + def build_llm_prompt(self, *args, **kwargs): + raise NotImplementedError +``` + +### 2. Task Scheduler + +异步任务调度器: + +```python +class TaskScheduler: + def __init__(self, max_concurrent: int = 10): + self.tasks: Dict[str, Task] = {} + self.semaphore = asyncio.Semaphore(max_concurrent) + + async def submit(self, task_id, task, *args, **kwargs): + # 提交任务 + pass + + def get_task_status(self, task_id: str) -> Optional[TaskResult]: + # 获取任务状态 + pass + + def cancel_task(self, task_id: str) -> bool: + # 取消任务 + pass +``` + +### 3. 算子执行 + +#### 算子注册 +```python +from datamate.core.base_op import OPERATORS + +OPERATORS.register_module( + module_name='YourOperatorName', + module_path="ops.user.operator_package.process" +) +``` + +#### 执行算子 +```python +from datamate.core.base_op import Mapper + +class MyMapper(Mapper): + def execute(self, sample): + text = sample.get('text', '') + processed = text.upper() + sample['text'] = processed + return sample +``` + +## 快速开始 + +### 前置条件 +- Python 3.11+ +- Ray 2.7.0+ +- Poetry + +### 安装 +```bash +cd runtime/python-executor +poetry install +``` + +### 启动 Ray Head +```bash +ray start --head +``` + +### 启动 Ray Worker +```bash +ray start --head-address=:6379 +``` + +## 使用 + +### 提交任务到 Ray +```python +from ray import remote + +@remote +def execute_operator(sample, operator_config): + # 执行算子逻辑 + return result + +# 提交任务 +result_ref = execute_operator.remote(sample, config) +result = ray.get(result_ref) +``` + +### 使用 Task Scheduler +```python +from datamate.scheduler.scheduler import TaskScheduler + +scheduler = TaskScheduler(max_concurrent=10) +task_id = "task-001" +scheduler.submit(task_id, my_function, arg1, arg2) +status = scheduler.get_task_status(task_id) +``` + +## 开发 + +### 添加新算子 +1. 在 `runtime/ops/` 创建算子目录 +2. 实现 `process.py` 和 `__init__.py` +3. 在 `__init__.py` 注册算子 +4. 测试算子 + +### 调试算子 +```bash +# 本地测试 +python -c "from ops.user.operator_package.process import YourOperatorName; op = YourOperatorName(); print(op.execute({'text': 'test'}))" +``` + +## 性能 + +### 并行执行 +Ray 自动处理并行执行和资源分配。 + +### 容错 +Ray 提供自动任务重试和故障转移。 + +### 资源管理 +Ray 动态分配 CPU、GPU、内存资源。 + +## 文档 + +- [Ray 文档](https://docs.ray.io/) +- [AGENTS.md](./AGENTS.md) + +## 相关链接 + +- [运行时 README](../README.md) +- [算子生态](../ops/README.md) From 29763f2ba4b7d82860815e3a2becf321bfb5e967 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Tue, 24 Mar 2026 19:08:04 +0800 Subject: [PATCH 4/6] =?UTF-8?q?=E4=B8=B0=E5=AF=8C=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README-zh.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README-zh.md b/README-zh.md index 3afb2ee99..904cc3140 100644 --- a/README-zh.md +++ b/README-zh.md @@ -118,21 +118,21 @@ make uninstall - **[AGENTS.md](./AGENTS.md)** - AI 助手指南和代码规范 ### 后端文档 -- **[backend/README.md](./backend/README.md)** - 后端架构、服务和技术栈 -- **[backend/api-gateway/README.md](./backend/api-gateway/README.md)** - API Gateway 配置和路由 -- **[backend/services/main-application/README.md](./backend/services/main-application/README.md)** - 主应用模块 -- **[backend/shared/README.md](./backend/shared/README.md)** - 共享库(domain-common, security-common) +- **[backend/README-zh.md](./backend/README-zh.md)** - 后端架构、服务和技术栈 +- **[backend/api-gateway/README-zh.md](./backend/api-gateway/README-zh.md)** - API Gateway 配置和路由 +- **[backend/services/main-application/README-zh.md](./backend/services/main-application/README-zh.md)** - 主应用模块 +- **[backend/shared/README-zh.md](./backend/shared/README-zh.md)** - 共享库(domain-common, security-common) ### 运行时文档 -- **[runtime/README.md](./runtime/README.md)** - 运行时架构和组件 -- **[runtime/datamate-python/README.md](./runtime/datamate-python/README.md)** - FastAPI 后端服务 -- **[runtime/python-executor/README.md](./runtime/python-executor/README.md)** - Ray 执行器框架 +- **[runtime/README-zh.md](./runtime/README-zh.md)** - 运行时架构和组件 +- **[runtime/datamate-python/README-zh.md](./runtime/datamate-python/README-zh.md)** - FastAPI 后端服务 +- **[runtime/python-executor/README-zh.md](./runtime/python-executor/README-zh.md)** - Ray 执行器框架 - **[runtime/ops/README.md](./runtime/ops/README.md)** - 算子生态 -- **[runtime/datax/README.md](./runtime/datax/README.md)** - DataX 数据框架 -- **[runtime/deer-flow/README.md](./runtime/deer-flow/README.md)** - DeerFlow LLM 服务 +- **[runtime/datax/README-zh.md](./runtime/datax/README-zh.md)** - DataX 数据框架 +- **[runtime/deer-flow/README-zh.md](./runtime/deer-flow/README-zh.md)** - DeerFlow LLM 服务 ### 前端文档 -- **[frontend/README.md](./frontend/README.md)** - React 前端应用 +- **[frontend/README-zh.md](./frontend/README-zh.md)** - React 前端应用 ## 🤝 贡献指南 From 6aed26b497a9ce43f4d0c01ecf10bebdd0cf77b6 Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Tue, 24 Mar 2026 19:08:38 +0800 Subject: [PATCH 5/6] =?UTF-8?q?=E4=B8=B0=E5=AF=8C=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .sisyphus/boulder.json | 19 - .sisyphus/plans/add-documentation.md | 655 --------------------------- 2 files changed, 674 deletions(-) delete mode 100644 .sisyphus/boulder.json delete mode 100644 .sisyphus/plans/add-documentation.md diff --git a/.sisyphus/boulder.json b/.sisyphus/boulder.json deleted file mode 100644 index 54d1434b2..000000000 --- a/.sisyphus/boulder.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "active_plan": "/Users/hsc/Applications/opensource/DataMate/.sisyphus/plans/add-documentation.md", - "started_at": "2026-03-24T07:53:28.467Z", - "session_ids": [ - "ses_2e1786466ffebN7bw1pqZhHWwD", - "ses_2e1236f1affeab7qN32dYM1ZFB", - "ses_2e1234d66ffeEgvMQ5SVS2y6uo", - "ses_2e1230e90ffe3uKb0MBfgLgHFC", - "ses_2e122f9fcffeMh5lU9ipSmz0hq", - "ses_2e122e4cfffe0GGn3fQXMGnUm8", - "ses_2e122cd8effe9fkKbT7Od5cx5B", - "ses_2e122b2ccffeHHEWypTyHV4lLq", - "ses_2e1229ca2ffe5toRFk1auHa8bc", - "ses_2e12288b7ffeq30gRXccrzmAdU", - "ses_2e1232d73ffe8ibloZlm0mzG02" - ], - "plan_name": "add-documentation", - "agent": "atlas" -} \ No newline at end of file diff --git a/.sisyphus/plans/add-documentation.md b/.sisyphus/plans/add-documentation.md deleted file mode 100644 index 0e26eccb3..000000000 --- a/.sisyphus/plans/add-documentation.md +++ /dev/null @@ -1,655 +0,0 @@ -# Add High and Medium Priority Documentation - -## TL;DR - -> **Quick Summary**: Add 8 missing documentation files (4 high priority, 4 medium priority) to improve project coverage from 60% to ~85%. -> -> **Deliverables**: 8 new README.md files with comprehensive content -> - backend/README.md -> - runtime/README.md -> - DEVELOPMENT.md -> - ARCHITECTURE.md -> - backend/api-gateway/README.md -> - backend/services/main-application/README.md -> - backend/shared/README.md -> - runtime/datax/README.md -> - runtime/deer-flow/README.md -> - runtime/python-executor/README.md -> -> **Estimated Effort**: Short -> **Parallel Execution**: YES - 10 parallel tasks -> **Critical Path**: None (all independent) - ---- - -## Context - -### Original Request -User requested to add high and medium priority documentation files to DataMate project. - -### Analysis Summary -**Current Documentation Coverage**: ~60% -- Existing: 23 README.md + 8 AGENTS.md -- Missing: 15+ critical documentation files - -**Key Findings**: -- Backend has no overall README -- Runtime has no overall README -- No development guide for local setup -- No architecture documentation -- Individual service READMEs missing - ---- - -## Work Objectives - -### Core Objective -Create comprehensive documentation for high and medium priority modules to improve project maintainability and onboarding experience. - -### Concrete Deliverables -- 4 high-priority docs: backend/README.md, runtime/README.md, DEVELOPMENT.md, ARCHITECTURE.md -- 6 medium-priority docs: service and component READMEs - -### Definition of Done -- [ ] All 10 documentation files created -- [ ] Each file has proper structure (Overview, Quick Start, Development) -- [ ] Links to related documentation included -- [ ] Code examples where applicable - -### Must Have -- Clear overview of each component -- Quick start instructions -- Technology stack information -- Development guidelines -- Links to related docs - -### Must NOT Have (Guardrails) -- Generic "placeholder" content -- Outdated information -- Broken internal links -- Duplicate content from other docs - ---- - -## Verification Strategy - -> **ZERO HUMAN INTERVENTION** — ALL verification is agent-executed. - -### Test Decision -- **Infrastructure exists**: NO -- **Automated tests**: None -- **Framework**: None - -### QA Policy -Every task MUST include agent-executed QA scenarios: -- Verify file exists -- Verify file is not empty -- Verify markdown syntax -- Verify internal links work - ---- - -## Execution Strategy - -### Parallel Execution Waves - -``` -Wave 1 (Start Immediately — all docs independent): -├── Task 1: Create backend/README.md [quick] -├── Task 2: Create runtime/README.md [quick] -├── Task 3: Create DEVELOPMENT.md [quick] -├── Task 4: Create ARCHITECTURE.md [quick] -├── Task 5: Create backend/api-gateway/README.md [quick] -├── Task 6: Create backend/services/main-application/README.md [quick] -├── Task 7: Create backend/shared/README.md [quick] -├── Task 8: Create runtime/datax/README.md [quick] -├── Task 9: Create runtime/deer-flow/README.md [quick] -└── Task 10: Create runtime/python-executor/README.md [quick] - -Wave FINAL (After ALL tasks): -├── Task F1: Verify all files exist [quick] -└── Task F2: Verify no broken links [quick] - -Critical Path: None (all independent) -Parallel Speedup: ~90% faster than sequential -Max Concurrent: 10 -``` - -### Dependency Matrix - -- **1-10**: — — F1, F2, 1 -- **F1**: 1-10 — F2, 2 -- **F2**: 1-10, F1 — 3 - -### Agent Dispatch Summary - -- **1**: **10** — T1-T10 → `quick` -- **2**: **2** — F1 → `quick`, F2 → `quick` - ---- - -## TODOs - -- [ ] 1. Create backend/README.md - - **What to do**: - - Create comprehensive README for backend module - - Include: Overview, Architecture, Services, Tech Stack, Quick Start, Development, Testing - - Reference: backend/pom.xml, services/pom.xml, AGENTS.md - - **Must NOT do**: - - Duplicate content from individual service READMEs - - Include outdated configuration examples - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Skills Evaluated but Omitted**: None needed - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 2-10) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `backend/pom.xml` - Module structure and dependencies - - `backend/services/pom.xml` - Service modules - - `backend/shared/AGENTS.md` - Shared libraries documentation - - **Acceptance Criteria**: - - [ ] File created: backend/README.md - - [ ] File is valid markdown (can be parsed) - - [ ] Contains all required sections - - **QA Scenarios**: - ``` - Scenario: Verify backend/README.md exists and is valid - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f backend/README.md - 2. Check file is not empty: test -s backend/README.md - 3. Check line count > 50: wc -l backend/README.md - Expected Result: File exists, not empty, >50 lines - Failure Indicators: File not found, empty file, too short - Evidence: .sisyphus/evidence/task-1-backend-readme-verify.txt - - Scenario: Verify markdown syntax - Tool: Bash - Preconditions: File exists - Steps: - 1. Check for proper markdown headers: grep -c "^#" backend/README.md - Expected Result: At least 5 markdown headers found - Failure Indicators: No headers found - Evidence: .sisyphus/evidence/task-1-markdown-syntax.txt - ``` - - **Commit**: NO (group with final task) - -- [ ] 2. Create runtime/README.md - - **What to do**: - - Create comprehensive README for runtime module - - Include: Overview, Components (datamate-python, python-executor, ops, datax, deer-flow), Tech Stack, Quick Start, Development - - Reference: runtime/datamate-python/pyproject.toml, AGENTS.md files - - **Must NOT do**: - - Duplicate content from individual component READMEs - - Include outdated Ray configuration - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 1, 3-10) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `runtime/datamate-python/pyproject.toml` - Python dependencies and project info - - `runtime/datamate-python/app/AGENTS.md` - Python backend docs - - `runtime/ops/AGENTS.md` - Operator ecosystem docs - - `runtime/python-executor/AGENTS.md` - Ray executor docs - - **Acceptance Criteria**: - - [ ] File created: runtime/README.md - - [ ] File is valid markdown - - [ ] Contains all component descriptions - - **QA Scenarios**: - ``` - Scenario: Verify runtime/README.md exists and is valid - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f runtime/README.md - 2. Check file is not empty: test -s runtime/README.md - 3. Check line count > 50: wc -l runtime/README.md - Expected Result: File exists, not empty, >50 lines - Failure Indicators: File not found, empty file, too short - Evidence: .sisyphus/evidence/task-2-runtime-readme-verify.txt - ``` - - **Commit**: NO (group with final task) - -- [ ] 3. Create DEVELOPMENT.md - - **What to do**: - - Create comprehensive development guide - - Include: Prerequisites, Quick Start, Project Structure, Development Workflow, Environment Config, Testing, Debugging, Common Issues - - Cover Java, Python, and React development - - **Must NOT do**: - - Include environment-specific secrets - - Duplicate content from individual READMEs - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 1-2, 4-10) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `AGENTS.md` - Code style guidelines - - `backend/pom.xml` - Java dependencies - - `frontend/package.json` - Node dependencies - - `runtime/datamate-python/pyproject.toml` - Python dependencies - - **Acceptance Criteria**: - - [ ] File created: DEVELOPMENT.md - - [ ] File is valid markdown - - [ ] Covers all three languages (Java, Python, React) - - **QA Scenarios**: - ``` - Scenario: Verify DEVELOPMENT.md exists and is valid - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f DEVELOPMENT.md - 2. Check file is not empty: test -s DEVELOPMENT.md - 3. Check line count > 100: wc -l DEVELOPMENT.md - Expected Result: File exists, not empty, >100 lines - Failure Indicators: File not found, empty file, too short - Evidence: .sisyphus/evidence/task-3-development-verify.txt - ``` - - **Commit**: NO (group with final task) - -- [ ] 4. Create ARCHITECTURE.md - - **What to do**: - - Create comprehensive architecture documentation - - Include: High-level architecture diagram, Components, Data Flow, Technology Stack, Communication Patterns, Security, Scalability, Deployment, Monitoring - - Include ASCII art diagram - - **Must NOT do**: - - Include outdated diagrams - - Duplicate content from other docs - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 1-3, 5-10) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `backend/services/main-application/src/main/resources/application.yml` - Service configuration - - `backend/api-gateway/src/main/resources/application.yml` - Gateway configuration - - `runtime/datamate-python/app/main.py` - Python entry point - - **Acceptance Criteria**: - - [ ] File created: ARCHITECTURE.md - - [ ] File is valid markdown - - [ ] Contains architecture diagram - - [ ] Contains all major sections - - **QA Scenarios**: - ``` - Scenario: Verify ARCHITECTURE.md exists and is valid - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f ARCHITECTURE.md - 2. Check file is not empty: test -s ARCHITECTURE.md - 3. Check line count > 100: wc -l ARCHITECTURE.md - Expected Result: File exists, not empty, >100 lines - Failure Indicators: File not found, empty file, too short - Evidence: .sisyphus/evidence/task-4-architecture-verify.txt - ``` - - **Commit**: NO (group with final task) - -- [ ] 5. Create backend/api-gateway/README.md - - **What to do**: - - Create README for API Gateway - - Include: Overview, Configuration (ports, routes, auth), Development, Testing - - Reference: backend/api-gateway/src/main/resources/application.yml - - **Must NOT do**: - - Include JWT secrets - - Duplicate backend/README.md content - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 1-4, 6-10) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `backend/api-gateway/src/main/resources/application.yml` - Gateway configuration - - `backend/api-gateway/pom.xml` - Dependencies - - **Acceptance Criteria**: - - [ ] File created: backend/api-gateway/README.md - - [ ] File is valid markdown - - [ ] Contains configuration details - - **QA Scenarios**: - ``` - Scenario: Verify api-gateway/README.md exists - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f backend/api-gateway/README.md - 2. Check file is not empty: test -s backend/api-gateway/README.md - Expected Result: File exists, not empty - Failure Indicators: File not found, empty file - Evidence: .sisyphus/evidence/task-5-api-gateway-verify.txt - ``` - - **Commit**: NO (group with final task) - -- [ ] 6. Create backend/services/main-application/README.md - - **What to do**: - - Create README for Main Application - - Include: Overview, Modules (data management, data cleaning, operator market), Configuration, Development - - Reference: backend/services/main-application/src/main/resources/application.yml - - **Must NOT do**: - - Duplicate backend/README.md content - - Include database credentials - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 1-5, 7-10) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `backend/services/main-application/src/main/resources/application.yml` - Application configuration - - `backend/services/main-application/pom.xml` - Dependencies - - **Acceptance Criteria**: - - [ ] File created: backend/services/main-application/README.md - - [ ] File is valid markdown - - [ ] Contains module descriptions - - **QA Scenarios**: - ``` - Scenario: Verify main-application/README.md exists - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f backend/services/main-application/README.md - 2. Check file is not empty: test -s backend/services/main-application/README.md - Expected Result: File exists, not empty - Failure Indicators: File not found, empty file - Evidence: .sisyphus/evidence/task-6-main-app-verify.txt - ``` - - **Commit**: NO (group with final task) - -- [ ] 7. Create backend/shared/README.md - - **What to do**: - - Create README for shared libraries - - Include: Overview, domain-common (exceptions, entities), security-common (JWT), Usage examples - - Reference: backend/shared/AGENTS.md - - **Must NOT do**: - - Duplicate AGENTS.md content - - Include internal implementation details - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 1-6, 8-10) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `backend/shared/AGENTS.md` - Shared libraries documentation - - `backend/shared/domain-common/pom.xml` - Dependencies - - `backend/shared/security-common/pom.xml` - Dependencies - - **Acceptance Criteria**: - - [ ] File created: backend/shared/README.md - - [ ] File is valid markdown - - [ ] Contains library descriptions - - **QA Scenarios**: - ``` - Scenario: Verify shared/README.md exists - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f backend/shared/README.md - 2. Check file is not empty: test -s backend/shared/README.md - Expected Result: File exists, not empty - Failure Indicators: File not found, empty file - Evidence: .sisyphus/evidence/task-7-shared-verify.txt - ``` - - **Commit**: NO (group with final task) - -- [ ] 8. Create runtime/datax/README.md - - **What to do**: - - Create README for DataX framework - - Include: Overview, Supported readers/writers (MySQL, PostgreSQL, Oracle, MongoDB, HDFS, S3, NFS, etc.), Usage examples - - Reference: runtime/datax/package.xml - - **Must NOT do**: - - Include database credentials - - Duplicate runtime/README.md content - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 1-7, 9-10) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `runtime/datax/package.xml` - DataX assembly configuration - - **Acceptance Criteria**: - - [ ] File created: runtime/datax/README.md - - [ ] File is valid markdown - - [ ] Contains reader/writer list - - **QA Scenarios**: - ``` - Scenario: Verify datax/README.md exists - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f runtime/datax/README.md - 2. Check file is not empty: test -s runtime/datax/README.md - Expected Result: File exists, not empty - Failure Indicators: File not found, empty file - Evidence: .sisyphus/evidence/task-8-datax-verify.txt - ``` - - **Commit**: NO (group with final task) - -- [ ] 9. Create runtime/deer-flow/README.md - - **What to do**: - - Create README for DeerFlow service - - Include: Overview, Configuration (conf.yaml), Usage, LLM integration - - Reference: runtime/deer-flow/conf.yaml - - **Must NOT do**: - - Include API keys - - Duplicate runtime/README.md content - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 1-8, 10) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `runtime/deer-flow/conf.yaml` - DeerFlow configuration - - `runtime/deer-flow/.env` - Environment variables - - **Acceptance Criteria**: - - [ ] File created: runtime/deer-flow/README.md - - [ ] File is valid markdown - - [ ] Contains configuration guide - - **QA Scenarios**: - ``` - Scenario: Verify deer-flow/README.md exists - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f runtime/deer-flow/README.md - 2. Check file is not empty: test -s runtime/deer-flow/README.md - Expected Result: File exists, not empty - Failure Indicators: File not found, empty file - Evidence: .sisyphus/evidence/task-9-deer-flow-verify.txt - ``` - - **Commit**: NO (group with final task) - -- [ ] 10. Create runtime/python-executor/README.md - - **What to do**: - - Create README for Ray executor - - Include: Overview, Architecture (scheduler, wrappers, core), Operator execution, Quick start - - Reference: runtime/python-executor/AGENTS.md, pyproject.toml - - **Must NOT do**: - - Duplicate AGENTS.md content - - Include Ray cluster credentials - - **Recommended Agent Profile**: - - **Category**: `quick` - - Reason: Simple file creation with well-defined structure - - **Skills**: `[]` - - **Parallelization**: - - **Can Run In Parallel**: YES - - **Parallel Group**: Wave 1 (with Tasks 1-9) - - **Blocks**: F1, F2 - - **Blocked By**: None - - **References**: - - `runtime/python-executor/AGENTS.md` - Ray executor documentation - - `runtime/python-executor/pyproject.toml` - Dependencies - - **Acceptance Criteria**: - - [ ] File created: runtime/python-executor/README.md - - [ ] File is valid markdown - - [ ] Contains architecture description - - **QA Scenarios**: - ``` - Scenario: Verify python-executor/README.md exists - Tool: Bash - Preconditions: None - Steps: - 1. Check file exists: test -f runtime/python-executor/README.md - 2. Check file is not empty: test -s runtime/python-executor/README.md - Expected Result: File exists, not empty - Failure Indicators: File not found, empty file - Evidence: .sisyphus/evidence/task-10-executor-verify.txt - ``` - - **Commit**: NO (group with final task) - ---- - -## Final Verification Wave - -- [ ] F1. **Verify All Files Exist** — `quick` - Check that all 10 documentation files were created successfully. - - Verify each file exists - - Verify each file is not empty - - Verify each file has valid markdown syntax - Output: `Files [10/10] | VERDICT: APPROVE/`REJECT` - -- [ ] F2. **Verify No Broken Links** — `quick` - Check internal links in documentation files. - - Search for markdown links `[text](path)` - - Verify referenced files exist - - Report any broken links - Output: `Links [N/N valid] | VERDICT: APPROVE/REJECT` - ---- - -## Commit Strategy - -- **10**: `docs: add high and medium priority documentation` — backend/README.md, runtime/README.md, DEVELOPMENT.md, ARCHITECTURE.md, backend/api-gateway/README.md, backend/services/main-application/README.md, backend/shared/README.md, runtime/datax/README.md, runtime/deer-flow/README.md, runtime/python-executor/README.md - ---- - -## Success Criteria - -### Verification Commands -```bash -# Check all files exist -test -f backend/README.md && test -f runtime/README.md && test -f DEVELOPMENT.md && test -f ARCHITECTURE.md - -# Count files -find . -name "README.md" -not -path "*/node_modules/*" -not -path "*/.venv/*" | wc -l -``` - -### Final Checklist -- [ ] All 10 documentation files created -- [ ] Each file has proper structure -- [ ] No broken internal links -- [ ] Documentation coverage improved to ~85% From 218abe4cb3f12349134f03bdcda5465f7583fa6d Mon Sep 17 00:00:00 2001 From: hhhhsc <1710496817@qq.com> Date: Thu, 26 Mar 2026 17:41:31 +0800 Subject: [PATCH 6/6] =?UTF-8?q?=E4=B8=B0=E5=AF=8C=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ARCHITECTURE-zh.md | 238 ----------------- ARCHITECTURE.md | 239 ------------------ README-zh.md | 1 - README.md | 1 - backend/README.md | 26 +- backend/api-gateway/README.md | 34 +-- .../services/main-application/README-zh.md | 28 +- backend/services/main-application/README.md | 64 ++--- backend/shared/README.md | 36 +-- runtime/README-zh.md | 1 - runtime/README.md | 75 +++--- runtime/datax/README.md | 58 ++--- runtime/deer-flow/README-zh.md | 51 ---- runtime/deer-flow/README.md | 91 ++----- runtime/python-executor/README.md | 94 +++---- 15 files changed, 201 insertions(+), 836 deletions(-) delete mode 100644 ARCHITECTURE-zh.md delete mode 100644 ARCHITECTURE.md diff --git a/ARCHITECTURE-zh.md b/ARCHITECTURE-zh.md deleted file mode 100644 index 355fcf60a..000000000 --- a/ARCHITECTURE-zh.md +++ /dev/null @@ -1,238 +0,0 @@ -# DataMate 架构 - -## 概述 - -DataMate 是一个基于微服务的数据管理平台,用于模型微调和 RAG 检索。它采用多语言架构,包含 Java 后端、Python 运行时和 React 前端。 - -## 高层架构 - -``` -┌─────────────────────────────────────────────────────────┐ -│ 前端 (React) │ -│ localhost:5173 │ -└────────────────┬────────────────────────────────────────┘ - │ HTTP/REST - ▼ -┌─────────────────────────────────────────────────────────┐ -│ API Gateway │ -│ (Spring Cloud) │ -│ localhost:8080 │ -│ ┌──────────────────────────────────────────────────┐ │ -│ │ 认证 (JWT) │ │ -│ │ 路由转发 │ │ -│ │ 限流 │ │ -│ └──────────────────────────────────────────────────┘ │ -└────────────────┬────────────────────────────────────────┘ - │ - ├─────────────────┬─────────────────┐ - ▼ ▼ ▼ -┌─────────────────────────┐ ┌─────────────────────────┐ ┌─────────────────────────┐ -│ 主应用 │ │ 数据管理服务 │ │ RAG 索引器 │ -│ (Spring Boot) │ │ Service │ │ Service │ -│ - 数据清洗 │ │ - 数据集管理 │ │ - 知识库管理 │ │ -│ - 算子市场 │ │ - 文件操作 │ │ - 向量搜索 │ │ -│ - 数据收集 │ │ - 标签管理 │ │ - Milvus 集成 │ │ -└─────────┬───────────┘ └─────────┬───────────┘ └─────────┬───────────┘ - │ │ │ - ▼ ▼ ▼ -┌─────────────────────────────────────────────────────────────────────────────────┐ -│ PostgreSQL (元数据) │ -│ Redis (缓存) │ -│ Milvus (向量) │ -│ MinIO (文件) │ -└─────────────────────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────┐ -│ Python 运行时 (FastAPI) │ -│ localhost:18000 │ -│ ┌──────────────────────────────────────────────────┐ │ -│ │ 数据合成 │ │ -│ │ 数据标注 (Label Studio 集成) │ │ -│ │ 数据评估 │ │ -│ │ RAG 索引 │ │ -│ └──────────────────────────────────────────────────┘ │ -└────────────────┬────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────┐ -│ Ray 执行器 (分布式) │ -│ ┌──────────────────────────────────────────────────┐ │ -│ │ 算子执行 │ │ -│ │ 任务调度 │ │ -│ │ 分布式计算 │ │ -│ └──────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────┘ -``` - -## 组件 - -### 前端层 -- **框架**: React 18 + TypeScript + Vite -- **UI 库**: Ant Design -- **样式**: TailwindCSS v4 -- **状态管理**: Redux Toolkit -- **路由**: React Router v7 - -### 后端层(Java) -- **API Gateway**: Spring Cloud Gateway - - 路由转发 - - JWT 认证 - - 限流 - -- **主应用**: Spring Boot 3.5 - - 数据清洗管道 - - 算子市场 - - 数据收集任务 - -- **数据管理服务**: Spring Boot 3.5 - - 数据集 CRUD - - 文件操作 - - 标签管理 - -- **RAG 索引器服务**: Spring Boot 3.5 - - 知识库管理 - - 向量搜索 - - Milvus 集成 - -### 运行时层(Python) -- **FastAPI 后端**: 端口 18000 - - 数据合成(QA 生成) - - 数据标注(Label Studio 集成) - - 模型评估 - - RAG 索引 - -- **Ray 执行器**: 分布式执行 - - 算子执行 - - 任务调度 - - 多节点并行性 - -### 算子生态 -- **filter**: 数据过滤(去重、敏感内容、质量) -- **mapper**: 数据转换(清洗、归一化) -- **slicer**: 数据切片(文本分割、幻灯片提取) -- **formatter**: 格式转换(PDF → text, slide → JSON) -- **llms**: LLM 算子(质量评估、条件检查) - -## 数据流 - -### 1. 数据摄入 -``` -用户上传 → 前端 → API Gateway → 数据管理服务 → PostgreSQL/MinIO -``` - -### 2. 数据处理 -``` -数据集 → 前端 → API Gateway → 主应用 → Python 运行时 -→ Ray 执行器 → 算子 → 处理后的数据 → PostgreSQL/MinIO -``` - -### 3. RAG 索引 -``` -处理后的数据 → Python 运行时 → RAG 索引器服务 → Milvus (向量) -``` - -### 4. RAG 检索 -``` -查询 → 前端 → API Gateway → RAG 索引器服务 → Milvus → 结果 -``` - -## 技术栈 - -| 层级 | 技术 | -|--------|------| -| **前端** | React 18, TypeScript, Vite, Ant Design, TailwindCSS | -| **后端** | Spring Boot 3.5, Java 21, MyBatis-Plus, PostgreSQL | -| **运行时** | FastAPI, Python 3.12, Ray, SQLAlchemy | -| **向量数据库** | Milvus | -| **缓存** | Redis | -| **对象存储** | MinIO | -| **部署** | Docker Compose, Kubernetes/Helm | - -## 通信模式 - -### 服务间通信 -- **REST API**: 前端和后端之间的 HTTP/JSON -- **gRPC**: (如有)后端服务之间 -- **消息队列**: (如有)用于异步任务 - -### 后端到运行时 -- **HTTP/REST**: Java 后端调用 Python 运行时 runtime APIs -- **Ray**: Python 运行时提交任务到 Ray 执行器 - -## 安全 - -### 认证 -- **JWT**: 基于 Token 的认证,通过 API Gateway -- **会话**: (如有)会话管理 - -### 授权 -- **基于角色的**: (如有)RBAC -- **基于资源的**: (如有)资源级访问控制 - -## 可扩展性 - -### 水平扩展 -- **后端服务**: Kubernetes pod 扩展(通过 Helm) -- **Ray 执行器**: 多节点 Ray 集群 -- **前端**: 静态资源服务 + CDN - -### 垂直扩展 -- **数据库**: PostgreSQL 连接池 -- **缓存**: Redis 集群 -- **向量数据库**: Milvus 集群 - -## 部署 - -### Docker Compose -```bash -make install INSTALLER=docker -``` - -### Kubernetes/Helm -```bash -make install INSTALLER=k8s -``` - -## 监控 - -### 指标 -- **Spring Boot Actuator**: `/actuator/metrics` -- **Prometheus**: (如已配置)指标收集 -- **Ray**: Ray dashboard 用于执行器监控 - -### 日志 -- **Java**: Log4j2 -- **Python**: Ray dashboard for executor monitoring - -## 架构决策 - -### 为什么选择多语言? -- **Java 后端**: 企业级、成熟生态系统、强类型 -- **Python 运行时**: 丰富的 ML/AI 生态系统、灵活、快速原型开发 -- **React 前端**: 现代 UI、组件化、大型生态系统 - -### 为什么选择微服务? -- **可扩展性**: 服务独立扩展 -- **可维护性**: 清晰的服务边界 -- **技术多样性**: 为每个任务使用最佳工具 - -### 为什么选择 Ray? -- **分布式计算**: 无缝多节点执行 -- **容错**: 自动任务重试和恢复 -- **资源管理**: 动态资源分配 - -## 未来增强 - -- [ ] 服务网格(Istio/Linkerd) -- [ ] 事件总线(Kafka/Pulsar) -- [ ] GraphQL API -- [ ] 实时更新(WebSocket) -- [ ] 高级监控(Grafana, Loki) - -## 引用 - -- [后端架构](./backend/README.md) -- [运行时架构](./runtime/README.md) -- [前端架构](./frontend/README.md) -- [AGENTS.md](./AGENTS.md) diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md deleted file mode 100644 index 394f6100e..000000000 --- a/ARCHITECTURE.md +++ /dev/null @@ -1,239 +0,0 @@ -# DataMate Architecture - -## Overview - -DataMate is a microservices-based data management platform for model fine-tuning and RAG retrieval. It follows a polyglot architecture with Java backend, Python runtime, and React frontend. - -## High-Level Architecture - -``` -┌─────────────────────────────────────────────────────────────────┐ -│ Frontend (React) │ -│ localhost:5173 │ -└────────────────────────┬────────────────────────────────────────┘ - │ HTTP/REST - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ API Gateway │ -│ (Spring Cloud) │ -│ localhost:8080 │ -│ ┌──────────────────────────────────────────────────────────┐ │ -│ │ Authentication (JWT) │ │ -│ │ Route Forwarding │ │ -│ │ Rate Limiting │ │ -│ └──────────────────────────────────────────────────────────┘ │ -└────────────────┬────────────────────────────────────────────────┘ - │ - ├─────────────────┬─────────────────┐ - ▼ ▼ ▼ -┌─────────────────────────┐ ┌─────────────────────────┐ ┌─────────────────────────┐ -│ Main Application │ │ Data Management │ │ RAG Indexer │ -│ (Spring Boot) │ │ Service │ │ Service │ -│ - Data Cleaning │ │ - Dataset Mgmt │ │ - Knowledge Base │ -│ - Operator Market │ │ - File Operations │ │ - Vector Search │ -│ - Data Collection │ │ - Tag Management │ │ - Milvus Integration │ -└─────────┬───────────┘ └─────────┬───────────┘ └─────────┬───────────┘ - │ │ │ - │ │ │ - ▼ ▼ ▼ -┌─────────────────────────────────────────────────────────────────────────────────┐ -│ PostgreSQL (Metadata) │ -│ Redis (Cache) │ -│ Milvus (Vectors) │ -│ MinIO (Files) │ -└─────────────────────────────────────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Python Runtime (FastAPI) │ -│ localhost:18000 │ -│ ┌──────────────────────────────────────────────────────────┐ │ -│ │ Data Synthesis │ │ -│ │ Data Annotation (Label Studio) │ │ -│ │ Data Evaluation │ │ -│ │ RAG Indexing │ │ -│ └──────────────────────────────────────────────────────────┘ │ -└────────────────┬────────────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Ray Executor (Distributed) │ -│ ┌──────────────────────────────────────────────────────────┐ │ -│ │ Operator Execution │ │ -│ │ Task Scheduling │ │ -│ │ Distributed Computing │ │ -│ └──────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────┘ -``` - -## Components - -### Frontend Layer -- **Framework**: React 18 + TypeScript + Vite -- **UI Library**: Ant Design -- **Styling**: TailwindCSS v4 -- **State Management**: Redux Toolkit -- **Routing**: React Router v7 - -### Backend Layer (Java) -- **API Gateway**: Spring Cloud Gateway - - Route forwarding - - JWT authentication - - Rate limiting - -- **Main Application**: Spring Boot 3.5 - - Data cleaning pipeline - - Operator marketplace - - Data collection tasks - -- **Data Management Service**: Spring Boot 3.5 - - Dataset CRUD - - File operations - - Tag management - -- **RAG Indexer Service**: Spring Boot 3.5 - - Knowledge base management - - Vector search - - Milvus integration - -### Runtime Layer (Python) -- **FastAPI Backend**: Port 18000 - - Data synthesis (QA generation) - - Data annotation (Label Studio integration) - - Model evaluation - - RAG indexing - -- **Ray Executor**: Distributed execution - - Operator execution - - Task scheduling - - Multi-node parallelism - -### Operator Ecosystem -- **filter**: Data filtering (duplicates, sensitive content, quality) -- **mapper**: Data transformation (cleaning, normalization) -- **slicer**: Data segmentation (text splitting, slide extraction) -- **formatter**: Format conversion (PDF → text, slide → JSON) -- **llms**: LLM-based operators (quality evaluation, condition checking) - -## Data Flow - -### 1. Data Ingestion -``` -User Upload → Frontend → API Gateway → Data Management Service → PostgreSQL/MinIO -``` - -### 2. Data Processing -``` -Dataset → Frontend → API Gateway → Main Application → Python Runtime -→ Ray Executor → Operators → Processed Data → PostgreSQL/MinIO -``` - -### 3. RAG Indexing -``` -Processed Data → Python Runtime → RAG Indexer Service → Milvus (Vectors) -``` - -### 4. RAG Retrieval -``` -Query → Frontend → API Gateway → RAG Indexer Service → Milvus → Results -``` - -## Technology Stack - -| Layer | Technology | -|--------|-----------| -| **Frontend** | React 18, TypeScript, Vite, Ant Design, TailwindCSS | -| **Backend** | Spring Boot 3.5, Java 21, MyBatis-Plus, PostgreSQL | -| **Runtime** | FastAPI, Python 3.12, Ray, SQLAlchemy | -| **Vector DB** | Milvus | -| **Cache** | Redis | -| **Object Storage** | MinIO | -| **Deployment** | Docker Compose, Kubernetes/Helm | - -## Communication Patterns - -### Service-to-Service -- **REST API**: HTTP/JSON between frontend and backend -- **gRPC**: (if any) between backend services -- **Message Queue**: (if any) for async tasks - -### Backend-to-Runtime -- **HTTP/REST**: Java backend calls Python runtime runtime APIs -- **Ray**: Python runtime submits tasks to Ray executor - -## Security - -### Authentication -- **JWT**: Token-based authentication via API Gateway -- **Session**: (if any) session management - -### Authorization -- **Role-based**: (if any) RBAC -- **Resource-based**: (if any) resource-level access control - -## Scalability - -### Horizontal Scaling -- **Backend Services**: Kubernetes pod scaling via Helm -- **Ray Executor**: Multi-node Ray cluster -- **Frontend**: Static asset serving + CDN - -### Vertical Scaling -- **Database**: PostgreSQL connection pooling -- **Cache**: Redis clustering -- **Vector DB**: Milvus cluster - -## Deployment - -### Docker Compose -```bash -make install INSTALLER=docker -``` - -### Kubernetes/Helm -```bash -make install INSTALLER=k8s -``` - -## Monitoring - -### Metrics -- **Spring Boot Actuator**: `/actuator/metrics` -- **Prometheus**: (if configured) metrics collection -- **Ray**: Ray dashboard for executor monitoring - -### Logging -- **Java**: Log4j2 -- **Python**: Ray dashboard for executor monitoring - -## Architecture Decisions - -### Why Polyglot? -- **Java Backend**: Enterprise-grade, mature ecosystem, strong typing -- **Python Runtime**: Rich ML/AI ecosystem, flexible, fast prototyping -- **React Frontend**: Modern UI, component-based, large ecosystem - -### Why Microservices? -- **Scalability**: Independent scaling of services -- **Maintainability**: Clear service boundaries -- **Technology Diversity**: Use best tool for each job - -### Why Ray? -- **Distributed Computing**: Seamless multi-node execution -- **Fault Tolerance**: Automatic task retry and recovery -- **Resource Management**: Dynamic resource allocation - -## Future Enhancements - -- [ ] Service Mesh (Istio/Linkerd) -- [ ] Event Bus (Kafka/Pulsar) -- [ ] GraphQL API -- [ ] Real-time-Updates (WebSocket) -- [ ] Advanced Monitoring (Grafana, Loki) - -## References - -- [Backend Architecture](./backend/README.md) -- [Runtime Architecture](./runtime/README.md) -- [Frontend Architecture](./frontend/README.md) -- [AGENTS.md](./AGENTS.md) diff --git a/README-zh.md b/README-zh.md index 904cc3140..1d4987ed0 100644 --- a/README-zh.md +++ b/README-zh.md @@ -113,7 +113,6 @@ make uninstall ## 📚 文档 ### 核心文档 -- **[ARCHITECTURE.md](./ARCHITECTURE.md)** - 系统架构、微服务通信、数据流 - **[DEVELOPMENT.md](./DEVELOPMENT.md)** - 本地开发环境搭建和工作流程 - **[AGENTS.md](./AGENTS.md)** - AI 助手指南和代码规范 diff --git a/README.md b/README.md index 3116fa8f7..97ee80593 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,6 @@ When running make uninstall, the installer will prompt once whether to delete vo ## 📚 Documentation ### Core Documentation -- **[ARCHITECTURE.md](./ARCHITECTURE.md)** - System architecture, microservices communication, data flow - **[DEVELOPMENT.md](./DEVELOPMENT.md)** - Local development environment setup and workflow - **[AGENTS.md](./AGENTS.md)** - AI assistant guidelines and code style diff --git a/backend/README.md b/backend/README.md index 6b4c42fd4..fb5bb4727 100644 --- a/backend/README.md +++ b/backend/README.md @@ -2,28 +2,28 @@ ## Overview -DataMate Backend 是基于 Spring Boot 3.5 + Java 21 的微服务架构,提供数据管理、RAG 索引、API 网关等核心功能。 +DataMate Backend is a microservices architecture based on Spring Boot 3.5 + Java 21, providing core functions such as data management, RAG indexing, and API gateway. ## Architecture ``` backend/ -├── api-gateway/ # API Gateway + 认证 +├── api-gateway/ # API Gateway + Authentication ├── services/ -│ ├── data-management-service/ # 数据集管理 -│ ├── rag-indexer-service/ # RAG 索引 -│ └── main-application/ # 主应用入口 +│ ├── data-management-service/ # Dataset management +│ ├── rag-indexer-service/ # RAG indexing +│ └── main-application/ # Main application entry └── shared/ - ├── domain-common/ # DDD 构建块、异常处理 - └── security-common/ # JWT 工具 + ├── domain-common/ # DDD building blocks, exception handling + └── security-common/ # JWT utilities ``` ## Services | Service | Port | Description | |---------|-------|-------------| -| **main-application** | 8080 | 主应用,包含数据管理、数据清洗、算子市场等模块 | -| **api-gateway** | 8080 | API Gateway,路由转发和认证 | +| **main-application** | 8080 | Main application, includes data management, data cleaning, operator marketplace modules | +| **api-gateway** | 8080 | API Gateway, route forwarding and authentication | ## Technology Stack @@ -40,11 +40,11 @@ backend/ ### External Services - **PostgreSQL**: `datamate-database:5432` - **Redis**: `datamate-redis:6379` -- **Milvus**: 向量数据库(RAG 索引) +- **Milvus**: Vector database (RAG indexing) ### Shared Libraries -- **domain-common**: 业务异常、系统参数、领域实体基类 -- **security-common**: JWT 工具、认证辅助 +- **domain-common**: Business exceptions, system parameters, domain entity base classes +- **security-common**: JWT utilities, auth helpers ## Quick Start @@ -127,7 +127,7 @@ mvn -pl services/data-management-service -am test ## Documentation - **API Docs**: http://localhost:8080/api/swagger-ui.html -- **AGENTS.md**: See `backend/shared/AGENTS.md` for shared libraries +- **AGENTS.md**: See `backend/shared/AGENTS.md` for shared libraries documentation - **Service Docs**: See individual service READMEs ## Related Links diff --git a/backend/api-gateway/README.md b/backend/api-gateway/README.md index afba70c3d..23ef8fbf5 100644 --- a/backend/api-gateway/README.md +++ b/backend/api-gateway/README.md @@ -2,7 +2,7 @@ ## Overview -API Gateway 是 DataMate 的统一入口,基于 Spring Cloud Gateway 实现,负责路由转发、JWT 认证和限流。 +API Gateway is DataMate's unified entry point, built on Spring Cloud Gateway, responsible for route forwarding, JWT authentication, and rate limiting. ## Architecture @@ -12,7 +12,7 @@ backend/api-gateway/ │ ├── config/ # Gateway configuration │ ├── filter/ # JWT authentication filter │ └── route/ # Route definitions -└免 src/main/resources/ +└── src/main/resources/ └── application.yml # Gateway configuration ``` @@ -43,25 +43,25 @@ datamate: ## Features ### 1. Route Forwarding -- 将前端请求转发到对应的后端服务 -- 支持负载均衡 -- 路径重写 +- Forward frontend requests to corresponding backend services +- Support for load balancing +- Path rewriting ### 2. JWT Authentication -- 基于 JWT Token 的认证 -- Token 验证和过期检查 -- 用户上下文传递 +- JWT Token-based authentication +- Token validation and expiration checking +- User context propagation ### 3. Rate Limiting -- (如果配置)请求频率限制 -- 防止 API 滥用 +- Request rate limiting (if configured) +- Prevent API abuse ## Quick Start ### Prerequisites - JDK 21+ - Maven 3.8+ -- Nacos 服务(如果使用服务发现) +- Nacos service (if using service discovery) ### Build ```bash @@ -77,8 +77,8 @@ mvn spring-boot:run ## Development -### 添加新路由 -在 `application.yml` 或通过 Nacos 配置路由规则: +### Adding New Routes +Configure route rules in `application.yml` or via Nacos: ```yaml spring: @@ -93,8 +93,8 @@ spring: - StripPrefix=3 ``` -### 添加自定义过滤器 -创建 `GlobalFilter` 或 `GatewayFilter`: +### Adding Custom Filters +Create a `GlobalFilter` or `GatewayFilter`: ```java @Component @@ -109,12 +109,12 @@ public class AuthFilter implements GlobalFilter { ## Testing -### 测试路由转发 +### Test Route Forwarding ```bash curl http://localhost:8080/api/data-management/datasets ``` -### 测试 JWT 认证 +### Test JWT Authentication ```bash curl -H "Authorization: Bearer " http://localhost:8080/api/protected-endpoint ``` diff --git a/backend/services/main-application/README-zh.md b/backend/services/main-application/README-zh.md index 4bc298878..1568c5a20 100644 --- a/backend/services/main-application/README-zh.md +++ b/backend/services/main-application/README-zh.md @@ -35,19 +35,7 @@ backend/services/main-application/ - 标签管理 - 数据集版本控制 -### 2. 数据清洗 -- 数据清洗管道 -- 数据质量检查 -- 数据去重 -- 数据格式转换 - -### 3. 算子市场 -- 算子上传/下载 -- 算子版本管理 -- 算子分类和搜索 -- 算子执行配置 - -### 4. 数据收集 +### 2. 数据收集 - 数据源配置 - 定时数据收集任务 - 数据同步 @@ -69,12 +57,6 @@ server: datamate: data-management: base-path: /dataset - operator-market: - repository-path: ./runtime/operators - max-upload-size: 50MB - ray: - enabled: false - address: ray://localhost:10001 ``` ## 快速开始 @@ -106,14 +88,6 @@ mvn spring-boot:run 4. 在 `application/` 创建 application service 5. 在 `interfaces/rest/` 创建 controller -### 集成 Ray 执行器 -```yaml -datamate: - ray: - enabled: true - address: ray://localhost:10001 -``` - ## 测试 ### 运行测试 diff --git a/backend/services/main-application/README.md b/backend/services/main-application/README.md index be122675d..51b4c65c5 100644 --- a/backend/services/main-application/README.md +++ b/backend/services/main-application/README.md @@ -2,7 +2,7 @@ ## Overview -Main Application 是 DataMate 的核心 Spring Boot 服务,包含数据管理、数据清洗、算子市场、数据收集等主要功能模块。 +The Main Application is DataMate's core Spring Boot service, containing major functional modules including data management, data cleaning, operator marketplace, and data collection. ## Architecture @@ -30,28 +30,16 @@ backend/services/main-application/ ## Modules ### 1. Data Management -- 数据集 CRUD 操作 -- 文件上传/下载 -- 标签管理 -- 数据集版本控制 - -### 2. Data Cleaning -- 数据清洗管道 -- 数据质量检查 -- 数据去重 -- 数据格式转换 - -### 3. Operator Market -- 算子上传/下载 -- 算子版本管理 -- 算子分类和搜索 -- 算子执行配置 - -### 4. Data Collection -- 数据源配置 -- 定时数据收集任务 -- 数据同步 -- 数据导入/导出 +- Dataset CRUD operations +- File upload/download +- Tag management +- Dataset versioning + +### 2. Data Collection +- Data source configuration +- Scheduled data collection tasks +- Data synchronization +- Data import/export ## Configuration @@ -69,12 +57,6 @@ server: datamate: data-management: base-path: /dataset - operator-market: - repository-path: ./runtime/operators - max-upload-size: 50MB - ray: - enabled: false - address: ray://localhost:10001 ``` ## Quick Start @@ -99,30 +81,22 @@ mvn spring-boot:run ## Development -### 添加新模块 -1. 在 `domain/model/` 创建实体类 -2. 在 `domain/repository/` 创建 repository 接口 -3. 在 `infrastructure/persistence/` 实现 repository -4. 在 `application/` 创建 application service -5. 在 `interfaces/rest/` 创建 controller - -### 集成 Ray Executor -```yaml -datamate: - ray: - enabled: true - address: ray://localhost:10001 -``` +### Adding a New Module +1. Create entity class in `domain/model/` +2. Create repository interface in `domain/repository/` +3. Implement repository in `infrastructure/persistence/` +4. Create application service in `application/` +5. Create controller in `interfaces/rest/` ## Testing -### 运行测试 +### Run Tests ```bash cd backend/services/main-application mvn test ``` -### 运行特定测试 +### Run Specific Test ```bash mvn test -Dtest=DatasetControllerTest ``` diff --git a/backend/shared/README.md b/backend/shared/README.md index 69b17856d..eb8c13630 100644 --- a/backend/shared/README.md +++ b/backend/shared/README.md @@ -2,18 +2,18 @@ ## Overview -Shared Libraries 包含所有后端服务共用的代码和工具,包括领域构建块、异常处理、JWT 工具等。 +Shared Libraries contain code and utilities shared across all backend services, including domain building blocks, exception handling, JWT utilities, and more. ## Architecture ``` backend/shared/ -├── domain-common/ # DDD 构建块、异常处理 +├── domain-common/ # DDD building blocks, exception handling │ └── src/main/java/com/datamate/common/ │ ├── infrastructure/exception/ # BusinessException, ErrorCode │ ├── setting/ # System params, model configs │ └── domain/ # Base entities, repositories -└── security-common/ # JWT 工具、认证辅助 +└── security-common/ # JWT utilities, auth helpers └── src/main/java/com/datamate/security/ ``` @@ -22,21 +22,21 @@ backend/shared/ ### 1. domain-common #### BusinessException -统一的业务异常处理机制: +Unified business exception handling mechanism: ```java -// 抛出业务异常 +// Throw business exception throw BusinessException.of(ErrorCode.DATASET_NOT_FOUND) .withDetail("dataset_id", datasetId); -// 带上下文的异常 +// Exception with context throw BusinessException.of(ErrorCode.VALIDATION_FAILED) .withDetail("field", "email") .withDetail("reason", "Invalid format"); ``` #### ErrorCode -错误码枚举接口: +Error code enumeration interface: ```java public interface ErrorCode { @@ -45,7 +45,7 @@ public interface ErrorCode { HttpStatus getHttpStatus(); } -// 示例 +// Example public enum CommonErrorCode implements ErrorCode { SUCCESS("0000", "Success", HttpStatus.OK), DATABASE_NOT_FOUND("4001", "Database not found", HttpStatus.NOT_FOUND); @@ -53,7 +53,7 @@ public enum CommonErrorCode implements ErrorCode { ``` #### BaseEntity -所有实体的基类,包含审计字段: +Base class for all entities, including audit fields: ```java @Data @@ -79,22 +79,22 @@ public class BaseEntity implements Serializable { ### 2. security-common #### JWT Utilities -JWT Token 生成和验证: +JWT Token generation and validation: ```java -// 生成 Token +// Generate Token String token = JwtUtil.generateToken(userId, secret, expiration); -// 验证 Token +// Validate Token Claims claims = JwtUtil.validateToken(token, secret); String userId = claims.getSubject(); ``` ## Usage -### 在服务中使用共享库 +### Using Shared Libraries in Services -#### Maven 依赖 +#### Maven Dependencies ```xml com.datamate @@ -108,7 +108,7 @@ String userId = claims.getSubject(); ``` -#### 使用 BusinessException +#### Using BusinessException ```java @RestController @RequiredArgsConstructor @@ -126,14 +126,14 @@ public class DatasetController { ## Quick Start -### 构建共享库 +### Build Shared Libraries ```bash cd backend mvn clean install ``` -### 在服务中使用 -共享库会自动被所有后端服务继承。 +### Use in Services +Shared libraries are automatically inherited by all backend services. ## Documentation diff --git a/runtime/README-zh.md b/runtime/README-zh.md index 34528cd3e..5aa180ddd 100644 --- a/runtime/README-zh.md +++ b/runtime/README-zh.md @@ -137,7 +137,6 @@ poetry run pytest ## 文档 - **API 文档**: http://localhost:18000/redoc -- **AGENTS.md**: 见 `runtime/datamate-python/app/AGENTS.md` 获取详细模块文档 - **算子指南**: 见 `runtime/ops/README.md` 获取算子开发 ## 相关链接 diff --git a/runtime/README.md b/runtime/README.md index da1a53d77..8d3a5621c 100644 --- a/runtime/README.md +++ b/runtime/README.md @@ -2,17 +2,17 @@ ## Overview -DataMate Runtime 提供数据处理、算子执行、数据收集等核心功能,基于 Python 3.12+ 和 FastAPI 框架。 +DataMate Runtime provides core functionality for data processing, operator execution, and data collection, built on Python 3.12+ and the FastAPI framework. ## Architecture ``` runtime/ -├── datamate-python/ # FastAPI 后端服务(port 18000) -├── python-executor/ # Ray 分布式执行器 -├── ops/ # 算子生态 -├── datax/ # DataX 数据读写框架 -└── deer-flow/ # DeerFlow 服务 +├── datamate-python/ # FastAPI backend service (port 18000) +├── python-executor/ # Ray distributed executor +├── ops/ # Operator ecosystem +├── datax/ # DataX data read/write framework +└── deer-flow/ # DeerFlow service ``` ## Components @@ -20,14 +20,14 @@ runtime/ ### 1. datamate-python (FastAPI Backend) **Port**: 18000 -核心 Python 服务,提供以下功能: -- **数据合成**: QA 生成、文档处理 -- **数据标注**: Label Studio 集成、自动标注 -- **数据评估**: 模型评估、质量检查 -- **数据清洗**: 数据清洗管道 -- **算子市场**: 算子管理、上传 -- **RAG 索引**: 向量索引、知识库管理 -- **数据收集**: 定时任务、数据源集成 +Core Python service providing: +- **Data Synthesis**: QA generation, document processing +- **Data Annotation**: Label Studio integration, auto-annotation +- **Data Evaluation**: Model evaluation, quality checks +- **Data Cleaning**: Data cleaning pipelines +- **Operator Marketplace**: Operator management, upload +- **RAG Indexing**: Vector indexing, knowledge base management +- **Data Collection**: Scheduled tasks, data source integration **Technology Stack**: - FastAPI 0.124+ @@ -35,39 +35,39 @@ runtime/ - Pydantic 2.12+ - PostgreSQL (via asyncpg) - Milvus (via pymilvus) -- APScheduler (定时任务) +- APScheduler (scheduled tasks) ### 2. python-executor (Ray Executor) -Ray 分布式执行框架,负责: -- **算子执行**: 执行数据处理算子 -- **任务调度**: 异步任务管理 -- **分布式计算**: 多节点并行处理 +Ray distributed execution framework responsible for: +- **Operator Execution**: Execute data processing operators +- **Task Scheduling**: Async task management +- **Distributed Computing**: Multi-node parallel processing **Technology Stack**: - Ray 2.7.0 -- FastAPI (执行器 API) -- Data-Juicer (数据处理) +- FastAPI (executor API) +- Data-Juicer (data processing) ### 3. ops (Operator Ecosystem) -算子生态,包含: -- **filter**: 数据过滤(去重、敏感内容、质量过滤) -- **mapper**: 数据转换(清洗、归一化) -- **slicer**: 数据切片(文本分割、幻灯片提取) -- **formatter**: 格式转换(PDF → text, slide → JSON) -- **llms**: LLM 算子(质量评估、条件检查) -- **annotation**: 标注算子(目标检测、分割) +Operator ecosystem including: +- **filter**: Data filtering (deduplication, sensitive content, quality filtering) +- **mapper**: Data transformation (cleaning, normalization) +- **slicer**: Data slicing (text splitting, slide extraction) +- **formatter**: Format conversion (PDF → text, slide → JSON) +- **llms**: LLM operators (quality evaluation, condition checking) +- **annotation**: Annotation operators (object detection, segmentation) -**See**: `runtime/ops/README.md` for operator development guide. +**See**: `runtime/ops/README.md` for operator development guide ### 4. datax (DataX Framework) -DataX 数据读写框架,支持多种数据源: -- **Readers**: MySQL, PostgreSQL, Oracle, MongoDB, Elasticsearch, HDFS, S3, NFS, GlusterFS, API, 等 -- **Writers**: 同上,支持写入目标 +DataX data read/write framework supporting multiple data sources: +- **Readers**: MySQL, PostgreSQL, Oracle, MongoDB, Elasticsearch, HDFS, S3, NFS, GlusterFS, API, etc. +- **Writers**: Same as above, supports writing to targets -**Technology Stack**: Java (Maven 构建) +**Technology Stack**: Java (Maven build) ### 5. deer-flow (DeerFlow Service) -DeerFlowService(配置见 `conf.yaml`)。 +DeerFlow service (see `conf.yaml` for configuration). ## Quick Start @@ -114,9 +114,9 @@ app/ ### Code Conventions - **Routes**: `APIRouter` in `interface/*.py` -- **DI**: `Depends(get_db)` for session -- **Error**: `raise BusinessError(ErrorCodes.XXX, context)` -- **Transaction**: `async with transaction(db):` +- **Dependency Injection**: `Depends(get_db)` for session +- **Error Handling**: `raise BusinessError(ErrorCodes.XXX, context)` +- **Transactions**: `async with transaction(db):` - **Models**: Extend `BaseEntity` (audit fields auto-filled) ## Testing @@ -137,7 +137,6 @@ poetry run pytest ## Documentation - **API Docs**: http://localhost:18000/redoc -- **AGENTS.md**: See `runtime/datamate-python/app/AGENTS.md` for detailed module docs - **Operator Guide**: See `runtime/ops/README.md` for operator development ## Related Links diff --git a/runtime/datax/README.md b/runtime/datax/README.md index 85ac02135..af2366255 100644 --- a/runtime/datax/README.md +++ b/runtime/datax/README.md @@ -2,15 +2,15 @@ ## Overview -DataX 是一个数据传输框架,支持多种数据源和数据目标之间的数据传输,用于数据收集和同步。 +DataX is a data transfer framework that supports data transmission between various data sources and targets, used for data collection and synchronization. ## Architecture ``` runtime/datax/ -├── core/ # DataX 核心组件 -├── transformer/ # 数据转换器 -├── readers/ # 数据读取器 +├── core/ # DataX core components +├── transformer/ # Data transformers +├── readers/ # Data readers │ ├── mysqlreader/ │ ├── postgresqlreader/ │ ├── oracleReader/ @@ -20,7 +20,7 @@ runtime/datax/ │ ├── nfsreader/ │ ├── glusterfsreader/ │ └── apireader/ -└── writers/ # 数据写入器 +└── writers/ # Data writers ├── mysqlwriter/ ├── postgresqlwriter/ ├── oraclewriter/ @@ -34,7 +34,7 @@ runtime/datax/ ## Supported Data Sources -### 关系型数据库 +### Relational Databases - MySQL - PostgreSQL - Oracle @@ -43,22 +43,22 @@ runtime/datax/ - KingbaseES - GaussDB -### NoSQL 数据库 +### NoSQL Databases - MongoDB - Elasticsearch - Cassandra - HBase - Redis -### 文件系统 +### File Systems - HDFS -- S3 (AWS S3, MinIO, 阿里云 OSS) +- S3 (AWS S3, MinIO, Alibaba Cloud OSS) - NFS - GlusterFS -- 本地文件系统 +- Local file system -### 其他 -- API 接口 +### Others +- API interfaces - Kafka - Pulsar - DataHub @@ -66,7 +66,7 @@ runtime/datax/ ## Usage -### 基本配置 +### Basic Configuration ```json { "job": { @@ -100,13 +100,13 @@ runtime/datax/ } ``` -### 运行 DataX +### Run DataX ```bash -# 构建 DataX +# Build DataX cd runtime/datax mvn clean package -# 运行 +# Run python datax.py -j job.json ``` @@ -117,34 +117,34 @@ python datax.py -j job.json - Maven 3.8+ - Python 3.6+ -### 构建 +### Build ```bash cd runtime/datax mvn clean package ``` -### 运行示例 +### Run Example ```bash python datax.py -j examples/mysql2text.json ``` ## Development -### 添加新的 Reader -1. 在 `readers/` 创建新模块 -2. 实现 Reader 接口 -3. 配置 reader 参数 -4. 添加到 package.xml +### Adding a New Reader +1. Create new module in `readers/` +2. Implement Reader interface +3. Configure reader parameters +4. Add to package.xml -### 添加新的 Writer -1. 在 `writers/` 创建新模块 -2. 实现 Writer 接口 -3. 配置 writer 参数 -4. 添加到 package.xml +### Adding a New Writer +1. Create new module in `writers/` +2. Implement Writer interface +3. Configure writer parameters +4. Add to package.xml ## Documentation -- [DataX 官方文档](https://github.com/alibaba/DataX) +- [DataX Official Documentation](https://github.com/alibaba/DataX) ## Related Links diff --git a/runtime/deer-flow/README-zh.md b/runtime/deer-flow/README-zh.md index 5cfe2d54f..209d436de 100644 --- a/runtime/deer-flow/README-zh.md +++ b/runtime/deer-flow/README-zh.md @@ -76,57 +76,6 @@ BASIC_MODEL: api_key: your_gemini_api_key ``` -#### 华为云 -```yaml -BASIC_MODEL: - base_url: https://ark.cn-beijing.volces.com/api/v3 - model: "doubao-1.5-pro-32k-250115" - api_key: your_api_key -``` - -## 快速开始 - -### 前置条件 -- Python 3.8+ -- LLM API Key 或本地 LLM - -### 配置 -1. 复制 `conf.yaml.example` 为 `conf.yaml` -2. 配置 LLM 提供商和 API Key -3. (可选)配置推理模型和搜索引擎 - -### 运行 -```bash -cd runtime/deer-flow -python -m deerflow -``` - -## 使用 - -### 基本规划 -```python -from deerflow import DeerFlow - -flow = DeerFlow() -result = flow.plan( - task="设计一个数据处理流程", - context="需要处理CSV文件,进行数据清洗和转换" -) -print(result) -``` - -### 推理任务 -```python -from deerflow import DeerFlow - -flow = DeerFlow() -result = flow.reason( - task="分析数据质量", - context="数据包含缺失了值和异常值" -) -print(result) -``` - ## 开发 ### 添加新的 LLM 提供商 diff --git a/runtime/deer-flow/README.md b/runtime/deer-flow/README.md index 5870ec44d..ee7642ab2 100644 --- a/runtime/deer-flow/README.md +++ b/runtime/deer-flow/README.md @@ -2,38 +2,38 @@ ## Overview -DeerFlow 是一个 LLM 驱动的服务,用于规划和推理任务,支持多种 LLM 提供商。 +DeerFlow is an LLM-driven service for planning and reasoning tasks, supporting multiple LLM providers. ## Architecture ``` runtime/deer-flow/ -├── conf.yaml # DeerFlow 配置文件 -├── .env # 环境变量 -└── (其他源代码) +├── conf.yaml # DeerFlow configuration file +├── .env # Environment variables +└── (other source code) ``` ## Configuration -### 基本配置 (conf.yaml) +### Basic Configuration (conf.yaml) ```yaml -# 基础模型配置 +# Basic model configuration BASIC_MODEL: base_url: https://api.example.com/v1 model: "model-name" api_key: your_api_key max_retries: 3 - verify_ssl: false # 如果使用自签名证书,设为 false + verify_ssl: false # Set to false if using self-signed certificates -# 推理模型配置(可选) +# Reasoning model configuration (optional) REASONING_MODEL: base_url: https://api.example.com/v1 model: "reasoning-model-name" api_key: your_api_key max_retries: 3 -# 搜索引擎配置(可选) +# Search engine configuration (optional) SEARCH_ENGINE: engine: tavily include_domains: @@ -49,7 +49,7 @@ SEARCH_ENGINE: max_content_length_per_page: 4000 ``` -### 支持的 LLM 提供商 +## Supported LLM Providers #### OpenAI ```yaml @@ -59,7 +59,7 @@ BASIC_MODEL: api_key: sk-... ``` -#### Ollama (本地部署) +#### Ollama (Local Deployment) ```yaml BASIC_MODEL: base_url: "http://localhost:11434/v1" @@ -76,72 +76,21 @@ BASIC_MODEL: api_key: your_gemini_api_key ``` -#### 华为云 -```yaml -BASIC_MODEL: - base_url: https://ark.cn-beijing.volces.com/api/v3 - model: "doubao-1-5-pro-32k-250115" - api_key: your_api_key -``` - -## Quick Start - -### Prerequisites -- Python 3.8+ -- LLM API Key 或本地 LLM - -### 配置 -1. 复制 `conf.yaml.example` 为 `conf.yaml` -2. 配置 LLM 提供商和 API Key -3. (可选)配置推理模型和搜索引擎 - -### 运行 -```bash -cd runtime/deer-flow -python -m deerflow -``` - -## Usage - -### 基本规划 -```python -from deerflow import DeerFlow - -flow = DeerFlow() -result = flow.plan( - task="设计一个数据处理流程", - context="需要处理CSV文件,进行数据清洗和转换" -) -print(result) -``` - -### 推理任务 -```python -from deerflow import DeerFlow - -flow = DeerFlow() -result = flow.reason( - task="分析数据质量", - context="数据包含缺失值和异常值" -) -print(result) -``` - ## Development -### 添加新的 LLM 提供商 -1. 在 `conf.yaml` 添加新的模型配置 -2. 实现对应的 API 调用逻辑 -3. 测试连接和推理 +### Adding a New LLM Provider +1. Add new model configuration in `conf.yaml` +2. Implement corresponding API call logic +3. Test connection and inference -### 自定义提示词模板 -1. 创建提示词模板文件 -2. 在 `conf.yaml` 引用模板 -3. 测试提示词效果 +### Customizing Prompt Templates +1. Create a prompt template file +2. Reference the template in `conf.yaml` +3. Test prompt effectiveness ## Documentation -- [DeerFlow 官方文档](https://github.com/ModelEngine-Group/DeerFlow) +- [DeerFlow Official Documentation](https://github.com/ModelEngine-Group/DeerFlow) ## Related Links diff --git a/runtime/python-executor/README.md b/runtime/python-executor/README.md index b9580d915..9cee6c708 100644 --- a/runtime/python-executor/README.md +++ b/runtime/python-executor/README.md @@ -2,7 +2,7 @@ ## Overview -Ray Executor 是基于 Ray 的分布式执行框架,负责执行数据处理算子、任务调度和分布式计算。 +Ray Executor is a Ray-based distributed execution framework responsible for executing data processing operators, task scheduling, and distributed computing. ## Architecture @@ -11,17 +11,17 @@ runtime/python-executor/ └── datamate/ ├── core/ │ ├── base_op.py # BaseOp, Mapper, Filter, Slicer, LLM - │ ├── dataset.py # Dataset 处理 - │ └── constant.py # 常量定义 + │ ├── dataset.py # Dataset processing + │ └── constant.py # Constant definitions ├── scheduler/ │ ├── scheduler.py # TaskScheduler, Task, TaskStatus - │ ├── func_task_scheduler.py # 函数任务调度 - │ └── cmd_task_scheduler.py # 命令任务调度 + │ ├── func_task_scheduler.py # Function task scheduling + │ └── cmd_task_scheduler.py # Command task scheduling ├── wrappers/ - │ ├── executor.py # Ray 执行器入口 - │ ├── datamate_wrapper.py # DataMate 任务包装 - │ └── data_juicer_wrapper.py # DataJuicer 集成 - └── common/utils/ # 工具函数 + │ ├── executor.py # Ray executor entry point + │ ├── datamate_wrapper.py # DataMate task wrapper + │ └── data_juicer_wrapper.py # DataJuicer integration + └── common/utils/ # Utility functions ├── bytes_transform.py ├── file_scanner.py ├── lazy_loader.py @@ -33,51 +33,51 @@ runtime/python-executor/ ### 1. Base Classes #### BaseOp -所有算子的基类: +Base class for all operators: ```python -class Base:Op: +class BaseOp: def __init__(self, *args, **kwargs): self.accelerator = kwargs.get('accelerator', "cpu") self.text_key = kwargs.get('text_key', "text") - # ... 其他配置 + # ... other configuration def execute(self, sample): raise NotImplementedError ``` #### Mapper -数据转换算子基类(1:1): +Base class for data transformation operators (1:1): ```python class Mapper(BaseOp): def execute(self, sample: Dict) -> Dict: - # 转换逻辑 + # Transformation logic return processed_sample ``` #### Filter -数据过滤算子基类(返回 bool): +Base class for data filtering operators (returns bool): ```python class Filter(BaseOp): def execute(self, sample: Dict) -> bool: - # 过滤逻辑 - return True # 保留或过滤 + # Filtering logic + return True # Keep or filter out ``` #### Slicer -数据切片算子基类(1:N): +Base class for data slicing operators (1:N): ```python class Slicer(BaseOp): def execute(self, sample: Dict) -> List[Dict]: - # 切片逻辑 + # Slicing logic return [sample1, sample2, ...] ``` #### LLM -LLM 算子基类: +Base class for LLM operators: ```python class LLM(Mapper): @@ -91,7 +91,7 @@ class LLM(Mapper): ### 2. Task Scheduler -异步任务调度器: +Async task scheduler: ```python class TaskScheduler: @@ -100,21 +100,21 @@ class TaskScheduler: self.semaphore = asyncio.Semaphore(max_concurrent) async def submit(self, task_id, task, *args, **kwargs): - # 提交任务 + # Submit task pass def get_task_status(self, task_id: str) -> Optional[TaskResult]: - # 获取任务状态 + # Get task status pass def cancel_task(self, task_id: str) -> bool: - # 取消任务 + # Cancel task pass ``` ### 3. Operator Execution -#### 算子注册 +#### Operator Registration ```python from datamate.core.base_op import OPERATORS @@ -124,7 +124,7 @@ OPERATORS.register_module( ) ``` -#### 执行算子 +#### Execute Operator ```python from datamate.core.base_op import Mapper @@ -143,39 +143,39 @@ class MyMapper(Mapper): - Ray 2.7.0+ - Poetry -### 安装 +### Installation ```bash cd runtime/python-executor poetry install ``` -### 启动 Ray Head +### Start Ray Head ```bash ray start --head ``` -### 启动 Ray Worker +### Start Ray Worker ```bash ray start --head-address=:6379 ``` ## Usage -### 提交任务到 Ray +### Submit Task to Ray ```python from ray import remote @remote def execute_operator(sample, operator_config): - # 执行算子逻辑 + # Execute operator logic return result -# 提交任务 +# Submit task result_ref = execute_operator.remote(sample, config) result = ray.get(result_ref) ``` -### 使用 Task Scheduler +### Use Task Scheduler ```python from datamate.scheduler.scheduler import TaskScheduler @@ -187,32 +187,32 @@ status = scheduler.get_task_status(task_id) ## Development -### 添加新算子 -1. 在 `runtime/ops/` 创建算子目录 -2. 实现 `process.py` 和 `__init__.py` -3. 在 `__init__.py` 注册算子 -4. 测试算子 +### Adding a New Operator +1. Create operator directory in `runtime/ops/` +2. Implement `process.py` and `__init__.py` +3. Register operator in `__init__.py` +4. Test the operator -### 调试算子 +### Debugging Operators ```bash -# 本地测试 +# Local test python -c "from ops.user.operator_package.process import YourOperatorName; op = YourOperatorName(); print(op.execute({'text': 'test'}))" ``` ## Performance -### 并行执行 -Ray 自动处理并行执行和资源分配。 +### Parallel Execution +Ray automatically handles parallel execution and resource allocation. -### 容错 -Ray 提供自动任务重试和故障转移。 +### Fault Tolerance +Ray provides automatic task retry and failover. -### 资源管理 -Ray 动态分配 CPU、GPU、内存资源。 +### Resource Management +Ray dynamically allocates CPU, GPU, and memory resources. ## Documentation -- [Ray 文档](https://docs.ray.io/) +- [Ray Documentation](https://docs.ray.io/) - [AGENTS.md](./AGENTS.md) ## Related Links