From fe116314aa692558aad9fa398d4d288637edae43 Mon Sep 17 00:00:00 2001
From: tribbianij314-hub <tribbianij314@gmail.com>
Date: Tue, 24 Mar 2026 15:14:37 +0800
Subject: [PATCH] =?UTF-8?q?docs:=20rail-workflow=20=E2=80=93=20add=20begin?=
 =?UTF-8?q?ner=20end-to-end=20collection=20playbook?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../rail-knowledge-workflow/001_init.sql      |  41 ++++
 .../00_goal_non_goal_template.md              |  40 ++++
 .../01_source_registry_template.md            |  15 ++
 .../03_run_log_template.md                    |  44 ++++
 .../rail-knowledge-workflow/README.md         | 225 ++++++++++++++++++
 .../crossref_minimal_collector.py             |  85 +++++++
 .../prompt_collect_template.md                |  20 ++
 .../prompt_review_template.md                 |  17 ++
 8 files changed, 487 insertions(+)
 create mode 100644 i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/001_init.sql
 create mode 100644 i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/00_goal_non_goal_template.md
 create mode 100644 i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/01_source_registry_template.md
 create mode 100644 i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/03_run_log_template.md
 create mode 100644 i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/README.md
 create mode 100644 i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/crossref_minimal_collector.py
 create mode 100644 i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/prompt_collect_template.md
 create mode 100644 i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/prompt_review_template.md

diff --git a/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/001_init.sql b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/001_init.sql
new file mode 100644
index 0000000..83a5449
--- /dev/null
+++ b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/001_init.sql	
@@ -0,0 +1,41 @@
+-- PostgreSQL 初始化脚本（最小可用）
+
+CREATE TABLE IF NOT EXISTS sources (
+  id            TEXT PRIMARY KEY,
+  name          TEXT NOT NULL,
+  source_type   TEXT NOT NULL,
+  entry_url     TEXT NOT NULL,
+  access_mode   TEXT NOT NULL,
+  status        TEXT NOT NULL DEFAULT 'active',
+  created_at    TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE TABLE IF NOT EXISTS documents (
+  id                BIGSERIAL PRIMARY KEY,
+  source_id         TEXT NOT NULL REFERENCES sources(id),
+  source_record_id  TEXT,
+  doi               TEXT,
+  title             TEXT NOT NULL,
+  authors           JSONB NOT NULL DEFAULT '[]'::jsonb,
+  abstract          TEXT,
+  doc_type          TEXT NOT NULL,
+  language          TEXT,
+  published_at      DATE,
+  source_url        TEXT NOT NULL,
+  file_url          TEXT,
+  quality_status    TEXT NOT NULL DEFAULT 'pending',
+  collected_at      TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+  updated_at        TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+CREATE UNIQUE INDEX IF NOT EXISTS idx_documents_doi_unique
+  ON documents ((LOWER(doi)))
+  WHERE doi IS NOT NULL;
+
+CREATE INDEX IF NOT EXISTS idx_documents_title_trgm ON documents (title);
+CREATE INDEX IF NOT EXISTS idx_documents_published_at ON documents (published_at);
+CREATE INDEX IF NOT EXISTS idx_documents_doc_type ON documents (doc_type);
+
+INSERT INTO sources (id, name, source_type, entry_url, access_mode, status)
+VALUES ('src_crossref', 'Crossref', 'api', 'https://api.crossref.org/works', 'api', 'active')
+ON CONFLICT (id) DO NOTHING;
diff --git a/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/00_goal_non_goal_template.md b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/00_goal_non_goal_template.md
new file mode 100644
index 0000000..38230ef
--- /dev/null
+++ b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/00_goal_non_goal_template.md	
@@ -0,0 +1,40 @@
+# 目标与非目标模板（请先填写）
+
+## 一句话目标
+
+在 [时间范围] 内，构建一个可持续更新的铁路知识数据库，覆盖 [范围] 并支持 [检索能力]。
+
+## 非目标
+
+- 本阶段不做：
+  - [ ] 全网覆盖
+  - [ ] 复杂前端
+  - [ ] 高并发分布式架构
+
+## 可改动范围
+
+- 允许改动：
+- 禁止改动：
+
+## 成功标准（可量化）
+
+- 数据源数量：
+- 入库数量：
+- 去重准确率：
+- 每日稳定抓取成功率：
+
+## 失败判定
+
+满足以下任一条件即判定失败：
+
+1.
+2.
+3.
+
+## 本周计划
+
+- Day 1:
+- Day 2:
+- Day 3:
+- Day 4:
+- Day 5:
diff --git a/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/01_source_registry_template.md b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/01_source_registry_template.md
new file mode 100644
index 0000000..8fd9c1b
--- /dev/null
+++ b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/01_source_registry_template.md	
@@ -0,0 +1,15 @@
+# 数据源登记模板
+
+> 规则：新增数据源前先登记；每次抓取后更新状态。
+
+| source_id | source_name | source_type | entry_url | access_mode | robots_ok | auth_required | priority | status | notes |
+|---|---|---|---|---|---|---|---|---|---|
+| src_crossref | Crossref | api | https://api.crossref.org/works | api | yes | no | p0 | active | 先做这个 |
+| src_openalex | OpenAlex | api | https://api.openalex.org/works | api | yes | no | p1 | pending | 第二阶段 |
+| src_manual_example | Railway Manual Site | web | https://example.com/manuals | html | pending | pending | p2 | pending | 需确认条款 |
+
+## 字段说明
+
+- `source_type`: api / html / pdf_index / rss
+- `access_mode`: api / crawler / browser
+- `status`: active / pending / blocked / retired
diff --git a/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/03_run_log_template.md b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/03_run_log_template.md
new file mode 100644
index 0000000..4d3a6b2
--- /dev/null
+++ b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/03_run_log_template.md	
@@ -0,0 +1,44 @@
+# 每日运行与复盘模板
+
+## 基本信息
+
+- 日期：
+- 负责人：
+- 本次目标：
+
+## 执行步骤
+
+1.
+2.
+3.
+
+## 结果摘要
+
+- 抓取总数：
+- 入库总数：
+- 去重后数量：
+- 错误数量：
+
+## 预期 vs 实际
+
+- 预期：
+- 实际：
+- 偏差原因：
+
+## 失败样本（最小复现）
+
+- 样本链接：
+- 错误日志：
+- 复现命令：
+
+## 明日计划
+
+1.
+2.
+3.
+
+## Prompt 资产沉淀
+
+- 本次有效 Prompt：
+- 需要淘汰 Prompt：
+- 下次优化方向：
diff --git a/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/README.md b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/README.md
new file mode 100644
index 0000000..fa1a7ce
--- /dev/null
+++ b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/README.md	
@@ -0,0 +1,225 @@
+# 铁路知识数据库：从 0 到 1 实战流程（单人版）
+
+> 目标用户：完全没有经验、但希望搭建一个可持续更新的铁路知识数据库的人。
+
+---
+
+## 1. 你将做出的成果
+
+完成本流程后，你将拥有：
+
+1. 一个可执行的目标与范围文档。
+2. 一个可扩展的数据源登记表。
+3. 一套统一数据库表结构（PostgreSQL）。
+4. 一个可运行的采集脚本（以 Crossref 为起点）。
+5. 一套去重、质检、复盘流程。
+6. 一组可复用 Prompt 模板。
+
+---
+
+## 2. 先理解方法：为什么要这样做
+
+本流程遵循以下原则：
+
+- 先规划、后编码。
+- 先跑通一个数据源，再逐步扩展。
+- 文档与提示词是资产，不是临时记录。
+- 每天都要有可验证产物和复盘。
+
+一句话：**把一次抓数据，做成可持续的数据工程能力。**
+
+---
+
+## 3. 总体架构（先看全貌）
+
+```text
+数据源（API/网页/PDF）
+  -> 采集层（fetchers）
+  -> 解析层（parsers）
+  -> 标准化层（normalizers）
+  -> 去重层（dedup）
+  -> 存储层（PostgreSQL + 原文文件）
+  -> 检索层（SQL / 全文检索）
+  -> 复盘层（日报 + Prompt 资产）
+```
+
+---
+
+## 4. 工具清单（最小可用）
+
+### 4.1 必需工具
+
+- Python 3.11+
+- PostgreSQL 15+
+- pip / venv
+- Git
+
+### 4.2 Python 依赖（第一阶段）
+
+- `httpx`：请求 API 与网页
+- `beautifulsoup4`：解析 HTML
+- `pydantic`：数据模型校验
+- `psycopg[binary]`：写入 PostgreSQL
+- `python-dotenv`：环境变量
+- `rapidfuzz`：标题相似度去重
+
+### 4.3 第二阶段再加
+
+- `playwright`：动态网页采集
+- `pymupdf`：PDF 文本提取
+- `prefect`/`apscheduler`：定时调度
+
+---
+
+## 5. 项目结构（直接照抄）
+
+```text
+rail-knowledge/
+  docs/
+    00_goal_non_goal.md
+    01_source_registry.md
+    02_data_quality.md
+    03_run_log.md
+  prompts/
+    prompt_collect.md
+    prompt_parse.md
+    prompt_dedup.md
+    prompt_review.md
+  sql/
+    001_init.sql
+  src/
+    main.py
+    settings.py
+    models.py
+    db.py
+    sources/
+      crossref.py
+    pipeline/
+      ingest.py
+      dedup.py
+  data/
+    raw/
+    exports/
+```
+
+---
+
+## 6. 8 天入门执行计划（建议）
+
+### Day 1：目标与边界
+
+- 填写 `docs/00_goal_non_goal.md`。
+- 确定第一批只抓 1 个来源：Crossref。
+
+验收：你可以一口气说清“抓什么、不抓什么”。
+
+### Day 2：搭建环境
+
+- 创建虚拟环境。
+- 安装依赖。
+- 建立 PostgreSQL 数据库。
+
+验收：`python -V`、数据库连接成功。
+
+### Day 3：创建数据表
+
+- 执行 `sql/001_init.sql`。
+- 手动插入 1 条测试数据。
+
+验收：能查到该记录。
+
+### Day 4：写第一个采集器
+
+- 在 `src/sources/crossref.py` 实现关键词检索。
+- 保存原始响应到 `data/raw/`。
+
+验收：成功抓取 50 条记录。
+
+### Day 5：写标准化入库
+
+- 将原始字段映射到统一表结构。
+- 写入 `documents` 表。
+
+验收：数据库中至少 50 条结构化记录。
+
+### Day 6：做去重
+
+- DOI 精确去重。
+- 标题模糊匹配去重。
+
+验收：重复数据明显下降并可解释。
+
+### Day 7：做检索与导出
+
+- SQL 按关键词/年份/类型查询。
+- 导出 CSV。
+
+验收：可稳定导出查询结果。
+
+### Day 8：复盘与模板化
+
+- 写 `docs/03_run_log.md`。
+- 优化 Prompt 模板。
+
+验收：下一轮可直接复用。
+
+---
+
+## 7. 每日工作模板（固定动作）
+
+每天只做四件事：
+
+1. 今日目标（10 分钟）
+2. 单模块开发（90~120 分钟）
+3. 最小测试（30 分钟）
+4. 复盘沉淀（20 分钟）
+
+如果你时间有限，宁可减少功能，也不要省略复盘。
+
+---
+
+## 8. 数据源扩展顺序（建议）
+
+1. Crossref（元数据）
+2. OpenAlex（关联关系）
+3. 各铁路机构公开技术文档页面
+4. 公开标准目录页面
+5. 期刊官网公开摘要页面
+
+说明：每增加一个来源，必须先更新 `01_source_registry.md`。
+
+---
+
+## 9. 质量规则（最小版）
+
+每条记录至少包含：
+
+- `title`
+- `source_url`
+- `source_name`
+- `doc_type`
+- `collected_at`
+
+若字段缺失则标记 `quality_status = pending`，不得默默丢弃。
+
+---
+
+## 10. 风险与合规
+
+- 优先采集公开元数据，谨慎处理版权全文。
+- 遵守网站 robots、ToS 与频率限制。
+- 对每条记录保留来源链接与采集时间。
+
+---
+
+## 11. 你下一步立刻执行什么
+
+按顺序执行：
+
+1. 复制本目录中的模板文件。
+2. 填写目标与数据源登记表。
+3. 执行 SQL 初始化数据库。
+4. 运行最小采集脚本抓取第一批数据。
+5. 做一次复盘并记录问题。
+
+先跑通，再做强。
diff --git a/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/crossref_minimal_collector.py b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/crossref_minimal_collector.py
new file mode 100644
index 0000000..d6e7778
--- /dev/null
+++ b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/crossref_minimal_collector.py	
@@ -0,0 +1,85 @@
+"""Crossref 最小采集器示例。
+
+运行方式：
+  python crossref_minimal_collector.py --query "railway signaling" --rows 20
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+
+import httpx
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Crossref minimal collector")
+    parser.add_argument("--query", required=True, help="检索关键词")
+    parser.add_argument("--rows", type=int, default=20, help="返回条数")
+    parser.add_argument("--out", default="./data/raw", help="输出目录")
+    return parser.parse_args()
+
+
+def normalize_item(item: dict) -> dict:
+    authors = []
+    for author in item.get("author", []):
+        given = author.get("given", "")
+        family = author.get("family", "")
+        full_name = f"{given} {family}".strip()
+        if full_name:
+            authors.append(full_name)
+
+    published_at = None
+    issued = item.get("issued", {}).get("date-parts", [])
+    if issued and issued[0]:
+        parts = issued[0]
+        year = parts[0]
+        month = parts[1] if len(parts) > 1 else 1
+        day = parts[2] if len(parts) > 2 else 1
+        published_at = f"{year:04d}-{month:02d}-{day:02d}"
+
+    return {
+        "source_id": "src_crossref",
+        "source_record_id": item.get("DOI"),
+        "doi": item.get("DOI"),
+        "title": (item.get("title") or [""])[0],
+        "authors": authors,
+        "abstract": item.get("abstract"),
+        "doc_type": item.get("type", "unknown"),
+        "published_at": published_at,
+        "source_url": item.get("URL") or "",
+        "collected_at": datetime.now(timezone.utc).isoformat(),
+    }
+
+
+def collect_crossref(query: str, rows: int) -> list[dict]:
+    url = "https://api.crossref.org/works"
+    params = {"query": query, "rows": rows}
+
+    with httpx.Client(timeout=20) as client:
+        response = client.get(url, params=params)
+        response.raise_for_status()
+        payload = response.json()
+
+    items = payload.get("message", {}).get("items", [])
+    return [normalize_item(item) for item in items]
+
+
+def main() -> None:
+    args = parse_args()
+    records = collect_crossref(args.query, args.rows)
+
+    out_dir = Path(args.out)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    output_path = out_dir / f"crossref_{timestamp}.json"
+
+    output_path.write_text(json.dumps(records, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"采集完成，记录数: {len(records)}")
+    print(f"输出文件: {output_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/prompt_collect_template.md b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/prompt_collect_template.md
new file mode 100644
index 0000000..96664c4
--- /dev/null
+++ b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/prompt_collect_template.md	
@@ -0,0 +1,20 @@
+# Prompt 模板：采集器生成
+
+你是资深 Python 数据工程师。请基于以下约束生成一个可运行的采集器：
+
+- 目标来源：{{source_name}}
+- 入口：{{entry_url}}
+- 输出：标准 JSON 列表
+- 字段：title, authors, doi, abstract, published_at, source_url, doc_type
+- 要求：
+  1) 失败重试 3 次
+  2) 请求间隔可配置
+  3) 记录结构化日志
+  4) 不使用过度抽象
+  5) 代码附最小运行示例
+
+请按以下结构输出：
+1. 文件结构
+2. 完整代码
+3. 运行命令
+4. 常见错误与修复
diff --git a/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/prompt_review_template.md b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/prompt_review_template.md
new file mode 100644
index 0000000..0d4c67d
--- /dev/null
+++ b/i18n/zh/documents/Tutorials and Guides/rail-knowledge-workflow/prompt_review_template.md	
@@ -0,0 +1,17 @@
+# Prompt 模板：运行复盘
+
+你是我的技术复盘助手。请根据以下运行记录输出结构化复盘：
+
+输入：
+- 目标：{{goal}}
+- 预期：{{expected}}
+- 实际：{{actual}}
+- 错误日志：{{error_logs}}
+- 变更文件：{{changed_files}}
+
+输出格式：
+1. 是否达成目标（是/否 + 原因）
+2. 根因分析（最多 3 条）
+3. 修复优先级（P0/P1/P2）
+4. 明日最小可执行任务（最多 3 条）
+5. 可沉淀 Prompt 资产（新增/淘汰）