Adaozuishuai · Adaozuishuai · Dec 12, 2025
diff --git a/.env.example b/.env.example
@@ -0,0 +1,17 @@
+# Required API keys
+OPENAI_API_KEY=sk-your-openai-key
+TAVILY_API_KEY=tvly-your-tavily-key
+
+# Optional observability
+LANGSMITH_API_KEY=lsv2-your-langsmith-key
+
+# Model and data controls
+HUGGINGFACE_MODEL=sentence-transformers/all-MiniLM-L6-v2
+SOURCE_URLS=https://zh.wikipedia.org/wiki/%E5%A4%A7%E7%86%8A%E7%8C%AB,https://zh.wikipedia.org/wiki/%E7%AB%B9,https://zh.wikipedia.org/wiki/%E7%86%8A%E7%8C%AB%E4%B8%83%E5%9B%BD%E8%B5%A0%E9%80%81
+# （可选）单一 URL 兜底，未提供 SOURCE_URLS 时启用
+# SOURCE_URL=https://zh.wikipedia.org/wiki/%E5%A4%A7%E7%86%8A%E7%8C%AB
+CHUNK_SIZE=1000
+CHUNK_OVERLAP=200
+CHAT_MODEL=gpt-5-nano
+TEMPERATURE=0.0
+TAVILY_RESULTS=3
diff --git a/README.md b/README.md
@@ -11,6 +11,35 @@
 
 _统一图片、文本、结构化数据的预处理与对齐流程，作为后续 LoRA + RAG 方案的第一步。_
 
+## 智能对话助手（企业级样板）
+
+`assistant_app/` 目录提供了将示例代码演进为企业级智能对话助手的骨架：
+
+- `config.py`：集中式配置，支持 `.env`/环境变量读取与必需密钥校验。
+- `data.py`：网页文档加载、分块与 FAISS 检索器构建。
+- `tools.py`：Tavily 搜索与基于向量库的维基百科检索工具注册。
+- `agent.py`：基于 LangGraph 的有记忆代理构建，可选接入 LangSmith 提示与追踪。
+- `main.py`：演示入口，展示工具绑定与多线程（多会话）记忆隔离的用法。
+- `mcp_server.py`：可选的 Model Context Protocol (MCP) 服务，将检索与搜索工具以标准化接口暴露给外部客户端。
+
+快速开始：
+
+```bash
+cp .env.example .env  # 填写 OpenAI/Tavily/LangSmith 等密钥
+python -m assistant_app.main
+
+# （可选）启动 MCP 服务，将工具暴露给兼容客户端
+python -m assistant_app.mcp_server
+```
+
+以上脚本会自动完成网页抓取、向量索引构建、工具装配，并输出多个 thread_id 的独立回答，方便接入企业客服、运营等场景。
+
+### 扩充维基百科知识库
+
+- 支持通过逗号分隔的 `SOURCE_URLS` 一次性抓取多篇维基百科文章（默认包含大熊猫、竹类、熊猫外交、栖息地保护等页面），自动合并到同一个向量检索库。
+- 兼容单一 `SOURCE_URL` 环境变量作为兜底配置，便于按需切换主题。
+- 所有页面会被自动分块（默认 1000/200）并写入 FAISS 索引，`wiki_search` 工具即可检索更丰富的百科知识。
+
 ## 目录
 
 1. [多模态预处理概览](#多模态预处理概览)

diff --git a/assistant_app/__init__.py b/assistant_app/__init__.py
@@ -0,0 +1 @@
+"""Enterprise-ready intelligent assistant package."""
diff --git a/assistant_app/agent.py b/assistant_app/agent.py
@@ -0,0 +1,49 @@
+"""
+Factory for the LangGraph-powered conversational agent.
+"""
+from __future__ import annotations
+
+from langchain.agents import create_agent
+from langchain_openai import ChatOpenAI
+from langgraph.checkpoint.memory import MemorySaver
+from langsmith import Client
+
+from .config import Settings
+
+
+def build_llm(settings: Settings) -> ChatOpenAI:
+    """Initialize the chat model used throughout the system."""
+
+    return ChatOpenAI(model=settings.chat_model, temperature=settings.temperature)
+
+
+def build_agent(settings: Settings, tools):
+    """Create a tool-aware agent with memory."""
+
+    settings.validate()
+    llm = build_llm(settings)
+    prompt = None
+    if settings.langsmith_api_key:
+        client = Client(api_key=settings.langsmith_api_key)
+        prompt = client.pull_prompt("hwchase17/openai-functions-agent", include_model=True)
+
+    agent_prompt = prompt if prompt is not None else None
+    memory = MemorySaver()
+
+    return create_agent(
+        model=llm,
+        tools=tools,
+        prompt=agent_prompt,
+        checkpointer=memory,
+    )
+
+
+def run_sample_sessions(agent, thread_ids, question: str):
+    """Demonstrate how different threads maintain context independently."""
+
+    results = {}
+    for thread_id in thread_ids:
+        config = {"configurable": {"thread_id": thread_id}}
+        response = agent.invoke({"messages": [{"role": "user", "content": question}]}, config)
+        results[thread_id] = response
+    return results
diff --git a/assistant_app/config.py b/assistant_app/config.py
@@ -0,0 +1,84 @@
+"""
+Configuration helpers for the intelligent assistant service.
+"""
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional
+
+
+@dataclass
+class Settings:
+    """Runtime configuration sourced from environment variables."""
+
+    openai_api_key: Optional[str] = None
+    langsmith_api_key: Optional[str] = None
+    tavily_api_key: Optional[str] = None
+    huggingface_model: str = "sentence-transformers/all-MiniLM-L6-v2"
+    source_urls: List[str] = field(
+        default_factory=lambda: [
+            "https://zh.wikipedia.org/wiki/%E5%A4%A7%E7%86%8A%E7%8C%AB",  # 大熊猫
+            "https://zh.wikipedia.org/wiki/%E7%AB%B9",  # 竹类
+            "https://zh.wikipedia.org/wiki/%E7%86%8A%E7%8C%AB%E4%B8%83%E5%9B%BD%E8%B5%A0%E9%80%81",  # 熊猫外交
+            "https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%9B%BD%E7%86%8A%E7%8C%AB%E5%8E%9F%E5%9C%B0",  # 熊猫栖息地与保护
+        ]
+    )
+    chunk_size: int = 1000
+    chunk_overlap: int = 200
+    chat_model: str = "gpt-5-nano"
+    temperature: float = 0.0
+    tavily_results: int = 3
+    default_thread_ids: List[str] = field(default_factory=lambda: ["customer-care", "ops-monitoring"])
+
+    @classmethod
+    def from_env(cls) -> "Settings":
+        """Create settings using environment variables with sensible defaults."""
+
+        return cls(
+            openai_api_key=os.getenv("OPENAI_API_KEY"),
+            langsmith_api_key=os.getenv("LANGSMITH_API_KEY"),
+            tavily_api_key=os.getenv("TAVILY_API_KEY"),
+            huggingface_model=os.getenv(
+                "HUGGINGFACE_MODEL", "sentence-transformers/all-MiniLM-L6-v2"
+            ),
+            source_urls=cls._parse_source_urls(),
+            chunk_size=int(os.getenv("CHUNK_SIZE", 1000)),
+            chunk_overlap=int(os.getenv("CHUNK_OVERLAP", 200)),
+            chat_model=os.getenv("CHAT_MODEL", "gpt-5-nano"),
+            temperature=float(os.getenv("TEMPERATURE", 0.0)),
+            tavily_results=int(os.getenv("TAVILY_RESULTS", 3)),
+        )
+
+    def validate(self) -> None:
+        """Validate that required keys are present."""
+
+        missing: list[str] = []
+        if not self.openai_api_key:
+            missing.append("OPENAI_API_KEY")
+        if not self.tavily_api_key:
+            missing.append("TAVILY_API_KEY")
+
+        if missing:
+            raise RuntimeError(
+                "Missing required environment variables: " + ", ".join(sorted(missing))
+            )
+
+    @classmethod
+    def _parse_source_urls(cls) -> List[str]:
+        """Support comma-separated SOURCE_URLS or a single SOURCE_URL fallback."""
+
+        urls = os.getenv("SOURCE_URLS")
+        if urls:
+            parsed = [url.strip() for url in urls.split(",") if url.strip()]
+            if parsed:
+                return parsed
+
+        single = os.getenv("SOURCE_URL")
+        if single:
+            return [single]
+
+        return cls().source_urls
+
+
+settings = Settings.from_env()
diff --git a/assistant_app/data.py b/assistant_app/data.py
@@ -0,0 +1,42 @@
+"""
+Document ingestion and retriever construction.
+"""
+from __future__ import annotations
+
+from langchain_community.document_loaders import WebBaseLoader
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface.embeddings import HuggingFaceEmbeddings
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+from .config import Settings
+
+
+def load_documents(settings: Settings):
+    """Load documents from the configured Wikipedia sources."""
+
+    documents = []
+    for url in settings.source_urls:
+        loader = WebBaseLoader(url)
+        documents.extend(loader.load())
+    return documents
+
+
+def split_documents(docs, settings: Settings):
+    """Split documents into overlapping chunks for retrieval."""
+
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=settings.chunk_size,
+        chunk_overlap=settings.chunk_overlap,
+        separators=["\n\n", "\n", " ", ""],
+    )
+    return splitter.split_documents(docs)
+
+
+def build_retriever(settings: Settings):
+    """Construct a FAISS-backed retriever from web documents."""
+
+    docs = load_documents(settings)
+    chunks = split_documents(docs, settings)
+    embeddings = HuggingFaceEmbeddings(model_name=settings.huggingface_model)
+    vectorstore = FAISS.from_documents(chunks, embeddings)
+    return vectorstore.as_retriever()
diff --git a/assistant_app/main.py b/assistant_app/main.py
@@ -0,0 +1,45 @@
+"""
+Command-line entry point to run the intelligent assistant locally.
+"""
+from __future__ import annotations
+
+from langchain_core.messages import HumanMessage, SystemMessage
+
+from .agent import build_agent, run_sample_sessions
+from .config import settings
+from .data import build_retriever
+from .tools import build_tools
+
+
+def build_conversational_agent():
+    """Build the end-to-end agent with tools and retrieval."""
+
+    retriever = build_retriever(settings)
+    tools = build_tools(retriever, settings)
+    agent = build_agent(settings, tools)
+    return agent, tools
+
+
+def run_demo():
+    """Run a short demo showing tool binding and memory."""
+
+    agent, tools = build_conversational_agent()
+
+    model_with_tools = agent.model.bind_tools(tools=tools)
+    messages = [
+        SystemMessage("你是一个乐于助人的企业级智能对话助手。"),
+        HumanMessage("2025年12月10日天津天气怎么样？"),
+    ]
+    _ = model_with_tools.invoke(messages)
+
+    thread_results = run_sample_sessions(
+        agent=agent, thread_ids=settings.default_thread_ids, question="熊猫的特征"
+    )
+
+    for thread_id, result in thread_results.items():
+        content = result["messages"][-1].content
+        print(f"[Thread: {thread_id}] {content}")
+
+
+if __name__ == "__main__":
+    run_demo()
diff --git a/assistant_app/mcp_server.py b/assistant_app/mcp_server.py
@@ -0,0 +1,70 @@
+"""
+Minimal Model Context Protocol (MCP) server exposing the assistant tools.
+
+This module wires the existing retrieval and search capabilities into an MCP
+server so external clients can call them as standardized tools.
+"""
+from __future__ import annotations
+
+from langchain_community.tools.tavily_search import TavilySearchResults
+from mcp.server.fastmcp import FastMCP
+from mcp.types import Resource
+
+from .config import settings
+from .data import build_retriever
+
+
+async def _panda_resource() -> Resource:
+    """Provide metadata describing the underlying panda corpus."""
+
+    return Resource(
+        uri=settings.source_url,
+        mimeType="text/html",
+        description="Primary source describing panda characteristics.",
+    )
+
+
+def create_mcp_server():
+    """Create an MCP server exposing internet search and panda retrieval."""
+
+    retriever = build_retriever(settings)
+    search = TavilySearchResults(max_results=settings.tavily_results)
+
+    server = FastMCP("assistant-mcp")
+
+    @server.tool()
+    async def tavily_search(query: str) -> str:
+        """进行互联网搜索并返回结果"""
+
+        return search.run(query)
+
+    @server.tool()
+    async def wiki_panda_search(query: str) -> str:
+        """根据向量数据库内容检索大熊猫相关信息。"""
+
+        results = retriever.get_relevant_documents(query)
+        return "\n\n".join([doc.page_content for doc in results])
+
+    @server.resource()
+    async def panda_wiki_source() -> Resource:
+        """Expose the configured panda wiki URL as a resource for MCP clients."""
+
+        return await _panda_resource()
+
+    return server
+
+
+async def serve(host: str = "0.0.0.0", port: int = 8001) -> None:
+    """Run the MCP server."""
+
+    server = create_mcp_server()
+    await server.serve(host=host, port=port)
+
+
+__all__ = ["create_mcp_server", "serve"]
+
+
+if __name__ == "__main__":
+    import asyncio
+
+    asyncio.run(serve())
diff --git a/assistant_app/tools.py b/assistant_app/tools.py
@@ -0,0 +1,37 @@
+"""
+Tooling used by the conversational agent.
+"""
+from __future__ import annotations
+
+from langchain.tools import tool
+from langchain_community.tools.tavily_search import TavilySearchResults
+from langchain_core.tools import create_retriever_tool
+
+from .config import Settings
+
+
+def build_tools(retriever, settings: Settings):
+    """Create reusable tools for the agent."""
+
+    search = TavilySearchResults(max_results=settings.tavily_results)
+
+    @tool
+    def search_tool(query: str) -> str:
+        """进行互联网搜索并返回结果"""
+
+        return search.run(query)
+
+    retriever_tool = create_retriever_tool(
+        retriever=retriever,
+        name="wiki_search",
+        description="搜索维基百科",
+    )
+
+    @tool
+    def wiki_panda_search(query: str) -> str:
+        """根据向量数据库内容检索相关的维基百科知识。"""
+
+        results = retriever_tool.get_relevant_documents(query)
+        return "\n\n".join([doc.page_content for doc in results])
+
+    return [search_tool, wiki_panda_search]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Enterprise-ready intelligent assistant package."""