diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f23d980 --- /dev/null +++ b/.env.example @@ -0,0 +1,17 @@ +# Required API keys +OPENAI_API_KEY=sk-your-openai-key +TAVILY_API_KEY=tvly-your-tavily-key + +# Optional observability +LANGSMITH_API_KEY=lsv2-your-langsmith-key + +# Model and data controls +HUGGINGFACE_MODEL=sentence-transformers/all-MiniLM-L6-v2 +SOURCE_URLS=https://zh.wikipedia.org/wiki/%E5%A4%A7%E7%86%8A%E7%8C%AB,https://zh.wikipedia.org/wiki/%E7%AB%B9,https://zh.wikipedia.org/wiki/%E7%86%8A%E7%8C%AB%E4%B8%83%E5%9B%BD%E8%B5%A0%E9%80%81 +# (可选)单一 URL 兜底,未提供 SOURCE_URLS 时启用 +# SOURCE_URL=https://zh.wikipedia.org/wiki/%E5%A4%A7%E7%86%8A%E7%8C%AB +CHUNK_SIZE=1000 +CHUNK_OVERLAP=200 +CHAT_MODEL=gpt-5-nano +TEMPERATURE=0.0 +TAVILY_RESULTS=3 diff --git a/README.md b/README.md index e86652c..c2af821 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,35 @@ _统一图片、文本、结构化数据的预处理与对齐流程,作为后续 LoRA + RAG 方案的第一步。_ +## 智能对话助手(企业级样板) + +`assistant_app/` 目录提供了将示例代码演进为企业级智能对话助手的骨架: + +- `config.py`:集中式配置,支持 `.env`/环境变量读取与必需密钥校验。 +- `data.py`:网页文档加载、分块与 FAISS 检索器构建。 +- `tools.py`:Tavily 搜索与基于向量库的维基百科检索工具注册。 +- `agent.py`:基于 LangGraph 的有记忆代理构建,可选接入 LangSmith 提示与追踪。 +- `main.py`:演示入口,展示工具绑定与多线程(多会话)记忆隔离的用法。 +- `mcp_server.py`:可选的 Model Context Protocol (MCP) 服务,将检索与搜索工具以标准化接口暴露给外部客户端。 + +快速开始: + +```bash +cp .env.example .env # 填写 OpenAI/Tavily/LangSmith 等密钥 +python -m assistant_app.main + +# (可选)启动 MCP 服务,将工具暴露给兼容客户端 +python -m assistant_app.mcp_server +``` + +以上脚本会自动完成网页抓取、向量索引构建、工具装配,并输出多个 thread_id 的独立回答,方便接入企业客服、运营等场景。 + +### 扩充维基百科知识库 + +- 支持通过逗号分隔的 `SOURCE_URLS` 一次性抓取多篇维基百科文章(默认包含大熊猫、竹类、熊猫外交、栖息地保护等页面),自动合并到同一个向量检索库。 +- 兼容单一 `SOURCE_URL` 环境变量作为兜底配置,便于按需切换主题。 +- 所有页面会被自动分块(默认 1000/200)并写入 FAISS 索引,`wiki_search` 工具即可检索更丰富的百科知识。 + ## 目录 1. [多模态预处理概览](#多模态预处理概览) diff --git a/assistant_app/__init__.py b/assistant_app/__init__.py new file mode 100644 index 0000000..eadc26c --- /dev/null +++ b/assistant_app/__init__.py @@ -0,0 +1 @@ +"""Enterprise-ready intelligent assistant package.""" diff --git a/assistant_app/agent.py b/assistant_app/agent.py new file mode 100644 index 0000000..624b074 --- /dev/null +++ b/assistant_app/agent.py @@ -0,0 +1,49 @@ +""" +Factory for the LangGraph-powered conversational agent. +""" +from __future__ import annotations + +from langchain.agents import create_agent +from langchain_openai import ChatOpenAI +from langgraph.checkpoint.memory import MemorySaver +from langsmith import Client + +from .config import Settings + + +def build_llm(settings: Settings) -> ChatOpenAI: + """Initialize the chat model used throughout the system.""" + + return ChatOpenAI(model=settings.chat_model, temperature=settings.temperature) + + +def build_agent(settings: Settings, tools): + """Create a tool-aware agent with memory.""" + + settings.validate() + llm = build_llm(settings) + prompt = None + if settings.langsmith_api_key: + client = Client(api_key=settings.langsmith_api_key) + prompt = client.pull_prompt("hwchase17/openai-functions-agent", include_model=True) + + agent_prompt = prompt if prompt is not None else None + memory = MemorySaver() + + return create_agent( + model=llm, + tools=tools, + prompt=agent_prompt, + checkpointer=memory, + ) + + +def run_sample_sessions(agent, thread_ids, question: str): + """Demonstrate how different threads maintain context independently.""" + + results = {} + for thread_id in thread_ids: + config = {"configurable": {"thread_id": thread_id}} + response = agent.invoke({"messages": [{"role": "user", "content": question}]}, config) + results[thread_id] = response + return results diff --git a/assistant_app/config.py b/assistant_app/config.py new file mode 100644 index 0000000..828610d --- /dev/null +++ b/assistant_app/config.py @@ -0,0 +1,84 @@ +""" +Configuration helpers for the intelligent assistant service. +""" +from __future__ import annotations + +import os +from dataclasses import dataclass, field +from typing import List, Optional + + +@dataclass +class Settings: + """Runtime configuration sourced from environment variables.""" + + openai_api_key: Optional[str] = None + langsmith_api_key: Optional[str] = None + tavily_api_key: Optional[str] = None + huggingface_model: str = "sentence-transformers/all-MiniLM-L6-v2" + source_urls: List[str] = field( + default_factory=lambda: [ + "https://zh.wikipedia.org/wiki/%E5%A4%A7%E7%86%8A%E7%8C%AB", # 大熊猫 + "https://zh.wikipedia.org/wiki/%E7%AB%B9", # 竹类 + "https://zh.wikipedia.org/wiki/%E7%86%8A%E7%8C%AB%E4%B8%83%E5%9B%BD%E8%B5%A0%E9%80%81", # 熊猫外交 + "https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%9B%BD%E7%86%8A%E7%8C%AB%E5%8E%9F%E5%9C%B0", # 熊猫栖息地与保护 + ] + ) + chunk_size: int = 1000 + chunk_overlap: int = 200 + chat_model: str = "gpt-5-nano" + temperature: float = 0.0 + tavily_results: int = 3 + default_thread_ids: List[str] = field(default_factory=lambda: ["customer-care", "ops-monitoring"]) + + @classmethod + def from_env(cls) -> "Settings": + """Create settings using environment variables with sensible defaults.""" + + return cls( + openai_api_key=os.getenv("OPENAI_API_KEY"), + langsmith_api_key=os.getenv("LANGSMITH_API_KEY"), + tavily_api_key=os.getenv("TAVILY_API_KEY"), + huggingface_model=os.getenv( + "HUGGINGFACE_MODEL", "sentence-transformers/all-MiniLM-L6-v2" + ), + source_urls=cls._parse_source_urls(), + chunk_size=int(os.getenv("CHUNK_SIZE", 1000)), + chunk_overlap=int(os.getenv("CHUNK_OVERLAP", 200)), + chat_model=os.getenv("CHAT_MODEL", "gpt-5-nano"), + temperature=float(os.getenv("TEMPERATURE", 0.0)), + tavily_results=int(os.getenv("TAVILY_RESULTS", 3)), + ) + + def validate(self) -> None: + """Validate that required keys are present.""" + + missing: list[str] = [] + if not self.openai_api_key: + missing.append("OPENAI_API_KEY") + if not self.tavily_api_key: + missing.append("TAVILY_API_KEY") + + if missing: + raise RuntimeError( + "Missing required environment variables: " + ", ".join(sorted(missing)) + ) + + @classmethod + def _parse_source_urls(cls) -> List[str]: + """Support comma-separated SOURCE_URLS or a single SOURCE_URL fallback.""" + + urls = os.getenv("SOURCE_URLS") + if urls: + parsed = [url.strip() for url in urls.split(",") if url.strip()] + if parsed: + return parsed + + single = os.getenv("SOURCE_URL") + if single: + return [single] + + return cls().source_urls + + +settings = Settings.from_env() diff --git a/assistant_app/data.py b/assistant_app/data.py new file mode 100644 index 0000000..d661b88 --- /dev/null +++ b/assistant_app/data.py @@ -0,0 +1,42 @@ +""" +Document ingestion and retriever construction. +""" +from __future__ import annotations + +from langchain_community.document_loaders import WebBaseLoader +from langchain_community.vectorstores import FAISS +from langchain_huggingface.embeddings import HuggingFaceEmbeddings +from langchain_text_splitters import RecursiveCharacterTextSplitter + +from .config import Settings + + +def load_documents(settings: Settings): + """Load documents from the configured Wikipedia sources.""" + + documents = [] + for url in settings.source_urls: + loader = WebBaseLoader(url) + documents.extend(loader.load()) + return documents + + +def split_documents(docs, settings: Settings): + """Split documents into overlapping chunks for retrieval.""" + + splitter = RecursiveCharacterTextSplitter( + chunk_size=settings.chunk_size, + chunk_overlap=settings.chunk_overlap, + separators=["\n\n", "\n", " ", ""], + ) + return splitter.split_documents(docs) + + +def build_retriever(settings: Settings): + """Construct a FAISS-backed retriever from web documents.""" + + docs = load_documents(settings) + chunks = split_documents(docs, settings) + embeddings = HuggingFaceEmbeddings(model_name=settings.huggingface_model) + vectorstore = FAISS.from_documents(chunks, embeddings) + return vectorstore.as_retriever() diff --git a/assistant_app/main.py b/assistant_app/main.py new file mode 100644 index 0000000..55416d4 --- /dev/null +++ b/assistant_app/main.py @@ -0,0 +1,45 @@ +""" +Command-line entry point to run the intelligent assistant locally. +""" +from __future__ import annotations + +from langchain_core.messages import HumanMessage, SystemMessage + +from .agent import build_agent, run_sample_sessions +from .config import settings +from .data import build_retriever +from .tools import build_tools + + +def build_conversational_agent(): + """Build the end-to-end agent with tools and retrieval.""" + + retriever = build_retriever(settings) + tools = build_tools(retriever, settings) + agent = build_agent(settings, tools) + return agent, tools + + +def run_demo(): + """Run a short demo showing tool binding and memory.""" + + agent, tools = build_conversational_agent() + + model_with_tools = agent.model.bind_tools(tools=tools) + messages = [ + SystemMessage("你是一个乐于助人的企业级智能对话助手。"), + HumanMessage("2025年12月10日天津天气怎么样?"), + ] + _ = model_with_tools.invoke(messages) + + thread_results = run_sample_sessions( + agent=agent, thread_ids=settings.default_thread_ids, question="熊猫的特征" + ) + + for thread_id, result in thread_results.items(): + content = result["messages"][-1].content + print(f"[Thread: {thread_id}] {content}") + + +if __name__ == "__main__": + run_demo() diff --git a/assistant_app/mcp_server.py b/assistant_app/mcp_server.py new file mode 100644 index 0000000..e6d4ca4 --- /dev/null +++ b/assistant_app/mcp_server.py @@ -0,0 +1,70 @@ +""" +Minimal Model Context Protocol (MCP) server exposing the assistant tools. + +This module wires the existing retrieval and search capabilities into an MCP +server so external clients can call them as standardized tools. +""" +from __future__ import annotations + +from langchain_community.tools.tavily_search import TavilySearchResults +from mcp.server.fastmcp import FastMCP +from mcp.types import Resource + +from .config import settings +from .data import build_retriever + + +async def _panda_resource() -> Resource: + """Provide metadata describing the underlying panda corpus.""" + + return Resource( + uri=settings.source_url, + mimeType="text/html", + description="Primary source describing panda characteristics.", + ) + + +def create_mcp_server(): + """Create an MCP server exposing internet search and panda retrieval.""" + + retriever = build_retriever(settings) + search = TavilySearchResults(max_results=settings.tavily_results) + + server = FastMCP("assistant-mcp") + + @server.tool() + async def tavily_search(query: str) -> str: + """进行互联网搜索并返回结果""" + + return search.run(query) + + @server.tool() + async def wiki_panda_search(query: str) -> str: + """根据向量数据库内容检索大熊猫相关信息。""" + + results = retriever.get_relevant_documents(query) + return "\n\n".join([doc.page_content for doc in results]) + + @server.resource() + async def panda_wiki_source() -> Resource: + """Expose the configured panda wiki URL as a resource for MCP clients.""" + + return await _panda_resource() + + return server + + +async def serve(host: str = "0.0.0.0", port: int = 8001) -> None: + """Run the MCP server.""" + + server = create_mcp_server() + await server.serve(host=host, port=port) + + +__all__ = ["create_mcp_server", "serve"] + + +if __name__ == "__main__": + import asyncio + + asyncio.run(serve()) diff --git a/assistant_app/tools.py b/assistant_app/tools.py new file mode 100644 index 0000000..93c7efb --- /dev/null +++ b/assistant_app/tools.py @@ -0,0 +1,37 @@ +""" +Tooling used by the conversational agent. +""" +from __future__ import annotations + +from langchain.tools import tool +from langchain_community.tools.tavily_search import TavilySearchResults +from langchain_core.tools import create_retriever_tool + +from .config import Settings + + +def build_tools(retriever, settings: Settings): + """Create reusable tools for the agent.""" + + search = TavilySearchResults(max_results=settings.tavily_results) + + @tool + def search_tool(query: str) -> str: + """进行互联网搜索并返回结果""" + + return search.run(query) + + retriever_tool = create_retriever_tool( + retriever=retriever, + name="wiki_search", + description="搜索维基百科", + ) + + @tool + def wiki_panda_search(query: str) -> str: + """根据向量数据库内容检索相关的维基百科知识。""" + + results = retriever_tool.get_relevant_documents(query) + return "\n\n".join([doc.page_content for doc in results]) + + return [search_tool, wiki_panda_search]