Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Required API keys
OPENAI_API_KEY=sk-your-openai-key
TAVILY_API_KEY=tvly-your-tavily-key

# Optional observability
LANGSMITH_API_KEY=lsv2-your-langsmith-key

# Model and data controls
HUGGINGFACE_MODEL=sentence-transformers/all-MiniLM-L6-v2
SOURCE_URLS=https://zh.wikipedia.org/wiki/%E5%A4%A7%E7%86%8A%E7%8C%AB,https://zh.wikipedia.org/wiki/%E7%AB%B9,https://zh.wikipedia.org/wiki/%E7%86%8A%E7%8C%AB%E4%B8%83%E5%9B%BD%E8%B5%A0%E9%80%81
# (可选)单一 URL 兜底,未提供 SOURCE_URLS 时启用
# SOURCE_URL=https://zh.wikipedia.org/wiki/%E5%A4%A7%E7%86%8A%E7%8C%AB
CHUNK_SIZE=1000
CHUNK_OVERLAP=200
CHAT_MODEL=gpt-5-nano
TEMPERATURE=0.0
TAVILY_RESULTS=3
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,35 @@

_统一图片、文本、结构化数据的预处理与对齐流程,作为后续 LoRA + RAG 方案的第一步。_

## 智能对话助手(企业级样板)

`assistant_app/` 目录提供了将示例代码演进为企业级智能对话助手的骨架:

- `config.py`:集中式配置,支持 `.env`/环境变量读取与必需密钥校验。
- `data.py`:网页文档加载、分块与 FAISS 检索器构建。
- `tools.py`:Tavily 搜索与基于向量库的维基百科检索工具注册。
- `agent.py`:基于 LangGraph 的有记忆代理构建,可选接入 LangSmith 提示与追踪。
- `main.py`:演示入口,展示工具绑定与多线程(多会话)记忆隔离的用法。
- `mcp_server.py`:可选的 Model Context Protocol (MCP) 服务,将检索与搜索工具以标准化接口暴露给外部客户端。

快速开始:

```bash
cp .env.example .env # 填写 OpenAI/Tavily/LangSmith 等密钥
python -m assistant_app.main

# (可选)启动 MCP 服务,将工具暴露给兼容客户端
python -m assistant_app.mcp_server
```

以上脚本会自动完成网页抓取、向量索引构建、工具装配,并输出多个 thread_id 的独立回答,方便接入企业客服、运营等场景。

### 扩充维基百科知识库

- 支持通过逗号分隔的 `SOURCE_URLS` 一次性抓取多篇维基百科文章(默认包含大熊猫、竹类、熊猫外交、栖息地保护等页面),自动合并到同一个向量检索库。
- 兼容单一 `SOURCE_URL` 环境变量作为兜底配置,便于按需切换主题。
- 所有页面会被自动分块(默认 1000/200)并写入 FAISS 索引,`wiki_search` 工具即可检索更丰富的百科知识。

## 目录

1. [多模态预处理概览](#多模态预处理概览)
Expand Down
1 change: 1 addition & 0 deletions assistant_app/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Enterprise-ready intelligent assistant package."""
49 changes: 49 additions & 0 deletions assistant_app/agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
Factory for the LangGraph-powered conversational agent.
"""
from __future__ import annotations

from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from langgraph.checkpoint.memory import MemorySaver
from langsmith import Client

from .config import Settings


def build_llm(settings: Settings) -> ChatOpenAI:
"""Initialize the chat model used throughout the system."""

return ChatOpenAI(model=settings.chat_model, temperature=settings.temperature)


def build_agent(settings: Settings, tools):
"""Create a tool-aware agent with memory."""

settings.validate()
llm = build_llm(settings)
prompt = None
if settings.langsmith_api_key:
client = Client(api_key=settings.langsmith_api_key)
prompt = client.pull_prompt("hwchase17/openai-functions-agent", include_model=True)

agent_prompt = prompt if prompt is not None else None
memory = MemorySaver()

return create_agent(
model=llm,
tools=tools,
prompt=agent_prompt,
checkpointer=memory,
)


def run_sample_sessions(agent, thread_ids, question: str):
"""Demonstrate how different threads maintain context independently."""

results = {}
for thread_id in thread_ids:
config = {"configurable": {"thread_id": thread_id}}
response = agent.invoke({"messages": [{"role": "user", "content": question}]}, config)
results[thread_id] = response
return results
84 changes: 84 additions & 0 deletions assistant_app/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""
Configuration helpers for the intelligent assistant service.
"""
from __future__ import annotations

import os
from dataclasses import dataclass, field
from typing import List, Optional


@dataclass
class Settings:
"""Runtime configuration sourced from environment variables."""

openai_api_key: Optional[str] = None
langsmith_api_key: Optional[str] = None
tavily_api_key: Optional[str] = None
huggingface_model: str = "sentence-transformers/all-MiniLM-L6-v2"
source_urls: List[str] = field(
default_factory=lambda: [
"https://zh.wikipedia.org/wiki/%E5%A4%A7%E7%86%8A%E7%8C%AB", # 大熊猫
"https://zh.wikipedia.org/wiki/%E7%AB%B9", # 竹类
"https://zh.wikipedia.org/wiki/%E7%86%8A%E7%8C%AB%E4%B8%83%E5%9B%BD%E8%B5%A0%E9%80%81", # 熊猫外交
"https://zh.wikipedia.org/wiki/%E4%B8%AD%E5%9B%BD%E7%86%8A%E7%8C%AB%E5%8E%9F%E5%9C%B0", # 熊猫栖息地与保护
]
)
chunk_size: int = 1000
chunk_overlap: int = 200
chat_model: str = "gpt-5-nano"
temperature: float = 0.0
tavily_results: int = 3
default_thread_ids: List[str] = field(default_factory=lambda: ["customer-care", "ops-monitoring"])

@classmethod
def from_env(cls) -> "Settings":
"""Create settings using environment variables with sensible defaults."""

return cls(
openai_api_key=os.getenv("OPENAI_API_KEY"),
langsmith_api_key=os.getenv("LANGSMITH_API_KEY"),
tavily_api_key=os.getenv("TAVILY_API_KEY"),
huggingface_model=os.getenv(
"HUGGINGFACE_MODEL", "sentence-transformers/all-MiniLM-L6-v2"
),
source_urls=cls._parse_source_urls(),
chunk_size=int(os.getenv("CHUNK_SIZE", 1000)),
chunk_overlap=int(os.getenv("CHUNK_OVERLAP", 200)),
chat_model=os.getenv("CHAT_MODEL", "gpt-5-nano"),
temperature=float(os.getenv("TEMPERATURE", 0.0)),
tavily_results=int(os.getenv("TAVILY_RESULTS", 3)),
)

def validate(self) -> None:
"""Validate that required keys are present."""

missing: list[str] = []
if not self.openai_api_key:
missing.append("OPENAI_API_KEY")
if not self.tavily_api_key:
missing.append("TAVILY_API_KEY")

if missing:
raise RuntimeError(
"Missing required environment variables: " + ", ".join(sorted(missing))
)

@classmethod
def _parse_source_urls(cls) -> List[str]:
"""Support comma-separated SOURCE_URLS or a single SOURCE_URL fallback."""

urls = os.getenv("SOURCE_URLS")
if urls:
parsed = [url.strip() for url in urls.split(",") if url.strip()]
if parsed:
return parsed

single = os.getenv("SOURCE_URL")
if single:
return [single]

return cls().source_urls


settings = Settings.from_env()
42 changes: 42 additions & 0 deletions assistant_app/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""
Document ingestion and retriever construction.
"""
from __future__ import annotations

from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from .config import Settings


def load_documents(settings: Settings):
"""Load documents from the configured Wikipedia sources."""

documents = []
for url in settings.source_urls:
loader = WebBaseLoader(url)
documents.extend(loader.load())
return documents


def split_documents(docs, settings: Settings):
"""Split documents into overlapping chunks for retrieval."""

splitter = RecursiveCharacterTextSplitter(
chunk_size=settings.chunk_size,
chunk_overlap=settings.chunk_overlap,
separators=["\n\n", "\n", " ", ""],
)
return splitter.split_documents(docs)


def build_retriever(settings: Settings):
"""Construct a FAISS-backed retriever from web documents."""

docs = load_documents(settings)
chunks = split_documents(docs, settings)
embeddings = HuggingFaceEmbeddings(model_name=settings.huggingface_model)
vectorstore = FAISS.from_documents(chunks, embeddings)
return vectorstore.as_retriever()
45 changes: 45 additions & 0 deletions assistant_app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""
Command-line entry point to run the intelligent assistant locally.
"""
from __future__ import annotations

from langchain_core.messages import HumanMessage, SystemMessage

from .agent import build_agent, run_sample_sessions
from .config import settings
from .data import build_retriever
from .tools import build_tools


def build_conversational_agent():
"""Build the end-to-end agent with tools and retrieval."""

retriever = build_retriever(settings)
tools = build_tools(retriever, settings)
agent = build_agent(settings, tools)
return agent, tools


def run_demo():
"""Run a short demo showing tool binding and memory."""

agent, tools = build_conversational_agent()

model_with_tools = agent.model.bind_tools(tools=tools)
messages = [
SystemMessage("你是一个乐于助人的企业级智能对话助手。"),
HumanMessage("2025年12月10日天津天气怎么样?"),
]
_ = model_with_tools.invoke(messages)

thread_results = run_sample_sessions(
agent=agent, thread_ids=settings.default_thread_ids, question="熊猫的特征"
)

for thread_id, result in thread_results.items():
content = result["messages"][-1].content
print(f"[Thread: {thread_id}] {content}")


if __name__ == "__main__":
run_demo()
70 changes: 70 additions & 0 deletions assistant_app/mcp_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""
Minimal Model Context Protocol (MCP) server exposing the assistant tools.

This module wires the existing retrieval and search capabilities into an MCP
server so external clients can call them as standardized tools.
"""
from __future__ import annotations

from langchain_community.tools.tavily_search import TavilySearchResults
from mcp.server.fastmcp import FastMCP
from mcp.types import Resource

from .config import settings
from .data import build_retriever


async def _panda_resource() -> Resource:
"""Provide metadata describing the underlying panda corpus."""

return Resource(
uri=settings.source_url,
mimeType="text/html",
description="Primary source describing panda characteristics.",
)


def create_mcp_server():
"""Create an MCP server exposing internet search and panda retrieval."""

retriever = build_retriever(settings)
search = TavilySearchResults(max_results=settings.tavily_results)

server = FastMCP("assistant-mcp")

@server.tool()
async def tavily_search(query: str) -> str:
"""进行互联网搜索并返回结果"""

return search.run(query)

@server.tool()
async def wiki_panda_search(query: str) -> str:
"""根据向量数据库内容检索大熊猫相关信息。"""

results = retriever.get_relevant_documents(query)
return "\n\n".join([doc.page_content for doc in results])

@server.resource()
async def panda_wiki_source() -> Resource:
"""Expose the configured panda wiki URL as a resource for MCP clients."""

return await _panda_resource()

return server


async def serve(host: str = "0.0.0.0", port: int = 8001) -> None:
"""Run the MCP server."""

server = create_mcp_server()
await server.serve(host=host, port=port)


__all__ = ["create_mcp_server", "serve"]


if __name__ == "__main__":
import asyncio

asyncio.run(serve())
37 changes: 37 additions & 0 deletions assistant_app/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
Tooling used by the conversational agent.
"""
from __future__ import annotations

from langchain.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_core.tools import create_retriever_tool

from .config import Settings


def build_tools(retriever, settings: Settings):
"""Create reusable tools for the agent."""

search = TavilySearchResults(max_results=settings.tavily_results)

@tool
def search_tool(query: str) -> str:
"""进行互联网搜索并返回结果"""

return search.run(query)

retriever_tool = create_retriever_tool(
retriever=retriever,
name="wiki_search",
description="搜索维基百科",
)

@tool
def wiki_panda_search(query: str) -> str:
"""根据向量数据库内容检索相关的维基百科知识。"""

results = retriever_tool.get_relevant_documents(query)
return "\n\n".join([doc.page_content for doc in results])

return [search_tool, wiki_panda_search]