diff --git a/api/main.py b/api/main.py index f2445de3..26910d7b 100644 --- a/api/main.py +++ b/api/main.py @@ -28,6 +28,7 @@ sources, speaker_profiles, transformations, + agent, # New Agent Router ) from api.routers import commands as commands_router from open_notebook.database.async_migrate import AsyncMigrationManager @@ -116,6 +117,7 @@ async def lifespan(app: FastAPI): app.include_router(speaker_profiles.router, prefix="/api", tags=["speaker-profiles"]) app.include_router(chat.router, prefix="/api", tags=["chat"]) app.include_router(source_chat.router, prefix="/api", tags=["source-chat"]) +app.include_router(agent.router, prefix="/api", tags=["agent"]) # Register Agent Endpoints @app.get("/") diff --git a/api/routers/agent.py b/api/routers/agent.py new file mode 100644 index 00000000..fd142b53 --- /dev/null +++ b/api/routers/agent.py @@ -0,0 +1,195 @@ +from typing import Any, Dict, List, Optional +import os +import httpx +from pathlib import Path + +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel, Field +from loguru import logger + +from open_notebook.acm_agent_service import get_research_agent +from open_notebook.config import UPLOADS_FOLDER +from open_notebook.domain.notebook import Notebook, Source +from open_notebook.database.repository import ensure_record_id +from api.command_service import CommandService +from commands.source_commands import SourceProcessingInput + +router = APIRouter() + +# --- Helpers --- + +def generate_unique_filename(original_filename: str, upload_folder: str) -> str: + """Generate unique filename to avoid overwrites.""" + file_path = Path(upload_folder) + file_path.mkdir(parents=True, exist_ok=True) + + # Split filename and extension + stem = Path(original_filename).stem + suffix = Path(original_filename).suffix + + # Check if file exists and generate unique name + counter = 0 + while True: + if counter == 0: + new_filename = original_filename + else: + new_filename = f"{stem} ({counter}){suffix}" + + full_path = file_path / new_filename + if not full_path.exists(): + return str(full_path) + counter += 1 + +# --- Data Models --- + +class PaperResult(BaseModel): + title: str + year: Optional[int] = None + venue: str + citations: Optional[int] = None + pdf_url: str + openalex_id: Optional[str] = None + abstract_index: bool = False + +class SearchPapersResponse(BaseModel): + count: int + results: List[PaperResult] + +class IngestPaperRequest(BaseModel): + pdf_url: str = Field(..., description="Direct URL to the paper PDF") + notebook_id: str = Field(..., description="Target notebook ID to ingest into") + title: Optional[str] = Field(None, description="Title of the paper") + +class IngestPaperResponse(BaseModel): + success: bool + message: str + source_id: Optional[str] = None + command_id: Optional[str] = None + +# --- Endpoints --- + +@router.get("/agent/acm/search", response_model=SearchPapersResponse) +async def search_acm_papers( + query: str = Query(..., min_length=3, description="Search query for ACM papers"), + limit: int = Query(5, ge=1, le=20, description="Max results to return") +): + """ + Search for Open Access papers in ACM Digital Library via OpenAlex. + """ + try: + agent = get_research_agent() + results = agent.search_papers(query, limit=limit) + + # Convert dicts to Pydantic models + papers = [] + for r in results: + papers.append(PaperResult( + title=r.get("title", "Untitled"), + year=r.get("year"), + venue=r.get("venue", "Unknown"), + citations=r.get("citations"), + pdf_url=r.get("pdf_url"), + openalex_id=r.get("openalex_id"), + abstract_index=r.get("abstract_index", False) + )) + + return SearchPapersResponse( + count=len(papers), + results=papers + ) + except Exception as e: + logger.error(f"Error searching ACM papers: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@router.post("/agent/acm/ingest", response_model=IngestPaperResponse) +async def ingest_acm_paper(request: IngestPaperRequest): + """ + Download a paper from URL and ingest it into the specified notebook. + This triggers the standard source processing pipeline. + """ + file_path = None + try: + # 1. Validate Notebook + notebook = await Notebook.get(request.notebook_id) + if not notebook: + raise HTTPException(status_code=404, detail="Notebook not found") + + # 2. Download the PDF + logger.info(f"Downloading paper from: {request.pdf_url}") + + # Extract filename from URL or use title + filename = request.pdf_url.split('/')[-1] + if not filename.lower().endswith('.pdf'): + filename += ".pdf" + + # Use httpx for async download + async with httpx.AsyncClient(follow_redirects=True, timeout=30.0) as client: + response = await client.get(request.pdf_url) + response.raise_for_status() + + # Save to UPLOADS_FOLDER with unique name + file_path = generate_unique_filename(filename, UPLOADS_FOLDER) + + # Write file (sync I/O is okay for small files, or could use aiofiles if strictly needed) + with open(file_path, 'wb') as f: + f.write(response.content) + + logger.info(f"Paper saved to: {file_path}") + + # 3. Create Source Record + source_title = request.title or filename + source = Source( + title=source_title, + topics=[], + ) + await source.save() + + # Link to Notebook + await source.add_to_notebook(request.notebook_id) + + # 4. Trigger Processing Command (Async) + # Import command modules to ensure they're registered + import commands.source_commands # noqa: F401 + + content_state = { + "file_path": file_path, + "delete_source": False # Keep file after processing + } + + command_input = SourceProcessingInput( + source_id=str(source.id), + content_state=content_state, + notebook_ids=[request.notebook_id], + transformations=[], # No extra transformations for now + embed=True, # Always embed for RAG + ) + + command_id = await CommandService.submit_command_job( + "open_notebook", # app name + "process_source", # command name + command_input.model_dump(), + ) + + # Update source with command reference + source.command = ensure_record_id(command_id) + await source.save() + + return IngestPaperResponse( + success=True, + message="Paper downloaded and processing started", + source_id=str(source.id), + command_id=command_id + ) + + except httpx.HTTPError as e: + logger.error(f"Download failed: {e}") + if file_path and os.path.exists(file_path): + os.unlink(file_path) + raise HTTPException(status_code=400, detail=f"Failed to download paper: {str(e)}") + + except Exception as e: + logger.error(f"Error ingesting ACM paper: {e}") + if file_path and os.path.exists(file_path): + os.unlink(file_path) + raise HTTPException(status_code=500, detail=str(e)) + diff --git a/docs/ACM_AGENT_TESTING_GUIDE.md b/docs/ACM_AGENT_TESTING_GUIDE.md new file mode 100644 index 00000000..9d717b7e --- /dev/null +++ b/docs/ACM_AGENT_TESTING_GUIDE.md @@ -0,0 +1,205 @@ +# ACM Scholar Agent - User Testing Guide + +Welcome to the ACM Scholar Agent testing! This guide will help you get started with searching and adding academic papers to your Open Notebook. + +## ๐Ÿš€ Quick Start + +### What You Can Do (No API Key Required!) + +- **Search ACM Papers** - Find open access papers from ACM Digital Library +- **Add Papers to Notebook** - One-click download and import from arXiv and other trusted sources +- **View Paper Content** - See extracted text from PDFs + +### For Full Experience (Optional) + +To chat with your papers using AI, you'll need one of these: +- **DeepSeek API Key** (Recommended, affordable) +- **OpenAI API Key** +- **Ollama** (Free, runs locally) + +--- + +## ๐Ÿ“ฆ Installation + +### Prerequisites + +- Python 3.10+ +- Node.js 18+ +- SurrealDB + +### 1. Clone the Repository + +```bash +git clone https://github.com/hongping-zh/open-notebook.git +cd open-notebook +``` + +### 2. Backend Setup + +```bash +# Create virtual environment +python -m venv .venv + +# Activate (Windows) +.venv\Scripts\activate + +# Activate (Mac/Linux) +source .venv/bin/activate + +# Install dependencies +pip install -e . +``` + +### 3. Frontend Setup + +```bash +cd frontend +npm install +``` + +### 4. Database Setup + +Start SurrealDB: +```bash +surreal start --user root --pass root file:data/surreal +``` + +### 5. Environment Configuration + +Create `.env` file in the project root: + +```env +API_URL=http://localhost:5055 +INTERNAL_API_URL=http://localhost:5055 +SURREAL_URL="ws://127.0.0.1:8000/rpc" +SURREAL_USER="root" +SURREAL_PASSWORD="root" +SURREAL_NAMESPACE="open_notebook" +SURREAL_DATABASE="staging" +``` + +### 6. Start the Application + +**Terminal 1 - Backend:** +```bash +python run_api.py +``` + +**Terminal 2 - Frontend:** +```bash +cd frontend +npm run dev +``` + +**Open browser:** http://localhost:3000 + +--- + +## ๐Ÿ” Testing the ACM Agent + +### Step 1: Create a Notebook + +1. Click **"+ New Notebook"** +2. Give it a name (e.g., "AI Research") + +### Step 2: Search for Papers + +1. In your notebook, click **"+ Add"** button +2. Select **"Research Papers"** from the dropdown +3. Enter a search query (e.g., "Large Language Models", "Reinforcement Learning") +4. Click **Search** + +### Step 3: Add a Paper + +1. Browse the search results +2. Click **"+ Add"** on any paper you want +3. The paper will be downloaded and processed automatically + +### Step 4: View Your Paper + +- The paper appears in your Sources list +- Click to view extracted content +- Status shows: Processing โ†’ Embedded (ready for chat) + +--- + +## ๐Ÿ’ฌ Chat with Your Papers (Optional) + +To enable AI chat, configure a language model: + +### Option A: DeepSeek (Recommended) + +1. Get API key from https://platform.deepseek.com/ +2. Go to **Settings** โ†’ **Models** in Open Notebook +3. Add new model: + - Provider: `deepseek` + - Name: `deepseek-chat` + - API Key: Your key +4. Set as default chat model + +### Option B: Ollama (Free, Local) + +1. Install Ollama: https://ollama.ai/ +2. Pull a model: `ollama pull llama2` +3. Configure in Open Notebook Settings + +--- + +## ๐Ÿงช Test Scenarios + +### Scenario 1: Basic Search +- Query: "neural network optimization" +- Expected: 3-5 papers with arXiv links + +### Scenario 2: Specific Topic +- Query: "transformer attention mechanism" +- Expected: Papers about attention in transformers + +### Scenario 3: Add and Process +- Add any paper from search results +- Wait for processing (usually < 1 minute) +- Verify content is extracted + +### Scenario 4: Chat (if configured) +- Ask: "What is the main contribution of this paper?" +- Expected: AI response based on paper content + +--- + +## โ“ Troubleshooting + +### No Search Results? +- Try broader search terms +- ACM Agent only returns papers with accessible PDFs (arXiv, etc.) + +### Paper Stuck in "Processing"? +- Check backend logs for errors +- Ensure SurrealDB is running + +### Chat Not Working? +- Verify LLM model is configured +- Check API key is valid +- Look at backend logs for errors + +--- + +## ๐Ÿ“ Feedback + +We'd love to hear your feedback! Please report: +- Bugs or errors +- Feature requests +- User experience issues + +Contact: [Your contact info or GitHub Issues] + +--- + +## ๐Ÿ”— Links + +- **Repository**: https://github.com/hongping-zh/open-notebook +- **Original Project**: https://github.com/lfnovo/open-notebook +- **OpenAlex API**: https://openalex.org/ + +--- + +**Happy researching!** ๐Ÿ“š diff --git a/frontend/src/app/(dashboard)/notebooks/components/SourcesColumn.tsx b/frontend/src/app/(dashboard)/notebooks/components/SourcesColumn.tsx index 58f6addb..1422e869 100644 --- a/frontend/src/app/(dashboard)/notebooks/components/SourcesColumn.tsx +++ b/frontend/src/app/(dashboard)/notebooks/components/SourcesColumn.tsx @@ -15,6 +15,7 @@ import { LoadingSpinner } from '@/components/common/LoadingSpinner' import { EmptyState } from '@/components/common/EmptyState' import { AddSourceDialog } from '@/components/sources/AddSourceDialog' import { AddExistingSourceDialog } from '@/components/sources/AddExistingSourceDialog' +import { ResearchDialog } from '@/components/notebooks/ResearchDialog' import { SourceCard } from '@/components/sources/SourceCard' import { useDeleteSource, useRetrySource, useRemoveSourceFromNotebook } from '@/lib/hooks/use-sources' import { ConfirmDialog } from '@/components/common/ConfirmDialog' @@ -51,6 +52,7 @@ export function SourcesColumn({ const [dropdownOpen, setDropdownOpen] = useState(false) const [addDialogOpen, setAddDialogOpen] = useState(false) const [addExistingDialogOpen, setAddExistingDialogOpen] = useState(false) + const [researchDialogOpen, setResearchDialogOpen] = useState(false) const [deleteDialogOpen, setDeleteDialogOpen] = useState(false) const [sourceToDelete, setSourceToDelete] = useState(null) const [removeDialogOpen, setRemoveDialogOpen] = useState(false) @@ -173,6 +175,10 @@ export function SourcesColumn({ Add Existing Source + { setDropdownOpen(false); setResearchDialogOpen(true); }}> + + Research Papers + {collapseButton} @@ -235,6 +241,13 @@ export function SourcesColumn({ onSuccess={onRefresh} /> + + void + notebookId: string + onSuccess?: () => void +} + +export function ResearchDialog({ + open, + onOpenChange, + notebookId, + onSuccess +}: ResearchDialogProps) { + const [query, setQuery] = useState('') + const [results, setResults] = useState([]) + const [isSearching, setIsSearching] = useState(false) + const [ingestingUrls, setIngestingUrls] = useState>(new Set()) + const [ingestedUrls, setIngestedUrls] = useState>(new Set()) + + const handleSearch = async (e?: React.FormEvent) => { + e?.preventDefault() + if (!query.trim()) return + + setIsSearching(true) + try { + const response = await searchAcmPapers(query) + setResults(response.results) + } catch (error) { + console.error('Search failed:', error) + toast.error('Failed to search papers') + } finally { + setIsSearching(false) + } + } + + const handleIngest = async (paper: PaperResult) => { + if (ingestingUrls.has(paper.pdf_url) || ingestedUrls.has(paper.pdf_url)) return + + setIngestingUrls(prev => new Set(prev).add(paper.pdf_url)) + try { + await ingestAcmPaper({ + pdf_url: paper.pdf_url, + notebook_id: notebookId, + title: paper.title + }) + + setIngestedUrls(prev => new Set(prev).add(paper.pdf_url)) + toast.success('Paper added to notebook processing queue') + onSuccess?.() + } catch (error) { + console.error('Ingest failed:', error) + toast.error('Failed to add paper') + } finally { + setIngestingUrls(prev => { + const next = new Set(prev) + next.delete(paper.pdf_url) + return next + }) + } + } + + const handleClose = () => { + onOpenChange(false) + // Optional: clear results on close? + // setResults([]) + // setQuery('') + } + + return ( + + + + + + Research Papers (ACM) + + + Search and add open access papers from ACM Digital Library directly to your notebook. + + + +
+
+
+ + setQuery(e.target.value)} + className="pl-9" + autoFocus + /> +
+ +
+
+ + + {results.length === 0 && !isSearching ? ( +
+ +

Search for papers to get started

+
+ ) : ( +
+ {results.map((paper, index) => ( +
+
+
+

+ {paper.title} +

+
+ + + {paper.venue} + + {paper.year && โ€ข {paper.year}} + {paper.citations !== undefined && โ€ข {paper.citations} citations} + + Open Access + +
+
+ + +
+
+ ))} +
+ )} +
+ +
+ Powered by OpenAlex & ACM โ€ข Showing top {results.length} results +
+
+
+ ) +} diff --git a/frontend/src/lib/api/agent.ts b/frontend/src/lib/api/agent.ts new file mode 100644 index 00000000..6887181c --- /dev/null +++ b/frontend/src/lib/api/agent.ts @@ -0,0 +1,62 @@ +import { getApiUrl } from '../config' + +export interface PaperResult { + title: string + year?: number + venue: string + citations?: number + pdf_url: string + openalex_id?: string + abstract_index: boolean +} + +export interface SearchPapersResponse { + count: number + results: PaperResult[] +} + +export interface IngestPaperRequest { + pdf_url: string + notebook_id: string + title?: string +} + +export interface IngestPaperResponse { + success: boolean + message: string + source_id?: string + command_id?: string +} + +export async function searchAcmPapers(query: string, limit: number = 5): Promise { + const apiUrl = await getApiUrl() + const response = await fetch(`${apiUrl}/api/agent/acm/search?query=${encodeURIComponent(query)}&limit=${limit}`, { + method: 'GET', + headers: { + 'Content-Type': 'application/json', + }, + }) + + if (!response.ok) { + throw new Error(`Failed to search papers: ${response.statusText}`) + } + + return response.json() +} + +export async function ingestAcmPaper(data: IngestPaperRequest): Promise { + const apiUrl = await getApiUrl() + const response = await fetch(`${apiUrl}/api/agent/acm/ingest`, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(data), + }) + + if (!response.ok) { + throw new Error(`Failed to ingest paper: ${response.statusText}`) + } + + return response.json() +} diff --git a/open_notebook/acm_agent_service/README.md b/open_notebook/acm_agent_service/README.md new file mode 100644 index 00000000..ce386af6 --- /dev/null +++ b/open_notebook/acm_agent_service/README.md @@ -0,0 +1,29 @@ +# ACM Scholar Agent Module + +Search and discover ACM Digital Library papers via OpenAlex API. + +## Usage + +```python +from open_notebook.acm_agent_service import get_research_agent + +agent = get_research_agent() + +# Search for papers +results = agent.search_papers("Large Language Models") +for paper in results: + print(f"- {paper['title']} ({paper['year']})") +``` + +## API Endpoints + +| Endpoint | Method | Description | +|----------|--------|-------------| +| `/api/agent/acm/search` | GET | Search ACM papers | +| `/api/agent/acm/ingest` | POST | Download and ingest paper | + +## Technical Details + +- **Search Backend**: OpenAlex API +- **Filters**: ACM Publisher, Computer Science, Open Access +- **Dependencies**: `requests`, `loguru` diff --git a/open_notebook/acm_agent_service/__init__.py b/open_notebook/acm_agent_service/__init__.py new file mode 100644 index 00000000..11fa008b --- /dev/null +++ b/open_notebook/acm_agent_service/__init__.py @@ -0,0 +1,9 @@ +from .core import get_agent, ACMAgent +from .interfaces import ResearchAgentInterface + +# Public API for the package +def get_research_agent() -> ResearchAgentInterface: + """ + Main entry point. Returns the ACM Agent instance. + """ + return get_agent() diff --git a/open_notebook/acm_agent_service/core.py b/open_notebook/acm_agent_service/core.py new file mode 100644 index 00000000..aa38bea2 --- /dev/null +++ b/open_notebook/acm_agent_service/core.py @@ -0,0 +1,24 @@ +from typing import List, Dict, Any +from .interfaces import ResearchAgentInterface +from .tools import OpenAlexACMTool + + +class ACMAgent(ResearchAgentInterface): + """ + ACM Scholar Agent: Search and discover ACM papers via OpenAlex API. + """ + def search_papers(self, query: str, limit: int = 5) -> List[Dict[str, Any]]: + return OpenAlexACMTool.search(query, limit) + + def ingest_paper(self, paper_url: str) -> Dict[str, Any]: + filename = paper_url.split('/')[-1] + if not filename.endswith('.pdf'): + filename += ".pdf" + return {"success": True, "message": f"Ready to download {filename}"} + + +def get_agent() -> ResearchAgentInterface: + """ + Returns the ACM Agent instance. + """ + return ACMAgent() diff --git a/open_notebook/acm_agent_service/interfaces.py b/open_notebook/acm_agent_service/interfaces.py new file mode 100644 index 00000000..9421d06c --- /dev/null +++ b/open_notebook/acm_agent_service/interfaces.py @@ -0,0 +1,32 @@ +from typing import List, Dict, Protocol, Any + + +class ResearchAgentInterface(Protocol): + """ + Standard interface for the Research Agent. + """ + + def search_papers(self, query: str, limit: int = 5) -> List[Dict[str, Any]]: + """ + Search for papers in the target knowledge base (e.g., ACM). + + Args: + query: The search string. + limit: Max number of results. + + Returns: + List of paper objects (title, url, year, etc.) + """ + ... + + def ingest_paper(self, paper_url: str) -> Dict[str, Any]: + """ + Download and ingest a paper into the system. + + Args: + paper_url: The direct URL to the PDF. + + Returns: + Status dictionary (e.g., {"success": True, "document_id": "..."}) + """ + ... diff --git a/open_notebook/acm_agent_service/tools.py b/open_notebook/acm_agent_service/tools.py new file mode 100644 index 00000000..f858a506 --- /dev/null +++ b/open_notebook/acm_agent_service/tools.py @@ -0,0 +1,69 @@ +import requests +from typing import List, Dict, Any +from loguru import logger + + +class OpenAlexACMTool: + """ + Tool for searching ACM papers via the OpenAlex API. + """ + + BASE_URL = "https://api.openalex.org/works" + ACM_PUBLISHER_ID = "P4310319798" + CS_CONCEPT_ID = "C41008148" + + @classmethod + def search(cls, query: str, limit: int = 5) -> List[Dict[str, Any]]: + """ + Search for ACM Open Access papers. + """ + filters = [ + f"primary_location.source.publisher_lineage:{cls.ACM_PUBLISHER_ID}", + f"concepts.id:{cls.CS_CONCEPT_ID}", + "is_oa:true", + "type:article", + ] + + params = { + "search": query, + "filter": ",".join(filters), + "per_page": min(limit, 20), + "sort": "cited_by_count:desc" + } + + logger.info(f"[ACM-Agent] Searching: '{query}'") + + try: + response = requests.get(cls.BASE_URL, params=params, timeout=10) + response.raise_for_status() + data = response.json() + except requests.exceptions.RequestException as e: + logger.error(f"[ACM-Agent] Search failed: {e}") + return [] + + results = [] + for work in data.get('results', []): + venue = work.get('primary_location', {}).get('source', {}).get('display_name', 'Unknown') + pdf_url = cls._get_pdf_url(work) + + if not pdf_url: + continue + + results.append({ + "title": work.get('title'), + "year": work.get('publication_year'), + "venue": venue, + "citations": work.get('cited_by_count'), + "pdf_url": pdf_url, + "openalex_id": work.get('id'), + }) + + return results[:limit] + + @classmethod + def _get_pdf_url(cls, work: Dict[str, Any]) -> str | None: + """Get PDF URL from OpenAlex work data.""" + best_oa = work.get('best_oa_location', {}) + if best_oa and best_oa.get('pdf_url'): + return best_oa.get('pdf_url') + return None