diff --git a/package.json b/package.json index 4d15272..641469d 100644 --- a/package.json +++ b/package.json @@ -64,7 +64,7 @@ "vitest": "^4.0.0" }, "dependencies": { - "@mendable/firecrawl-js": "^4.10.0", + "@mendable/firecrawl-js": "^4.12.0", "commander": "^14.0.2" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index e3e21b6..f61e2fd 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -9,8 +9,8 @@ importers: .: dependencies: '@mendable/firecrawl-js': - specifier: ^4.10.0 - version: 4.10.0 + specifier: ^4.12.0 + version: 4.12.0 commander: specifier: ^14.0.2 version: 14.0.2 @@ -195,8 +195,8 @@ packages: '@jridgewell/sourcemap-codec@1.5.5': resolution: {integrity: sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==} - '@mendable/firecrawl-js@4.10.0': - resolution: {integrity: sha512-40qtKCVY3a1A4Y6t/m5Ar10HbzrWuyCNt7vR3uBh+j14GZC0JoxEkjaFRC00wBmPD9N5JMT4gmTXvzM/SI9enw==} + '@mendable/firecrawl-js@4.12.0': + resolution: {integrity: sha512-Jjawuumet+3htp39PwwhkZhSj2ORR8Tz/HqORoFGngrB3HadMBKoX6SSPIhayRTXwNnDkaU0PWm1SUtRvPdoPw==} engines: {node: '>=22.0.0'} '@rollup/rollup-android-arm-eabi@4.55.1': @@ -972,7 +972,7 @@ snapshots: '@jridgewell/sourcemap-codec@1.5.5': {} - '@mendable/firecrawl-js@4.10.0': + '@mendable/firecrawl-js@4.12.0': dependencies: axios: 1.13.2 typescript-event-target: 1.1.2 diff --git a/skills/firecrawl-cli/SKILL.md b/skills/firecrawl-cli/SKILL.md index 46eba9e..bd8c515 100644 --- a/skills/firecrawl-cli/SKILL.md +++ b/skills/firecrawl-cli/SKILL.md @@ -288,3 +288,66 @@ For many URLs, use xargs with `-P` for parallel execution: ```bash cat urls.txt | xargs -P 10 -I {} sh -c 'firecrawl scrape "{}" -o ".firecrawl/$(echo {} | md5).md"' ``` + +### Agent - AI-powered data extraction (use sparingly) + +**IMPORTANT:** Only use `agent` for complex multi-site data enrichment tasks. It takes 1-5 minutes to complete. For most tasks, use `scrape`, `search`, or `map` instead. + +**When to use agent:** + +- Gathering data about entities (companies, products, people) from multiple unknown sources +- Competitive analysis comparing features/pricing across several websites +- Research requiring data synthesis from various sources +- Building lists of entities matching specific criteria +- Single page with very specific extraction needs (e.g., "find the CEO's email and LinkedIn") +- Single page discovery requiring navigation (e.g., "find the pricing for enterprise plan" when buried in subpages) + +**When NOT to use agent:** + +- Basic single page scraping → use `scrape` +- Known website crawling → use `crawl` +- URL discovery → use `map` +- Web search → use `search` +- Any time-sensitive task +- Simple content extraction that `scrape` can handle + +```bash +# Multi-site company research +firecrawl agent "Find Series A fintech startups from YC W24 with funding amounts" --wait -o .firecrawl/yc-fintech.json + +# Competitive pricing analysis +firecrawl agent "Compare pricing plans for Vercel, Netlify, and Cloudflare Pages" --wait -o .firecrawl/pricing.json + +# Focused extraction from specific URLs +firecrawl agent "Extract feature comparison" --urls https://a.com,https://b.com --wait -o .firecrawl/features.json + +# Structured output with schema +firecrawl agent "Find top 10 headless CMS options with pricing" --schema-file schema.json --wait -o .firecrawl/cms.json + +# Higher accuracy for complex tasks +firecrawl agent "Research AI coding assistants market" --model spark-1-pro --wait -o .firecrawl/research.json +``` + +**Agent Options:** + +- `--wait` - Wait for completion (recommended, otherwise returns job ID) +- `--urls ` - Comma-separated URLs to focus extraction on +- `--model ` - spark-1-mini (default, faster) or spark-1-pro (higher accuracy) +- `--schema ` - Inline JSON schema for structured output +- `--schema-file ` - Path to JSON schema file +- `--max-credits ` - Maximum credits to spend +- `--timeout ` - Timeout when waiting +- `-o, --output ` - Save to file + +**Checking job status (if not using --wait):** + +```bash +# Start agent (returns job ID immediately) +firecrawl agent "Find competitors" -o .firecrawl/job.json + +# Check status later +firecrawl agent + +# Wait for existing job to complete +firecrawl agent --wait -o .firecrawl/result.json +``` diff --git a/src/commands/agent.ts b/src/commands/agent.ts new file mode 100644 index 0000000..e6e5217 --- /dev/null +++ b/src/commands/agent.ts @@ -0,0 +1,431 @@ +/** + * Agent command implementation + */ + +import type { + AgentOptions, + AgentResult, + AgentStatusResult, +} from '../types/agent'; +import { getClient } from '../utils/client'; +import { isJobId } from '../utils/job'; +import { writeOutput } from '../utils/output'; +import { createSpinner } from '../utils/spinner'; +import { readFileSync } from 'fs'; + +/** + * Extract detailed error message from API errors + */ +function extractErrorMessage(error: unknown): string { + if (error instanceof Error) { + const anyError = error as any; + + // Handle Firecrawl SDK errors with details array + if (anyError.details && Array.isArray(anyError.details)) { + const messages = anyError.details + .map((d: any) => d.message || JSON.stringify(d)) + .join('; '); + return messages || error.message; + } + + // Check for response data in the error (common in axios/fetch errors) + if (anyError.response?.data?.error) { + return anyError.response.data.error; + } + if (anyError.response?.data?.message) { + return anyError.response.data.message; + } + if (anyError.response?.data) { + return JSON.stringify(anyError.response.data); + } + + return error.message; + } + return 'Unknown error occurred'; +} + +/** + * Load schema from file + */ +function loadSchemaFromFile(filePath: string): Record { + try { + const content = readFileSync(filePath, 'utf-8'); + return JSON.parse(content); + } catch (error) { + if ((error as NodeJS.ErrnoException).code === 'ENOENT') { + throw new Error(`Schema file not found: ${filePath}`); + } + if (error instanceof SyntaxError) { + throw new Error(`Invalid JSON in schema file: ${filePath}`); + } + throw error; + } +} + +/** + * Execute agent status check (with optional wait/polling) + */ +async function checkAgentStatus( + jobId: string, + options: AgentOptions +): Promise { + const app = getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl }); + + // If not waiting, just return current status + if (!options.wait) { + try { + const status = await app.getAgentStatus(jobId); + return { + success: status.success, + data: { + id: jobId, + status: status.status, + data: status.data, + creditsUsed: status.creditsUsed, + expiresAt: status.expiresAt, + }, + }; + } catch (error) { + return { + success: false, + error: extractErrorMessage(error), + }; + } + } + + // Wait mode: poll until completion + const spinner = createSpinner(`Checking agent status...`); + spinner.start(); + + // Handle Ctrl+C gracefully + const handleInterrupt = () => { + spinner.stop(); + process.stderr.write('\n\nInterrupted. Agent may still be running.\n'); + process.stderr.write(`Check status with: firecrawl agent ${jobId}\n\n`); + process.exit(0); + }; + process.on('SIGINT', handleInterrupt); + + const pollMs = options.pollInterval ? options.pollInterval * 1000 : 5000; + const startTime = Date.now(); + const timeoutMs = options.timeout ? options.timeout * 1000 : undefined; + + try { + // Check initial status + let agentStatus = await app.getAgentStatus(jobId); + spinner.update(`Agent ${agentStatus.status}... (Job ID: ${jobId})`); + + while (true) { + if (agentStatus.status === 'completed') { + spinner.succeed('Agent completed'); + return { + success: agentStatus.success, + data: { + id: jobId, + status: agentStatus.status, + data: agentStatus.data, + creditsUsed: agentStatus.creditsUsed, + expiresAt: agentStatus.expiresAt, + }, + }; + } + + if (agentStatus.status === 'failed') { + spinner.fail('Agent failed'); + return { + success: false, + data: { + id: jobId, + status: agentStatus.status, + data: agentStatus.data, + creditsUsed: agentStatus.creditsUsed, + expiresAt: agentStatus.expiresAt, + }, + error: agentStatus.error, + }; + } + + // Check timeout + if (timeoutMs && Date.now() - startTime > timeoutMs) { + spinner.fail(`Timeout after ${options.timeout}s`); + return { + success: false, + error: `Timeout after ${options.timeout} seconds. Agent still processing.`, + }; + } + + await new Promise((resolve) => setTimeout(resolve, pollMs)); + agentStatus = await app.getAgentStatus(jobId); + spinner.update(`Agent ${agentStatus.status}... (Job ID: ${jobId})`); + } + } catch (error) { + spinner.fail('Failed to check agent status'); + return { + success: false, + error: extractErrorMessage(error), + }; + } finally { + process.removeListener('SIGINT', handleInterrupt); + } +} + +/** + * Execute agent command + */ +export async function executeAgent( + options: AgentOptions +): Promise { + try { + const app = getClient({ apiKey: options.apiKey, apiUrl: options.apiUrl }); + const { prompt, status, wait, pollInterval, timeout } = options; + + // If status flag is set or input looks like a job ID, check status + if (status || isJobId(prompt)) { + return await checkAgentStatus(prompt, options); + } + + // Load schema from file if specified + let schema: Record | undefined = options.schema as + | Record + | undefined; + if (options.schemaFile) { + schema = loadSchemaFromFile(options.schemaFile); + } + + // Build agent options + const agentParams: { + prompt: string; + urls?: string[]; + schema?: Record; + model?: 'spark-1-pro' | 'spark-1-mini'; + maxCredits?: number; + pollInterval?: number; + timeout?: number; + } = { + prompt, + }; + + if (options.urls && options.urls.length > 0) { + agentParams.urls = options.urls; + } + if (schema) { + agentParams.schema = schema; + } + if (options.model) { + agentParams.model = options.model as 'spark-1-pro' | 'spark-1-mini'; + } + if (options.maxCredits !== undefined) { + agentParams.maxCredits = options.maxCredits; + } + + // If wait mode, use polling with spinner + if (wait) { + const spinner = createSpinner('Starting agent...'); + spinner.start(); + + // Start agent first + let response; + try { + response = await app.startAgent(agentParams); + } catch (error) { + spinner.fail('Failed to start agent'); + return { + success: false, + error: extractErrorMessage(error), + }; + } + const jobId = response.id; + + // Handle Ctrl+C gracefully + const handleInterrupt = () => { + spinner.stop(); + process.stderr.write('\n\nInterrupted. Agent is still running.\n'); + process.stderr.write(`Check status with: firecrawl agent ${jobId}\n\n`); + process.exit(0); + }; + process.on('SIGINT', handleInterrupt); + + spinner.update(`Agent running... (Job ID: ${jobId})`); + + // Poll for status + const pollMs = pollInterval ? pollInterval * 1000 : 5000; + const startTime = Date.now(); + const timeoutMs = timeout ? timeout * 1000 : undefined; + + try { + while (true) { + await new Promise((resolve) => setTimeout(resolve, pollMs)); + + const agentStatus = await app.getAgentStatus(jobId); + + if (agentStatus.status === 'completed') { + process.removeListener('SIGINT', handleInterrupt); + spinner.succeed('Agent completed'); + return { + success: agentStatus.success, + data: { + id: jobId, + status: agentStatus.status, + data: agentStatus.data, + creditsUsed: agentStatus.creditsUsed, + expiresAt: agentStatus.expiresAt, + }, + }; + } + + if (agentStatus.status === 'failed') { + process.removeListener('SIGINT', handleInterrupt); + spinner.fail('Agent failed'); + return { + success: false, + data: { + id: jobId, + status: agentStatus.status, + data: agentStatus.data, + creditsUsed: agentStatus.creditsUsed, + expiresAt: agentStatus.expiresAt, + }, + error: agentStatus.error, + }; + } + + // Check timeout + if (timeoutMs && Date.now() - startTime > timeoutMs) { + process.removeListener('SIGINT', handleInterrupt); + spinner.fail(`Timeout after ${timeout}s (Job ID: ${jobId})`); + return { + success: false, + error: `Timeout after ${timeout} seconds. Agent still processing. Job ID: ${jobId}`, + }; + } + } + } finally { + process.removeListener('SIGINT', handleInterrupt); + } + } + + // Otherwise, start agent and return job ID + const spinner = createSpinner('Starting agent...'); + spinner.start(); + + let response; + try { + response = await app.startAgent(agentParams); + } catch (error) { + spinner.fail('Failed to start agent'); + return { + success: false, + error: extractErrorMessage(error), + }; + } + + spinner.succeed(`Agent started (Job ID: ${response.id})`); + + return { + success: response.success, + data: { + jobId: response.id, + status: 'processing', + }, + }; + } catch (error) { + return { + success: false, + error: extractErrorMessage(error), + }; + } +} + +/** + * Format agent status in human-readable way + */ +function formatAgentStatus(data: AgentStatusResult['data']): string { + if (!data) return ''; + + const lines: string[] = []; + lines.push(`Job ID: ${data.id}`); + lines.push(`Status: ${data.status}`); + + if (data.creditsUsed !== undefined) { + lines.push(`Credits Used: ${data.creditsUsed}`); + } + + if (data.expiresAt) { + const expiresDate = new Date(data.expiresAt); + lines.push( + `Expires: ${expiresDate.toLocaleString('en-US', { + year: 'numeric', + month: 'short', + day: 'numeric', + hour: '2-digit', + minute: '2-digit', + })}` + ); + } + + if (data.data) { + lines.push(''); + lines.push('Result:'); + lines.push(JSON.stringify(data.data, null, 2)); + } + + return lines.join('\n') + '\n'; +} + +/** + * Handle agent command output + */ +export async function handleAgentCommand(options: AgentOptions): Promise { + const result = await executeAgent(options); + + if (!result.success) { + console.error('Error:', result.error); + process.exit(1); + } + + // Handle status result (completed agent job with data) + if ('data' in result && result.data && 'data' in result.data) { + const statusResult = result as AgentStatusResult; + if (statusResult.data) { + let outputContent: string; + + if (options.json) { + // JSON format + outputContent = options.pretty + ? JSON.stringify({ success: true, ...statusResult.data }, null, 2) + : JSON.stringify({ success: true, ...statusResult.data }); + } else { + // Human-readable format + outputContent = formatAgentStatus(statusResult.data); + } + + writeOutput(outputContent, options.output, !!options.output); + return; + } + } + + // Handle agent start result (job ID) + const agentResult = result as AgentResult; + if (!agentResult.data) { + return; + } + + let outputContent: string; + + if ('jobId' in agentResult.data) { + const jobData = { + jobId: agentResult.data.jobId, + status: agentResult.data.status, + }; + + outputContent = options.pretty + ? JSON.stringify({ success: true, data: jobData }, null, 2) + : JSON.stringify({ success: true, data: jobData }); + } else { + outputContent = options.pretty + ? JSON.stringify(agentResult.data, null, 2) + : JSON.stringify(agentResult.data); + } + + writeOutput(outputContent, options.output, !!options.output); +} diff --git a/src/index.ts b/src/index.ts index 17f40ed..b427ecd 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,6 +14,7 @@ import { handleCreditUsageCommand } from './commands/credit-usage'; import { handleCrawlCommand } from './commands/crawl'; import { handleMapCommand } from './commands/map'; import { handleSearchCommand } from './commands/search'; +import { handleAgentCommand } from './commands/agent'; import { handleVersionCommand } from './commands/version'; import { handleLoginCommand } from './commands/login'; import { handleLogoutCommand } from './commands/logout'; @@ -35,6 +36,7 @@ const AUTH_REQUIRED_COMMANDS = [ 'crawl', 'map', 'search', + 'agent', 'credit-usage', ]; @@ -460,10 +462,120 @@ function createSearchCommand(): Command { return searchCmd; } -// Add crawl, map, and search commands to main program +/** + * Create and configure the agent command + */ +function createAgentCommand(): Command { + const agentCmd = new Command('agent') + .description('Run an AI agent to extract data from the web') + .argument( + '', + 'Natural language prompt describing data to extract, or job ID to check status' + ) + .option('--urls ', 'Comma-separated URLs to focus extraction on') + .option( + '--model ', + 'Model to use: spark-1-mini (default, cheaper) or spark-1-pro (higher accuracy)' + ) + .option( + '--schema ', + 'JSON schema for structured output (inline JSON string)' + ) + .option( + '--schema-file ', + 'Path to JSON schema file for structured output' + ) + .option( + '--max-credits ', + 'Maximum credits to spend (job fails if exceeded)', + parseInt + ) + .option('--status', 'Check status of existing agent job', false) + .option( + '--wait', + 'Wait for agent to complete before returning results', + false + ) + .option( + '--poll-interval ', + 'Polling interval in seconds when waiting (default: 5)', + parseFloat + ) + .option( + '--timeout ', + 'Timeout in seconds when waiting (default: no timeout)', + parseFloat + ) + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--json', 'Output as JSON format', false) + .option('--pretty', 'Pretty print JSON output', false) + .action(async (promptOrJobId, options) => { + // Auto-detect if it's a job ID (UUID format) + const isStatusCheck = options.status || isJobId(promptOrJobId); + + // Parse URLs + let urls: string[] | undefined; + if (options.urls) { + urls = options.urls + .split(',') + .map((u: string) => u.trim()) + .filter((u: string) => u.length > 0); + } + + // Parse inline schema + let schema: Record | undefined; + if (options.schema) { + try { + schema = JSON.parse(options.schema) as Record; + } catch { + console.error('Error: Invalid JSON in --schema option'); + process.exit(1); + } + } + + // Validate model + const validModels = ['spark-1-pro', 'spark-1-mini']; + if (options.model && !validModels.includes(options.model)) { + console.error( + `Error: Invalid model "${options.model}". Valid models: ${validModels.join(', ')}` + ); + process.exit(1); + } + + const agentOptions = { + prompt: promptOrJobId, + urls, + schema, + schemaFile: options.schemaFile, + model: options.model, + maxCredits: options.maxCredits, + status: isStatusCheck, + wait: options.wait, + pollInterval: options.pollInterval, + timeout: options.timeout, + apiKey: options.apiKey, + apiUrl: options.apiUrl, + output: options.output, + json: options.json, + pretty: options.pretty, + }; + + await handleAgentCommand(agentOptions); + }); + + return agentCmd; +} + +// Add crawl, map, search, and agent commands to main program program.addCommand(createCrawlCommand()); program.addCommand(createMapCommand()); program.addCommand(createSearchCommand()); +program.addCommand(createAgentCommand()); program .command('config') diff --git a/src/types/agent.ts b/src/types/agent.ts new file mode 100644 index 0000000..e2a5c9a --- /dev/null +++ b/src/types/agent.ts @@ -0,0 +1,61 @@ +/** + * Types and interfaces for the agent command + */ + +export type AgentModel = 'spark-1-pro' | 'spark-1-mini'; + +export type AgentStatus = 'processing' | 'completed' | 'failed'; + +export interface AgentOptions { + /** Natural language prompt describing the data to extract */ + prompt: string; + /** Model to use: spark-1-mini (default, cheaper) or spark-1-pro (higher accuracy) */ + model?: AgentModel; + /** Specific URLs to focus extraction on */ + urls?: string[]; + /** JSON schema for structured output */ + schema?: Record; + /** Path to JSON schema file */ + schemaFile?: string; + /** Maximum credits to spend (job fails if exceeded) */ + maxCredits?: number; + /** Check status of existing agent job */ + status?: boolean; + /** Wait for agent to complete before returning results */ + wait?: boolean; + /** Polling interval in seconds when waiting */ + pollInterval?: number; + /** Timeout in seconds when waiting */ + timeout?: number; + /** API key for Firecrawl */ + apiKey?: string; + /** API URL for Firecrawl */ + apiUrl?: string; + /** Output file path */ + output?: string; + /** Pretty print JSON output */ + pretty?: boolean; + /** Force JSON output */ + json?: boolean; +} + +export interface AgentResult { + success: boolean; + data?: { + jobId: string; + status: AgentStatus; + }; + error?: string; +} + +export interface AgentStatusResult { + success: boolean; + data?: { + id: string; + status: AgentStatus; + data?: any; + creditsUsed?: number; + expiresAt?: string; + }; + error?: string; +} diff --git a/src/utils/spinner.ts b/src/utils/spinner.ts new file mode 100644 index 0000000..f6c07e7 --- /dev/null +++ b/src/utils/spinner.ts @@ -0,0 +1,62 @@ +/** + * Simple spinner utility for CLI feedback + */ + +const SPINNER_FRAMES = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']; + +export interface Spinner { + start: (message?: string) => void; + update: (message: string) => void; + stop: (finalMessage?: string) => void; + succeed: (message?: string) => void; + fail: (message?: string) => void; +} + +export function createSpinner(initialMessage: string = ''): Spinner { + let frameIndex = 0; + let interval: ReturnType | null = null; + let currentMessage = initialMessage; + + const clearLine = () => { + process.stderr.write('\r\x1b[K'); + }; + + const render = () => { + const frame = SPINNER_FRAMES[frameIndex]; + clearLine(); + process.stderr.write(`${frame} ${currentMessage}`); + frameIndex = (frameIndex + 1) % SPINNER_FRAMES.length; + }; + + return { + start(message?: string) { + if (message) currentMessage = message; + if (interval) return; + render(); + interval = setInterval(render, 80); + }, + + update(message: string) { + currentMessage = message; + }, + + stop(finalMessage?: string) { + if (interval) { + clearInterval(interval); + interval = null; + } + clearLine(); + if (finalMessage) { + process.stderr.write(`${finalMessage}\n`); + } + }, + + succeed(message?: string) { + this.stop(`✓ ${message || currentMessage}`); + }, + + fail(message?: string) { + this.stop(`✗ ${message || currentMessage}`); + }, + }; +}