diff --git a/.gitignore b/.gitignore index 40463502..a0716929 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,6 @@ core-ingestion/node_modules/* core-ingestion/dist/ ix-cli/dist/ .vscode/ -.mcp.json \ No newline at end of file +.mcp.json +# Ix agent worktrees +.ix-worktrees/ diff --git a/docker-compose.standalone.yml b/docker-compose.standalone.yml index a6734a18..63d285d2 100644 --- a/docker-compose.standalone.yml +++ b/docker-compose.standalone.yml @@ -14,6 +14,7 @@ services: - "127.0.0.1:8529:8529" environment: ARANGO_NO_AUTH: "1" + command: ["arangod", "--server.endpoint", "tcp://0.0.0.0:8529", "--experimental-vector-index"] volumes: - arangodb-data:/var/lib/arangodb3 healthcheck: @@ -24,6 +25,42 @@ services: retries: 15 restart: unless-stopped + ollama: + image: ollama/ollama:latest + networks: + - backend + ports: + - "127.0.0.1:11434:11434" + volumes: + - ollama-data:/root/.ollama + restart: unless-stopped + profiles: + - semantic + + semantic-extraction: + image: ghcr.io/ix-infrastructure/ix-semantic-extraction:latest + networks: + - backend + ports: + - "127.0.0.1:11400:11400" + environment: + OLLAMA_BASE_URL: http://ollama:11434 + IX_EXTRACTION_MODEL: qwen2.5vl:7b + IX_EMBEDDING_MODEL: "nomic-embed-text:v2-moe" + IX_AUTO_PULL: "true" + PORT: "11400" + depends_on: + - ollama + healthcheck: + test: ["CMD", "curl", "-sf", "http://localhost:11400/v1/health"] + interval: 10s + timeout: 5s + start_period: 120s + retries: 10 + restart: unless-stopped + profiles: + - semantic + memory-layer: image: ghcr.io/ix-infrastructure/ix-memory-layer:latest networks: @@ -41,6 +78,7 @@ services: ARANGO_DATABASE: ix_memory ARANGO_USER: root ARANGO_PASSWORD: "" + IX_EXTRACTION_URL: http://semantic-extraction:11400 PORT: "8090" depends_on: arangodb: @@ -55,6 +93,7 @@ services: volumes: arangodb-data: + ollama-data: networks: backend: diff --git a/ix-cli/src/cli/commands/load.ts b/ix-cli/src/cli/commands/load.ts new file mode 100644 index 00000000..bc3c4e0a --- /dev/null +++ b/ix-cli/src/cli/commands/load.ts @@ -0,0 +1,144 @@ +/** + * `ix load ` — Multi-source semantic ingestion command. + * + * Thin client: detects source type, fetches content, and POSTs to the + * backend's /v1/load endpoint which handles LLM extraction, embedding, + * and patch commit server-side. + * + * Supports: tweets, arXiv papers, PDFs, images/screenshots, webpages, + * chat exports, and generic local files. + */ + +import type { Command } from "commander"; +import chalk from "chalk"; +import { IxClient } from "../../client/api.js"; +import { getEndpoint } from "../config.js"; +import { detectSource } from "../sources/detect.js"; +import { fetchContent } from "../sources/fetch.js"; + +interface LoadOptions { + format: string; + verbose: boolean; +} + +export function registerLoadCommand(program: Command): void { + program + .command("load") + .description("Ingest a URL or file into the knowledge graph (papers, tweets, screenshots, etc.)") + .argument("", "URL or local file path to ingest") + .option("--format ", "Output format (text or json)", "text") + .option("--verbose", "Show detailed output", false) + .action(async (source: string, opts: LoadOptions) => { + try { + await runLoad(source, opts); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + if (opts.format === "json") { + console.log(JSON.stringify({ error: msg })); + } else { + console.error(chalk.red(`Error: ${msg}`)); + } + process.exitCode = 1; + } + }); +} + +async function runLoad(source: string, opts: LoadOptions): Promise { + const isJson = opts.format === "json"; + + // 1. Detect source type + const detected = detectSource(source); + + if (detected.kind === "github") { + if (!isJson) { + console.error( + chalk.yellow("GitHub repos should be ingested with: ix ingest --github ") + ); + } + process.exitCode = 1; + return; + } + + if (!isJson) { + console.log( + chalk.dim(`Detected source type: `) + chalk.cyan(detected.kind) + + chalk.dim(` (${detected.uri})`) + ); + } + + // 2. Fetch content + if (!isJson) process.stdout.write(chalk.dim("Fetching content... ")); + const content = await fetchContent(detected); + if (!isJson) { + const size = content.text + ? `${content.text.length} chars` + : content.binary + ? `${(content.binary.length / 1024).toFixed(1)} KB` + : "empty"; + console.log(chalk.green(`done`) + chalk.dim(` (${size})`)); + } + + // 3. Build request payload for backend + const payload: Record = { + uri: detected.uri, + kind: detected.kind, + meta: { ...detected.meta, ...content.meta }, + }; + + if (content.text) { + payload.text = content.text; + } + + if (content.binary) { + payload.binaryBase64 = content.binary.toString("base64"); + payload.contentType = (content.meta.content_type as string) ?? undefined; + } + + // 4. POST to backend — extraction, embedding, and commit happen server-side + if (!isJson) process.stdout.write(chalk.dim("Extracting and committing... ")); + + const endpoint = getEndpoint(); + const resp = await fetch(`${endpoint}/v1/load`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + signal: AbortSignal.timeout(5 * 60 * 1000), // 5 min for LLM extraction + }); + + if (!resp.ok) { + const text = await resp.text(); + throw new Error(`Backend error (${resp.status}): ${text}`); + } + + const result = (await resp.json()) as { + status: string; + rev: number; + patchId: string; + nodes: number; + edges: number; + claims: number; + }; + + if (!isJson) { + console.log(chalk.green("done") + chalk.dim(` (rev ${result.rev})`)); + console.log(); + console.log( + chalk.bold("Ingested: ") + + chalk.cyan(detected.kind) + + chalk.dim(" → ") + + `${result.nodes} nodes, ${result.edges} edges, ${result.claims} claims` + + chalk.dim(` (rev ${result.rev})`) + ); + } else { + console.log(JSON.stringify({ + status: result.status, + kind: detected.kind, + uri: detected.uri, + nodes: result.nodes, + edges: result.edges, + claims: result.claims, + rev: result.rev, + patchId: result.patchId, + })); + } +} diff --git a/ix-cli/src/cli/register/oss.ts b/ix-cli/src/cli/register/oss.ts index a68b2b25..36a2115d 100644 --- a/ix-cli/src/cli/register/oss.ts +++ b/ix-cli/src/cli/register/oss.ts @@ -34,6 +34,7 @@ import { registerSubsystemsCommand } from "../commands/subsystems.js"; import { registerUpgradeCommand } from "../commands/upgrade.js"; import { registerViewCommand } from "../commands/view.js"; import { registerSavingsCommand } from "../commands/savings.js"; +import { registerLoadCommand } from "../commands/load.js"; const PRO_COMMANDS: { name: string; desc: string }[] = [ { name: "briefing", desc: "Session-resume briefing" }, @@ -96,6 +97,7 @@ export function registerOssCommands(program: Command): void { registerUpgradeCommand(program); registerViewCommand(program); registerSavingsCommand(program); + registerLoadCommand(program); // Hide advanced commands from default help const advancedSet = new Set(ADVANCED_COMMANDS); diff --git a/ix-cli/src/cli/sources/detect.ts b/ix-cli/src/cli/sources/detect.ts new file mode 100644 index 00000000..60002fed --- /dev/null +++ b/ix-cli/src/cli/sources/detect.ts @@ -0,0 +1,151 @@ +/** + * URL / path type detection for multi-source ingestion. + * + * Auto-classifies a user-provided string (URL or local path) into a + * SourceKind that drives the fetch → extract → transform pipeline. + */ + +export type SourceKind = + | "tweet" + | "arxiv" + | "pdf" + | "image" + | "webpage" + | "chat_export" + | "github" + | "local_file"; + +export interface DetectedSource { + kind: SourceKind; + /** Original input string (URL or file path). */ + raw: string; + /** Normalized URL if applicable, otherwise the absolute file path. */ + uri: string; + /** Extra metadata extracted during detection (e.g. arxiv paper ID). */ + meta: Record; +} + +const IMAGE_EXTENSIONS = new Set([ + ".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".bmp", ".tiff", +]); + +const CHAT_EXTENSIONS = new Set([".json", ".csv", ".jsonl"]); + +/** + * Detect the source type from a URL or file path. + * + * Detection order matters — more specific patterns are checked first. + */ +export function detectSource(input: string): DetectedSource { + const trimmed = input.trim(); + + // --- URL-based detection --- + if (/^https?:\/\//i.test(trimmed)) { + return detectUrl(trimmed); + } + + // --- Local file path --- + return detectLocalFile(trimmed); +} + +function detectUrl(url: string): DetectedSource { + const lower = url.toLowerCase(); + const parsed = new URL(url); + const host = parsed.hostname.replace(/^www\./, ""); + const path = parsed.pathname.toLowerCase(); + + // Twitter / X + if (host === "twitter.com" || host === "x.com") { + const tweetMatch = parsed.pathname.match(/\/([^/]+)\/status\/(\d+)/); + return { + kind: "tweet", + raw: url, + uri: url, + meta: tweetMatch + ? { author: tweetMatch[1], tweetId: tweetMatch[2] } + : {}, + }; + } + + // arXiv + if (host === "arxiv.org" || host.endsWith(".arxiv.org")) { + const paperMatch = parsed.pathname.match( + /\/(?:abs|pdf|html)\/(\d{4}\.\d{4,5})/ + ); + return { + kind: "arxiv", + raw: url, + uri: url, + meta: paperMatch ? { paperId: paperMatch[1] } : {}, + }; + } + + // GitHub — delegate to existing GitHub ingestion + if (host === "github.com") { + return { + kind: "github", + raw: url, + uri: url, + meta: {}, + }; + } + + // Direct PDF link + if (path.endsWith(".pdf") || lower.includes("content-type=application/pdf")) { + return { + kind: "pdf", + raw: url, + uri: url, + meta: {}, + }; + } + + // Direct image link + const ext = extFrom(path); + if (ext && IMAGE_EXTENSIONS.has(ext)) { + return { + kind: "image", + raw: url, + uri: url, + meta: {}, + }; + } + + // Default: webpage + return { + kind: "webpage", + raw: url, + uri: url, + meta: {}, + }; +} + +function detectLocalFile(filePath: string): DetectedSource { + const lower = filePath.toLowerCase(); + const ext = extFrom(lower); + + if (ext === ".pdf") { + return { kind: "pdf", raw: filePath, uri: filePath, meta: {} }; + } + + if (ext && IMAGE_EXTENSIONS.has(ext)) { + return { kind: "image", raw: filePath, uri: filePath, meta: {} }; + } + + // Chat export heuristic: JSON/CSV/JSONL files with "chat", "slack", + // "discord", or "messages" in the filename + if (ext && CHAT_EXTENSIONS.has(ext)) { + const name = filePath.split("/").pop() ?? ""; + if (/chat|slack|discord|messages|conversation/i.test(name)) { + return { kind: "chat_export", raw: filePath, uri: filePath, meta: {} }; + } + } + + return { kind: "local_file", raw: filePath, uri: filePath, meta: {} }; +} + +function extFrom(path: string): string | null { + const dot = path.lastIndexOf("."); + if (dot === -1) return null; + return path.slice(dot).toLowerCase(); +} diff --git a/ix-cli/src/cli/sources/fetch.ts b/ix-cli/src/cli/sources/fetch.ts new file mode 100644 index 00000000..68eaf69a --- /dev/null +++ b/ix-cli/src/cli/sources/fetch.ts @@ -0,0 +1,358 @@ +/** + * Per-source-type content fetchers. + * + * Each fetcher takes a DetectedSource and returns a FetchedContent object + * containing the raw content (text or binary) plus extracted metadata. + */ + +import { readFileSync } from "node:fs"; +import type { DetectedSource, SourceKind } from "./detect.js"; + +export interface FetchedContent { + kind: SourceKind; + /** Source URI for provenance tracking. */ + uri: string; + /** Text content (for text-based sources). */ + text?: string; + /** Binary content (for images, PDFs). */ + binary?: Buffer; + /** Structured metadata extracted during fetch. */ + meta: Record; +} + +type Fetcher = (source: DetectedSource) => Promise; + +const fetchers: Record = { + tweet: fetchTweet, + arxiv: fetchArxiv, + pdf: fetchPdf, + image: fetchImage, + webpage: fetchWebpage, + chat_export: fetchChatExport, + local_file: fetchLocalFile, +}; + +export async function fetchContent(source: DetectedSource): Promise { + const fetcher = fetchers[source.kind]; + if (!fetcher) { + throw new Error(`No fetcher for source kind: ${source.kind}`); + } + return fetcher(source); +} + +// ─── Tweet ────────────────────────────────────────────────────────── + +async function fetchTweet(source: DetectedSource): Promise { + // Use Twitter oEmbed API — no auth required, returns HTML + metadata + const oembedUrl = `https://publish.twitter.com/oembed?url=${encodeURIComponent(source.uri)}&omit_script=true`; + + const resp = await fetch(oembedUrl, { + signal: AbortSignal.timeout(15_000), + }); + + if (!resp.ok) { + throw new Error(`Twitter oEmbed failed (${resp.status}): ${await resp.text()}`); + } + + const data = (await resp.json()) as { + html: string; + author_name: string; + author_url: string; + url: string; + }; + + // Strip HTML tags to get plain text + const plainText = data.html + .replace(/<[^>]+>/g, " ") + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/\s+/g, " ") + .trim(); + + return { + kind: "tweet", + uri: source.uri, + text: plainText, + meta: { + author: data.author_name, + author_url: data.author_url, + tweet_url: data.url, + tweet_id: source.meta.tweetId, + }, + }; +} + +// ─── arXiv ────────────────────────────────────────────────────────── + +async function fetchArxiv(source: DetectedSource): Promise { + const paperId = source.meta.paperId; + // Use arXiv abstract page for metadata extraction + const absUrl = paperId + ? `https://arxiv.org/abs/${paperId}` + : source.uri.replace("/pdf/", "/abs/").replace("/html/", "/abs/"); + + const resp = await fetch(absUrl, { + signal: AbortSignal.timeout(30_000), + }); + + if (!resp.ok) { + throw new Error(`arXiv fetch failed (${resp.status})`); + } + + const html = await resp.text(); + + // Extract metadata from the HTML + const title = extractMeta(html, "citation_title") ?? extractTag(html, "title") ?? "Unknown"; + const abstract = extractMetaContent(html, "citation_abstract") + ?? extractByClass(html, "abstract") + ?? ""; + const authors = extractAllMeta(html, "citation_author"); + const date = extractMeta(html, "citation_date") ?? ""; + const doi = extractMeta(html, "citation_doi") ?? ""; + + const text = `# ${title}\n\n**Authors:** ${authors.join(", ")}\n**Date:** ${date}\n\n## Abstract\n\n${abstract.trim()}`; + + return { + kind: "arxiv", + uri: source.uri, + text, + meta: { + paper_id: paperId ?? "", + title, + authors, + date, + doi, + abstract_url: absUrl, + }, + }; +} + +// ─── PDF ──────────────────────────────────────────────────────────── + +async function fetchPdf(source: DetectedSource): Promise { + if (isUrl(source.uri)) { + const resp = await fetch(source.uri, { + signal: AbortSignal.timeout(60_000), + }); + if (!resp.ok) { + throw new Error(`PDF download failed (${resp.status})`); + } + const buffer = Buffer.from(await resp.arrayBuffer()); + return { + kind: "pdf", + uri: source.uri, + binary: buffer, + meta: { size: buffer.length }, + }; + } + + // Local file + const buffer = readFileSync(source.uri); + return { + kind: "pdf", + uri: source.uri, + binary: buffer, + meta: { size: buffer.length }, + }; +} + +// ─── Image ────────────────────────────────────────────────────────── + +async function fetchImage(source: DetectedSource): Promise { + if (isUrl(source.uri)) { + const resp = await fetch(source.uri, { + signal: AbortSignal.timeout(30_000), + }); + if (!resp.ok) { + throw new Error(`Image download failed (${resp.status})`); + } + const buffer = Buffer.from(await resp.arrayBuffer()); + return { + kind: "image", + uri: source.uri, + binary: buffer, + meta: { size: buffer.length, content_type: resp.headers.get("content-type") ?? "unknown" }, + }; + } + + const buffer = readFileSync(source.uri); + return { + kind: "image", + uri: source.uri, + binary: buffer, + meta: { size: buffer.length }, + }; +} + +// ─── Webpage ──────────────────────────────────────────────────────── + +async function fetchWebpage(source: DetectedSource): Promise { + const resp = await fetch(source.uri, { + signal: AbortSignal.timeout(30_000), + headers: { + "User-Agent": "Mozilla/5.0 (compatible; Ix/1.0; +https://github.com/ix-infrastructure)", + }, + }); + + if (!resp.ok) { + throw new Error(`Webpage fetch failed (${resp.status})`); + } + + const html = await resp.text(); + const title = extractTag(html, "title") ?? source.uri; + const text = htmlToText(html); + const description = extractMeta(html, "description") ?? ""; + + return { + kind: "webpage", + uri: source.uri, + text, + meta: { + title, + description, + url: source.uri, + }, + }; +} + +// ─── Chat export ──────────────────────────────────────────────────── + +async function fetchChatExport(source: DetectedSource): Promise { + const raw = readFileSync(source.uri, "utf-8"); + const ext = source.uri.split(".").pop()?.toLowerCase(); + + if (ext === "json" || ext === "jsonl") { + // Try parsing as JSON array or JSONL + let messages: unknown[]; + if (ext === "jsonl") { + messages = raw.split("\n").filter(Boolean).map((line) => JSON.parse(line)); + } else { + const parsed = JSON.parse(raw); + messages = Array.isArray(parsed) ? parsed : parsed.messages ?? [parsed]; + } + + return { + kind: "chat_export", + uri: source.uri, + text: raw, + meta: { + format: ext, + message_count: messages.length, + messages, + }, + }; + } + + // CSV — return as raw text, let extraction handle it + return { + kind: "chat_export", + uri: source.uri, + text: raw, + meta: { format: "csv" }, + }; +} + +// ─── Local file (generic) ─────────────────────────────────────────── + +async function fetchLocalFile(source: DetectedSource): Promise { + const raw = readFileSync(source.uri, "utf-8"); + return { + kind: "local_file", + uri: source.uri, + text: raw, + meta: {}, + }; +} + +// ─── Helpers ──────────────────────────────────────────────────────── + +function isUrl(s: string): boolean { + return /^https?:\/\//i.test(s); +} + +/** Extract content from a meta tag by name. */ +function extractMeta(html: string, name: string): string | null { + const re = new RegExp( + `]+name=["']${name}["'][^>]+content=["']([^"']+)["']`, + "i" + ); + const match = html.match(re); + if (match) return match[1]; + + // Try reversed attribute order + const re2 = new RegExp( + `]+content=["']([^"']+)["'][^>]+name=["']${name}["']`, + "i" + ); + const match2 = html.match(re2); + return match2 ? match2[1] : null; +} + +/** Same as extractMeta but for property-based meta tags (og:, etc). */ +function extractMetaContent(html: string, name: string): string | null { + // Also try matching the content within a
or with the class + const re = new RegExp( + `]+(?:name|property)=["']${name}["'][^>]+content=["']([^"']+)["']`, + "i" + ); + return html.match(re)?.[1] ?? null; +} + +/** Extract all values for a repeated meta tag. */ +function extractAllMeta(html: string, name: string): string[] { + const re = new RegExp( + `]+name=["']${name}["'][^>]+content=["']([^"']+)["']`, + "gi" + ); + const results: string[] = []; + let m; + while ((m = re.exec(html)) !== null) { + results.push(m[1]); + } + return results; +} + +/** Extract text content of a tag. */ +function extractTag(html: string, tag: string): string | null { + const re = new RegExp(`<${tag}[^>]*>([^<]+)`, "i"); + return html.match(re)?.[1]?.trim() ?? null; +} + +/** Extract text content by class name. */ +function extractByClass(html: string, className: string): string | null { + const re = new RegExp( + `<[^>]+class=["'][^"']*\\b${className}\\b[^"']*["'][^>]*>([\\s\\S]*?)]+>/g, " ").replace(/\s+/g, " ").trim(); +} + +/** Simple HTML to text conversion — strips tags, decodes entities. */ +function htmlToText(html: string): string { + return html + // Remove script and style blocks + .replace(//gi, "") + .replace(//gi, "") + .replace(//gi, "") + .replace(//gi, "") + .replace(//gi, "") + // Convert block elements to newlines + .replace(/<\/?(?:div|p|br|h[1-6]|li|tr|blockquote)[^>]*>/gi, "\n") + // Strip remaining tags + .replace(/<[^>]+>/g, " ") + // Decode common entities + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, " ") + // Collapse whitespace + .replace(/[ \t]+/g, " ") + .replace(/\n{3,}/g, "\n\n") + .trim(); +} diff --git a/ix-cli/src/client/api.ts b/ix-cli/src/client/api.ts index d1964bf8..9dfc3161 100644 --- a/ix-cli/src/client/api.ts +++ b/ix-cli/src/client/api.ts @@ -325,6 +325,53 @@ export class IxClient { return this.get("/v1/health"); } + // ── Branch API ────────────────────────────────────────────────── + + async createBranch(name: string, createdBy: string): Promise<{ + id: string; name: string; baseRev: number; headRev: number; + status: string; createdBy: string; + }> { + return this.post("/v1/branches", { name, createdBy }); + } + + async listBranches(opts?: { status?: string }): Promise { + const params = new URLSearchParams(); + if (opts?.status) params.set("status", opts.status); + const qs = params.toString(); + return this.get(`/v1/branches${qs ? `?${qs}` : ""}`); + } + + async getBranch(id: string): Promise { + return this.get(`/v1/branches/${id}`); + } + + async mergeBranch(id: string, opts?: { force?: boolean }): Promise<{ + branchId: string; conflicts: unknown[]; safeToMerge: boolean; + merged?: boolean; mergedRev?: number; + }> { + return this.post(`/v1/branches/${id}/merge`, { force: opts?.force ?? false }); + } + + async compareBranches(branchIds: string[]): Promise<{ + conflicts: Array<{ logicalId: string; touchedBy: string[] }>; + safeToMerge: boolean; + }> { + return this.post("/v1/branches/compare", { branches: branchIds }); + } + + async abandonBranch(id: string): Promise<{ status: string }> { + const resp = await fetch(`${this.endpoint}/v1/branches/${id}`, { + method: "DELETE", + headers: this.authToken ? { Authorization: `Bearer ${this.authToken}` } : {}, + signal: AbortSignal.timeout(30_000), + }); + if (!resp.ok) { + const text = await resp.text(); + throw new Error(`${resp.status}: ${text}`); + } + return resp.json() as Promise<{ status: string }>; + } + private async post(path: string, body: unknown): Promise { const resp = await fetch(`${this.endpoint}${path}`, { method: "POST",