Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,6 @@ core-ingestion/node_modules/*
core-ingestion/dist/
ix-cli/dist/
.vscode/
.mcp.json
.mcp.json
# Ix agent worktrees
.ix-worktrees/
39 changes: 39 additions & 0 deletions docker-compose.standalone.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ services:
- "127.0.0.1:8529:8529"
environment:
ARANGO_NO_AUTH: "1"
command: ["arangod", "--server.endpoint", "tcp://0.0.0.0:8529", "--experimental-vector-index"]
volumes:
- arangodb-data:/var/lib/arangodb3
healthcheck:
Expand All @@ -24,6 +25,42 @@ services:
retries: 15
restart: unless-stopped

ollama:
image: ollama/ollama:latest
networks:
- backend
ports:
- "127.0.0.1:11434:11434"
volumes:
- ollama-data:/root/.ollama
restart: unless-stopped
profiles:
- semantic

semantic-extraction:
image: ghcr.io/ix-infrastructure/ix-semantic-extraction:latest
networks:
- backend
ports:
- "127.0.0.1:11400:11400"
environment:
OLLAMA_BASE_URL: http://ollama:11434
IX_EXTRACTION_MODEL: qwen2.5vl:7b
IX_EMBEDDING_MODEL: "nomic-embed-text:v2-moe"
IX_AUTO_PULL: "true"
PORT: "11400"
depends_on:
- ollama
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:11400/v1/health"]
interval: 10s
timeout: 5s
start_period: 120s
retries: 10
restart: unless-stopped
profiles:
- semantic

memory-layer:
image: ghcr.io/ix-infrastructure/ix-memory-layer:latest
networks:
Expand All @@ -41,6 +78,7 @@ services:
ARANGO_DATABASE: ix_memory
ARANGO_USER: root
ARANGO_PASSWORD: ""
IX_EXTRACTION_URL: http://semantic-extraction:11400
PORT: "8090"
depends_on:
arangodb:
Expand All @@ -55,6 +93,7 @@ services:

volumes:
arangodb-data:
ollama-data:

networks:
backend:
144 changes: 144 additions & 0 deletions ix-cli/src/cli/commands/load.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
/**
* `ix load <url|path>` — Multi-source semantic ingestion command.
*
* Thin client: detects source type, fetches content, and POSTs to the
* backend's /v1/load endpoint which handles LLM extraction, embedding,
* and patch commit server-side.
*
* Supports: tweets, arXiv papers, PDFs, images/screenshots, webpages,
* chat exports, and generic local files.
*/

import type { Command } from "commander";
import chalk from "chalk";
import { IxClient } from "../../client/api.js";
import { getEndpoint } from "../config.js";
import { detectSource } from "../sources/detect.js";
import { fetchContent } from "../sources/fetch.js";

interface LoadOptions {
format: string;
verbose: boolean;
}

export function registerLoadCommand(program: Command): void {
program
.command("load")
.description("Ingest a URL or file into the knowledge graph (papers, tweets, screenshots, etc.)")
.argument("<source>", "URL or local file path to ingest")
.option("--format <format>", "Output format (text or json)", "text")
.option("--verbose", "Show detailed output", false)
.action(async (source: string, opts: LoadOptions) => {
try {
await runLoad(source, opts);
} catch (err) {
const msg = err instanceof Error ? err.message : String(err);
if (opts.format === "json") {
console.log(JSON.stringify({ error: msg }));
} else {
console.error(chalk.red(`Error: ${msg}`));
}
process.exitCode = 1;
}
});
}

async function runLoad(source: string, opts: LoadOptions): Promise<void> {
const isJson = opts.format === "json";

// 1. Detect source type
const detected = detectSource(source);

if (detected.kind === "github") {
if (!isJson) {
console.error(
chalk.yellow("GitHub repos should be ingested with: ix ingest --github <owner/repo>")
);
}
process.exitCode = 1;
return;
}

if (!isJson) {
console.log(
chalk.dim(`Detected source type: `) + chalk.cyan(detected.kind) +
chalk.dim(` (${detected.uri})`)
);
}

// 2. Fetch content
if (!isJson) process.stdout.write(chalk.dim("Fetching content... "));
const content = await fetchContent(detected);
if (!isJson) {
const size = content.text
? `${content.text.length} chars`
: content.binary
? `${(content.binary.length / 1024).toFixed(1)} KB`
: "empty";
console.log(chalk.green(`done`) + chalk.dim(` (${size})`));
}

// 3. Build request payload for backend
const payload: Record<string, unknown> = {
uri: detected.uri,
kind: detected.kind,
meta: { ...detected.meta, ...content.meta },
};

if (content.text) {
payload.text = content.text;
}

if (content.binary) {
payload.binaryBase64 = content.binary.toString("base64");
payload.contentType = (content.meta.content_type as string) ?? undefined;
}

// 4. POST to backend — extraction, embedding, and commit happen server-side
if (!isJson) process.stdout.write(chalk.dim("Extracting and committing... "));

const endpoint = getEndpoint();
const resp = await fetch(`${endpoint}/v1/load`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(payload),
signal: AbortSignal.timeout(5 * 60 * 1000), // 5 min for LLM extraction
});

if (!resp.ok) {
const text = await resp.text();
throw new Error(`Backend error (${resp.status}): ${text}`);
}

const result = (await resp.json()) as {
status: string;
rev: number;
patchId: string;
nodes: number;
edges: number;
claims: number;
};

if (!isJson) {
console.log(chalk.green("done") + chalk.dim(` (rev ${result.rev})`));
console.log();
console.log(
chalk.bold("Ingested: ") +
chalk.cyan(detected.kind) +
chalk.dim(" → ") +
`${result.nodes} nodes, ${result.edges} edges, ${result.claims} claims` +
chalk.dim(` (rev ${result.rev})`)
);
} else {
console.log(JSON.stringify({
status: result.status,
kind: detected.kind,
uri: detected.uri,
nodes: result.nodes,
edges: result.edges,
claims: result.claims,
rev: result.rev,
patchId: result.patchId,
}));
}
}
2 changes: 2 additions & 0 deletions ix-cli/src/cli/register/oss.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import { registerSubsystemsCommand } from "../commands/subsystems.js";
import { registerUpgradeCommand } from "../commands/upgrade.js";
import { registerViewCommand } from "../commands/view.js";
import { registerSavingsCommand } from "../commands/savings.js";
import { registerLoadCommand } from "../commands/load.js";

const PRO_COMMANDS: { name: string; desc: string }[] = [
{ name: "briefing", desc: "Session-resume briefing" },
Expand Down Expand Up @@ -96,6 +97,7 @@ export function registerOssCommands(program: Command): void {
registerUpgradeCommand(program);
registerViewCommand(program);
registerSavingsCommand(program);
registerLoadCommand(program);

// Hide advanced commands from default help
const advancedSet = new Set(ADVANCED_COMMANDS);
Expand Down
151 changes: 151 additions & 0 deletions ix-cli/src/cli/sources/detect.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/**
* URL / path type detection for multi-source ingestion.
*
* Auto-classifies a user-provided string (URL or local path) into a
* SourceKind that drives the fetch → extract → transform pipeline.
*/

export type SourceKind =
| "tweet"
| "arxiv"
| "pdf"
| "image"
| "webpage"
| "chat_export"
| "github"
| "local_file";

export interface DetectedSource {
kind: SourceKind;
/** Original input string (URL or file path). */
raw: string;
/** Normalized URL if applicable, otherwise the absolute file path. */
uri: string;
/** Extra metadata extracted during detection (e.g. arxiv paper ID). */
meta: Record<string, string>;
}

const IMAGE_EXTENSIONS = new Set([
".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".bmp", ".tiff",
]);

const CHAT_EXTENSIONS = new Set([".json", ".csv", ".jsonl"]);

/**
* Detect the source type from a URL or file path.
*
* Detection order matters — more specific patterns are checked first.
*/
export function detectSource(input: string): DetectedSource {
const trimmed = input.trim();

// --- URL-based detection ---
if (/^https?:\/\//i.test(trimmed)) {
return detectUrl(trimmed);
}

// --- Local file path ---
return detectLocalFile(trimmed);
}

function detectUrl(url: string): DetectedSource {
const lower = url.toLowerCase();
const parsed = new URL(url);
const host = parsed.hostname.replace(/^www\./, "");
const path = parsed.pathname.toLowerCase();

// Twitter / X
if (host === "twitter.com" || host === "x.com") {
const tweetMatch = parsed.pathname.match(/\/([^/]+)\/status\/(\d+)/);
return {
kind: "tweet",
raw: url,
uri: url,
meta: tweetMatch
? { author: tweetMatch[1], tweetId: tweetMatch[2] }
: {},
};
}

// arXiv
if (host === "arxiv.org" || host.endsWith(".arxiv.org")) {
const paperMatch = parsed.pathname.match(
/\/(?:abs|pdf|html)\/(\d{4}\.\d{4,5})/
);
return {
kind: "arxiv",
raw: url,
uri: url,
meta: paperMatch ? { paperId: paperMatch[1] } : {},
};
}

// GitHub — delegate to existing GitHub ingestion
if (host === "github.com") {
return {
kind: "github",
raw: url,
uri: url,
meta: {},
};
}

// Direct PDF link
if (path.endsWith(".pdf") || lower.includes("content-type=application/pdf")) {
return {
kind: "pdf",
raw: url,
uri: url,
meta: {},
};
}

// Direct image link
const ext = extFrom(path);
if (ext && IMAGE_EXTENSIONS.has(ext)) {
return {
kind: "image",
raw: url,
uri: url,
meta: {},
};
}

// Default: webpage
return {
kind: "webpage",
raw: url,
uri: url,
meta: {},
};
}

function detectLocalFile(filePath: string): DetectedSource {
const lower = filePath.toLowerCase();
const ext = extFrom(lower);

if (ext === ".pdf") {
return { kind: "pdf", raw: filePath, uri: filePath, meta: {} };
}

if (ext && IMAGE_EXTENSIONS.has(ext)) {
return { kind: "image", raw: filePath, uri: filePath, meta: {} };
}

// Chat export heuristic: JSON/CSV/JSONL files with "chat", "slack",
// "discord", or "messages" in the filename
if (ext && CHAT_EXTENSIONS.has(ext)) {
const name = filePath.split("/").pop() ?? "";
if (/chat|slack|discord|messages|conversation/i.test(name)) {
return { kind: "chat_export", raw: filePath, uri: filePath, meta: {} };
}
}

return { kind: "local_file", raw: filePath, uri: filePath, meta: {} };
}

function extFrom(path: string): string | null {
const dot = path.lastIndexOf(".");
if (dot === -1) return null;
return path.slice(dot).toLowerCase();
}
Loading
Loading