Skip to content

Commit f771abf

Browse files
feat: embedding service, semantic context engine, and branch API methods
* feat: add embedding service and patch enrichment for semantic context engine Phase 1 client-side support for the semantic context engine: - EmbeddingService wrapping Voyage AI API with batch support and per-NodeKind content strategies - enrichPatchWithEmbeddings() post-processor that injects vector embeddings into UpsertNode ops - Enable ArangoDB experimental vector index flag in standalone docker-compose - Add .ix-worktrees/ to gitignore Embeddings are computed at ingestion time and stored in node attrs. Gracefully degrades when VOYAGE_API_KEY is not set — patches pass through unchanged. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * feat: add ix load command and multi-source content detection Thin CLI client for semantic ingestion: detect source type → fetch content → POST to backend /v1/load. Supports tweets, arxiv papers, PDFs, images, webpages, chat exports, and local files. Remove client-side embedding code (moved to ix-semantic-extraction service). Update docker-compose with ollama + semantic-extraction services. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: add auth header to abandonBranch fetch call Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ee9dc14 commit f771abf

File tree

7 files changed

+744
-1
lines changed

7 files changed

+744
-1
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,6 @@ core-ingestion/node_modules/*
1111
core-ingestion/dist/
1212
ix-cli/dist/
1313
.vscode/
14-
.mcp.json
14+
.mcp.json
15+
# Ix agent worktrees
16+
.ix-worktrees/

docker-compose.standalone.yml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ services:
1414
- "127.0.0.1:8529:8529"
1515
environment:
1616
ARANGO_NO_AUTH: "1"
17+
command: ["arangod", "--server.endpoint", "tcp://0.0.0.0:8529", "--experimental-vector-index"]
1718
volumes:
1819
- arangodb-data:/var/lib/arangodb3
1920
healthcheck:
@@ -24,6 +25,42 @@ services:
2425
retries: 15
2526
restart: unless-stopped
2627

28+
ollama:
29+
image: ollama/ollama:latest
30+
networks:
31+
- backend
32+
ports:
33+
- "127.0.0.1:11434:11434"
34+
volumes:
35+
- ollama-data:/root/.ollama
36+
restart: unless-stopped
37+
profiles:
38+
- semantic
39+
40+
semantic-extraction:
41+
image: ghcr.io/ix-infrastructure/ix-semantic-extraction:latest
42+
networks:
43+
- backend
44+
ports:
45+
- "127.0.0.1:11400:11400"
46+
environment:
47+
OLLAMA_BASE_URL: http://ollama:11434
48+
IX_EXTRACTION_MODEL: qwen2.5vl:7b
49+
IX_EMBEDDING_MODEL: "nomic-embed-text:v2-moe"
50+
IX_AUTO_PULL: "true"
51+
PORT: "11400"
52+
depends_on:
53+
- ollama
54+
healthcheck:
55+
test: ["CMD", "curl", "-sf", "http://localhost:11400/v1/health"]
56+
interval: 10s
57+
timeout: 5s
58+
start_period: 120s
59+
retries: 10
60+
restart: unless-stopped
61+
profiles:
62+
- semantic
63+
2764
memory-layer:
2865
image: ghcr.io/ix-infrastructure/ix-memory-layer:latest
2966
networks:
@@ -41,6 +78,7 @@ services:
4178
ARANGO_DATABASE: ix_memory
4279
ARANGO_USER: root
4380
ARANGO_PASSWORD: ""
81+
IX_EXTRACTION_URL: http://semantic-extraction:11400
4482
PORT: "8090"
4583
depends_on:
4684
arangodb:
@@ -55,6 +93,7 @@ services:
5593

5694
volumes:
5795
arangodb-data:
96+
ollama-data:
5897

5998
networks:
6099
backend:

ix-cli/src/cli/commands/load.ts

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/**
2+
* `ix load <url|path>` — Multi-source semantic ingestion command.
3+
*
4+
* Thin client: detects source type, fetches content, and POSTs to the
5+
* backend's /v1/load endpoint which handles LLM extraction, embedding,
6+
* and patch commit server-side.
7+
*
8+
* Supports: tweets, arXiv papers, PDFs, images/screenshots, webpages,
9+
* chat exports, and generic local files.
10+
*/
11+
12+
import type { Command } from "commander";
13+
import chalk from "chalk";
14+
import { IxClient } from "../../client/api.js";
15+
import { getEndpoint } from "../config.js";
16+
import { detectSource } from "../sources/detect.js";
17+
import { fetchContent } from "../sources/fetch.js";
18+
19+
interface LoadOptions {
20+
format: string;
21+
verbose: boolean;
22+
}
23+
24+
export function registerLoadCommand(program: Command): void {
25+
program
26+
.command("load")
27+
.description("Ingest a URL or file into the knowledge graph (papers, tweets, screenshots, etc.)")
28+
.argument("<source>", "URL or local file path to ingest")
29+
.option("--format <format>", "Output format (text or json)", "text")
30+
.option("--verbose", "Show detailed output", false)
31+
.action(async (source: string, opts: LoadOptions) => {
32+
try {
33+
await runLoad(source, opts);
34+
} catch (err) {
35+
const msg = err instanceof Error ? err.message : String(err);
36+
if (opts.format === "json") {
37+
console.log(JSON.stringify({ error: msg }));
38+
} else {
39+
console.error(chalk.red(`Error: ${msg}`));
40+
}
41+
process.exitCode = 1;
42+
}
43+
});
44+
}
45+
46+
async function runLoad(source: string, opts: LoadOptions): Promise<void> {
47+
const isJson = opts.format === "json";
48+
49+
// 1. Detect source type
50+
const detected = detectSource(source);
51+
52+
if (detected.kind === "github") {
53+
if (!isJson) {
54+
console.error(
55+
chalk.yellow("GitHub repos should be ingested with: ix ingest --github <owner/repo>")
56+
);
57+
}
58+
process.exitCode = 1;
59+
return;
60+
}
61+
62+
if (!isJson) {
63+
console.log(
64+
chalk.dim(`Detected source type: `) + chalk.cyan(detected.kind) +
65+
chalk.dim(` (${detected.uri})`)
66+
);
67+
}
68+
69+
// 2. Fetch content
70+
if (!isJson) process.stdout.write(chalk.dim("Fetching content... "));
71+
const content = await fetchContent(detected);
72+
if (!isJson) {
73+
const size = content.text
74+
? `${content.text.length} chars`
75+
: content.binary
76+
? `${(content.binary.length / 1024).toFixed(1)} KB`
77+
: "empty";
78+
console.log(chalk.green(`done`) + chalk.dim(` (${size})`));
79+
}
80+
81+
// 3. Build request payload for backend
82+
const payload: Record<string, unknown> = {
83+
uri: detected.uri,
84+
kind: detected.kind,
85+
meta: { ...detected.meta, ...content.meta },
86+
};
87+
88+
if (content.text) {
89+
payload.text = content.text;
90+
}
91+
92+
if (content.binary) {
93+
payload.binaryBase64 = content.binary.toString("base64");
94+
payload.contentType = (content.meta.content_type as string) ?? undefined;
95+
}
96+
97+
// 4. POST to backend — extraction, embedding, and commit happen server-side
98+
if (!isJson) process.stdout.write(chalk.dim("Extracting and committing... "));
99+
100+
const endpoint = getEndpoint();
101+
const resp = await fetch(`${endpoint}/v1/load`, {
102+
method: "POST",
103+
headers: { "Content-Type": "application/json" },
104+
body: JSON.stringify(payload),
105+
signal: AbortSignal.timeout(5 * 60 * 1000), // 5 min for LLM extraction
106+
});
107+
108+
if (!resp.ok) {
109+
const text = await resp.text();
110+
throw new Error(`Backend error (${resp.status}): ${text}`);
111+
}
112+
113+
const result = (await resp.json()) as {
114+
status: string;
115+
rev: number;
116+
patchId: string;
117+
nodes: number;
118+
edges: number;
119+
claims: number;
120+
};
121+
122+
if (!isJson) {
123+
console.log(chalk.green("done") + chalk.dim(` (rev ${result.rev})`));
124+
console.log();
125+
console.log(
126+
chalk.bold("Ingested: ") +
127+
chalk.cyan(detected.kind) +
128+
chalk.dim(" → ") +
129+
`${result.nodes} nodes, ${result.edges} edges, ${result.claims} claims` +
130+
chalk.dim(` (rev ${result.rev})`)
131+
);
132+
} else {
133+
console.log(JSON.stringify({
134+
status: result.status,
135+
kind: detected.kind,
136+
uri: detected.uri,
137+
nodes: result.nodes,
138+
edges: result.edges,
139+
claims: result.claims,
140+
rev: result.rev,
141+
patchId: result.patchId,
142+
}));
143+
}
144+
}

ix-cli/src/cli/register/oss.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import { registerSubsystemsCommand } from "../commands/subsystems.js";
3434
import { registerUpgradeCommand } from "../commands/upgrade.js";
3535
import { registerViewCommand } from "../commands/view.js";
3636
import { registerSavingsCommand } from "../commands/savings.js";
37+
import { registerLoadCommand } from "../commands/load.js";
3738

3839
const PRO_COMMANDS: { name: string; desc: string }[] = [
3940
{ name: "briefing", desc: "Session-resume briefing" },
@@ -96,6 +97,7 @@ export function registerOssCommands(program: Command): void {
9697
registerUpgradeCommand(program);
9798
registerViewCommand(program);
9899
registerSavingsCommand(program);
100+
registerLoadCommand(program);
99101

100102
// Hide advanced commands from default help
101103
const advancedSet = new Set(ADVANCED_COMMANDS);

ix-cli/src/cli/sources/detect.ts

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
/**
2+
* URL / path type detection for multi-source ingestion.
3+
*
4+
* Auto-classifies a user-provided string (URL or local path) into a
5+
* SourceKind that drives the fetch → extract → transform pipeline.
6+
*/
7+
8+
export type SourceKind =
9+
| "tweet"
10+
| "arxiv"
11+
| "pdf"
12+
| "image"
13+
| "webpage"
14+
| "chat_export"
15+
| "github"
16+
| "local_file";
17+
18+
export interface DetectedSource {
19+
kind: SourceKind;
20+
/** Original input string (URL or file path). */
21+
raw: string;
22+
/** Normalized URL if applicable, otherwise the absolute file path. */
23+
uri: string;
24+
/** Extra metadata extracted during detection (e.g. arxiv paper ID). */
25+
meta: Record<string, string>;
26+
}
27+
28+
const IMAGE_EXTENSIONS = new Set([
29+
".png", ".jpg", ".jpeg", ".gif", ".webp", ".svg", ".bmp", ".tiff",
30+
]);
31+
32+
const CHAT_EXTENSIONS = new Set([".json", ".csv", ".jsonl"]);
33+
34+
/**
35+
* Detect the source type from a URL or file path.
36+
*
37+
* Detection order matters — more specific patterns are checked first.
38+
*/
39+
export function detectSource(input: string): DetectedSource {
40+
const trimmed = input.trim();
41+
42+
// --- URL-based detection ---
43+
if (/^https?:\/\//i.test(trimmed)) {
44+
return detectUrl(trimmed);
45+
}
46+
47+
// --- Local file path ---
48+
return detectLocalFile(trimmed);
49+
}
50+
51+
function detectUrl(url: string): DetectedSource {
52+
const lower = url.toLowerCase();
53+
const parsed = new URL(url);
54+
const host = parsed.hostname.replace(/^www\./, "");
55+
const path = parsed.pathname.toLowerCase();
56+
57+
// Twitter / X
58+
if (host === "twitter.com" || host === "x.com") {
59+
const tweetMatch = parsed.pathname.match(/\/([^/]+)\/status\/(\d+)/);
60+
return {
61+
kind: "tweet",
62+
raw: url,
63+
uri: url,
64+
meta: tweetMatch
65+
? { author: tweetMatch[1], tweetId: tweetMatch[2] }
66+
: {},
67+
};
68+
}
69+
70+
// arXiv
71+
if (host === "arxiv.org" || host.endsWith(".arxiv.org")) {
72+
const paperMatch = parsed.pathname.match(
73+
/\/(?:abs|pdf|html)\/(\d{4}\.\d{4,5})/
74+
);
75+
return {
76+
kind: "arxiv",
77+
raw: url,
78+
uri: url,
79+
meta: paperMatch ? { paperId: paperMatch[1] } : {},
80+
};
81+
}
82+
83+
// GitHub — delegate to existing GitHub ingestion
84+
if (host === "github.com") {
85+
return {
86+
kind: "github",
87+
raw: url,
88+
uri: url,
89+
meta: {},
90+
};
91+
}
92+
93+
// Direct PDF link
94+
if (path.endsWith(".pdf") || lower.includes("content-type=application/pdf")) {
95+
return {
96+
kind: "pdf",
97+
raw: url,
98+
uri: url,
99+
meta: {},
100+
};
101+
}
102+
103+
// Direct image link
104+
const ext = extFrom(path);
105+
if (ext && IMAGE_EXTENSIONS.has(ext)) {
106+
return {
107+
kind: "image",
108+
raw: url,
109+
uri: url,
110+
meta: {},
111+
};
112+
}
113+
114+
// Default: webpage
115+
return {
116+
kind: "webpage",
117+
raw: url,
118+
uri: url,
119+
meta: {},
120+
};
121+
}
122+
123+
function detectLocalFile(filePath: string): DetectedSource {
124+
const lower = filePath.toLowerCase();
125+
const ext = extFrom(lower);
126+
127+
if (ext === ".pdf") {
128+
return { kind: "pdf", raw: filePath, uri: filePath, meta: {} };
129+
}
130+
131+
if (ext && IMAGE_EXTENSIONS.has(ext)) {
132+
return { kind: "image", raw: filePath, uri: filePath, meta: {} };
133+
}
134+
135+
// Chat export heuristic: JSON/CSV/JSONL files with "chat", "slack",
136+
// "discord", or "messages" in the filename
137+
if (ext && CHAT_EXTENSIONS.has(ext)) {
138+
const name = filePath.split("/").pop() ?? "";
139+
if (/chat|slack|discord|messages|conversation/i.test(name)) {
140+
return { kind: "chat_export", raw: filePath, uri: filePath, meta: {} };
141+
}
142+
}
143+
144+
return { kind: "local_file", raw: filePath, uri: filePath, meta: {} };
145+
}
146+
147+
function extFrom(path: string): string | null {
148+
const dot = path.lastIndexOf(".");
149+
if (dot === -1) return null;
150+
return path.slice(dot).toLowerCase();
151+
}

0 commit comments

Comments
 (0)