Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 108 additions & 0 deletions Packages/OsaurusCore/Managers/Documents/DocumentFormatRegistry.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
//
// DocumentFormatRegistry.swift
// osaurus
//
// Process-wide routing table from a URL (or a `StructuredDocument`) to the
// adapter / emitter / streamer responsible for it. Adapters are registered
// once — at app launch for in-tree formats, at plugin load for plugin-
// provided formats — and looked up every time a file is ingested or an
// artifact is emitted, so the hot path here is the lookup, not registration.
//
// Thread safety: the registry guards its internal state with an `NSLock`
// rather than `@MainActor` isolation. Attachment ingress happens on the
// main actor today, but the agent tool surface (PR 7 in the stage-4
// roadmap) runs off the main actor, and we don't want every tool call to
// pay an `await`-hop just to look up an adapter.
//

import Foundation

public final class DocumentFormatRegistry: @unchecked Sendable {
public static let shared = DocumentFormatRegistry()

private let lock = NSLock()

// Insertion order preserved; lookup walks in reverse so the most
// recently-registered claimant wins ties. That lets a plugin override
// a built-in for a specific URL without having to unregister first.
private var adapters: [any DocumentFormatAdapter] = []
private var emitters: [any DocumentFormatEmitter] = []
private var streamersByFormatId: [String: any DocumentFormatStreamer] = [:]

/// `public` so tests can spin up an isolated registry without touching
/// `shared`. Production code should always use `shared`.
public init() {}

// MARK: - Registration

public func register(adapter: any DocumentFormatAdapter) {
lock.lock()
defer { lock.unlock() }
adapters.append(adapter)
}

public func register(emitter: any DocumentFormatEmitter) {
lock.lock()
defer { lock.unlock() }
emitters.append(emitter)
}

public func register(streamer: any DocumentFormatStreamer) {
lock.lock()
defer { lock.unlock() }
streamersByFormatId[streamer.formatId] = streamer
}

/// Removes every registration (adapter, emitter, streamer) whose
/// `formatId` matches. Returns `true` if anything was actually removed.
/// Used by plugin unload and by tests that want a clean slate.
@discardableResult
public func unregisterAll(formatId: String) -> Bool {
lock.lock()
defer { lock.unlock() }
let before = adapters.count + emitters.count + streamersByFormatId.count
adapters.removeAll { $0.formatId == formatId }
emitters.removeAll { $0.formatId == formatId }
streamersByFormatId.removeValue(forKey: formatId)
let after = adapters.count + emitters.count + streamersByFormatId.count
return before != after
}

// MARK: - Lookup

/// Returns the most-recently-registered adapter whose `canHandle`
/// accepts the URL. `nil` when nothing claims it — callers can
/// decide whether to fall through to a legacy path or throw
/// `DocumentAdapterError.unsupportedFormat`.
public func adapter(for url: URL, uti: String? = nil) -> (any DocumentFormatAdapter)? {
lock.lock()
defer { lock.unlock() }
return adapters.reversed().first(where: { $0.canHandle(url: url, uti: uti) })
}

public func emitter(for document: StructuredDocument) -> (any DocumentFormatEmitter)? {
lock.lock()
defer { lock.unlock() }
return emitters.reversed().first(where: { $0.canEmit(document) })
}

public func streamer(forFormatId id: String) -> (any DocumentFormatStreamer)? {
lock.lock()
defer { lock.unlock() }
return streamersByFormatId[id]
}

// MARK: - Introspection

/// Union of format ids currently registered across adapters, emitters,
/// and streamers. Useful for plugin-host diagnostics and for tests.
public func registeredFormatIds() -> Set<String> {
lock.lock()
defer { lock.unlock() }
var ids: Set<String> = []
for adapter in adapters { ids.insert(adapter.formatId) }
for emitter in emitters { ids.insert(emitter.formatId) }
for id in streamersByFormatId.keys { ids.insert(id) }
return ids
}
}
37 changes: 37 additions & 0 deletions Packages/OsaurusCore/Models/Documents/DocumentAdapterError.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//
// DocumentAdapterError.swift
// osaurus
//
// Shared error surface for every document format adapter, emitter, and
// streamer. Callers that don't care about the specific failure can still
// catch the common cases (size, cancellation) at the protocol boundary
// without per-format knowledge.
//

import Foundation

public enum DocumentAdapterError: LocalizedError, Sendable {
case unsupportedFormat(formatId: String)
case sizeLimitExceeded(actual: Int64, limit: Int64)
case readFailed(underlying: String)
case writeFailed(underlying: String)
case emptyContent
case cancelled

public var errorDescription: String? {
switch self {
case .unsupportedFormat(let id):
return "No registered adapter for format '\(id)'"
case .sizeLimitExceeded(let actual, let limit):
return "File exceeds size limit (\(actual) bytes > \(limit) bytes)"
case .readFailed(let reason):
return "Document read failed: \(reason)"
case .writeFailed(let reason):
return "Document write failed: \(reason)"
case .emptyContent:
return "Document contains no readable content"
case .cancelled:
return "Document parse was cancelled"
}
}
}
34 changes: 34 additions & 0 deletions Packages/OsaurusCore/Models/Documents/DocumentFormatAdapter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//
// DocumentFormatAdapter.swift
// osaurus
//
// Read-side handler for a single file format. Adapters are registered
// with `DocumentFormatRegistry` at app launch (for in-tree adapters) or
// at plugin load (for plugin-provided adapters), then looked up every
// time a file is ingested. `canHandle` is kept separate from `parse` so
// the registry can iterate candidates without paying parse cost on every
// file type that doesn't match.
//

import Foundation

public protocol DocumentFormatAdapter: Sendable {
/// Stable identifier used for logging, registry tie-breaks, and as the
/// plugin registration key. Examples: "xlsx", "docx", "pdf", "csv".
var formatId: String { get }

/// Lightweight precondition check. Must NOT open the file; it is called
/// before `parse` as the registry enumerates adapters. An adapter that
/// narrows (e.g. "I only handle PDFs with a tagged text layer") uses
/// this hook to defer to a more permissive adapter registered later.
func canHandle(url: URL, uti: String?) -> Bool

/// Parse a file into its typed representation. Adapters that read the
/// whole file into memory must throw
/// `DocumentAdapterError.sizeLimitExceeded` when the file exceeds
/// `sizeLimit`. Streaming adapters may return a `StructuredDocument`
/// whose representation carries an async stream (see CSVTable in
/// stage-4 PR 4). The registry supplies a per-format cap from
/// `DocumentLimits`.
func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument
}
25 changes: 25 additions & 0 deletions Packages/OsaurusCore/Models/Documents/DocumentFormatEmitter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
//
// DocumentFormatEmitter.swift
// osaurus
//
// Write-side handler for a single file format. Deliberately split from
// `DocumentFormatAdapter` so read-only formats (XLS, OFX 1.x, PPTX in
// the stage-2 priority list) don't have to carry a stub writer. Sandbox
// containment and destination checking live in the caller
// (`ShareArtifactTool` today); emitters are just byte producers.
//

import Foundation

public protocol DocumentFormatEmitter: Sendable {
var formatId: String { get }

/// Keyed on the concrete shape of `document.representation`, not on
/// the file extension — an emitter produces exactly one representation
/// shape, so the registry uses this hook to pick the right writer.
func canEmit(_ document: StructuredDocument) -> Bool

/// Write the document to `url`. The caller is responsible for having
/// already resolved and contained `url`; the emitter writes raw bytes.
func emit(_ document: StructuredDocument, to url: URL) async throws
}
25 changes: 25 additions & 0 deletions Packages/OsaurusCore/Models/Documents/DocumentFormatStreamer.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
//
// DocumentFormatStreamer.swift
// osaurus
//
// Optional streaming read for formats where whole-file-into-memory is
// not viable — multi-GB CSVs, page-by-page PDFs, row-by-row XLSX.
// Orthogonal to `DocumentFormatAdapter` so small formats (QIF, MT940)
// can skip the streaming surface entirely. Callers that can back-
// pressure (the agent tool surface in particular) prefer streaming when
// both an adapter and a streamer are registered for the same format.
//

import Foundation

public protocol DocumentFormatStreamer: Sendable {
associatedtype Element: Sendable

var formatId: String { get }

/// Stream format-native records out of the file. `AsyncThrowingStream`
/// gives callers cancellation and back-pressure without bespoke
/// plumbing; adapters that can't produce records incrementally should
/// not conform to this protocol at all.
func stream(url: URL) -> AsyncThrowingStream<Element, Error>
}
38 changes: 38 additions & 0 deletions Packages/OsaurusCore/Models/Documents/DocumentLimits.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//
// DocumentLimits.swift
// osaurus
//
// Per-format byte ceilings applied to in-memory parsing. Streaming
// adapters are not bound by these caps — they negotiate back-pressure
// with their caller — but any adapter that reads the whole file into
// memory must honour the limit returned by `limit(forFormatId:)`.
//
// The numeric defaults here are intentionally generous compared to the
// 500 KB text cap on the legacy `DocumentParser`. They exist to prevent
// OOM under adversarial input, not to shape the user-facing attachment
// experience; the chat attachment flow keeps its own smaller caps.
//

import Foundation

public enum DocumentLimits {
public static let plainText: Int64 = 5 * 1024 * 1024
public static let csv: Int64 = 25 * 1024 * 1024
public static let xlsx: Int64 = 50 * 1024 * 1024
public static let pdf: Int64 = 100 * 1024 * 1024
public static let docx: Int64 = 50 * 1024 * 1024

/// Fallback for formats that haven't been assigned a tuned cap.
public static let defaultLimit: Int64 = 10 * 1024 * 1024

public static func limit(forFormatId id: String) -> Int64 {
switch id.lowercased() {
case "plaintext", "text", "txt", "md", "markdown": return plainText
case "csv", "tsv": return csv
case "xlsx", "xls", "ods": return xlsx
case "pdf": return pdf
case "docx", "doc", "rtf": return docx
default: return defaultLimit
}
}
}
56 changes: 56 additions & 0 deletions Packages/OsaurusCore/Models/Documents/StructuredDocument.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
//
// StructuredDocument.swift
// osaurus
//
// Typed parse result that carries both a format-native representation
// AND a plain-text fallback. The fallback is load-bearing because the
// existing chat attachment flow consumes
// `Attachment.Kind.document(content: String, …)`; keeping a text view
// on every parsed document lets adapters migrate onto the typed surface
// one at a time without breaking that contract.
//

import Foundation

/// Marker protocol for per-format typed representations (`Workbook`,
/// `WordDocument`, `PDFDocument`, …). Concrete types live next to their
/// adapter under `Packages/OsaurusCore/Models/Documents/<Format>/`.
public protocol StructuredRepresentation: Sendable {}

/// Type-erasing container so a `StructuredDocument` can cross layers
/// (registry, tool surface, artifact pipeline) without leaking the
/// concrete representation type into every caller.
public struct AnyStructuredRepresentation: @unchecked Sendable {
public let formatId: String
public let underlying: any StructuredRepresentation

public init(formatId: String, underlying: any StructuredRepresentation) {
self.formatId = formatId
self.underlying = underlying
}
}

public struct StructuredDocument: @unchecked Sendable {
public let formatId: String
public let filename: String
public let fileSize: Int64
public let representation: AnyStructuredRepresentation
public let textFallback: String
public let createdAt: Date

public init(
formatId: String,
filename: String,
fileSize: Int64,
representation: AnyStructuredRepresentation,
textFallback: String,
createdAt: Date = Date()
) {
self.formatId = formatId
self.filename = filename
self.fileSize = fileSize
self.representation = representation
self.textFallback = textFallback
self.createdAt = createdAt
}
}
Loading
Loading