Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions Packages/OsaurusCore/AppDelegate.swift
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega
// the specific crash class this prevents.
MLXErrorRecovery.installGlobalHandler()

// Register in-tree document format adapters before any file-ingress
// path can run. Idempotent; safe if a future migration moves this.
DocumentAdaptersBootstrap.registerBuiltIns()

// Detect repeated startup crashes and enter safe mode if needed
LaunchGuard.checkOnLaunch()

Expand Down Expand Up @@ -175,7 +179,7 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega
#endif

// Initialize directory access early so security-scoped bookmark is active
let _ = DirectoryPickerService.shared
_ = DirectoryPickerService.shared

if LaunchGuard.isSafeMode {
NotificationService.shared.postSafeModeActive()
Expand Down Expand Up @@ -876,7 +880,7 @@ extension AppDelegate {
}

@objc private func handleServeCommand(_ note: Notification) {
var desiredPort: Int? = nil
var desiredPort: Int?
var exposeFlag: Bool = false
if let ui = note.userInfo {
if let p = ui["port"] as? Int {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//
// DocumentAdaptersBootstrap.swift
// osaurus
//
// Registers the in-tree document adapters with `DocumentFormatRegistry.shared`
// exactly once, at app launch. Kept separate from `AppDelegate` so tests can
// opt into the same registration (or opt out of it entirely) without dragging
// in `NSApplication`.
//

import Foundation

public enum DocumentAdaptersBootstrap {
private static let lock = NSLock()
// Guarded by `lock`; the `nonisolated(unsafe)` matches the project pattern
// for lock-protected process-global state (see `OsaurusPaths.overrideRoot`).
nonisolated(unsafe) private static var didRegisterShared = false

/// Idempotent against the shared registry: safe to call from multiple
/// launch paths without producing duplicate adapter registrations.
/// Non-shared registries (tests, isolated instances) are re-registered on
/// every call so each test gets a clean baseline.
public static func registerBuiltIns(registry: DocumentFormatRegistry = .shared) {
lock.lock()
defer { lock.unlock() }
if registry === DocumentFormatRegistry.shared, didRegisterShared { return }
registry.register(adapter: PlainTextAdapter())
registry.register(adapter: PDFAdapter())
registry.register(adapter: RichDocumentAdapter())
if registry === DocumentFormatRegistry.shared {
didRegisterShared = true
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//
// PlainTextRepresentation.swift
// osaurus
//
// Default representation for adapters that extract a single text string.
// Every adapter has to publish *some* `StructuredRepresentation`; the
// wrappers around `PDFKit` text extraction and `NSAttributedString` don't
// preserve any format-native structure, so they emit this shape. The
// real typed representations (`Workbook`, `WordDocument`, …) replace it
// per-format as higher-fidelity adapters land.
//

import Foundation

public struct PlainTextRepresentation: StructuredRepresentation, Sendable {
public let text: String
public init(text: String) { self.text = text }
}
68 changes: 68 additions & 0 deletions Packages/OsaurusCore/Services/Documents/PDFAdapter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
//
// PDFAdapter.swift
// osaurus
//
// Wraps the text-layer extraction path in `DocumentParser.parsePDFWithFallback`.
// Intentionally does NOT cover the image-rendering fallback — when a PDF has
// no extractable text, this adapter throws `.emptyContent` and the
// `DocumentParser` shim falls through to the legacy switch, which still
// renders each page as PNG. Moving that path onto the adapter surface is
// deferred to stage-4 PR 8 (layout-aware table extraction), where the
// typed `PDFDocument` representation gets introduced.
//

import Foundation
import PDFKit

public struct PDFAdapter: DocumentFormatAdapter {
public let formatId = "pdf"

public init() {}

public func canHandle(url: URL, uti: String?) -> Bool {
url.pathExtension.lowercased() == "pdf"
}

public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument {
let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0)
if sizeLimit > 0, fileSize > sizeLimit {
throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit)
}

guard let document = PDFDocument(url: url) else {
throw DocumentAdapterError.readFailed(underlying: "PDFKit could not open document")
}

let extracted = Self.extractText(from: document)
guard !extracted.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
// No text layer — let the shim fall through to the legacy image-
// render fallback. Don't claim a result we can't produce.
throw DocumentAdapterError.emptyContent
}

let truncated = PlainTextAdapter.applyCharacterCap(extracted)

return StructuredDocument(
formatId: formatId,
filename: url.lastPathComponent,
fileSize: fileSize,
representation: AnyStructuredRepresentation(
formatId: formatId,
underlying: PlainTextRepresentation(text: truncated)
),
textFallback: truncated
)
}

private static func extractText(from document: PDFDocument) -> String {
var pages: [String] = []
for index in 0 ..< document.pageCount {
guard let page = document.page(at: index),
let text = page.string,
!text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
else { continue }
pages.append(text)
}
return pages.joined(separator: "\n\n")
}
}
90 changes: 90 additions & 0 deletions Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
//
// PlainTextAdapter.swift
// osaurus
//
// Wraps the existing plain-text ingress path in `DocumentParser`. Claims
// roughly the 60 extensions that were previously handled by the inline
// `case _ where isPlainText(ext:)` branch — `.txt`, `.md`, source code,
// config files, etc. Behaviour is intentionally identical to the legacy
// switch: UTF-8 first, ISO-Latin-1 retry, post-read character-count
// truncation marker. This adapter is a migration bridge, not a fidelity
// improvement.
//

import Foundation

public struct PlainTextAdapter: DocumentFormatAdapter {
public let formatId = "plaintext"

public init() {}

public func canHandle(url: URL, uti: String?) -> Bool {
Self.plainTextExtensions.contains(url.pathExtension.lowercased())
}

public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument {
let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0)
if sizeLimit > 0, fileSize > sizeLimit {
throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit)
}

let rawContent: String
do {
rawContent = try String(contentsOf: url, encoding: .utf8)
} catch {
// Fall back to latin-1 for files that are "mostly text" with a few
// non-UTF-8 bytes — same behaviour as the legacy path.
guard let data = try? Data(contentsOf: url),
let decoded = String(data: data, encoding: .isoLatin1)
else {
throw DocumentAdapterError.readFailed(underlying: error.localizedDescription)
}
rawContent = decoded
}

guard !rawContent.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
throw DocumentAdapterError.emptyContent
}

let truncated = Self.applyCharacterCap(rawContent)

return StructuredDocument(
formatId: formatId,
filename: url.lastPathComponent,
fileSize: fileSize,
representation: AnyStructuredRepresentation(
formatId: formatId,
underlying: PlainTextRepresentation(text: truncated)
),
textFallback: truncated
)
}

// MARK: - Helpers

/// Preserves the legacy 500K-character UX — consumers already expect the
/// trailing marker when a document is truncated mid-read. The cap on
/// bytes-read is higher (see `DocumentLimits.plainText`), so the two
/// interact: oversized files are refused outright; merely long files
/// are surfaced with a truncation note.
static func applyCharacterCap(_ text: String) -> String {
let cap = 500_000
guard text.count > cap else { return text }
return String(text.prefix(cap))
+ "\n\n[Document truncated — exceeded \(cap) character limit]"
}

static let plainTextExtensions: Set<String> = [
"txt", "md", "markdown", "csv", "tsv",
"json", "xml", "yaml", "yml", "toml",
"log", "ini", "cfg", "conf", "env",
"swift", "py", "js", "ts", "tsx", "jsx",
"rs", "go", "java", "kt", "c", "cpp", "h", "hpp",
"rb", "php", "sh", "bash", "zsh", "fish",
"css", "scss", "less", "sql",
"r", "m", "mm", "lua", "pl", "ex", "exs",
"zig", "nim", "dart", "scala", "groovy",
"tf", "hcl", "dockerfile",
"gitignore", "editorconfig", "prettierrc",
]
}
84 changes: 84 additions & 0 deletions Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
//
// RichDocumentAdapter.swift
// osaurus
//
// Wraps the `NSAttributedString(url:documentType:)` path in
// `DocumentParser.parseRichDocument`. A single adapter covers DOCX, DOC,
// RTF, RTFD, and HTML today because they share the same underlying
// framework call and produce the same plain-text output. When stage-4
// PR 11 lands a high-fidelity DOCX reader (tables, tracked changes,
// comments) this adapter splits along format lines and this one becomes
// the RTF/HTML-only path.
//

import AppKit
import Foundation

public struct RichDocumentAdapter: DocumentFormatAdapter {
public let formatId = "richdoc"

public init() {}

public func canHandle(url: URL, uti: String?) -> Bool {
Self.supportedExtensions.contains(url.pathExtension.lowercased())
}

public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument {
let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0)
if sizeLimit > 0, fileSize > sizeLimit {
throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit)
}

let documentType = Self.documentType(forExtension: url.pathExtension.lowercased())
let extracted: String
do {
var options: [NSAttributedString.DocumentReadingOptionKey: Any] = [:]
if let documentType {
options[.documentType] = documentType
}
let attributed = try NSAttributedString(
url: url,
options: options,
documentAttributes: nil
)
extracted = attributed.string
} catch {
throw DocumentAdapterError.readFailed(underlying: error.localizedDescription)
}

guard !extracted.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
throw DocumentAdapterError.emptyContent
}

let truncated = PlainTextAdapter.applyCharacterCap(extracted)

return StructuredDocument(
formatId: formatId,
filename: url.lastPathComponent,
fileSize: fileSize,
representation: AnyStructuredRepresentation(
formatId: formatId,
underlying: PlainTextRepresentation(text: truncated)
),
textFallback: truncated
)
}

// MARK: - Helpers

static let supportedExtensions: Set<String> = [
"docx", "doc", "rtf", "rtfd", "html", "htm",
]

private static func documentType(
forExtension ext: String
) -> NSAttributedString.DocumentType? {
switch ext {
case "docx": return nil // NSAttributedString auto-detects OOXML
case "doc": return .docFormat
case "rtf", "rtfd": return .rtf
case "html", "htm": return .html
default: return nil
}
}
}
Loading
Loading