diff --git a/Packages/OsaurusCore/AppDelegate.swift b/Packages/OsaurusCore/AppDelegate.swift index 175a2d095..780438e0d 100644 --- a/Packages/OsaurusCore/AppDelegate.swift +++ b/Packages/OsaurusCore/AppDelegate.swift @@ -32,6 +32,10 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega // the specific crash class this prevents. MLXErrorRecovery.installGlobalHandler() + // Register in-tree document format adapters before any file-ingress + // path can run. Idempotent; safe if a future migration moves this. + DocumentAdaptersBootstrap.registerBuiltIns() + // Detect repeated startup crashes and enter safe mode if needed LaunchGuard.checkOnLaunch() @@ -175,7 +179,7 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega #endif // Initialize directory access early so security-scoped bookmark is active - let _ = DirectoryPickerService.shared + _ = DirectoryPickerService.shared if LaunchGuard.isSafeMode { NotificationService.shared.postSafeModeActive() @@ -876,7 +880,7 @@ extension AppDelegate { } @objc private func handleServeCommand(_ note: Notification) { - var desiredPort: Int? = nil + var desiredPort: Int? var exposeFlag: Bool = false if let ui = note.userInfo { if let p = ui["port"] as? Int { diff --git a/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift new file mode 100644 index 000000000..03d4e14e5 --- /dev/null +++ b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift @@ -0,0 +1,34 @@ +// +// DocumentAdaptersBootstrap.swift +// osaurus +// +// Registers the in-tree document adapters with `DocumentFormatRegistry.shared` +// exactly once, at app launch. Kept separate from `AppDelegate` so tests can +// opt into the same registration (or opt out of it entirely) without dragging +// in `NSApplication`. +// + +import Foundation + +public enum DocumentAdaptersBootstrap { + private static let lock = NSLock() + // Guarded by `lock`; the `nonisolated(unsafe)` matches the project pattern + // for lock-protected process-global state (see `OsaurusPaths.overrideRoot`). + nonisolated(unsafe) private static var didRegisterShared = false + + /// Idempotent against the shared registry: safe to call from multiple + /// launch paths without producing duplicate adapter registrations. + /// Non-shared registries (tests, isolated instances) are re-registered on + /// every call so each test gets a clean baseline. + public static func registerBuiltIns(registry: DocumentFormatRegistry = .shared) { + lock.lock() + defer { lock.unlock() } + if registry === DocumentFormatRegistry.shared, didRegisterShared { return } + registry.register(adapter: PlainTextAdapter()) + registry.register(adapter: PDFAdapter()) + registry.register(adapter: RichDocumentAdapter()) + if registry === DocumentFormatRegistry.shared { + didRegisterShared = true + } + } +} diff --git a/Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift b/Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift new file mode 100644 index 000000000..f53c2348c --- /dev/null +++ b/Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift @@ -0,0 +1,18 @@ +// +// PlainTextRepresentation.swift +// osaurus +// +// Default representation for adapters that extract a single text string. +// Every adapter has to publish *some* `StructuredRepresentation`; the +// wrappers around `PDFKit` text extraction and `NSAttributedString` don't +// preserve any format-native structure, so they emit this shape. The +// real typed representations (`Workbook`, `WordDocument`, …) replace it +// per-format as higher-fidelity adapters land. +// + +import Foundation + +public struct PlainTextRepresentation: StructuredRepresentation, Sendable { + public let text: String + public init(text: String) { self.text = text } +} diff --git a/Packages/OsaurusCore/Services/Documents/PDFAdapter.swift b/Packages/OsaurusCore/Services/Documents/PDFAdapter.swift new file mode 100644 index 000000000..122006ad4 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/PDFAdapter.swift @@ -0,0 +1,68 @@ +// +// PDFAdapter.swift +// osaurus +// +// Wraps the text-layer extraction path in `DocumentParser.parsePDFWithFallback`. +// Intentionally does NOT cover the image-rendering fallback — when a PDF has +// no extractable text, this adapter throws `.emptyContent` and the +// `DocumentParser` shim falls through to the legacy switch, which still +// renders each page as PNG. Moving that path onto the adapter surface is +// deferred to stage-4 PR 8 (layout-aware table extraction), where the +// typed `PDFDocument` representation gets introduced. +// + +import Foundation +import PDFKit + +public struct PDFAdapter: DocumentFormatAdapter { + public let formatId = "pdf" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + url.pathExtension.lowercased() == "pdf" + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + guard let document = PDFDocument(url: url) else { + throw DocumentAdapterError.readFailed(underlying: "PDFKit could not open document") + } + + let extracted = Self.extractText(from: document) + guard !extracted.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + // No text layer — let the shim fall through to the legacy image- + // render fallback. Don't claim a result we can't produce. + throw DocumentAdapterError.emptyContent + } + + let truncated = PlainTextAdapter.applyCharacterCap(extracted) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: truncated) + ), + textFallback: truncated + ) + } + + private static func extractText(from document: PDFDocument) -> String { + var pages: [String] = [] + for index in 0 ..< document.pageCount { + guard let page = document.page(at: index), + let text = page.string, + !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + else { continue } + pages.append(text) + } + return pages.joined(separator: "\n\n") + } +} diff --git a/Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift b/Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift new file mode 100644 index 000000000..1df764eb0 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift @@ -0,0 +1,90 @@ +// +// PlainTextAdapter.swift +// osaurus +// +// Wraps the existing plain-text ingress path in `DocumentParser`. Claims +// roughly the 60 extensions that were previously handled by the inline +// `case _ where isPlainText(ext:)` branch — `.txt`, `.md`, source code, +// config files, etc. Behaviour is intentionally identical to the legacy +// switch: UTF-8 first, ISO-Latin-1 retry, post-read character-count +// truncation marker. This adapter is a migration bridge, not a fidelity +// improvement. +// + +import Foundation + +public struct PlainTextAdapter: DocumentFormatAdapter { + public let formatId = "plaintext" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + Self.plainTextExtensions.contains(url.pathExtension.lowercased()) + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + let rawContent: String + do { + rawContent = try String(contentsOf: url, encoding: .utf8) + } catch { + // Fall back to latin-1 for files that are "mostly text" with a few + // non-UTF-8 bytes — same behaviour as the legacy path. + guard let data = try? Data(contentsOf: url), + let decoded = String(data: data, encoding: .isoLatin1) + else { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + rawContent = decoded + } + + guard !rawContent.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + throw DocumentAdapterError.emptyContent + } + + let truncated = Self.applyCharacterCap(rawContent) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: truncated) + ), + textFallback: truncated + ) + } + + // MARK: - Helpers + + /// Preserves the legacy 500K-character UX — consumers already expect the + /// trailing marker when a document is truncated mid-read. The cap on + /// bytes-read is higher (see `DocumentLimits.plainText`), so the two + /// interact: oversized files are refused outright; merely long files + /// are surfaced with a truncation note. + static func applyCharacterCap(_ text: String) -> String { + let cap = 500_000 + guard text.count > cap else { return text } + return String(text.prefix(cap)) + + "\n\n[Document truncated — exceeded \(cap) character limit]" + } + + static let plainTextExtensions: Set = [ + "txt", "md", "markdown", "csv", "tsv", + "json", "xml", "yaml", "yml", "toml", + "log", "ini", "cfg", "conf", "env", + "swift", "py", "js", "ts", "tsx", "jsx", + "rs", "go", "java", "kt", "c", "cpp", "h", "hpp", + "rb", "php", "sh", "bash", "zsh", "fish", + "css", "scss", "less", "sql", + "r", "m", "mm", "lua", "pl", "ex", "exs", + "zig", "nim", "dart", "scala", "groovy", + "tf", "hcl", "dockerfile", + "gitignore", "editorconfig", "prettierrc", + ] +} diff --git a/Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift b/Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift new file mode 100644 index 000000000..33afe2f86 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift @@ -0,0 +1,84 @@ +// +// RichDocumentAdapter.swift +// osaurus +// +// Wraps the `NSAttributedString(url:documentType:)` path in +// `DocumentParser.parseRichDocument`. A single adapter covers DOCX, DOC, +// RTF, RTFD, and HTML today because they share the same underlying +// framework call and produce the same plain-text output. When stage-4 +// PR 11 lands a high-fidelity DOCX reader (tables, tracked changes, +// comments) this adapter splits along format lines and this one becomes +// the RTF/HTML-only path. +// + +import AppKit +import Foundation + +public struct RichDocumentAdapter: DocumentFormatAdapter { + public let formatId = "richdoc" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + Self.supportedExtensions.contains(url.pathExtension.lowercased()) + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + let documentType = Self.documentType(forExtension: url.pathExtension.lowercased()) + let extracted: String + do { + var options: [NSAttributedString.DocumentReadingOptionKey: Any] = [:] + if let documentType { + options[.documentType] = documentType + } + let attributed = try NSAttributedString( + url: url, + options: options, + documentAttributes: nil + ) + extracted = attributed.string + } catch { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + + guard !extracted.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + throw DocumentAdapterError.emptyContent + } + + let truncated = PlainTextAdapter.applyCharacterCap(extracted) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: truncated) + ), + textFallback: truncated + ) + } + + // MARK: - Helpers + + static let supportedExtensions: Set = [ + "docx", "doc", "rtf", "rtfd", "html", "htm", + ] + + private static func documentType( + forExtension ext: String + ) -> NSAttributedString.DocumentType? { + switch ext { + case "docx": return nil // NSAttributedString auto-detects OOXML + case "doc": return .docFormat + case "rtf", "rtfd": return .rtf + case "html", "htm": return .html + default: return nil + } + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift b/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift new file mode 100644 index 000000000..6bc3bd938 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift @@ -0,0 +1,124 @@ +// +// DocumentParserShimTests.swift +// osaurusTests +// +// Integration tests for the `DocumentParser.parseAll` shim: verifies that +// the registry is consulted first, that `.emptyContent` from a registered +// adapter falls through to the legacy switch, and that errors bubble up +// translated into the legacy `ParseError` surface. Uses the shared +// registry (register + `unregisterAll` in teardown) so the shim's call +// site is exactly the one reached from production. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("DocumentParser.parseAll registry shim", .serialized) +struct DocumentParserShimTests { + + // A fixture-extension adapter so tests don't collide with built-ins. + private static let fixtureFormatId = "test-fixture-shim" + private static let fixtureExtension = "fixtureshim" + + private func registerFixture(content: String) { + DocumentFormatRegistry.shared.register( + adapter: FixtureAdapter( + formatId: Self.fixtureFormatId, + extensions: [Self.fixtureExtension], + produce: content + ) + ) + } + + private func cleanUp() { + DocumentFormatRegistry.shared.unregisterAll(formatId: Self.fixtureFormatId) + } + + // MARK: - Routing + + @Test func parseAll_routesThroughRegistry_whenAdapterClaims() throws { + registerFixture(content: "routed-through-registry") + defer { cleanUp() } + + let url = try writeFile(content: "ignored", ext: Self.fixtureExtension) + defer { try? FileManager.default.removeItem(at: url) } + + let attachments = try DocumentParser.parseAll(url: url) + #expect(attachments.count == 1) + #expect(attachments.first?.documentContent == "routed-through-registry") + } + + @Test func parseAll_fallsThroughOnEmptyContent() throws { + // Fixture adapter with empty payload → adapter throws .emptyContent → + // shim should try the legacy switch, which for an unknown extension + // surfaces `ParseError.unsupportedFormat`. + registerFixture(content: "") + defer { cleanUp() } + + let url = try writeFile(content: "ignored", ext: Self.fixtureExtension) + defer { try? FileManager.default.removeItem(at: url) } + + #expect(throws: DocumentParser.ParseError.self) { + _ = try DocumentParser.parseAll(url: url) + } + } + + @Test func parseAll_preservesLegacyPath_whenNoAdapterMatches() throws { + // No fixture registered. A plain .txt file still flows through the + // legacy switch and produces exactly one document attachment. + let url = try writeFile(content: "legacy path still works", ext: "txt") + defer { try? FileManager.default.removeItem(at: url) } + + let attachments = try DocumentParser.parseAll(url: url) + #expect(attachments.count == 1) + #expect(attachments.first?.documentContent == "legacy path still works") + } + + // MARK: - Bootstrap + + @Test func bootstrap_registersExpectedBuiltInsOnIsolatedRegistry() { + let registry = DocumentFormatRegistry() + DocumentAdaptersBootstrap.registerBuiltIns(registry: registry) + let ids = registry.registeredFormatIds() + #expect(ids.contains("plaintext")) + #expect(ids.contains("pdf")) + #expect(ids.contains("richdoc")) + } + + // MARK: - Fixtures + + private func writeFile(content: String, ext: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-shim-\(UUID().uuidString).\(ext)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } + + private struct FixtureAdapter: DocumentFormatAdapter { + let formatId: String + let extensions: Set + let produce: String + + func canHandle(url: URL, uti: String?) -> Bool { + extensions.contains(url.pathExtension.lowercased()) + } + + func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + guard !produce.isEmpty else { + throw DocumentAdapterError.emptyContent + } + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: produce) + ), + textFallback: produce + ) + } + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift new file mode 100644 index 000000000..ed8033d38 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift @@ -0,0 +1,96 @@ +// +// PDFAdapterTests.swift +// osaurusTests +// +// Exercises the text-layer PDF adapter. Synthesises tiny PDFs via Core +// Graphics so the test bundle doesn't carry binary fixtures. The +// image-only fallback path stays in the legacy `DocumentParser` switch +// for now; the adapter intentionally throws `.emptyContent` when there's +// no text layer so the shim can fall through. +// + +import AppKit +import CoreGraphics +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("PDFAdapter") +struct PDFAdapterTests { + + @Test func canHandle_acceptsPDFExtensionOnly() { + let adapter = PDFAdapter() + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.pdf"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.PDF"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil) == false) + } + + @Test func parse_readsTextLayer() async throws { + let url = try Self.writePDF(text: "Hello PDF body content") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PDFAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.formatId == "pdf") + #expect(doc.textFallback.contains("Hello PDF body content")) + } + + @Test func parse_throwsEmptyContentForPDFWithNoTextLayer() async throws { + let url = try Self.writeBlankPDF() + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PDFAdapter().parse(url: url, sizeLimit: 0) + } + } + + @Test func parse_throwsSizeLimitExceededAboveCap() async throws { + let url = try Self.writePDF(text: "tiny") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PDFAdapter().parse(url: url, sizeLimit: 1) + } + } + + // MARK: - Fixtures + + private static func writePDF(text: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-pdf-\(UUID().uuidString).pdf") + var mediaBox = CGRect(x: 0, y: 0, width: 300, height: 200) + guard let ctx = CGContext(url as CFURL, mediaBox: &mediaBox, nil) else { + throw FixtureError.contextCreationFailed + } + ctx.beginPDFPage(nil) + + // Draw the text into the PDF context via NSAttributedString so PDFKit + // can recover it from the text layer on read-back. + let gc = NSGraphicsContext(cgContext: ctx, flipped: false) + NSGraphicsContext.saveGraphicsState() + NSGraphicsContext.current = gc + let font = NSFont.systemFont(ofSize: 14) + NSAttributedString(string: text, attributes: [.font: font]) + .draw(at: NSPoint(x: 20, y: 100)) + NSGraphicsContext.restoreGraphicsState() + + ctx.endPDFPage() + ctx.closePDF() + return url + } + + private static func writeBlankPDF() throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-pdf-blank-\(UUID().uuidString).pdf") + var mediaBox = CGRect(x: 0, y: 0, width: 100, height: 100) + guard let ctx = CGContext(url as CFURL, mediaBox: &mediaBox, nil) else { + throw FixtureError.contextCreationFailed + } + ctx.beginPDFPage(nil) + ctx.endPDFPage() + ctx.closePDF() + return url + } + + private enum FixtureError: Error { case contextCreationFailed } +} diff --git a/Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift new file mode 100644 index 000000000..bc18a3448 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift @@ -0,0 +1,84 @@ +// +// PlainTextAdapterTests.swift +// osaurusTests +// +// Covers the plain-text migration adapter. Same behavioural contract as +// the legacy `DocumentParser.parsePlainText` — UTF-8, ISO-Latin-1 retry, +// character-cap truncation — plus the size-limit contract from the new +// adapter protocol. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("PlainTextAdapter") +struct PlainTextAdapterTests { + + @Test func canHandle_acceptsCommonTextExtensions() { + let adapter = PlainTextAdapter() + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.MD"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.swift"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.pdf"), uti: nil) == false) + } + + @Test func parse_readsUtf8Content() async throws { + let url = try Self.write("hello\nutf8\n", filename: "hello.txt") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.formatId == "plaintext") + #expect(doc.filename.hasSuffix("hello.txt")) + #expect(doc.textFallback.contains("hello")) + #expect(doc.textFallback.contains("utf8")) + } + + @Test func parse_fallsBackToLatin1ForNonUtf8Bytes() async throws { + // A single 0xE9 byte (`é` in latin-1) is illegal standalone UTF-8. + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("latin-\(UUID().uuidString).txt") + try Data([0xE9, 0x0A]).write(to: url) + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.textFallback.contains("é")) + } + + @Test func parse_throwsEmptyContentForWhitespaceOnly() async throws { + let url = try Self.write(" \n\t\n", filename: "empty.txt") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + } + } + + @Test func parse_throwsSizeLimitExceededAboveCap() async throws { + let url = try Self.write("hello world", filename: "big.txt") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PlainTextAdapter().parse(url: url, sizeLimit: 1) + } + } + + @Test func parse_truncatesLongContentWithMarker() async throws { + let payload = String(repeating: "a", count: 500_002) + let url = try Self.write(payload, filename: "long.txt") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.textFallback.hasSuffix("character limit]")) + } + + // MARK: - Helpers + + private static func write(_ content: String, filename: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("\(UUID().uuidString)-\(filename)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift new file mode 100644 index 000000000..5fb4f6631 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift @@ -0,0 +1,70 @@ +// +// RichDocumentAdapterTests.swift +// osaurusTests +// +// Covers the NSAttributedString-backed migration adapter across the +// extensions it claims today (DOCX, RTF, HTML). Uses HTML and RTF +// fixtures authored inline; the DOCX path is exercised indirectly +// through `canHandle` — building a real DOCX on the fly requires ZIP +// plumbing that will come with the high-fidelity DOCX reader in stage-4 +// PR 11. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("RichDocumentAdapter") +struct RichDocumentAdapterTests { + + @Test func canHandle_acceptsAllRichDocumentExtensions() { + let adapter = RichDocumentAdapter() + for ext in ["docx", "doc", "rtf", "rtfd", "html", "htm"] { + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.\(ext)"), uti: nil)) + } + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil) == false) + } + + @Test func parse_readsHTMLBodyAsPlainText() async throws { + let url = try Self.write( + "

Title

Body text

", + filename: "page.html" + ) + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await RichDocumentAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.formatId == "richdoc") + #expect(doc.textFallback.contains("Title")) + #expect(doc.textFallback.contains("Body text")) + #expect(doc.textFallback.contains("

") == false) + } + + @Test func parse_readsRTFAsPlainText() async throws { + let rtf = "{\\rtf1\\ansi Hello {\\b bold} world}" + let url = try Self.write(rtf, filename: "page.rtf") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await RichDocumentAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.textFallback.contains("Hello")) + #expect(doc.textFallback.contains("bold")) + } + + @Test func parse_throwsSizeLimitExceededAboveCap() async throws { + let url = try Self.write("hi", filename: "big.html") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await RichDocumentAdapter().parse(url: url, sizeLimit: 1) + } + } + + // MARK: - Helpers + + private static func write(_ content: String, filename: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("\(UUID().uuidString)-\(filename)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } +} diff --git a/Packages/OsaurusCore/Utils/DocumentParser.swift b/Packages/OsaurusCore/Utils/DocumentParser.swift index 2bce81665..17140495f 100644 --- a/Packages/OsaurusCore/Utils/DocumentParser.swift +++ b/Packages/OsaurusCore/Utils/DocumentParser.swift @@ -57,6 +57,14 @@ enum DocumentParser { let ext = url.pathExtension.lowercased() let filename = url.lastPathComponent + // Registry-routed path. Returns nil when no adapter claims the file + // OR when the claiming adapter surfaces `.emptyContent` / + // `.unsupportedFormat`, so the legacy switch below still handles + // e.g. image-only PDFs and any format an adapter hasn't taken over. + if let attachments = try routeThroughRegistry(url: url, fileSize: fileSize) { + return attachments + } + // PDF may fall back to image rendering if text extraction yields nothing if ext == "pdf" { return try parsePDFWithFallback(url: url, filename: filename, fileSize: fileSize) @@ -147,10 +155,10 @@ enum DocumentParser { return try String(contentsOf: url, encoding: .utf8) } catch { // Retry with latin1 for binary-ish text files - if let data = try? Data(contentsOf: url), - let str = String(data: data, encoding: .isoLatin1) - { - return str + if let data = try? Data(contentsOf: url) { + if let str = String(data: data, encoding: .isoLatin1) { + return str + } } throw ParseError.readFailed(error.localizedDescription) } @@ -189,10 +197,10 @@ enum DocumentParser { private static func extractPDFText(from document: PDFDocument) -> String { var pages: [String] = [] for i in 0 ..< document.pageCount { - if let page = document.page(at: i), let text = page.string, - !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty - { - pages.append(text) + if let page = document.page(at: i), let text = page.string { + if !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + pages.append(text) + } } } return pages.joined(separator: "\n\n") @@ -261,4 +269,86 @@ enum DocumentParser { throw ParseError.readFailed(error.localizedDescription) } } + + // MARK: - Registry shim + + /// Tries the document format registry before the legacy switch. The + /// registry runs async; we block on a dedicated dispatch queue so the + /// synchronous `parseAll` contract is preserved during the migration + /// window. Once every caller is async (stage-4 PR 10), this shim goes + /// away. + /// + /// Return value conventions: + /// - `nil` — no adapter is registered, or an adapter declined the file + /// via `.emptyContent` / `.unsupportedFormat`; legacy path handles it. + /// - non-nil — the adapter produced a text view; convert to + /// `[Attachment]` by wrapping `textFallback`. + /// - throws — adapter produced a non-recoverable error (size / read / + /// write); surface as `ParseError`. + private static func routeThroughRegistry(url: URL, fileSize: Int) throws -> [Attachment]? { + let registry = DocumentFormatRegistry.shared + guard let adapter = registry.adapter(for: url) else { return nil } + + let sizeLimit = DocumentLimits.limit(forFormatId: adapter.formatId) + do { + let document = try runBlocking { + try await adapter.parse(url: url, sizeLimit: sizeLimit) + } + return [ + .document( + filename: document.filename, + content: document.textFallback, + fileSize: Int(document.fileSize) + ) + ] + } catch DocumentAdapterError.emptyContent, DocumentAdapterError.unsupportedFormat { + // Fall through so the legacy switch (image-only PDFs, formats + // without an adapter yet) still gets a shot. + return nil + } catch DocumentAdapterError.sizeLimitExceeded { + throw ParseError.fileTooLarge + } catch let DocumentAdapterError.readFailed(reason) { + throw ParseError.readFailed(reason) + } catch DocumentAdapterError.writeFailed, DocumentAdapterError.cancelled { + throw ParseError.readFailed("Adapter emitted non-read error for ingress") + } catch { + throw ParseError.readFailed(error.localizedDescription) + } + } + + /// Synchronously awaits an async body. The shim is called from + /// `parseAll` which is itself invoked from UI callbacks that are still + /// synchronous — see `FloatingInputCard`. Dropping the semaphore means + /// reworking every ingress call site, which isn't in scope for PR 3. + private static func runBlocking(_ body: @escaping @Sendable () async throws -> T) throws -> T { + let semaphore = DispatchSemaphore(value: 0) + let resultBox = UnfairLockedBox?>(nil) + + Task.detached { + let result: Result + do { + result = .success(try await body()) + } catch { + result = .failure(error) + } + resultBox.set(result) + semaphore.signal() + } + + semaphore.wait() + switch resultBox.get()! { + case .success(let value): return value + case .failure(let error): throw error + } + } +} + +/// Tiny lock-box so the blocking-await shim above can hand a value back +/// across the actor/thread boundary without tripping Swift 6 sendability. +private final class UnfairLockedBox: @unchecked Sendable { + private var value: Value + private let lock = NSLock() + init(_ value: Value) { self.value = value } + func get() -> Value { lock.lock(); defer { lock.unlock() }; return value } + func set(_ newValue: Value) { lock.lock(); defer { lock.unlock() }; value = newValue } }