From a6913cbccb5830a7f81845a1a83e49791afc3a2c Mon Sep 17 00:00:00 2001 From: Michael Meding <264272563+mimeding@users.noreply.github.com> Date: Thu, 23 Apr 2026 16:05:45 -0300 Subject: [PATCH 1/7] feat(documents): wrap PlainText/PDF/DOCX as adapters and route DocumentParser through the registry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrates the three ingress paths already handled by DocumentParser onto the adapter surface introduced in the foundations PR, without changing any user-observable behaviour. parseAll now consults the registry first and falls back to its existing switch for anything an adapter hasn't claimed or has declined — specifically image-only PDFs, which continue to render via the legacy fallback until the layout-aware PDF rework lands. - PlainTextAdapter wraps the existing UTF-8 / ISO-Latin-1 retry path and the 500K-character truncation marker so the legacy behaviour stays byte-identical. - PDFAdapter wraps PDFKit text extraction; it throws emptyContent when there is no text layer so the shim falls through to the legacy image- render path rather than claiming a result it cannot produce. - RichDocumentAdapter wraps NSAttributedString across docx/doc/rtf/html; a single adapter for all four because they share the framework call today, splitting when high-fidelity DOCX lands. - DocumentAdaptersBootstrap registers the three on the shared registry from AppDelegate.applicationDidFinishLaunching exactly once so the shim sees adapters on the first file ingress. - PlainTextRepresentation is the neutral text shape for adapters that cannot yet publish a format-native representation; replaced per-format by Workbook / WordDocument / etc. in later PRs. --- Packages/OsaurusCore/AppDelegate.swift | 4 + .../Documents/DocumentAdaptersBootstrap.swift | 34 +++++ .../Documents/PlainTextRepresentation.swift | 18 +++ .../Services/Documents/PDFAdapter.swift | 68 ++++++++++ .../Services/Documents/PlainTextAdapter.swift | 90 +++++++++++++ .../Documents/RichDocumentAdapter.swift | 84 ++++++++++++ .../Documents/DocumentParserShimTests.swift | 124 ++++++++++++++++++ .../Tests/Documents/PDFAdapterTests.swift | 96 ++++++++++++++ .../Documents/PlainTextAdapterTests.swift | 84 ++++++++++++ .../Documents/RichDocumentAdapterTests.swift | 70 ++++++++++ .../OsaurusCore/Utils/DocumentParser.swift | 90 +++++++++++++ 11 files changed, 762 insertions(+) create mode 100644 Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift create mode 100644 Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift create mode 100644 Packages/OsaurusCore/Services/Documents/PDFAdapter.swift create mode 100644 Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift create mode 100644 Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift create mode 100644 Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift create mode 100644 Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift create mode 100644 Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift create mode 100644 Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift diff --git a/Packages/OsaurusCore/AppDelegate.swift b/Packages/OsaurusCore/AppDelegate.swift index 175a2d095..d7e8cc9e7 100644 --- a/Packages/OsaurusCore/AppDelegate.swift +++ b/Packages/OsaurusCore/AppDelegate.swift @@ -32,6 +32,10 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega // the specific crash class this prevents. MLXErrorRecovery.installGlobalHandler() + // Register in-tree document format adapters before any file-ingress + // path can run. Idempotent; safe if a future migration moves this. + DocumentAdaptersBootstrap.registerBuiltIns() + // Detect repeated startup crashes and enter safe mode if needed LaunchGuard.checkOnLaunch() diff --git a/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift new file mode 100644 index 000000000..03d4e14e5 --- /dev/null +++ b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift @@ -0,0 +1,34 @@ +// +// DocumentAdaptersBootstrap.swift +// osaurus +// +// Registers the in-tree document adapters with `DocumentFormatRegistry.shared` +// exactly once, at app launch. Kept separate from `AppDelegate` so tests can +// opt into the same registration (or opt out of it entirely) without dragging +// in `NSApplication`. +// + +import Foundation + +public enum DocumentAdaptersBootstrap { + private static let lock = NSLock() + // Guarded by `lock`; the `nonisolated(unsafe)` matches the project pattern + // for lock-protected process-global state (see `OsaurusPaths.overrideRoot`). + nonisolated(unsafe) private static var didRegisterShared = false + + /// Idempotent against the shared registry: safe to call from multiple + /// launch paths without producing duplicate adapter registrations. + /// Non-shared registries (tests, isolated instances) are re-registered on + /// every call so each test gets a clean baseline. + public static func registerBuiltIns(registry: DocumentFormatRegistry = .shared) { + lock.lock() + defer { lock.unlock() } + if registry === DocumentFormatRegistry.shared, didRegisterShared { return } + registry.register(adapter: PlainTextAdapter()) + registry.register(adapter: PDFAdapter()) + registry.register(adapter: RichDocumentAdapter()) + if registry === DocumentFormatRegistry.shared { + didRegisterShared = true + } + } +} diff --git a/Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift b/Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift new file mode 100644 index 000000000..f53c2348c --- /dev/null +++ b/Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift @@ -0,0 +1,18 @@ +// +// PlainTextRepresentation.swift +// osaurus +// +// Default representation for adapters that extract a single text string. +// Every adapter has to publish *some* `StructuredRepresentation`; the +// wrappers around `PDFKit` text extraction and `NSAttributedString` don't +// preserve any format-native structure, so they emit this shape. The +// real typed representations (`Workbook`, `WordDocument`, …) replace it +// per-format as higher-fidelity adapters land. +// + +import Foundation + +public struct PlainTextRepresentation: StructuredRepresentation, Sendable { + public let text: String + public init(text: String) { self.text = text } +} diff --git a/Packages/OsaurusCore/Services/Documents/PDFAdapter.swift b/Packages/OsaurusCore/Services/Documents/PDFAdapter.swift new file mode 100644 index 000000000..122006ad4 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/PDFAdapter.swift @@ -0,0 +1,68 @@ +// +// PDFAdapter.swift +// osaurus +// +// Wraps the text-layer extraction path in `DocumentParser.parsePDFWithFallback`. +// Intentionally does NOT cover the image-rendering fallback — when a PDF has +// no extractable text, this adapter throws `.emptyContent` and the +// `DocumentParser` shim falls through to the legacy switch, which still +// renders each page as PNG. Moving that path onto the adapter surface is +// deferred to stage-4 PR 8 (layout-aware table extraction), where the +// typed `PDFDocument` representation gets introduced. +// + +import Foundation +import PDFKit + +public struct PDFAdapter: DocumentFormatAdapter { + public let formatId = "pdf" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + url.pathExtension.lowercased() == "pdf" + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + guard let document = PDFDocument(url: url) else { + throw DocumentAdapterError.readFailed(underlying: "PDFKit could not open document") + } + + let extracted = Self.extractText(from: document) + guard !extracted.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + // No text layer — let the shim fall through to the legacy image- + // render fallback. Don't claim a result we can't produce. + throw DocumentAdapterError.emptyContent + } + + let truncated = PlainTextAdapter.applyCharacterCap(extracted) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: truncated) + ), + textFallback: truncated + ) + } + + private static func extractText(from document: PDFDocument) -> String { + var pages: [String] = [] + for index in 0 ..< document.pageCount { + guard let page = document.page(at: index), + let text = page.string, + !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty + else { continue } + pages.append(text) + } + return pages.joined(separator: "\n\n") + } +} diff --git a/Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift b/Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift new file mode 100644 index 000000000..1df764eb0 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift @@ -0,0 +1,90 @@ +// +// PlainTextAdapter.swift +// osaurus +// +// Wraps the existing plain-text ingress path in `DocumentParser`. Claims +// roughly the 60 extensions that were previously handled by the inline +// `case _ where isPlainText(ext:)` branch — `.txt`, `.md`, source code, +// config files, etc. Behaviour is intentionally identical to the legacy +// switch: UTF-8 first, ISO-Latin-1 retry, post-read character-count +// truncation marker. This adapter is a migration bridge, not a fidelity +// improvement. +// + +import Foundation + +public struct PlainTextAdapter: DocumentFormatAdapter { + public let formatId = "plaintext" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + Self.plainTextExtensions.contains(url.pathExtension.lowercased()) + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + let rawContent: String + do { + rawContent = try String(contentsOf: url, encoding: .utf8) + } catch { + // Fall back to latin-1 for files that are "mostly text" with a few + // non-UTF-8 bytes — same behaviour as the legacy path. + guard let data = try? Data(contentsOf: url), + let decoded = String(data: data, encoding: .isoLatin1) + else { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + rawContent = decoded + } + + guard !rawContent.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + throw DocumentAdapterError.emptyContent + } + + let truncated = Self.applyCharacterCap(rawContent) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: truncated) + ), + textFallback: truncated + ) + } + + // MARK: - Helpers + + /// Preserves the legacy 500K-character UX — consumers already expect the + /// trailing marker when a document is truncated mid-read. The cap on + /// bytes-read is higher (see `DocumentLimits.plainText`), so the two + /// interact: oversized files are refused outright; merely long files + /// are surfaced with a truncation note. + static func applyCharacterCap(_ text: String) -> String { + let cap = 500_000 + guard text.count > cap else { return text } + return String(text.prefix(cap)) + + "\n\n[Document truncated — exceeded \(cap) character limit]" + } + + static let plainTextExtensions: Set = [ + "txt", "md", "markdown", "csv", "tsv", + "json", "xml", "yaml", "yml", "toml", + "log", "ini", "cfg", "conf", "env", + "swift", "py", "js", "ts", "tsx", "jsx", + "rs", "go", "java", "kt", "c", "cpp", "h", "hpp", + "rb", "php", "sh", "bash", "zsh", "fish", + "css", "scss", "less", "sql", + "r", "m", "mm", "lua", "pl", "ex", "exs", + "zig", "nim", "dart", "scala", "groovy", + "tf", "hcl", "dockerfile", + "gitignore", "editorconfig", "prettierrc", + ] +} diff --git a/Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift b/Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift new file mode 100644 index 000000000..33afe2f86 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift @@ -0,0 +1,84 @@ +// +// RichDocumentAdapter.swift +// osaurus +// +// Wraps the `NSAttributedString(url:documentType:)` path in +// `DocumentParser.parseRichDocument`. A single adapter covers DOCX, DOC, +// RTF, RTFD, and HTML today because they share the same underlying +// framework call and produce the same plain-text output. When stage-4 +// PR 11 lands a high-fidelity DOCX reader (tables, tracked changes, +// comments) this adapter splits along format lines and this one becomes +// the RTF/HTML-only path. +// + +import AppKit +import Foundation + +public struct RichDocumentAdapter: DocumentFormatAdapter { + public let formatId = "richdoc" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + Self.supportedExtensions.contains(url.pathExtension.lowercased()) + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + let documentType = Self.documentType(forExtension: url.pathExtension.lowercased()) + let extracted: String + do { + var options: [NSAttributedString.DocumentReadingOptionKey: Any] = [:] + if let documentType { + options[.documentType] = documentType + } + let attributed = try NSAttributedString( + url: url, + options: options, + documentAttributes: nil + ) + extracted = attributed.string + } catch { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + + guard !extracted.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + throw DocumentAdapterError.emptyContent + } + + let truncated = PlainTextAdapter.applyCharacterCap(extracted) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: truncated) + ), + textFallback: truncated + ) + } + + // MARK: - Helpers + + static let supportedExtensions: Set = [ + "docx", "doc", "rtf", "rtfd", "html", "htm", + ] + + private static func documentType( + forExtension ext: String + ) -> NSAttributedString.DocumentType? { + switch ext { + case "docx": return nil // NSAttributedString auto-detects OOXML + case "doc": return .docFormat + case "rtf", "rtfd": return .rtf + case "html", "htm": return .html + default: return nil + } + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift b/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift new file mode 100644 index 000000000..6bc3bd938 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift @@ -0,0 +1,124 @@ +// +// DocumentParserShimTests.swift +// osaurusTests +// +// Integration tests for the `DocumentParser.parseAll` shim: verifies that +// the registry is consulted first, that `.emptyContent` from a registered +// adapter falls through to the legacy switch, and that errors bubble up +// translated into the legacy `ParseError` surface. Uses the shared +// registry (register + `unregisterAll` in teardown) so the shim's call +// site is exactly the one reached from production. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("DocumentParser.parseAll registry shim", .serialized) +struct DocumentParserShimTests { + + // A fixture-extension adapter so tests don't collide with built-ins. + private static let fixtureFormatId = "test-fixture-shim" + private static let fixtureExtension = "fixtureshim" + + private func registerFixture(content: String) { + DocumentFormatRegistry.shared.register( + adapter: FixtureAdapter( + formatId: Self.fixtureFormatId, + extensions: [Self.fixtureExtension], + produce: content + ) + ) + } + + private func cleanUp() { + DocumentFormatRegistry.shared.unregisterAll(formatId: Self.fixtureFormatId) + } + + // MARK: - Routing + + @Test func parseAll_routesThroughRegistry_whenAdapterClaims() throws { + registerFixture(content: "routed-through-registry") + defer { cleanUp() } + + let url = try writeFile(content: "ignored", ext: Self.fixtureExtension) + defer { try? FileManager.default.removeItem(at: url) } + + let attachments = try DocumentParser.parseAll(url: url) + #expect(attachments.count == 1) + #expect(attachments.first?.documentContent == "routed-through-registry") + } + + @Test func parseAll_fallsThroughOnEmptyContent() throws { + // Fixture adapter with empty payload → adapter throws .emptyContent → + // shim should try the legacy switch, which for an unknown extension + // surfaces `ParseError.unsupportedFormat`. + registerFixture(content: "") + defer { cleanUp() } + + let url = try writeFile(content: "ignored", ext: Self.fixtureExtension) + defer { try? FileManager.default.removeItem(at: url) } + + #expect(throws: DocumentParser.ParseError.self) { + _ = try DocumentParser.parseAll(url: url) + } + } + + @Test func parseAll_preservesLegacyPath_whenNoAdapterMatches() throws { + // No fixture registered. A plain .txt file still flows through the + // legacy switch and produces exactly one document attachment. + let url = try writeFile(content: "legacy path still works", ext: "txt") + defer { try? FileManager.default.removeItem(at: url) } + + let attachments = try DocumentParser.parseAll(url: url) + #expect(attachments.count == 1) + #expect(attachments.first?.documentContent == "legacy path still works") + } + + // MARK: - Bootstrap + + @Test func bootstrap_registersExpectedBuiltInsOnIsolatedRegistry() { + let registry = DocumentFormatRegistry() + DocumentAdaptersBootstrap.registerBuiltIns(registry: registry) + let ids = registry.registeredFormatIds() + #expect(ids.contains("plaintext")) + #expect(ids.contains("pdf")) + #expect(ids.contains("richdoc")) + } + + // MARK: - Fixtures + + private func writeFile(content: String, ext: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-shim-\(UUID().uuidString).\(ext)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } + + private struct FixtureAdapter: DocumentFormatAdapter { + let formatId: String + let extensions: Set + let produce: String + + func canHandle(url: URL, uti: String?) -> Bool { + extensions.contains(url.pathExtension.lowercased()) + } + + func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + guard !produce.isEmpty else { + throw DocumentAdapterError.emptyContent + } + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: produce) + ), + textFallback: produce + ) + } + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift new file mode 100644 index 000000000..ed8033d38 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift @@ -0,0 +1,96 @@ +// +// PDFAdapterTests.swift +// osaurusTests +// +// Exercises the text-layer PDF adapter. Synthesises tiny PDFs via Core +// Graphics so the test bundle doesn't carry binary fixtures. The +// image-only fallback path stays in the legacy `DocumentParser` switch +// for now; the adapter intentionally throws `.emptyContent` when there's +// no text layer so the shim can fall through. +// + +import AppKit +import CoreGraphics +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("PDFAdapter") +struct PDFAdapterTests { + + @Test func canHandle_acceptsPDFExtensionOnly() { + let adapter = PDFAdapter() + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.pdf"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.PDF"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil) == false) + } + + @Test func parse_readsTextLayer() async throws { + let url = try Self.writePDF(text: "Hello PDF body content") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PDFAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.formatId == "pdf") + #expect(doc.textFallback.contains("Hello PDF body content")) + } + + @Test func parse_throwsEmptyContentForPDFWithNoTextLayer() async throws { + let url = try Self.writeBlankPDF() + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PDFAdapter().parse(url: url, sizeLimit: 0) + } + } + + @Test func parse_throwsSizeLimitExceededAboveCap() async throws { + let url = try Self.writePDF(text: "tiny") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PDFAdapter().parse(url: url, sizeLimit: 1) + } + } + + // MARK: - Fixtures + + private static func writePDF(text: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-pdf-\(UUID().uuidString).pdf") + var mediaBox = CGRect(x: 0, y: 0, width: 300, height: 200) + guard let ctx = CGContext(url as CFURL, mediaBox: &mediaBox, nil) else { + throw FixtureError.contextCreationFailed + } + ctx.beginPDFPage(nil) + + // Draw the text into the PDF context via NSAttributedString so PDFKit + // can recover it from the text layer on read-back. + let gc = NSGraphicsContext(cgContext: ctx, flipped: false) + NSGraphicsContext.saveGraphicsState() + NSGraphicsContext.current = gc + let font = NSFont.systemFont(ofSize: 14) + NSAttributedString(string: text, attributes: [.font: font]) + .draw(at: NSPoint(x: 20, y: 100)) + NSGraphicsContext.restoreGraphicsState() + + ctx.endPDFPage() + ctx.closePDF() + return url + } + + private static func writeBlankPDF() throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-pdf-blank-\(UUID().uuidString).pdf") + var mediaBox = CGRect(x: 0, y: 0, width: 100, height: 100) + guard let ctx = CGContext(url as CFURL, mediaBox: &mediaBox, nil) else { + throw FixtureError.contextCreationFailed + } + ctx.beginPDFPage(nil) + ctx.endPDFPage() + ctx.closePDF() + return url + } + + private enum FixtureError: Error { case contextCreationFailed } +} diff --git a/Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift new file mode 100644 index 000000000..bc18a3448 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift @@ -0,0 +1,84 @@ +// +// PlainTextAdapterTests.swift +// osaurusTests +// +// Covers the plain-text migration adapter. Same behavioural contract as +// the legacy `DocumentParser.parsePlainText` — UTF-8, ISO-Latin-1 retry, +// character-cap truncation — plus the size-limit contract from the new +// adapter protocol. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("PlainTextAdapter") +struct PlainTextAdapterTests { + + @Test func canHandle_acceptsCommonTextExtensions() { + let adapter = PlainTextAdapter() + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.MD"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.swift"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.pdf"), uti: nil) == false) + } + + @Test func parse_readsUtf8Content() async throws { + let url = try Self.write("hello\nutf8\n", filename: "hello.txt") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.formatId == "plaintext") + #expect(doc.filename.hasSuffix("hello.txt")) + #expect(doc.textFallback.contains("hello")) + #expect(doc.textFallback.contains("utf8")) + } + + @Test func parse_fallsBackToLatin1ForNonUtf8Bytes() async throws { + // A single 0xE9 byte (`é` in latin-1) is illegal standalone UTF-8. + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("latin-\(UUID().uuidString).txt") + try Data([0xE9, 0x0A]).write(to: url) + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.textFallback.contains("é")) + } + + @Test func parse_throwsEmptyContentForWhitespaceOnly() async throws { + let url = try Self.write(" \n\t\n", filename: "empty.txt") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + } + } + + @Test func parse_throwsSizeLimitExceededAboveCap() async throws { + let url = try Self.write("hello world", filename: "big.txt") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PlainTextAdapter().parse(url: url, sizeLimit: 1) + } + } + + @Test func parse_truncatesLongContentWithMarker() async throws { + let payload = String(repeating: "a", count: 500_002) + let url = try Self.write(payload, filename: "long.txt") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.textFallback.hasSuffix("character limit]")) + } + + // MARK: - Helpers + + private static func write(_ content: String, filename: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("\(UUID().uuidString)-\(filename)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift new file mode 100644 index 000000000..5fb4f6631 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift @@ -0,0 +1,70 @@ +// +// RichDocumentAdapterTests.swift +// osaurusTests +// +// Covers the NSAttributedString-backed migration adapter across the +// extensions it claims today (DOCX, RTF, HTML). Uses HTML and RTF +// fixtures authored inline; the DOCX path is exercised indirectly +// through `canHandle` — building a real DOCX on the fly requires ZIP +// plumbing that will come with the high-fidelity DOCX reader in stage-4 +// PR 11. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("RichDocumentAdapter") +struct RichDocumentAdapterTests { + + @Test func canHandle_acceptsAllRichDocumentExtensions() { + let adapter = RichDocumentAdapter() + for ext in ["docx", "doc", "rtf", "rtfd", "html", "htm"] { + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.\(ext)"), uti: nil)) + } + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil) == false) + } + + @Test func parse_readsHTMLBodyAsPlainText() async throws { + let url = try Self.write( + "

Title

Body text

", + filename: "page.html" + ) + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await RichDocumentAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.formatId == "richdoc") + #expect(doc.textFallback.contains("Title")) + #expect(doc.textFallback.contains("Body text")) + #expect(doc.textFallback.contains("

") == false) + } + + @Test func parse_readsRTFAsPlainText() async throws { + let rtf = "{\\rtf1\\ansi Hello {\\b bold} world}" + let url = try Self.write(rtf, filename: "page.rtf") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await RichDocumentAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.textFallback.contains("Hello")) + #expect(doc.textFallback.contains("bold")) + } + + @Test func parse_throwsSizeLimitExceededAboveCap() async throws { + let url = try Self.write("hi", filename: "big.html") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await RichDocumentAdapter().parse(url: url, sizeLimit: 1) + } + } + + // MARK: - Helpers + + private static func write(_ content: String, filename: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("\(UUID().uuidString)-\(filename)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } +} diff --git a/Packages/OsaurusCore/Utils/DocumentParser.swift b/Packages/OsaurusCore/Utils/DocumentParser.swift index 2bce81665..7a6c2e8c1 100644 --- a/Packages/OsaurusCore/Utils/DocumentParser.swift +++ b/Packages/OsaurusCore/Utils/DocumentParser.swift @@ -57,6 +57,14 @@ enum DocumentParser { let ext = url.pathExtension.lowercased() let filename = url.lastPathComponent + // Registry-routed path. Returns nil when no adapter claims the file + // OR when the claiming adapter surfaces `.emptyContent` / + // `.unsupportedFormat`, so the legacy switch below still handles + // e.g. image-only PDFs and any format an adapter hasn't taken over. + if let attachments = try routeThroughRegistry(url: url, fileSize: fileSize) { + return attachments + } + // PDF may fall back to image rendering if text extraction yields nothing if ext == "pdf" { return try parsePDFWithFallback(url: url, filename: filename, fileSize: fileSize) @@ -261,4 +269,86 @@ enum DocumentParser { throw ParseError.readFailed(error.localizedDescription) } } + + // MARK: - Registry shim + + /// Tries the document format registry before the legacy switch. The + /// registry runs async; we block on a dedicated dispatch queue so the + /// synchronous `parseAll` contract is preserved during the migration + /// window. Once every caller is async (stage-4 PR 10), this shim goes + /// away. + /// + /// Return value conventions: + /// - `nil` — no adapter is registered, or an adapter declined the file + /// via `.emptyContent` / `.unsupportedFormat`; legacy path handles it. + /// - non-nil — the adapter produced a text view; convert to + /// `[Attachment]` by wrapping `textFallback`. + /// - throws — adapter produced a non-recoverable error (size / read / + /// write); surface as `ParseError`. + private static func routeThroughRegistry(url: URL, fileSize: Int) throws -> [Attachment]? { + let registry = DocumentFormatRegistry.shared + guard let adapter = registry.adapter(for: url) else { return nil } + + let sizeLimit = DocumentLimits.limit(forFormatId: adapter.formatId) + do { + let document = try runBlocking { + try await adapter.parse(url: url, sizeLimit: sizeLimit) + } + return [ + .document( + filename: document.filename, + content: document.textFallback, + fileSize: Int(document.fileSize) + ) + ] + } catch DocumentAdapterError.emptyContent, DocumentAdapterError.unsupportedFormat { + // Fall through so the legacy switch (image-only PDFs, formats + // without an adapter yet) still gets a shot. + return nil + } catch DocumentAdapterError.sizeLimitExceeded { + throw ParseError.fileTooLarge + } catch let DocumentAdapterError.readFailed(reason) { + throw ParseError.readFailed(reason) + } catch DocumentAdapterError.writeFailed, DocumentAdapterError.cancelled { + throw ParseError.readFailed("Adapter emitted non-read error for ingress") + } catch { + throw ParseError.readFailed(error.localizedDescription) + } + } + + /// Synchronously awaits an async body. The shim is called from + /// `parseAll` which is itself invoked from UI callbacks that are still + /// synchronous — see `FloatingInputCard`. Dropping the semaphore means + /// reworking every ingress call site, which isn't in scope for PR 3. + private static func runBlocking(_ body: @escaping @Sendable () async throws -> T) throws -> T { + let semaphore = DispatchSemaphore(value: 0) + let resultBox = UnfairLockedBox?>(nil) + + Task.detached { + let result: Result + do { + result = .success(try await body()) + } catch { + result = .failure(error) + } + resultBox.set(result) + semaphore.signal() + } + + semaphore.wait() + switch resultBox.get()! { + case .success(let value): return value + case .failure(let error): throw error + } + } +} + +/// Tiny lock-box so the blocking-await shim above can hand a value back +/// across the actor/thread boundary without tripping Swift 6 sendability. +private final class UnfairLockedBox: @unchecked Sendable { + private var value: Value + private let lock = NSLock() + init(_ value: Value) { self.value = value } + func get() -> Value { lock.lock(); defer { lock.unlock() }; return value } + func set(_ newValue: Value) { lock.lock(); defer { lock.unlock() }; value = newValue } } From 8a01d2a7c555b82a83050d33eea56dfd7c2c1a77 Mon Sep 17 00:00:00 2001 From: Michael Meding <264272563+mimeding@users.noreply.github.com> Date: Fri, 24 Apr 2026 12:46:50 -0300 Subject: [PATCH 2/7] feat(documents): XLSX read via CoreXLSX into a typed Workbook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First real-fidelity document adapter. Reads .xlsx into a typed Workbook representation carrying sheet names, cells with formula source strings, merged-range references, shared strings, and cell types (number, shared string, inline string, boolean). The text fallback renders each sheet as a tab-separated table so callers still on the legacy Attachment. Kind.document path see something readable. The adapter deliberately does NOT call CoreXLSX's parseStyles() — that entry point crashes on openpyxl-generated workbooks because the library's PatternFill.patternType is non-optional while Excel's default empty pattern omits the attribute. Everything we surface today is style-independent; lifting that limitation (number formats, column widths, dates stored as styled numbers) lives in a follow-up slice behind a hand-rolled styles fallback. - Package.swift: CoreXLSX 0.14.2 dependency for the core target, testTarget resource declaration for the xlsxwriter-produced fixture. - Workbook / Sheet / Row / Cell / CellValue / CellRange: the typed intermediate that both the XLSX read path and the eventual XLSX write emitter round-trip through. - XLSXAdapter: the actual CoreXLSX → Workbook translator + markdown- style text fallback. - DocumentAdaptersBootstrap: registers XLSXAdapter alongside PlainText / PDF / RichDocument, so DocumentParser.parseAll now routes .xlsx through the registry instead of throwing unsupportedFormat. - Tests/Documents/Fixtures/xlsx/sample.xlsx: 5.9 KB fixture with two sheets, a SUM formula, a merged range (A5:B5), shared strings, and explicit booleans. Exercises the parse paths for each fidelity feature. - XLSXAdapterTests: 7 tests pinning format routing, sheet/cell structure, formulas, merged ranges, shared strings, booleans, text fallback formatting, and size-limit refusal. - DocumentParserShimTests: expands the bootstrap assertion to include "xlsx" alongside the three existing adapter ids. --- .../Documents/DocumentAdaptersBootstrap.swift | 1 + .../Models/Documents/Workbook.swift | 87 ++++++++ Packages/OsaurusCore/Package.swift | 5 +- .../Services/Documents/XLSXAdapter.swift | 211 ++++++++++++++++++ .../Documents/DocumentParserShimTests.swift | 1 + .../Tests/Documents/Fixtures/xlsx/sample.xlsx | Bin 0 -> 5937 bytes .../Tests/Documents/XLSXAdapterTests.swift | 143 ++++++++++++ 7 files changed, 447 insertions(+), 1 deletion(-) create mode 100644 Packages/OsaurusCore/Models/Documents/Workbook.swift create mode 100644 Packages/OsaurusCore/Services/Documents/XLSXAdapter.swift create mode 100644 Packages/OsaurusCore/Tests/Documents/Fixtures/xlsx/sample.xlsx create mode 100644 Packages/OsaurusCore/Tests/Documents/XLSXAdapterTests.swift diff --git a/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift index 03d4e14e5..39643f0ba 100644 --- a/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift +++ b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift @@ -27,6 +27,7 @@ public enum DocumentAdaptersBootstrap { registry.register(adapter: PlainTextAdapter()) registry.register(adapter: PDFAdapter()) registry.register(adapter: RichDocumentAdapter()) + registry.register(adapter: XLSXAdapter()) if registry === DocumentFormatRegistry.shared { didRegisterShared = true } diff --git a/Packages/OsaurusCore/Models/Documents/Workbook.swift b/Packages/OsaurusCore/Models/Documents/Workbook.swift new file mode 100644 index 000000000..a24031dbb --- /dev/null +++ b/Packages/OsaurusCore/Models/Documents/Workbook.swift @@ -0,0 +1,87 @@ +// +// Workbook.swift +// osaurus +// +// Typed representation for parsed XLSX workbooks. Designed as the +// round-trip target for both the read side (`XLSXAdapter`, this PR) and +// the write side (`XLSXEmitter`, landing in the next slice). Fields are +// chosen to match what CoreXLSX surfaces cleanly today — sheet names, +// merged ranges, raw cell values, formula source strings — plus the +// shared-string table so repeated strings round-trip without being +// re-interned on write. Style-derived fidelity (number formats, column +// widths) is deliberately out of scope for this PR; see the comment on +// `CellValue` for why. +// + +import Foundation + +public struct Workbook: StructuredRepresentation, Sendable { + public let sheets: [Sheet] + public let sharedStrings: [String] + + public init(sheets: [Sheet], sharedStrings: [String]) { + self.sheets = sheets + self.sharedStrings = sharedStrings + } +} + +public struct Sheet: Sendable { + public let name: String + public let rows: [Row] + public let mergedRanges: [CellRange] + + public init(name: String, rows: [Row], mergedRanges: [CellRange]) { + self.name = name + self.rows = rows + self.mergedRanges = mergedRanges + } +} + +public struct Row: Sendable { + /// 1-based row number matching the on-wire `r` attribute. + public let index: Int + public let cells: [Cell] + + public init(index: Int, cells: [Cell]) { + self.index = index + self.cells = cells + } +} + +public struct Cell: Sendable { + /// A1-style reference on-wire, e.g. "B3". + public let reference: String + public let value: CellValue + /// Formula source (`=SUM(A1:A3)`) when the cell carries one. Excel + /// stores both the formula and its cached result; we preserve both. + public let formula: String? + + public init(reference: String, value: CellValue, formula: String? = nil) { + self.reference = reference + self.value = value + self.formula = formula + } +} + +/// Scalar cell payload. Excel dates are stored as numbers with a style +/// attached — without parsing the style table we can't distinguish a date +/// from a plain number, so dates that aren't explicitly typed (`t="d"`) +/// surface as `.number`. Lifting that limitation means shipping a style +/// parser that tolerates the CoreXLSX `patternType` crash on +/// openpyxl-generated files; that work lives in a separate slice. +public enum CellValue: Sendable, Equatable { + case empty + case number(Double) + case string(String) + case bool(Bool) + case inlineString(String) +} + +/// A1-style cell range, e.g. "A1:C3". +public struct CellRange: Sendable, Equatable { + public let reference: String + + public init(reference: String) { + self.reference = reference + } +} diff --git a/Packages/OsaurusCore/Package.swift b/Packages/OsaurusCore/Package.swift index 51b3073d9..49e776152 100644 --- a/Packages/OsaurusCore/Package.swift +++ b/Packages/OsaurusCore/Package.swift @@ -148,6 +148,7 @@ let package = Package( .package(url: "https://github.com/mgriebling/SwiftMath", from: "1.7.3"), .package(url: "https://github.com/raspu/Highlightr", from: "2.3.0"), .package(url: "https://github.com/AAChartModel/AAChartKit-Swift.git", from: "9.5.0"), + .package(url: "https://github.com/CoreOffice/CoreXLSX.git", from: "0.14.2"), ], targets: [ // Vendored SQLCipher 4.6.1 amalgamation (CommonCrypto @@ -271,6 +272,7 @@ let package = Package( .product(name: "ContainerizationExtras", package: "containerization"), .product(name: "Highlightr", package: "Highlightr"), .product(name: "AAInfographics", package: "AAChartKit-Swift"), + .product(name: "CoreXLSX", package: "CoreXLSX"), ], path: ".", exclude: ["Tests", "SQLCipher"], @@ -284,7 +286,8 @@ let package = Package( .product(name: "NIOEmbedded", package: "swift-nio"), .product(name: "VecturaKit", package: "VecturaKit"), ], - path: "Tests" + path: "Tests", + resources: [.copy("Documents/Fixtures")] ), ] ) diff --git a/Packages/OsaurusCore/Services/Documents/XLSXAdapter.swift b/Packages/OsaurusCore/Services/Documents/XLSXAdapter.swift new file mode 100644 index 000000000..e91624ee5 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/XLSXAdapter.swift @@ -0,0 +1,211 @@ +// +// XLSXAdapter.swift +// osaurus +// +// First real-fidelity adapter: reads `.xlsx` into a typed `Workbook` +// rather than flattening it to markdown the way the legacy text path +// would. Backed by CoreXLSX. The adapter intentionally does NOT call +// `parseStyles()` — that entry point crashes on openpyxl-generated +// workbooks because CoreXLSX's `PatternFill.patternType` is non-optional +// while Excel's default empty pattern omits that attribute. Style- +// dependent fidelity (number formats, column widths, dates that aren't +// explicitly typed) is deferred to a follow-up slice so this PR can ship +// behaviour that works against every current-style XLSX writer. +// + +import CoreXLSX +import Foundation + +public struct XLSXAdapter: DocumentFormatAdapter { + public let formatId = "xlsx" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + url.pathExtension.lowercased() == "xlsx" + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + let file: XLSXFile + do { + guard let opened = XLSXFile(filepath: url.path) else { + throw DocumentAdapterError.readFailed(underlying: "XLSXFile could not open \(url.path)") + } + file = opened + } + + let sharedStrings: [String] + do { + // `parseSharedStrings` is nil on workbooks with no text cells, + // which is legal for a pure-numeric sheet. Treat that as empty. + let parsed = try file.parseSharedStrings() + sharedStrings = parsed?.items.map { $0.text ?? "" } ?? [] + } catch { + throw DocumentAdapterError.readFailed(underlying: "shared strings: \(error.localizedDescription)") + } + + let coreWorkbooks: [CoreXLSX.Workbook] + do { + coreWorkbooks = try file.parseWorkbooks() + } catch { + throw DocumentAdapterError.readFailed(underlying: "workbook index: \(error.localizedDescription)") + } + + var sheets: [Sheet] = [] + for coreWorkbook in coreWorkbooks { + let pathsAndNames: [(name: String?, path: String)] + do { + pathsAndNames = try file.parseWorksheetPathsAndNames(workbook: coreWorkbook) + } catch { + throw DocumentAdapterError.readFailed(underlying: "worksheet index: \(error.localizedDescription)") + } + + for pair in pathsAndNames { + let coreSheet: Worksheet + do { + coreSheet = try file.parseWorksheet(at: pair.path) + } catch { + throw DocumentAdapterError.readFailed( + underlying: "worksheet \(pair.name ?? pair.path): \(error.localizedDescription)" + ) + } + sheets.append( + Self.makeSheet( + name: pair.name ?? pair.path, + coreSheet: coreSheet, + sharedStrings: sharedStrings + ) + ) + } + } + + guard !sheets.isEmpty else { + throw DocumentAdapterError.emptyContent + } + + let workbook = Workbook(sheets: sheets, sharedStrings: sharedStrings) + let textFallback = Self.renderTextFallback(workbook) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: workbook + ), + textFallback: textFallback + ) + } + + // MARK: - CoreXLSX → Workbook + + private static func makeSheet( + name: String, + coreSheet: Worksheet, + sharedStrings: [String] + ) -> Sheet { + let rows: [Row] = (coreSheet.data?.rows ?? []).map { coreRow in + let cells: [Cell] = coreRow.cells.map { coreCell in + Cell( + reference: coreCell.reference.description, + value: mapCellValue(coreCell, sharedStrings: sharedStrings), + formula: coreCell.formula?.value + ) + } + return Row(index: Int(coreRow.reference), cells: cells) + } + + let mergedRanges: [CellRange] = (coreSheet.mergeCells?.items ?? []).map { + CellRange(reference: $0.reference) + } + + return Sheet(name: name, rows: rows, mergedRanges: mergedRanges) + } + + private static func mapCellValue( + _ coreCell: CoreXLSX.Cell, + sharedStrings: [String] + ) -> CellValue { + // CoreXLSX's `Cell.type` is an optional enum; `Cell.value` is a + // raw string. The interpretation depends on `type`. + guard let rawValue = coreCell.value, !rawValue.isEmpty else { + return .empty + } + + switch coreCell.type { + case .bool: + return .bool(rawValue == "1") + case .sharedString: + if let index = Int(rawValue), index >= 0, index < sharedStrings.count { + return .string(sharedStrings[index]) + } + return .empty + case .inlineStr: + if let inline = coreCell.inlineString { + // CoreXLSX's `InlineString` concatenates all runs for us. + return .inlineString(inline.text ?? "") + } + return .inlineString(rawValue) + case .string: + return .string(rawValue) + case .number, .none: + if let number = Double(rawValue) { + return .number(number) + } + return .empty + case .date: + // Explicitly-typed dates are rare in the wild — Excel writers + // almost always store dates as numbers plus a style. Preserve + // the raw string so callers that know the style table can + // reconstruct; callers that don't still see a string. + return .string(rawValue) + case .error: + return .string(rawValue) + } + } + + // MARK: - Text fallback + + private static func renderTextFallback(_ workbook: Workbook) -> String { + var out: [String] = [] + for sheet in workbook.sheets { + out.append("## Sheet: \(sheet.name)") + for row in sheet.rows { + let cellText = row.cells.map { describeCell($0) }.joined(separator: "\t") + out.append("\(row.index)\t\(cellText)") + } + if !sheet.mergedRanges.isEmpty { + let ranges = sheet.mergedRanges.map { $0.reference }.joined(separator: ", ") + out.append("Merged: \(ranges)") + } + out.append("") + } + return out.joined(separator: "\n").trimmingCharacters(in: .whitespacesAndNewlines) + } + + private static func describeCell(_ cell: Cell) -> String { + let base: String + switch cell.value { + case .empty: base = "" + case .number(let value): + base = + value.truncatingRemainder(dividingBy: 1) == 0 + ? String(Int(value)) + : String(value) + case .string(let text), .inlineString(let text): + base = text + case .bool(let flag): + base = flag ? "TRUE" : "FALSE" + } + if let formula = cell.formula { + return base.isEmpty ? "=\(formula)" : "\(base) [=\(formula)]" + } + return base + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift b/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift index 6bc3bd938..580383bb2 100644 --- a/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift +++ b/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift @@ -85,6 +85,7 @@ struct DocumentParserShimTests { #expect(ids.contains("plaintext")) #expect(ids.contains("pdf")) #expect(ids.contains("richdoc")) + #expect(ids.contains("xlsx")) } // MARK: - Fixtures diff --git a/Packages/OsaurusCore/Tests/Documents/Fixtures/xlsx/sample.xlsx b/Packages/OsaurusCore/Tests/Documents/Fixtures/xlsx/sample.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f2a42dd329c4c7b8c3ea897f84d3a6aa917ac848 GIT binary patch literal 5937 zcmZ`-1ymE@8XX`pxT7txYcfbcsxQH88zeTdXG8pNR z$bOvsRc5GCi9WL_N#YThT5w_iNL~RFFC}{Pe5@c)98V@-P`vVu43cZ&d1YLLJ)gIi zl;U~RmpB3$D>V1BkHl0mVFZBjc(**{ud#c0&I`BGHxuiCgO2XYhVx^2ENF3Kd>46} zT_@el+D$9&TPTNmxl)c z&|oLT(bU?Bh56@uNpzc12P?YBzPIG+b&{@NX(`yr7c z=wX(~;SS$7%=>l*1j4q%RL{uJ8H-5p5&H(;4z~8)S~_6SqanS0g3NB8gPy!@_Cw)` zTtQD$gf@Z9S0vAaY`783Pm>wW%&1PW zz)>quTJFP+X6I;SWM^mfGcbP%)D%7NvV#@-&HgLYigTUn(Rfv=4ORHM>MaBmE1Sfr z=wVymn==BSA~{MP@@Px~0snDDzvnU6$uhzdmw6CfCE0`endHIGsPkt~s5JJMnTK;; zZ&9VFr__0Nh`w{>lUM3fMk$bi%KMjOMID7goQM(`eY6_cIpZ68h>Iy}PiVkOLY>sA@P~I_G^G^j_!G^EQO zJ{yU#B3(j4tE7-@B6Prih1>KUT^O7FS{PluuxOF6$OfzH5hY>O$yl>hL)#&H8Gmn} zM*^b8L9%JJQXSoK#nBMvnXM5A3y*R}<`H=GCMB?qJDBD1WvZA;WN1s3o3R>J{CLKy z=C$;!Z$PP~py+(zoE%Hv>~h*P&K}Z3*U$Y%+n-R|NJFD(PMR39^}c_6?$Te9U$+LL z4R-VI`ZVlL^?oSh5Kg)%nU?0$7ZWjs!QdX^AJMl6wMmGjG_B(g?Il$-gaj8L#VgjS zdh23MkRg5_zCLBzn-c{(nh&D^Hj4}vyUzo4@%AE=v$}a=DHbB#`3=?stQQSt@;JgR z!PH@dS}D^V9K<#e9TqA)q$d>DT4lLq0B-jx?Z8b)DtPszu2=YPXH`(z#dKiKcEEDN z|8bW6m$T7*uml0i&fbA*f-cd8SXtB>MTP!|xMHpIiZmhPU+|`F5qyW91=7>&mnR>d z3LclojGIHt@raAS`2r~qQv3!h!1Z#M*U&wmoeiA*0laeHt!TkD;vB; z^3hBkGhzEWSJjA8If9~GNM2mHsu~dbFtXF|B`FSKn_iWwr4P-QbzyWy`M3_kV`H?L z)W=99TOVKP77ZRTg77VUD9kbxF8SW`s4-WXe7wobsL4*a6g>+@P8T2mVclpPZtU4w zEO(Z}%XwJOGMq(Z6wZ7bF(bOttzp0gQ?c+hDVl-EX|szMqnO(1_DjKI4A-L38WJe8+N)^tT^}+TBl{!~9T( z`GNK4ru{1yg{k~1l8vzYwIEL+m*p=Ld8HafucLuKXxj)YlH}8vel7cabRu?w(;R83 zbJ{~h-BQsA#0*uvmYhHkoW@J$G3NY*oylutxnko`jsk+!lLzt4p$#5o1GmyUf^D9r zPCERPgZ{1YT%7$%&S?D0dU4-{gl2$JE;0{(a5*mC#{{DVn0cVX6+%%VI zu8i5~FRx87^J_j^FfpyiMV9j-Qx)526}MGHMC0}pdo8uY4O#kw&kx?R>R(R*w?s+u zIO&)l3u7#SE0pTF!nx~L>X)g*Im%87!(PuDyc3}AE&=PE^M>Kgez%lRk&_jkt?_Rl z31LhzsO7yq4wJ>urE=Z;U)T_KT#%)_2-RUv+1DwPu!R z4LT3Qo>5$dDaz5v_wPltaVd>h$jOe&UcQ-q>p5w?x&E$ewv**=6L1|tPK!O7x_Jl|&L9 zVQ{dA?EitdF(_Yw&m+IBfuQ-2yTqkL2tcr?*2nL?cITZ=uuixi6)Eb#c?yU@L?vPi z%^HapT27B*VI{JTIwU?ss5`~SEA_o~(P|-SzE>eh8cBL0U@{oPGC#&YrEvDJHvOeR zZj6f6EGv5V!a3;u0(6$VxcG_eBUI!-8+6aWrYobT=H+XK4Be*bK8`6gZmgT)>5O+z ze^BeXUOGEyraYEdsfdPNhvB~tn?!1mYbHN)TgVB{0W<92(DRDLDxZE}AF)_EeIViw zBxdlS+HN6|v{qVFZ#);I^wN2hLNy;$8vF{7Yb@tjJ!Qj<6{v<}s$We33hly@nf<~z zW_rUg*jrf{C#lape?=L580r69bd6G4fWa!fjk-c(aEh(3fm(rlz+o^0OP0|w&@v>& zq*$M1ss{!w3#MF5<1wnnG9FrsVtHbmSBsU^kkPffx1-TsWj~jVmBey)wW(bwkH+L6 z#ySToS-qy>4QVtMCmr`t=YG7vVHQw<@%%c8<|M!A1w_X$0Lt(AEpAmMf3ziecG%iX zVV?Syi|Lba7{Q0L_?C=(F`g4i0?YZho4B-CeVpZI#PUdYqpTdKrn@dj(R@dJO$F`f z7$qc(u=wXSI`IBL6}ZrAX7C1u0PQPSAii*3Un6sPwxJMop2dO57!Rwe!2a>Ihy z*}~Mu^!NL(a=ojq7y*?aZa!v^YOV>#;~ywZD&d~a3EZtNnydQQ9U@qf|In*AvB%a) zd>01@RKX^qwk5*`7sS=ZLOy{XNNz(GPo<*|%_;|~-vK`JQl}jr`}kgvZ)|@?W)qi| zkm-%7Uehbp4yG2uvcRn_;SadYjStLemwo+ESq7SfYpD;tTBM&>#|Z(#A4%9QR}+BZ zY-g%ZUXb?n2|NYPk~-iRBg*5<__cPoa+|p}Xl^fKdZqI{}GbcFX2pu~y|B#uY5tt!#Et@vJ2be0la zKu|@ZFMJp4{8W^+bimQY_c6nJnn;=I&V`o2kxBdDw%hw zArO4%Yi!^uTh}}!x$DC9$sXNuCnfMU_H+AiP@*$BSE_Ze2fxp>voO8qnoQ7G)5`PV z;0FV#Vom6fC{A{dJ217=i6W8L3a~Z#ft6b*6NK5uv@-GpQBebWMb`<&LHk;?A>8PBi{IpVySlpc zDdw$kx`3zi(E%;O5=P1!zNNl>p}U=A<>7SYmRFY-@*D=+7YjZ&CzFt7R)jcA@5Aw! zg>*rW!&7r#@4G3$0-^iI4rERm+&QZ5&DY(A$)bRVp~oudR0<6E&){?5rqomr-K|0s zfs>ClHs%P10Sc`9t?Tl}Ifs-XCUVr z`6UBJX|PP7I{$MiL=7LSvu$eaP00CdKNgYk>rB#;e97#!n|C@>`Qq|YPnL@a_}`g& zL@49qyj8J(A7o91$-22qI86Ws4!ys!ew-mT$9`BC2hnWX(&99AF^re0{VvPIbcQI< zlC~3B#(DakpuQy&6o$sx>E))_!%oySAr<-Pqaivru_^CNbuAd@Epx2Y*ghIDnWJ_F zk$>e^bumV-g_JG1aKcwbiz=LTtg6VI68>WOYEhIXkT}=ophUIF^fC@9)AYn5hS6m= znQ#AY={pdWV!t1G1m-h&iag#NR~{ZyqBC!eL;x}XL2 zgVHv0)(&&J*DA^FO9-Zr;o5?OX`^QH1B76qB;{BX+(XqO#+J)pWX!MRYQz6%~GJTr>6A)+6UkXQ(y#*h0A2)YQa5-ueN_#0b@`e9ORXos`R z1C%a%f<)^a=^uKc#Y}v9R)1~~-c`C&Y0LfG_?FDK6fz&Ap;myEzx;Yz)xMX!+)z({ zd54Ok)KsW=C4>j}i}9n=keX4dZXs-0bdd;$q2BKATXou*^^~1dPmik-xDYdQGLEmy zI=&{~qJdL`Zj#-R$eb2@{I9V7?)*qxdaZq7nlixJ9qj+q^o70sZ%LKypseVf``7qE zdttMN#6cpOY&oQ!Q|X4XP!8JB3a~b=M3>Cj7A=49PVrl~H_p8LLdVbf^O|~eE zX55;0s`#6VA4?XrmL1O5*0bq?+a?$NyX8_Mzoh!)bAu!ltfN9x77K8hob+U6^Expt zqM$-8V!za ztGyp2S-gq4O6CHAEov{kIX7cdcr-9)vUsWcX!YrNg$qmI-tLeUliI(LYbG2+XlMO| zF(sjM+PMAAUIuY`!Q-X8$_;!wTt2QKHCkHdw5!nj8Oi0GbaBs2!YJ9Ef@i&TZl$|k zO=K9_dZcR)qEES&2P?&|zFAWqwggKfu^Hcl2wou!!qJ`zvNlV$h}^jB@;T#pr8nEtaVIVv@;)S>Za;oP%X2ZwbRjrFGz9G`CsN*+o2==ROl)x= zXXbhenbf<31Ixp~<0Jh4wIa;1Uys**Kac;hWV}yse zmVZs6_rdpPra$0P*a{f-7yfI4y3cTb)cC_t2mH6i w;C?RsL!b^zt^bu-@5Aq>jX&^@xc>qFXA%L+BO?9$1{& URL { + // `.copy("Documents/Fixtures")` in `Package.swift` drops the parent + // `Documents/` segment inside the test bundle, so resources live + // under `Fixtures/xlsx/...` at the bundle root. + guard + let url = Bundle.module.url( + forResource: "sample", + withExtension: "xlsx", + subdirectory: "Fixtures/xlsx" + ) + else { + throw FixtureError.missing + } + return url + } + + private enum FixtureError: Error { case missing } +} From 565cb67b83326d663b3bd1d73ab86ff0a5ae14f8 Mon Sep 17 00:00:00 2001 From: mimeding <264272563+mimeding@users.noreply.github.com> Date: Tue, 28 Apr 2026 18:41:49 -0300 Subject: [PATCH 3/7] Resolve CoreXLSX workspace packages --- .../xcshareddata/swiftpm/Package.resolved | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved b/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved index 55fbfe785..18f15f515 100644 --- a/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -1,5 +1,5 @@ { - "originHash" : "8b2c8aee839bee68c488f430fa30e77681adea30962ae12dd392a5b5cd847ae2", + "originHash" : "0fe21e9e78c9dca9c9c9090eaa922b8b3f63117b6f76e3f6d15c36acf5f71a72", "pins" : [ { "identity" : "aachartkit-swift", @@ -28,6 +28,15 @@ "version" : "0.31.0" } }, + { + "identity" : "corexlsx", + "kind" : "remoteSourceControl", + "location" : "https://github.com/CoreOffice/CoreXLSX.git", + "state" : { + "revision" : "1391f3832ea2eeee5186ea8abb81ea49ed0609cc", + "version" : "0.14.2" + } + }, { "identity" : "eventsource", "kind" : "remoteSourceControl", @@ -420,6 +429,15 @@ "revision" : "2e61c12a1573d073618ee2f98f39149ea36068e1" } }, + { + "identity" : "xmlcoder", + "kind" : "remoteSourceControl", + "location" : "https://github.com/maxdesiatov/XMLCoder.git", + "state" : { + "revision" : "ca932442d7481700f5434a7b138c47dd42d9902b", + "version" : "0.14.0" + } + }, { "identity" : "yyjson", "kind" : "remoteSourceControl", @@ -429,6 +447,15 @@ "version" : "0.12.0" } }, + { + "identity" : "zipfoundation", + "kind" : "remoteSourceControl", + "location" : "https://github.com/weichsel/ZIPFoundation.git", + "state" : { + "revision" : "22787ffb59de99e5dc1fbfe80b19c97a904ad48d", + "version" : "0.9.20" + } + }, { "identity" : "zstd", "kind" : "remoteSourceControl", From 2ddfe545cfefc1705199245ff6cba4a1a4500d37 Mon Sep 17 00:00:00 2001 From: Michael Meding <264272563+mimeding@users.noreply.github.com> Date: Thu, 23 Apr 2026 17:44:01 -0300 Subject: [PATCH 4/7] feat(documents): XLSX write via libxlsxwriter closes the round trip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pairs with XLSXAdapter so agents can ingest a workbook, modify the typed Workbook in-process, and emit it back as a fresh .xlsx attachment. libxlsxwriter ships a first-party Swift Package as a pure C SwiftPM target, so no XCFramework / vendored C source is needed in osaurus itself — it's just a dependency add. - Package.swift: libxlsxwriter 1.2.4 dependency for the core target. - XLSXEmitter: Workbook -> .xlsx via libxlsxwriter. Parses A1 cell references into 0-indexed row/col, dispatches strings / numbers / booleans / formulas to the right write_* function, handles merged ranges via worksheet_merge_range with a nil string so the top-left cell's already-written content is preserved. Cleans up a partial .xlsx on any emit error so a failed round trip never masquerades as a readable file. - DocumentAdaptersBootstrap: registers XLSXEmitter alongside XLSXAdapter. - XLSXEmitterTests: 7 tests pinning the round trip end-to-end. Builds a Workbook in memory, writes via XLSXEmitter, reads via XLSXAdapter, asserts sheet names / formulas / merged ranges / strings / numbers / booleans all survive. Licensing footnote: libxlsxwriter is BSD-2-Clause, but bundles third_party/tmpfileplus/tmpfileplus.c under MPL 2.0. Statically linking is permitted. A follow-up to AcknowledgementsView should list both; deliberately out of scope for this PR. --- .../Documents/DocumentAdaptersBootstrap.swift | 1 + Packages/OsaurusCore/Package.swift | 2 + .../Services/Documents/XLSXEmitter.swift | 222 ++++++++++++++ .../Tests/Documents/XLSXEmitterTests.swift | 273 ++++++++++++++++++ 4 files changed, 498 insertions(+) create mode 100644 Packages/OsaurusCore/Services/Documents/XLSXEmitter.swift create mode 100644 Packages/OsaurusCore/Tests/Documents/XLSXEmitterTests.swift diff --git a/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift index 39643f0ba..e32e8481e 100644 --- a/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift +++ b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift @@ -28,6 +28,7 @@ public enum DocumentAdaptersBootstrap { registry.register(adapter: PDFAdapter()) registry.register(adapter: RichDocumentAdapter()) registry.register(adapter: XLSXAdapter()) + registry.register(emitter: XLSXEmitter()) if registry === DocumentFormatRegistry.shared { didRegisterShared = true } diff --git a/Packages/OsaurusCore/Package.swift b/Packages/OsaurusCore/Package.swift index 49e776152..b98431576 100644 --- a/Packages/OsaurusCore/Package.swift +++ b/Packages/OsaurusCore/Package.swift @@ -149,6 +149,7 @@ let package = Package( .package(url: "https://github.com/raspu/Highlightr", from: "2.3.0"), .package(url: "https://github.com/AAChartModel/AAChartKit-Swift.git", from: "9.5.0"), .package(url: "https://github.com/CoreOffice/CoreXLSX.git", from: "0.14.2"), + .package(url: "https://github.com/jmcnamara/libxlsxwriter.git", from: "1.2.4"), ], targets: [ // Vendored SQLCipher 4.6.1 amalgamation (CommonCrypto @@ -273,6 +274,7 @@ let package = Package( .product(name: "Highlightr", package: "Highlightr"), .product(name: "AAInfographics", package: "AAChartKit-Swift"), .product(name: "CoreXLSX", package: "CoreXLSX"), + .product(name: "libxlsxwriter", package: "libxlsxwriter"), ], path: ".", exclude: ["Tests", "SQLCipher"], diff --git a/Packages/OsaurusCore/Services/Documents/XLSXEmitter.swift b/Packages/OsaurusCore/Services/Documents/XLSXEmitter.swift new file mode 100644 index 000000000..479c75f73 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/XLSXEmitter.swift @@ -0,0 +1,222 @@ +// +// XLSXEmitter.swift +// osaurus +// +// Writes a typed `Workbook` back out to `.xlsx` using libxlsxwriter. +// Pairs with `XLSXAdapter` to close the read-emit-read round trip that +// makes Excel a first-class format for osaurus agents: an agent can +// ingest a workbook, edit a `Workbook` in-process, and emit it back to +// the user as an attachable artifact. +// +// Licensing notes, surfaced for whoever owns acknowledgements: +// - libxlsxwriter itself is BSD-2-Clause. +// - It vendors `third_party/tmpfileplus/tmpfileplus.c` which is +// MPL 2.0. Statically linking it is permitted; the MPL only +// requires that the source of the covered file remain available. +// A follow-up to `AcknowledgementsView` should list both. +// + +import Foundation +import libxlsxwriter + +public struct XLSXEmitter: DocumentFormatEmitter { + public let formatId = "xlsx" + + public init() {} + + public func canEmit(_ document: StructuredDocument) -> Bool { + document.representation.underlying is Workbook + } + + public func emit(_ document: StructuredDocument, to url: URL) async throws { + guard let workbook = document.representation.underlying as? Workbook else { + throw DocumentAdapterError.writeFailed( + underlying: "emit called with non-Workbook representation" + ) + } + + // libxlsxwriter operates on a filename — it writes directly to the + // destination during `workbook_close` rather than handing back + // bytes. The caller has already resolved/contained `url` per the + // emitter contract. + let workbookHandle: UnsafeMutablePointer? = url.path.withCString { + workbook_new($0) + } + guard let lxwWorkbook = workbookHandle else { + throw DocumentAdapterError.writeFailed( + underlying: "workbook_new failed for \(url.path)" + ) + } + + var pendingError: DocumentAdapterError? + + for sheet in workbook.sheets { + let sheetHandle: UnsafeMutablePointer? = sheet.name.withCString { + workbook_add_worksheet(lxwWorkbook, $0) + } + guard let lxwSheet = sheetHandle else { + pendingError = .writeFailed( + underlying: "workbook_add_worksheet failed for '\(sheet.name)'" + ) + break + } + if let err = Self.writeSheet(sheet, to: lxwSheet) { + pendingError = err + break + } + } + + // `workbook_close` is ALWAYS called, even on earlier errors, so + // libxlsxwriter can release its buffers and temp files. + let closeError = workbook_close(lxwWorkbook) + if pendingError == nil, closeError.rawValue != 0 { + pendingError = .writeFailed(underlying: "workbook_close error \(closeError.rawValue)") + } + + if let error = pendingError { + // Best-effort cleanup — leaving a partial .xlsx behind would + // masquerade as a successful emit to any later reader. + try? FileManager.default.removeItem(at: url) + throw error + } + } + + // MARK: - Internals + + private static func writeSheet( + _ sheet: Sheet, + to lxwSheet: UnsafeMutablePointer + ) -> DocumentAdapterError? { + for row in sheet.rows { + for cell in row.cells { + if let error = writeCell(cell, to: lxwSheet) { + return error + } + } + } + for range in sheet.mergedRanges { + guard let coords = parseRange(range.reference) else { + return .writeFailed(underlying: "Bad merge range '\(range.reference)'") + } + // Passing a nil string tells libxlsxwriter to preserve whatever + // was already written at the top-left cell of the range; our + // top-left cell was emitted by the loop above. + let err = worksheet_merge_range( + lxwSheet, + coords.firstRow, + coords.firstCol, + coords.lastRow, + coords.lastCol, + nil, + nil + ) + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_merge_range \(range.reference) → \(err.rawValue)" + ) + } + } + return nil + } + + private static func writeCell( + _ cell: Cell, + to lxwSheet: UnsafeMutablePointer + ) -> DocumentAdapterError? { + guard let coords = parseA1(cell.reference) else { + return .writeFailed(underlying: "Bad cell reference '\(cell.reference)'") + } + let row = coords.row + let col = coords.col + + if let formula = cell.formula { + let err = formula.withCString { + worksheet_write_formula(lxwSheet, row, col, $0, nil) + } + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_write_formula \(cell.reference) → \(err.rawValue)" + ) + } + return nil + } + + switch cell.value { + case .empty: + return nil + case .number(let value): + let err = worksheet_write_number(lxwSheet, row, col, value, nil) + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_write_number \(cell.reference) → \(err.rawValue)" + ) + } + case .string(let text), .inlineString(let text): + let err = text.withCString { + worksheet_write_string(lxwSheet, row, col, $0, nil) + } + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_write_string \(cell.reference) → \(err.rawValue)" + ) + } + case .bool(let flag): + let err = worksheet_write_boolean(lxwSheet, row, col, flag ? 1 : 0, nil) + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_write_boolean \(cell.reference) → \(err.rawValue)" + ) + } + } + return nil + } + + // MARK: - A1 parsing + + /// Parses an A1-style cell reference ("B3", "AA10") into the 0-indexed + /// row and column that libxlsxwriter expects. Returns nil for anything + /// that doesn't match `[A-Z]+[0-9]+`. + private static func parseA1(_ reference: String) -> (row: UInt32, col: UInt16)? { + var letters: [UInt8] = [] + var digits: [UInt8] = [] + for scalar in reference.unicodeScalars { + guard scalar.isASCII, let byte = UInt8(exactly: scalar.value) else { return nil } + switch byte { + case 0x41 ... 0x5A: // A-Z + letters.append(byte) + case 0x61 ... 0x7A: // a-z + letters.append(byte - 32) + case 0x30 ... 0x39: // 0-9 + digits.append(byte) + default: + return nil + } + } + guard !letters.isEmpty, !digits.isEmpty else { return nil } + + let rowOneBasedString = String(bytes: digits, encoding: .ascii) ?? "" + guard let rowOneBased = UInt32(rowOneBasedString), rowOneBased > 0 else { return nil } + + var col: Int = 0 + let base = Int(UInt8(ascii: "A")) + for byte in letters { + col = col * 26 + (Int(byte) - base + 1) + } + guard col > 0, col <= 16_384 else { return nil } // Excel col cap + + return (row: rowOneBased - 1, col: UInt16(col - 1)) + } + + /// Parses an A1:A1 range ("A5:B5") into the four 0-indexed coordinates + /// libxlsxwriter's merge call wants. + private static func parseRange( + _ reference: String + ) -> (firstRow: UInt32, firstCol: UInt16, lastRow: UInt32, lastCol: UInt16)? { + let parts = reference.split(separator: ":", maxSplits: 1) + guard parts.count == 2, + let first = parseA1(String(parts[0])), + let last = parseA1(String(parts[1])) + else { return nil } + return (first.row, first.col, last.row, last.col) + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/XLSXEmitterTests.swift b/Packages/OsaurusCore/Tests/Documents/XLSXEmitterTests.swift new file mode 100644 index 000000000..8d29e456a --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/XLSXEmitterTests.swift @@ -0,0 +1,273 @@ +// +// XLSXEmitterTests.swift +// osaurusTests +// +// Proves the XLSX round trip: build a `Workbook` in memory, emit it +// through `XLSXEmitter`, re-parse the resulting file through +// `XLSXAdapter`, and assert that every fidelity feature we care about +// — sheet names, cell values, formula source strings, merged ranges, +// booleans — survives. Libxlsxwriter's output is strictly standards- +// conforming so the re-parse exercises the same CoreXLSX paths the +// read-side tests already pin. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("XLSXEmitter round trip") +struct XLSXEmitterTests { + + @Test func canEmit_onlyAcceptsWorkbookRepresentations() { + let emitter = XLSXEmitter() + let workbookDoc = StructuredDocument( + formatId: "xlsx", + filename: "a.xlsx", + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: "xlsx", + underlying: Workbook(sheets: [], sharedStrings: []) + ), + textFallback: "" + ) + let plainDoc = StructuredDocument( + formatId: "plaintext", + filename: "a.txt", + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: "plaintext", + underlying: PlainTextRepresentation(text: "") + ), + textFallback: "" + ) + + #expect(emitter.canEmit(workbookDoc)) + #expect(emitter.canEmit(plainDoc) == false) + } + + // MARK: - Round trip + + @Test func emit_thenReparse_preservesSheetsAndCells() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + let emitter = XLSXEmitter() + try await emitter.emit(Self.wrap(input), to: dest) + + #expect(FileManager.default.fileExists(atPath: dest.path)) + + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook else { + Issue.record("Re-parsed representation was not a Workbook") + return + } + + #expect(output.sheets.count == input.sheets.count) + for (expected, actual) in zip(input.sheets, output.sheets) { + #expect(expected.name == actual.name, "sheet name mismatch") + } + } + + @Test func emit_preservesFormulaSourceStrings() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + try await XLSXEmitter().emit(Self.wrap(input), to: dest) + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook else { + Issue.record("Re-parsed representation was not a Workbook") + return + } + + let formulas = output.sheets + .flatMap(\.rows) + .flatMap(\.cells) + .compactMap(\.formula) + #expect(formulas.contains("SUM(B2:B3)")) + } + + @Test func emit_preservesMergedRanges() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + try await XLSXEmitter().emit(Self.wrap(input), to: dest) + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook else { + Issue.record("Re-parsed representation was not a Workbook") + return + } + + let mergedRefs = output.sheets.flatMap { $0.mergedRanges.map(\.reference) } + #expect(mergedRefs.contains("A5:B5")) + } + + @Test func emit_preservesStringAndNumberCells() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + try await XLSXEmitter().emit(Self.wrap(input), to: dest) + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook, + let revenue = output.sheets.first(where: { $0.name == "Revenue" }) + else { + Issue.record("Revenue sheet missing after round trip") + return + } + + // "Month" string header lands in A1. + let a1 = revenue.rows.flatMap(\.cells).first { $0.reference == "A1" } + if case .string(let value) = a1?.value { + #expect(value == "Month") + } else { + Issue.record("A1 after round trip was \(String(describing: a1?.value))") + } + + // 1200 number lands in B2. + let b2 = revenue.rows.flatMap(\.cells).first { $0.reference == "B2" } + if case .number(let value) = b2?.value { + #expect(value == 1200) + } else { + Issue.record("B2 after round trip was \(String(describing: b2?.value))") + } + } + + @Test func emit_preservesBooleans() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + try await XLSXEmitter().emit(Self.wrap(input), to: dest) + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook, + let notes = output.sheets.first(where: { $0.name == "Notes" }) + else { + Issue.record("Notes sheet missing") + return + } + + let bools = notes.rows.flatMap(\.cells).compactMap { cell -> Bool? in + if case .bool(let flag) = cell.value { return flag } else { return nil } + } + #expect(bools.count == 2) + #expect(bools.contains(true)) + #expect(bools.contains(false)) + } + + @Test func emit_rejectsNonWorkbookRepresentation() async throws { + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + let plain = StructuredDocument( + formatId: "plaintext", + filename: "a.txt", + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: "plaintext", + underlying: PlainTextRepresentation(text: "") + ), + textFallback: "" + ) + await #expect(throws: DocumentAdapterError.self) { + try await XLSXEmitter().emit(plain, to: dest) + } + } + + // MARK: - Fixture builder + + private static func wrap(_ workbook: Workbook) -> StructuredDocument { + StructuredDocument( + formatId: "xlsx", + filename: "fixture.xlsx", + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: "xlsx", + underlying: workbook + ), + textFallback: "" + ) + } + + /// Matches the shape of the checked-in `sample.xlsx` fixture so the + /// emitter and adapter exercise the same fidelity checklist. + private static func makeRoundTripFixture() -> Workbook { + let revenue = Sheet( + name: "Revenue", + rows: [ + Row( + index: 1, + cells: [ + Cell(reference: "A1", value: .string("Month")), + Cell(reference: "B1", value: .string("Amount")), + ] + ), + Row( + index: 2, + cells: [ + Cell(reference: "A2", value: .string("January")), + Cell(reference: "B2", value: .number(1200)), + ] + ), + Row( + index: 3, + cells: [ + Cell(reference: "A3", value: .string("February")), + Cell(reference: "B3", value: .number(950)), + ] + ), + Row( + index: 4, + cells: [ + Cell(reference: "A4", value: .string("Total")), + Cell(reference: "B4", value: .empty, formula: "SUM(B2:B3)"), + ] + ), + Row( + index: 5, + cells: [ + Cell(reference: "A5", value: .string("Generated for osaurus tests")) + ] + ), + ], + mergedRanges: [CellRange(reference: "A5:B5")] + ) + + let notes = Sheet( + name: "Notes", + rows: [ + Row( + index: 1, + cells: [ + Cell(reference: "A1", value: .string("Key")), + Cell(reference: "B1", value: .string("Value")), + ] + ), + Row( + index: 2, + cells: [ + Cell(reference: "A2", value: .string("reviewer")), + Cell(reference: "B2", value: .string("mimeding")), + ] + ), + Row( + index: 3, + cells: [ + Cell(reference: "A3", value: .bool(true)), + Cell(reference: "B3", value: .bool(false)), + ] + ), + ], + mergedRanges: [] + ) + + return Workbook(sheets: [revenue, notes], sharedStrings: []) + } + + private static func tempURL() -> URL { + FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-xlsx-roundtrip-\(UUID().uuidString).xlsx") + } +} From ba66d124edde2723b419c3fe1f3103bfd2ec263c Mon Sep 17 00:00:00 2001 From: mimeding <264272563+mimeding@users.noreply.github.com> Date: Tue, 28 Apr 2026 18:42:24 -0300 Subject: [PATCH 5/7] Resolve libxlsxwriter workspace package --- .../xcshareddata/swiftpm/Package.resolved | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved b/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved index 18f15f515..6c8c81f72 100644 --- a/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -1,5 +1,5 @@ { - "originHash" : "0fe21e9e78c9dca9c9c9090eaa922b8b3f63117b6f76e3f6d15c36acf5f71a72", + "originHash" : "c1188a7167ae42da4fce56bed21b6f0ad5f487ffcd6ff8a2788d7af08befc3a2", "pins" : [ { "identity" : "aachartkit-swift", @@ -100,6 +100,15 @@ "version" : "2.4.3" } }, + { + "identity" : "libxlsxwriter", + "kind" : "remoteSourceControl", + "location" : "https://github.com/jmcnamara/libxlsxwriter.git", + "state" : { + "revision" : "2894634d65cee6021901a165bfc2bb0fad6da193", + "version" : "1.2.4" + } + }, { "identity" : "mlx-swift", "kind" : "remoteSourceControl", From 82b1d2012bebf4b4ebd88069d4982b42a7117b29 Mon Sep 17 00:00:00 2001 From: Michael Meding <264272563+mimeding@users.noreply.github.com> Date: Fri, 24 Apr 2026 13:14:48 -0300 Subject: [PATCH 6/7] feat(documents): agent tools read_workbook / read_workbook_cell / write_workbook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exposes the typed Workbook surface to folder-mode agents. Stacks on top of the XLSX read (#929) + write (#936) PRs and completes the stage-4 round-trip goal: an agent can now ingest a spreadsheet, reason about cells and formulas in their native types, and emit a modified workbook — all without the model having to handroll XML. - read_workbook: returns a compact JSON summary of every sheet (names, row counts, merged ranges, truncated cell sample). Capped at 200 cells per sheet so large workbooks don't blow the context window; agents drop to read_workbook_cell for specific values. - read_workbook_cell: single-cell lookup by (path, sheet, A1 ref). Returns value, formula source, and type in a one-line JSON payload. - write_workbook: accepts a structured sheets array and emits the file via XLSXEmitter. Each cell carries its A1 ref, typed value, and optional formula; the schema enum guards against unknown types. write_workbook creates parent directories and surfaces a sheetCount / totalCells summary on success. - All three plug into FolderToolFactory.buildCoreTools alongside file_read / file_write, so they're registered the moment a working folder is selected and go away when it's cleared. - Tests: 8 tests covering sheet summary rendering, missing-file and out-of-root rejection, formula preservation on cell lookup, missing- sheet error, end-to-end write + re-parse fidelity, non-xlsx path refusal, and empty-sheets validation. Tests reuse the sample.xlsx fixture from the XLSX read PR. --- Packages/OsaurusCore/Folder/FolderTools.swift | 3 + .../OsaurusCore/Folder/WorkbookTools.swift | 573 ++++++++++++++++++ .../Tests/Documents/WorkbookToolsTests.swift | 181 ++++++ 3 files changed, 757 insertions(+) create mode 100644 Packages/OsaurusCore/Folder/WorkbookTools.swift create mode 100644 Packages/OsaurusCore/Tests/Documents/WorkbookToolsTests.swift diff --git a/Packages/OsaurusCore/Folder/FolderTools.swift b/Packages/OsaurusCore/Folder/FolderTools.swift index 62102188d..0c785ac3a 100644 --- a/Packages/OsaurusCore/Folder/FolderTools.swift +++ b/Packages/OsaurusCore/Folder/FolderTools.swift @@ -1016,6 +1016,9 @@ enum FolderToolFactory { FileWriteTool(rootPath: rootPath), FileEditTool(rootPath: rootPath), FileSearchTool(rootPath: rootPath), + ReadWorkbookTool(rootPath: rootPath), + ReadWorkbookCellTool(rootPath: rootPath), + WriteWorkbookTool(rootPath: rootPath), ] } diff --git a/Packages/OsaurusCore/Folder/WorkbookTools.swift b/Packages/OsaurusCore/Folder/WorkbookTools.swift new file mode 100644 index 000000000..c32490316 --- /dev/null +++ b/Packages/OsaurusCore/Folder/WorkbookTools.swift @@ -0,0 +1,573 @@ +// +// WorkbookTools.swift +// osaurus +// +// Folder-scoped agent tools for reading and writing XLSX workbooks +// through the typed `Workbook` surface. Installed by +// `FolderToolFactory.buildCoreTools` when a working folder is active. +// +// These tools let an agent ingest a spreadsheet, reason about cells and +// formulas in their native types, and emit a modified workbook without +// ever dropping to markdown-as-text serialisation. They pair with +// `XLSXAdapter` (read) and `XLSXEmitter` (write) via +// `DocumentFormatRegistry`. +// +// Path resolution matches `FileReadTool` / `FileWriteTool` — paths are +// contained under `rootPath` and `..`-traversal is rejected. +// + +import Foundation + +// MARK: - read_workbook + +struct ReadWorkbookTool: OsaurusTool { + let name = "read_workbook" + let description = + "Read an XLSX spreadsheet into a structured summary. Returns sheet " + + "names, row counts, merged ranges, and a truncated cell sample per " + + "sheet so the response stays in-context. For a specific cell's value " + + "or formula use `read_workbook_cell`. To write a modified workbook, " + + "use `write_workbook`." + + let parameters: JSONValue? = .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "properties": .object([ + "path": .object([ + "type": .string("string"), + "description": .string("Relative path to an .xlsx file under the working folder."), + ]) + ]), + "required": .array([.string("path")]), + ]) + + private let rootPath: URL + + /// Cap on cells returned per sheet. Agents that need more should + /// switch to `read_workbook_cell` for the specific reference. + private static let maxCellsPerSheet = 200 + + init(rootPath: URL) { + self.rootPath = rootPath + } + + func execute(argumentsJSON: String) async throws -> String { + let argsReq = requireArgumentsDictionary(argumentsJSON, tool: name) + guard case .value(let args) = argsReq else { return argsReq.failureEnvelope ?? "" } + let pathReq = requireString(args, "path", expected: "relative path to an .xlsx file", tool: name) + guard case .value(let relativePath) = pathReq else { return pathReq.failureEnvelope ?? "" } + + let fileURL: URL + do { + fileURL = try FolderToolHelpers.resolvePath(relativePath, rootPath: rootPath) + } catch { + return ToolEnvelope.failure(kind: .invalidArgs, message: error.localizedDescription, tool: name) + } + + let workbook: Workbook + do { + let document = try await XLSXAdapter().parse( + url: fileURL, + sizeLimit: DocumentLimits.limit(forFormatId: "xlsx") + ) + guard let wb = document.representation.underlying as? Workbook else { + return ToolEnvelope.failure( + kind: .executionError, + message: "XLSX adapter returned unexpected representation.", + tool: name + ) + } + workbook = wb + } catch { + return ToolEnvelope.failure( + kind: .executionError, + message: "Failed to read workbook: \(error.localizedDescription)", + tool: name + ) + } + + let payload = renderSummary(path: relativePath, workbook: workbook) + guard let data = try? JSONSerialization.data(withJSONObject: payload, options: [.sortedKeys]), + let text = String(data: data, encoding: .utf8) + else { + return ToolEnvelope.failure( + kind: .executionError, + message: "Could not serialise workbook summary.", + tool: name + ) + } + return ToolEnvelope.success(tool: name, text: text) + } + + // MARK: - Summary rendering + + private func renderSummary(path: String, workbook: Workbook) -> [String: Any] { + let sheets: [[String: Any]] = workbook.sheets.map { sheet in + let allCells = sheet.rows.flatMap { row in + row.cells.map { cell in renderCell(row: row.index, cell: cell) } + } + let truncated = allCells.prefix(Self.maxCellsPerSheet).map { $0 } + var sheetPayload: [String: Any] = [ + "name": sheet.name, + "rowCount": sheet.rows.count, + "cellCount": allCells.count, + "cells": truncated, + ] + if allCells.count > truncated.count { + sheetPayload["truncated"] = true + } + if !sheet.mergedRanges.isEmpty { + sheetPayload["mergedRanges"] = sheet.mergedRanges.map { $0.reference } + } + return sheetPayload + } + return [ + "path": path, + "sheets": sheets, + ] + } + + private func renderCell(row: Int, cell: Cell) -> [String: Any] { + var payload: [String: Any] = ["ref": cell.reference, "row": row] + switch cell.value { + case .empty: + payload["type"] = "empty" + case .number(let value): + payload["type"] = "number" + payload["value"] = value + case .string(let text): + payload["type"] = "string" + payload["value"] = text + case .inlineString(let text): + payload["type"] = "inlineString" + payload["value"] = text + case .bool(let flag): + payload["type"] = "bool" + payload["value"] = flag + } + if let formula = cell.formula { + payload["formula"] = formula + } + return payload + } +} + +// MARK: - read_workbook_cell + +struct ReadWorkbookCellTool: OsaurusTool { + let name = "read_workbook_cell" + let description = + "Read a single cell from an XLSX spreadsheet. Returns value, formula, " + + "and type for the referenced cell. Use after `read_workbook` has " + + "shown the structure and you need a specific value that was " + + "truncated out of the summary." + + let parameters: JSONValue? = .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "properties": .object([ + "path": .object([ + "type": .string("string"), + "description": .string("Relative path to an .xlsx file under the working folder."), + ]), + "sheet": .object([ + "type": .string("string"), + "description": .string("Sheet name, e.g. `Revenue`."), + ]), + "cell": .object([ + "type": .string("string"), + "description": .string("A1-style cell reference, e.g. `B3` or `AA10`."), + ]), + ]), + "required": .array([.string("path"), .string("sheet"), .string("cell")]), + ]) + + private let rootPath: URL + + init(rootPath: URL) { + self.rootPath = rootPath + } + + func execute(argumentsJSON: String) async throws -> String { + let argsReq = requireArgumentsDictionary(argumentsJSON, tool: name) + guard case .value(let args) = argsReq else { return argsReq.failureEnvelope ?? "" } + let pathReq = requireString(args, "path", expected: "relative path to an .xlsx file", tool: name) + guard case .value(let relativePath) = pathReq else { return pathReq.failureEnvelope ?? "" } + let sheetReq = requireString(args, "sheet", expected: "sheet name", tool: name) + guard case .value(let sheetName) = sheetReq else { return sheetReq.failureEnvelope ?? "" } + let cellReq = requireString(args, "cell", expected: "A1-style cell reference", tool: name) + guard case .value(let cellRef) = cellReq else { return cellReq.failureEnvelope ?? "" } + + let fileURL: URL + do { + fileURL = try FolderToolHelpers.resolvePath(relativePath, rootPath: rootPath) + } catch { + return ToolEnvelope.failure(kind: .invalidArgs, message: error.localizedDescription, tool: name) + } + + let workbook: Workbook + do { + let document = try await XLSXAdapter().parse( + url: fileURL, + sizeLimit: DocumentLimits.limit(forFormatId: "xlsx") + ) + guard let wb = document.representation.underlying as? Workbook else { + return ToolEnvelope.failure( + kind: .executionError, + message: "XLSX adapter returned unexpected representation.", + tool: name + ) + } + workbook = wb + } catch { + return ToolEnvelope.failure( + kind: .executionError, + message: "Failed to read workbook: \(error.localizedDescription)", + tool: name + ) + } + + guard let sheet = workbook.sheets.first(where: { $0.name == sheetName }) else { + let available = workbook.sheets.map(\.name).joined(separator: ", ") + return ToolEnvelope.failure( + kind: .invalidArgs, + message: + "Sheet '\(sheetName)' not found. Available sheets: \(available).", + field: "sheet", + expected: "an existing sheet name", + tool: name + ) + } + guard let cell = sheet.rows.flatMap(\.cells).first(where: { $0.reference == cellRef }) else { + return ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell '\(cellRef)' not found on sheet '\(sheetName)'.", + field: "cell", + expected: "an occupied cell on the sheet", + tool: name + ) + } + + var payload: [String: Any] = ["ref": cell.reference] + switch cell.value { + case .empty: payload["type"] = "empty" + case .number(let v): payload["type"] = "number"; payload["value"] = v + case .string(let v): payload["type"] = "string"; payload["value"] = v + case .inlineString(let v): payload["type"] = "inlineString"; payload["value"] = v + case .bool(let v): payload["type"] = "bool"; payload["value"] = v + } + if let formula = cell.formula { payload["formula"] = formula } + + guard let data = try? JSONSerialization.data(withJSONObject: payload, options: [.sortedKeys]), + let text = String(data: data, encoding: .utf8) + else { + return ToolEnvelope.failure( + kind: .executionError, + message: "Could not serialise cell payload.", + tool: name + ) + } + return ToolEnvelope.success(tool: name, text: text) + } +} + +// MARK: - write_workbook + +struct WriteWorkbookTool: OsaurusTool { + let name = "write_workbook" + let description = + "Write an XLSX spreadsheet to disk. Accepts a structured `sheets` " + + "array so the model never has to format raw XML. Each cell carries " + + "its A1 reference, a typed value, and an optional formula. " + + "Call `share_artifact` afterwards if you want the file to appear in " + + "the chat thread." + + let parameters: JSONValue? = .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "properties": .object([ + "path": .object([ + "type": .string("string"), + "description": .string("Relative output path, e.g. `report.xlsx`."), + ]), + "sheets": .object([ + "type": .string("array"), + "description": .string("One or more sheets in display order."), + "items": .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "required": .array([.string("name")]), + "properties": .object([ + "name": .object([ + "type": .string("string"), + "description": .string("Sheet display name."), + ]), + "cells": .object([ + "type": .string("array"), + "description": .string( + "Cells to write. Omit to create an empty sheet." + ), + "items": .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "required": .array([.string("ref")]), + "properties": .object([ + "ref": .object([ + "type": .string("string"), + "description": .string("A1 reference, e.g. `B3`."), + ]), + "type": .object([ + "type": .string("string"), + "description": .string( + "`string`, `number`, `bool`, or `formula`." + ), + "enum": .array([ + .string("string"), + .string("number"), + .string("bool"), + .string("formula"), + ]), + ]), + "value": .object([ + "description": .string( + "Cell value — string/number/bool. Ignored for `formula` cells; use `formula` instead." + ) + ]), + "formula": .object([ + "type": .string("string"), + "description": .string( + "Formula source without the leading `=`, e.g. `SUM(A1:A3)`." + ), + ]), + ]), + ]), + ]), + "mergedRanges": .object([ + "type": .string("array"), + "items": .object(["type": .string("string")]), + "description": .string("Optional A1:A1 merge ranges, e.g. `A1:B1`."), + ]), + ]), + ]), + ]), + ]), + "required": .array([.string("path"), .string("sheets")]), + ]) + + private let rootPath: URL + + init(rootPath: URL) { + self.rootPath = rootPath + } + + func execute(argumentsJSON: String) async throws -> String { + let argsReq = requireArgumentsDictionary(argumentsJSON, tool: name) + guard case .value(let args) = argsReq else { return argsReq.failureEnvelope ?? "" } + let pathReq = requireString(args, "path", expected: "relative output path ending in .xlsx", tool: name) + guard case .value(let relativePath) = pathReq else { return pathReq.failureEnvelope ?? "" } + + guard let rawSheets = args["sheets"] as? [[String: Any]], !rawSheets.isEmpty else { + return ToolEnvelope.failure( + kind: .invalidArgs, + message: "`sheets` must be a non-empty array of sheet objects.", + field: "sheets", + expected: "non-empty array", + tool: name + ) + } + + let destURL: URL + do { + destURL = try FolderToolHelpers.resolvePath(relativePath, rootPath: rootPath) + } catch { + return ToolEnvelope.failure(kind: .invalidArgs, message: error.localizedDescription, tool: name) + } + + guard destURL.pathExtension.lowercased() == "xlsx" else { + return ToolEnvelope.failure( + kind: .invalidArgs, + message: "`path` must end in `.xlsx`; got '\(relativePath)'.", + field: "path", + expected: ".xlsx file path", + tool: name + ) + } + + var sheets: [Sheet] = [] + for (index, raw) in rawSheets.enumerated() { + switch parseSheet(raw, at: index) { + case .value(let sheet): sheets.append(sheet) + case .failure(let envelope): return envelope + } + } + + let workbook = Workbook(sheets: sheets, sharedStrings: []) + let document = StructuredDocument( + formatId: "xlsx", + filename: destURL.lastPathComponent, + fileSize: 0, + representation: AnyStructuredRepresentation(formatId: "xlsx", underlying: workbook), + textFallback: "" + ) + + // Ensure parent exists so relative writes like `reports/q4.xlsx` + // work without a separate `dir_create` round-trip. + try? FileManager.default.createDirectory( + at: destURL.deletingLastPathComponent(), + withIntermediateDirectories: true + ) + + do { + try await XLSXEmitter().emit(document, to: destURL) + } catch { + return ToolEnvelope.failure( + kind: .executionError, + message: "Failed to write workbook: \(error.localizedDescription)", + tool: name + ) + } + + let payload: [String: Any] = [ + "path": relativePath, + "sheetCount": sheets.count, + "totalCells": sheets.reduce(0) { $0 + $1.rows.flatMap(\.cells).count }, + ] + guard let data = try? JSONSerialization.data(withJSONObject: payload, options: [.sortedKeys]), + let text = String(data: data, encoding: .utf8) + else { + return ToolEnvelope.success(tool: name, text: "Wrote workbook to \(relativePath)") + } + return ToolEnvelope.success(tool: name, text: text) + } + + // MARK: - Parsing + + private func parseSheet( + _ raw: [String: Any], + at index: Int + ) -> ArgumentRequirement { + guard let sheetName = raw["name"] as? String, !sheetName.isEmpty else { + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Sheet at index \(index) is missing a non-empty `name`.", + field: "sheets[\(index)].name", + expected: "non-empty string", + tool: name + ) + ) + } + + let rawCells = raw["cells"] as? [[String: Any]] ?? [] + var cellsByRow: [Int: [Cell]] = [:] + for (cellIndex, rawCell) in rawCells.enumerated() { + switch parseCell(rawCell, sheetIndex: index, cellIndex: cellIndex) { + case .value(let (row, cell)): + cellsByRow[row, default: []].append(cell) + case .failure(let envelope): return .failure(envelope) + } + } + let rows = cellsByRow.keys.sorted().map { rowIndex in + Row(index: rowIndex, cells: cellsByRow[rowIndex] ?? []) + } + + let mergedRanges: [CellRange] = + (raw["mergedRanges"] as? [String])? + .map { CellRange(reference: $0) } ?? [] + + return .value(Sheet(name: sheetName, rows: rows, mergedRanges: mergedRanges)) + } + + private func parseCell( + _ raw: [String: Any], + sheetIndex: Int, + cellIndex: Int + ) -> ArgumentRequirement<(Int, Cell)> { + guard let ref = raw["ref"] as? String, !ref.isEmpty else { + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell \(cellIndex) on sheet \(sheetIndex) is missing `ref`.", + field: "sheets[\(sheetIndex)].cells[\(cellIndex)].ref", + expected: "A1-style reference", + tool: name + ) + ) + } + guard let rowOneBased = rowComponent(of: ref) else { + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell reference '\(ref)' is not valid A1.", + field: "sheets[\(sheetIndex)].cells[\(cellIndex)].ref", + expected: "A1-style reference", + tool: name + ) + ) + } + + let typeHint = (raw["type"] as? String)?.lowercased() + let value: CellValue + var formula: String? + switch typeHint { + case "formula": + guard let f = raw["formula"] as? String, !f.isEmpty else { + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell '\(ref)' is typed as `formula` but has no `formula` string.", + field: "sheets[\(sheetIndex)].cells[\(cellIndex)].formula", + expected: "non-empty formula string", + tool: name + ) + ) + } + formula = f + value = .empty + case "bool": + value = .bool((raw["value"] as? Bool) ?? false) + case "number": + if let n = raw["value"] as? Double { + value = .number(n) + } else if let n = (raw["value"] as? NSNumber)?.doubleValue { + value = .number(n) + } else if let s = raw["value"] as? String, let n = Double(s) { + value = .number(n) + } else { + value = .empty + } + case "string", nil: + if let s = raw["value"] as? String { + value = .string(s) + } else if let n = raw["value"] as? NSNumber { + value = .number(n.doubleValue) + } else if let b = raw["value"] as? Bool { + value = .bool(b) + } else if raw["formula"] is String { + formula = raw["formula"] as? String + value = .empty + } else { + value = .empty + } + default: + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell '\(ref)' has unknown type '\(typeHint ?? "?")'.", + field: "sheets[\(sheetIndex)].cells[\(cellIndex)].type", + expected: "string / number / bool / formula", + tool: name + ) + ) + } + return .value((rowOneBased, Cell(reference: ref, value: value, formula: formula))) + } + + private func rowComponent(of reference: String) -> Int? { + var digits = "" + for ch in reference.unicodeScalars where ch.value >= 0x30 && ch.value <= 0x39 { + digits.append(Character(ch)) + } + return Int(digits) + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/WorkbookToolsTests.swift b/Packages/OsaurusCore/Tests/Documents/WorkbookToolsTests.swift new file mode 100644 index 000000000..3b754c43f --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/WorkbookToolsTests.swift @@ -0,0 +1,181 @@ +// +// WorkbookToolsTests.swift +// osaurusTests +// +// End-to-end tests for the `read_workbook` / `read_workbook_cell` / +// `write_workbook` agent tools. Uses the checked-in sample.xlsx fixture +// for the read paths and a temp directory for the write path so the +// three tools exercise the same XLSXAdapter / XLSXEmitter pair that +// agents see in production. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("Workbook agent tools") +struct WorkbookToolsTests { + + private let rootPath: URL + private let fixturePath: URL + + init() throws { + let tmp = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-wb-tools-\(UUID().uuidString)", isDirectory: true) + try FileManager.default.createDirectory(at: tmp, withIntermediateDirectories: true) + rootPath = tmp + + // Copy the fixture into the temp root so the tools can resolve + // "sample.xlsx" as a relative path under the working folder. + guard + let bundled = Bundle.module.url( + forResource: "sample", + withExtension: "xlsx", + subdirectory: "Fixtures/xlsx" + ) + else { + throw FixtureError.missing + } + fixturePath = tmp.appendingPathComponent("sample.xlsx") + try FileManager.default.copyItem(at: bundled, to: fixturePath) + } + + // MARK: - read_workbook + + @Test func readWorkbook_returnsSheetSummaries() async throws { + let tool = ReadWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute(argumentsJSON: #"{"path":"sample.xlsx"}"#) + let payload = try Self.successTextAsDict(envelope) + + #expect(payload["path"] as? String == "sample.xlsx") + let sheets = payload["sheets"] as? [[String: Any]] ?? [] + #expect(sheets.count == 2) + #expect(sheets.map { $0["name"] as? String }.contains { $0 == "Revenue" }) + #expect(sheets.map { $0["name"] as? String }.contains { $0 == "Notes" }) + + let revenue = sheets.first { $0["name"] as? String == "Revenue" } ?? [:] + let merged = revenue["mergedRanges"] as? [String] ?? [] + #expect(merged.contains("A5:B5")) + } + + @Test func readWorkbook_rejectsMissingFile() async throws { + let tool = ReadWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute(argumentsJSON: #"{"path":"nope.xlsx"}"#) + #expect(envelope.contains("\"kind\":\"execution_error\"") || envelope.contains("\"ok\":false")) + } + + @Test func readWorkbook_rejectsPathOutsideRoot() async throws { + let tool = ReadWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute(argumentsJSON: #"{"path":"../outside.xlsx"}"#) + #expect(envelope.contains("outside") || envelope.contains("invalid")) + } + + // MARK: - read_workbook_cell + + @Test func readWorkbookCell_returnsFormulaAndValue() async throws { + let tool = ReadWorkbookCellTool(rootPath: rootPath) + let envelope = try await tool.execute( + argumentsJSON: #"{"path":"sample.xlsx","sheet":"Revenue","cell":"B4"}"# + ) + let payload = try Self.successTextAsDict(envelope) + #expect(payload["ref"] as? String == "B4") + #expect(payload["formula"] as? String == "SUM(B2:B3)") + } + + @Test func readWorkbookCell_rejectsMissingSheet() async throws { + let tool = ReadWorkbookCellTool(rootPath: rootPath) + let envelope = try await tool.execute( + argumentsJSON: #"{"path":"sample.xlsx","sheet":"Ghost","cell":"A1"}"# + ) + #expect(envelope.contains("not found")) + } + + // MARK: - write_workbook + + @Test func writeWorkbook_emitsAndRoundTrips() async throws { + let tool = WriteWorkbookTool(rootPath: rootPath) + let input = #""" + { + "path": "output.xlsx", + "sheets": [ + { + "name": "Numbers", + "cells": [ + {"ref": "A1", "type": "string", "value": "Label"}, + {"ref": "B1", "type": "number", "value": 42}, + {"ref": "A2", "type": "bool", "value": true}, + {"ref": "C1", "type": "formula", "formula": "B1*2"} + ], + "mergedRanges": ["A3:B3"] + } + ] + } + """# + let envelope = try await tool.execute(argumentsJSON: input) + let payload = try Self.successTextAsDict(envelope) + #expect(payload["sheetCount"] as? Int == 1) + + let outURL = rootPath.appendingPathComponent("output.xlsx") + #expect(FileManager.default.fileExists(atPath: outURL.path)) + + // Round-trip through XLSXAdapter to confirm the cells the agent + // requested actually landed in the file. + let reparsed = try await XLSXAdapter().parse(url: outURL, sizeLimit: 0) + guard let workbook = reparsed.representation.underlying as? Workbook else { + Issue.record("re-parsed representation was not a Workbook") + return + } + #expect(workbook.sheets.first?.name == "Numbers") + let cells = workbook.sheets.first?.rows.flatMap(\.cells) ?? [] + #expect(cells.contains { $0.reference == "A1" }) + #expect(cells.contains { $0.reference == "B1" }) + #expect(cells.contains { $0.reference == "C1" && $0.formula == "B1*2" }) + } + + @Test func writeWorkbook_rejectsNonXLSXPath() async throws { + let tool = WriteWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute( + argumentsJSON: #"{"path":"report.txt","sheets":[{"name":"Sheet1","cells":[]}]}"# + ) + #expect(envelope.contains("must end in")) + } + + @Test func writeWorkbook_rejectsEmptySheets() async throws { + let tool = WriteWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute( + argumentsJSON: #"{"path":"out.xlsx","sheets":[]}"# + ) + #expect(envelope.contains("non-empty")) + } + + // MARK: - Helpers + + /// Extracts the inner `result.text` from a `ToolEnvelope.success` JSON + /// and parses it as a dictionary — the envelope wraps every tool + /// response so tests have to peel one layer. + private static func successTextAsDict(_ envelope: String) throws -> [String: Any] { + let data = envelope.data(using: .utf8) ?? Data() + guard let obj = try JSONSerialization.jsonObject(with: data) as? [String: Any], + let result = obj["result"] as? [String: Any], + let text = result["text"] as? String, + let innerData = text.data(using: .utf8), + let inner = try JSONSerialization.jsonObject(with: innerData) as? [String: Any] + else { + throw FixtureError.notSuccessEnvelope(envelope) + } + return inner + } + + private enum FixtureError: Error, CustomStringConvertible { + case missing + case notSuccessEnvelope(String) + + var description: String { + switch self { + case .missing: return "Bundle.module lost the sample.xlsx fixture" + case .notSuccessEnvelope(let raw): return "Not a success envelope: \(raw)" + } + } + } +} From 09dc52eff1c9e0233b8e69a7ff19c45f94fe2dcf Mon Sep 17 00:00:00 2001 From: Michael Meding Date: Sun, 3 May 2026 19:59:14 -0300 Subject: [PATCH 7/7] fix(documents): satisfy strict lint for workbook tools rebase Business rationale: Workbook tools expose the spreadsheet round trip to agents, and the rebased branch needs to stay CI-clean before review. Coding rationale: This keeps cleanup scoped to lint-only shape fixes inherited from lower stacked branches while preserving behavior and main's lean folder-tool activation model. Co-authored-by: Codex --- Packages/OsaurusCore/AppDelegate.swift | 4 ++-- Packages/OsaurusCore/Folder/FolderTools.swift | 11 ++++------- Packages/OsaurusCore/Utils/DocumentParser.swift | 16 ++++++++-------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/Packages/OsaurusCore/AppDelegate.swift b/Packages/OsaurusCore/AppDelegate.swift index d7e8cc9e7..780438e0d 100644 --- a/Packages/OsaurusCore/AppDelegate.swift +++ b/Packages/OsaurusCore/AppDelegate.swift @@ -179,7 +179,7 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega #endif // Initialize directory access early so security-scoped bookmark is active - let _ = DirectoryPickerService.shared + _ = DirectoryPickerService.shared if LaunchGuard.isSafeMode { NotificationService.shared.postSafeModeActive() @@ -880,7 +880,7 @@ extension AppDelegate { } @objc private func handleServeCommand(_ note: Notification) { - var desiredPort: Int? = nil + var desiredPort: Int? var exposeFlag: Bool = false if let ui = note.userInfo { if let p = ui["port"] as? Int { diff --git a/Packages/OsaurusCore/Folder/FolderTools.swift b/Packages/OsaurusCore/Folder/FolderTools.swift index 0c785ac3a..c681d5b32 100644 --- a/Packages/OsaurusCore/Folder/FolderTools.swift +++ b/Packages/OsaurusCore/Folder/FolderTools.swift @@ -82,10 +82,9 @@ enum FolderToolHelpers { static func detectProjectType(_ url: URL) -> ProjectType { let fm = FileManager.default for projectType in ProjectType.allCases where projectType != .unknown { - for manifestFile in projectType.manifestFiles { - if fm.fileExists(atPath: url.appendingPathComponent(manifestFile).path) { - return projectType - } + for manifestFile in projectType.manifestFiles + where fm.fileExists(atPath: url.appendingPathComponent(manifestFile).path) { + return projectType } } return .unknown @@ -638,9 +637,7 @@ struct FileSearchTool: OsaurusTool { if let pattern = filePattern { let regex = pattern.replacingOccurrences(of: ".", with: "\\.") .replacingOccurrences(of: "*", with: ".*") - if fileURL.lastPathComponent.range(of: "^\(regex)$", options: .regularExpression) - == nil - { + if fileURL.lastPathComponent.range(of: "^\(regex)$", options: .regularExpression) == nil { continue } } diff --git a/Packages/OsaurusCore/Utils/DocumentParser.swift b/Packages/OsaurusCore/Utils/DocumentParser.swift index 7a6c2e8c1..17140495f 100644 --- a/Packages/OsaurusCore/Utils/DocumentParser.swift +++ b/Packages/OsaurusCore/Utils/DocumentParser.swift @@ -155,10 +155,10 @@ enum DocumentParser { return try String(contentsOf: url, encoding: .utf8) } catch { // Retry with latin1 for binary-ish text files - if let data = try? Data(contentsOf: url), - let str = String(data: data, encoding: .isoLatin1) - { - return str + if let data = try? Data(contentsOf: url) { + if let str = String(data: data, encoding: .isoLatin1) { + return str + } } throw ParseError.readFailed(error.localizedDescription) } @@ -197,10 +197,10 @@ enum DocumentParser { private static func extractPDFText(from document: PDFDocument) -> String { var pages: [String] = [] for i in 0 ..< document.pageCount { - if let page = document.page(at: i), let text = page.string, - !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty - { - pages.append(text) + if let page = document.page(at: i), let text = page.string { + if !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { + pages.append(text) + } } } return pages.joined(separator: "\n\n")