diff --git a/Packages/OsaurusCore/AppDelegate.swift b/Packages/OsaurusCore/AppDelegate.swift index 175a2d095..780438e0d 100644 --- a/Packages/OsaurusCore/AppDelegate.swift +++ b/Packages/OsaurusCore/AppDelegate.swift @@ -32,6 +32,10 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega // the specific crash class this prevents. MLXErrorRecovery.installGlobalHandler() + // Register in-tree document format adapters before any file-ingress + // path can run. Idempotent; safe if a future migration moves this. + DocumentAdaptersBootstrap.registerBuiltIns() + // Detect repeated startup crashes and enter safe mode if needed LaunchGuard.checkOnLaunch() @@ -175,7 +179,7 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega #endif // Initialize directory access early so security-scoped bookmark is active - let _ = DirectoryPickerService.shared + _ = DirectoryPickerService.shared if LaunchGuard.isSafeMode { NotificationService.shared.postSafeModeActive() @@ -876,7 +880,7 @@ extension AppDelegate { } @objc private func handleServeCommand(_ note: Notification) { - var desiredPort: Int? = nil + var desiredPort: Int? var exposeFlag: Bool = false if let ui = note.userInfo { if let p = ui["port"] as? Int { diff --git a/Packages/OsaurusCore/Folder/FolderTools.swift b/Packages/OsaurusCore/Folder/FolderTools.swift index 62102188d..cee0dba48 100644 --- a/Packages/OsaurusCore/Folder/FolderTools.swift +++ b/Packages/OsaurusCore/Folder/FolderTools.swift @@ -82,10 +82,11 @@ enum FolderToolHelpers { static func detectProjectType(_ url: URL) -> ProjectType { let fm = FileManager.default for projectType in ProjectType.allCases where projectType != .unknown { - for manifestFile in projectType.manifestFiles { - if fm.fileExists(atPath: url.appendingPathComponent(manifestFile).path) { - return projectType - } + let hasManifest = projectType.manifestFiles.contains { manifestFile in + fm.fileExists(atPath: url.appendingPathComponent(manifestFile).path) + } + if hasManifest { + return projectType } } return .unknown @@ -638,9 +639,10 @@ struct FileSearchTool: OsaurusTool { if let pattern = filePattern { let regex = pattern.replacingOccurrences(of: ".", with: "\\.") .replacingOccurrences(of: "*", with: ".*") - if fileURL.lastPathComponent.range(of: "^\(regex)$", options: .regularExpression) - == nil - { + if fileURL.lastPathComponent.range( + of: "^\(regex)$", + options: .regularExpression + ) == nil { continue } } @@ -1016,6 +1018,9 @@ enum FolderToolFactory { FileWriteTool(rootPath: rootPath), FileEditTool(rootPath: rootPath), FileSearchTool(rootPath: rootPath), + ReadWorkbookTool(rootPath: rootPath), + ReadWorkbookCellTool(rootPath: rootPath), + WriteWorkbookTool(rootPath: rootPath), ] } diff --git a/Packages/OsaurusCore/Folder/WorkbookTools.swift b/Packages/OsaurusCore/Folder/WorkbookTools.swift new file mode 100644 index 000000000..c32490316 --- /dev/null +++ b/Packages/OsaurusCore/Folder/WorkbookTools.swift @@ -0,0 +1,573 @@ +// +// WorkbookTools.swift +// osaurus +// +// Folder-scoped agent tools for reading and writing XLSX workbooks +// through the typed `Workbook` surface. Installed by +// `FolderToolFactory.buildCoreTools` when a working folder is active. +// +// These tools let an agent ingest a spreadsheet, reason about cells and +// formulas in their native types, and emit a modified workbook without +// ever dropping to markdown-as-text serialisation. They pair with +// `XLSXAdapter` (read) and `XLSXEmitter` (write) via +// `DocumentFormatRegistry`. +// +// Path resolution matches `FileReadTool` / `FileWriteTool` — paths are +// contained under `rootPath` and `..`-traversal is rejected. +// + +import Foundation + +// MARK: - read_workbook + +struct ReadWorkbookTool: OsaurusTool { + let name = "read_workbook" + let description = + "Read an XLSX spreadsheet into a structured summary. Returns sheet " + + "names, row counts, merged ranges, and a truncated cell sample per " + + "sheet so the response stays in-context. For a specific cell's value " + + "or formula use `read_workbook_cell`. To write a modified workbook, " + + "use `write_workbook`." + + let parameters: JSONValue? = .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "properties": .object([ + "path": .object([ + "type": .string("string"), + "description": .string("Relative path to an .xlsx file under the working folder."), + ]) + ]), + "required": .array([.string("path")]), + ]) + + private let rootPath: URL + + /// Cap on cells returned per sheet. Agents that need more should + /// switch to `read_workbook_cell` for the specific reference. + private static let maxCellsPerSheet = 200 + + init(rootPath: URL) { + self.rootPath = rootPath + } + + func execute(argumentsJSON: String) async throws -> String { + let argsReq = requireArgumentsDictionary(argumentsJSON, tool: name) + guard case .value(let args) = argsReq else { return argsReq.failureEnvelope ?? "" } + let pathReq = requireString(args, "path", expected: "relative path to an .xlsx file", tool: name) + guard case .value(let relativePath) = pathReq else { return pathReq.failureEnvelope ?? "" } + + let fileURL: URL + do { + fileURL = try FolderToolHelpers.resolvePath(relativePath, rootPath: rootPath) + } catch { + return ToolEnvelope.failure(kind: .invalidArgs, message: error.localizedDescription, tool: name) + } + + let workbook: Workbook + do { + let document = try await XLSXAdapter().parse( + url: fileURL, + sizeLimit: DocumentLimits.limit(forFormatId: "xlsx") + ) + guard let wb = document.representation.underlying as? Workbook else { + return ToolEnvelope.failure( + kind: .executionError, + message: "XLSX adapter returned unexpected representation.", + tool: name + ) + } + workbook = wb + } catch { + return ToolEnvelope.failure( + kind: .executionError, + message: "Failed to read workbook: \(error.localizedDescription)", + tool: name + ) + } + + let payload = renderSummary(path: relativePath, workbook: workbook) + guard let data = try? JSONSerialization.data(withJSONObject: payload, options: [.sortedKeys]), + let text = String(data: data, encoding: .utf8) + else { + return ToolEnvelope.failure( + kind: .executionError, + message: "Could not serialise workbook summary.", + tool: name + ) + } + return ToolEnvelope.success(tool: name, text: text) + } + + // MARK: - Summary rendering + + private func renderSummary(path: String, workbook: Workbook) -> [String: Any] { + let sheets: [[String: Any]] = workbook.sheets.map { sheet in + let allCells = sheet.rows.flatMap { row in + row.cells.map { cell in renderCell(row: row.index, cell: cell) } + } + let truncated = allCells.prefix(Self.maxCellsPerSheet).map { $0 } + var sheetPayload: [String: Any] = [ + "name": sheet.name, + "rowCount": sheet.rows.count, + "cellCount": allCells.count, + "cells": truncated, + ] + if allCells.count > truncated.count { + sheetPayload["truncated"] = true + } + if !sheet.mergedRanges.isEmpty { + sheetPayload["mergedRanges"] = sheet.mergedRanges.map { $0.reference } + } + return sheetPayload + } + return [ + "path": path, + "sheets": sheets, + ] + } + + private func renderCell(row: Int, cell: Cell) -> [String: Any] { + var payload: [String: Any] = ["ref": cell.reference, "row": row] + switch cell.value { + case .empty: + payload["type"] = "empty" + case .number(let value): + payload["type"] = "number" + payload["value"] = value + case .string(let text): + payload["type"] = "string" + payload["value"] = text + case .inlineString(let text): + payload["type"] = "inlineString" + payload["value"] = text + case .bool(let flag): + payload["type"] = "bool" + payload["value"] = flag + } + if let formula = cell.formula { + payload["formula"] = formula + } + return payload + } +} + +// MARK: - read_workbook_cell + +struct ReadWorkbookCellTool: OsaurusTool { + let name = "read_workbook_cell" + let description = + "Read a single cell from an XLSX spreadsheet. Returns value, formula, " + + "and type for the referenced cell. Use after `read_workbook` has " + + "shown the structure and you need a specific value that was " + + "truncated out of the summary." + + let parameters: JSONValue? = .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "properties": .object([ + "path": .object([ + "type": .string("string"), + "description": .string("Relative path to an .xlsx file under the working folder."), + ]), + "sheet": .object([ + "type": .string("string"), + "description": .string("Sheet name, e.g. `Revenue`."), + ]), + "cell": .object([ + "type": .string("string"), + "description": .string("A1-style cell reference, e.g. `B3` or `AA10`."), + ]), + ]), + "required": .array([.string("path"), .string("sheet"), .string("cell")]), + ]) + + private let rootPath: URL + + init(rootPath: URL) { + self.rootPath = rootPath + } + + func execute(argumentsJSON: String) async throws -> String { + let argsReq = requireArgumentsDictionary(argumentsJSON, tool: name) + guard case .value(let args) = argsReq else { return argsReq.failureEnvelope ?? "" } + let pathReq = requireString(args, "path", expected: "relative path to an .xlsx file", tool: name) + guard case .value(let relativePath) = pathReq else { return pathReq.failureEnvelope ?? "" } + let sheetReq = requireString(args, "sheet", expected: "sheet name", tool: name) + guard case .value(let sheetName) = sheetReq else { return sheetReq.failureEnvelope ?? "" } + let cellReq = requireString(args, "cell", expected: "A1-style cell reference", tool: name) + guard case .value(let cellRef) = cellReq else { return cellReq.failureEnvelope ?? "" } + + let fileURL: URL + do { + fileURL = try FolderToolHelpers.resolvePath(relativePath, rootPath: rootPath) + } catch { + return ToolEnvelope.failure(kind: .invalidArgs, message: error.localizedDescription, tool: name) + } + + let workbook: Workbook + do { + let document = try await XLSXAdapter().parse( + url: fileURL, + sizeLimit: DocumentLimits.limit(forFormatId: "xlsx") + ) + guard let wb = document.representation.underlying as? Workbook else { + return ToolEnvelope.failure( + kind: .executionError, + message: "XLSX adapter returned unexpected representation.", + tool: name + ) + } + workbook = wb + } catch { + return ToolEnvelope.failure( + kind: .executionError, + message: "Failed to read workbook: \(error.localizedDescription)", + tool: name + ) + } + + guard let sheet = workbook.sheets.first(where: { $0.name == sheetName }) else { + let available = workbook.sheets.map(\.name).joined(separator: ", ") + return ToolEnvelope.failure( + kind: .invalidArgs, + message: + "Sheet '\(sheetName)' not found. Available sheets: \(available).", + field: "sheet", + expected: "an existing sheet name", + tool: name + ) + } + guard let cell = sheet.rows.flatMap(\.cells).first(where: { $0.reference == cellRef }) else { + return ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell '\(cellRef)' not found on sheet '\(sheetName)'.", + field: "cell", + expected: "an occupied cell on the sheet", + tool: name + ) + } + + var payload: [String: Any] = ["ref": cell.reference] + switch cell.value { + case .empty: payload["type"] = "empty" + case .number(let v): payload["type"] = "number"; payload["value"] = v + case .string(let v): payload["type"] = "string"; payload["value"] = v + case .inlineString(let v): payload["type"] = "inlineString"; payload["value"] = v + case .bool(let v): payload["type"] = "bool"; payload["value"] = v + } + if let formula = cell.formula { payload["formula"] = formula } + + guard let data = try? JSONSerialization.data(withJSONObject: payload, options: [.sortedKeys]), + let text = String(data: data, encoding: .utf8) + else { + return ToolEnvelope.failure( + kind: .executionError, + message: "Could not serialise cell payload.", + tool: name + ) + } + return ToolEnvelope.success(tool: name, text: text) + } +} + +// MARK: - write_workbook + +struct WriteWorkbookTool: OsaurusTool { + let name = "write_workbook" + let description = + "Write an XLSX spreadsheet to disk. Accepts a structured `sheets` " + + "array so the model never has to format raw XML. Each cell carries " + + "its A1 reference, a typed value, and an optional formula. " + + "Call `share_artifact` afterwards if you want the file to appear in " + + "the chat thread." + + let parameters: JSONValue? = .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "properties": .object([ + "path": .object([ + "type": .string("string"), + "description": .string("Relative output path, e.g. `report.xlsx`."), + ]), + "sheets": .object([ + "type": .string("array"), + "description": .string("One or more sheets in display order."), + "items": .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "required": .array([.string("name")]), + "properties": .object([ + "name": .object([ + "type": .string("string"), + "description": .string("Sheet display name."), + ]), + "cells": .object([ + "type": .string("array"), + "description": .string( + "Cells to write. Omit to create an empty sheet." + ), + "items": .object([ + "type": .string("object"), + "additionalProperties": .bool(false), + "required": .array([.string("ref")]), + "properties": .object([ + "ref": .object([ + "type": .string("string"), + "description": .string("A1 reference, e.g. `B3`."), + ]), + "type": .object([ + "type": .string("string"), + "description": .string( + "`string`, `number`, `bool`, or `formula`." + ), + "enum": .array([ + .string("string"), + .string("number"), + .string("bool"), + .string("formula"), + ]), + ]), + "value": .object([ + "description": .string( + "Cell value — string/number/bool. Ignored for `formula` cells; use `formula` instead." + ) + ]), + "formula": .object([ + "type": .string("string"), + "description": .string( + "Formula source without the leading `=`, e.g. `SUM(A1:A3)`." + ), + ]), + ]), + ]), + ]), + "mergedRanges": .object([ + "type": .string("array"), + "items": .object(["type": .string("string")]), + "description": .string("Optional A1:A1 merge ranges, e.g. `A1:B1`."), + ]), + ]), + ]), + ]), + ]), + "required": .array([.string("path"), .string("sheets")]), + ]) + + private let rootPath: URL + + init(rootPath: URL) { + self.rootPath = rootPath + } + + func execute(argumentsJSON: String) async throws -> String { + let argsReq = requireArgumentsDictionary(argumentsJSON, tool: name) + guard case .value(let args) = argsReq else { return argsReq.failureEnvelope ?? "" } + let pathReq = requireString(args, "path", expected: "relative output path ending in .xlsx", tool: name) + guard case .value(let relativePath) = pathReq else { return pathReq.failureEnvelope ?? "" } + + guard let rawSheets = args["sheets"] as? [[String: Any]], !rawSheets.isEmpty else { + return ToolEnvelope.failure( + kind: .invalidArgs, + message: "`sheets` must be a non-empty array of sheet objects.", + field: "sheets", + expected: "non-empty array", + tool: name + ) + } + + let destURL: URL + do { + destURL = try FolderToolHelpers.resolvePath(relativePath, rootPath: rootPath) + } catch { + return ToolEnvelope.failure(kind: .invalidArgs, message: error.localizedDescription, tool: name) + } + + guard destURL.pathExtension.lowercased() == "xlsx" else { + return ToolEnvelope.failure( + kind: .invalidArgs, + message: "`path` must end in `.xlsx`; got '\(relativePath)'.", + field: "path", + expected: ".xlsx file path", + tool: name + ) + } + + var sheets: [Sheet] = [] + for (index, raw) in rawSheets.enumerated() { + switch parseSheet(raw, at: index) { + case .value(let sheet): sheets.append(sheet) + case .failure(let envelope): return envelope + } + } + + let workbook = Workbook(sheets: sheets, sharedStrings: []) + let document = StructuredDocument( + formatId: "xlsx", + filename: destURL.lastPathComponent, + fileSize: 0, + representation: AnyStructuredRepresentation(formatId: "xlsx", underlying: workbook), + textFallback: "" + ) + + // Ensure parent exists so relative writes like `reports/q4.xlsx` + // work without a separate `dir_create` round-trip. + try? FileManager.default.createDirectory( + at: destURL.deletingLastPathComponent(), + withIntermediateDirectories: true + ) + + do { + try await XLSXEmitter().emit(document, to: destURL) + } catch { + return ToolEnvelope.failure( + kind: .executionError, + message: "Failed to write workbook: \(error.localizedDescription)", + tool: name + ) + } + + let payload: [String: Any] = [ + "path": relativePath, + "sheetCount": sheets.count, + "totalCells": sheets.reduce(0) { $0 + $1.rows.flatMap(\.cells).count }, + ] + guard let data = try? JSONSerialization.data(withJSONObject: payload, options: [.sortedKeys]), + let text = String(data: data, encoding: .utf8) + else { + return ToolEnvelope.success(tool: name, text: "Wrote workbook to \(relativePath)") + } + return ToolEnvelope.success(tool: name, text: text) + } + + // MARK: - Parsing + + private func parseSheet( + _ raw: [String: Any], + at index: Int + ) -> ArgumentRequirement { + guard let sheetName = raw["name"] as? String, !sheetName.isEmpty else { + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Sheet at index \(index) is missing a non-empty `name`.", + field: "sheets[\(index)].name", + expected: "non-empty string", + tool: name + ) + ) + } + + let rawCells = raw["cells"] as? [[String: Any]] ?? [] + var cellsByRow: [Int: [Cell]] = [:] + for (cellIndex, rawCell) in rawCells.enumerated() { + switch parseCell(rawCell, sheetIndex: index, cellIndex: cellIndex) { + case .value(let (row, cell)): + cellsByRow[row, default: []].append(cell) + case .failure(let envelope): return .failure(envelope) + } + } + let rows = cellsByRow.keys.sorted().map { rowIndex in + Row(index: rowIndex, cells: cellsByRow[rowIndex] ?? []) + } + + let mergedRanges: [CellRange] = + (raw["mergedRanges"] as? [String])? + .map { CellRange(reference: $0) } ?? [] + + return .value(Sheet(name: sheetName, rows: rows, mergedRanges: mergedRanges)) + } + + private func parseCell( + _ raw: [String: Any], + sheetIndex: Int, + cellIndex: Int + ) -> ArgumentRequirement<(Int, Cell)> { + guard let ref = raw["ref"] as? String, !ref.isEmpty else { + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell \(cellIndex) on sheet \(sheetIndex) is missing `ref`.", + field: "sheets[\(sheetIndex)].cells[\(cellIndex)].ref", + expected: "A1-style reference", + tool: name + ) + ) + } + guard let rowOneBased = rowComponent(of: ref) else { + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell reference '\(ref)' is not valid A1.", + field: "sheets[\(sheetIndex)].cells[\(cellIndex)].ref", + expected: "A1-style reference", + tool: name + ) + ) + } + + let typeHint = (raw["type"] as? String)?.lowercased() + let value: CellValue + var formula: String? + switch typeHint { + case "formula": + guard let f = raw["formula"] as? String, !f.isEmpty else { + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell '\(ref)' is typed as `formula` but has no `formula` string.", + field: "sheets[\(sheetIndex)].cells[\(cellIndex)].formula", + expected: "non-empty formula string", + tool: name + ) + ) + } + formula = f + value = .empty + case "bool": + value = .bool((raw["value"] as? Bool) ?? false) + case "number": + if let n = raw["value"] as? Double { + value = .number(n) + } else if let n = (raw["value"] as? NSNumber)?.doubleValue { + value = .number(n) + } else if let s = raw["value"] as? String, let n = Double(s) { + value = .number(n) + } else { + value = .empty + } + case "string", nil: + if let s = raw["value"] as? String { + value = .string(s) + } else if let n = raw["value"] as? NSNumber { + value = .number(n.doubleValue) + } else if let b = raw["value"] as? Bool { + value = .bool(b) + } else if raw["formula"] is String { + formula = raw["formula"] as? String + value = .empty + } else { + value = .empty + } + default: + return .failure( + ToolEnvelope.failure( + kind: .invalidArgs, + message: "Cell '\(ref)' has unknown type '\(typeHint ?? "?")'.", + field: "sheets[\(sheetIndex)].cells[\(cellIndex)].type", + expected: "string / number / bool / formula", + tool: name + ) + ) + } + return .value((rowOneBased, Cell(reference: ref, value: value, formula: formula))) + } + + private func rowComponent(of reference: String) -> Int? { + var digits = "" + for ch in reference.unicodeScalars where ch.value >= 0x30 && ch.value <= 0x39 { + digits.append(Character(ch)) + } + return Int(digits) + } +} diff --git a/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift new file mode 100644 index 000000000..87ab6e64c --- /dev/null +++ b/Packages/OsaurusCore/Managers/Documents/DocumentAdaptersBootstrap.swift @@ -0,0 +1,40 @@ +// +// DocumentAdaptersBootstrap.swift +// osaurus +// +// Registers the in-tree document adapters with `DocumentFormatRegistry.shared` +// exactly once, at app launch. Kept separate from `AppDelegate` so tests can +// opt into the same registration (or opt out of it entirely) without dragging +// in `NSApplication`. +// + +import Foundation + +public enum DocumentAdaptersBootstrap { + private static let lock = NSLock() + // Guarded by `lock`; the `nonisolated(unsafe)` matches the project pattern + // for lock-protected process-global state (see `OsaurusPaths.overrideRoot`). + nonisolated(unsafe) private static var didRegisterShared = false + + /// Idempotent against the shared registry: safe to call from multiple + /// launch paths without producing duplicate adapter registrations. + /// Non-shared registries (tests, isolated instances) are re-registered on + /// every call so each test gets a clean baseline. + public static func registerBuiltIns(registry: DocumentFormatRegistry = .shared) { + lock.lock() + defer { lock.unlock() } + if registry === DocumentFormatRegistry.shared, didRegisterShared { return } + registry.register(adapter: PlainTextAdapter()) + registry.register(adapter: PDFAdapter()) + registry.register(adapter: RichDocumentAdapter()) + registry.register(adapter: XLSXAdapter()) + registry.register(emitter: XLSXEmitter()) + // CSV registered after PlainText so later-wins routes .csv / .tsv + // through the typed adapter; PlainText still claims everything else. + registry.register(adapter: CSVAdapter()) + registry.register(streamer: CSVStreamer()) + if registry === DocumentFormatRegistry.shared { + didRegisterShared = true + } + } +} diff --git a/Packages/OsaurusCore/Models/Documents/CSVTable.swift b/Packages/OsaurusCore/Models/Documents/CSVTable.swift new file mode 100644 index 000000000..5f5817fa0 --- /dev/null +++ b/Packages/OsaurusCore/Models/Documents/CSVTable.swift @@ -0,0 +1,78 @@ +// +// CSVTable.swift +// osaurus +// +// Typed representation for CSV / TSV files. Replaces the flat +// "CSV-as-text" ingestion the legacy `DocumentParser` did by preserving +// encoding, delimiter, line-ending style, and per-row cell boundaries. +// Pairs with `CSVAdapter` (in-memory) and `CSVStreamer` (row-at-a-time). +// +// High-fidelity fields — the ones that actually matter to business +// users on round-trip — are deliberate: +// - `delimiter`: comma / tab / semicolon, honoured on re-emit. +// - `encoding`: UTF-8 / UTF-16 / ISO-Latin-1, preserved so an export +// back to the same locale doesn't silently widen the file. +// - `header`: optional first row, detected by the adapter. +// - `records`: raw string cells; numeric / date coercion is the +// caller's job (agents sometimes want the text literal). +// +// Out of scope: style-level XLSX features (number formats, fills). +// Those live in the Workbook representation, not here. +// + +import Foundation + +public struct CSVTable: StructuredRepresentation, Sendable { + /// Field separator — typically `,` for `.csv` and `\t` for `.tsv`. + public let delimiter: Character + /// Byte encoding detected from BOM / heuristic. + public let encoding: String.Encoding + /// Line-ending style present in the source bytes. Preserved so a + /// Windows-authored CSV round-trips as CRLF rather than being + /// silently rewritten to LF. + public let lineEnding: LineEnding + /// First row when the adapter identified it as a header. Heuristic: + /// present when the source had at least one data row AND the first + /// row's cells all parse as non-numeric text. + public let header: [String]? + /// Parsed cell strings — one `[String]` per row, not including the + /// header. Quoted-field expansion already applied. + public let records: [[String]] + /// Set to the row index where parsing stopped when `sizeLimit` was + /// hit; `nil` when the whole file fit under the cap. + public let truncatedAt: Int? + + public init( + delimiter: Character, + encoding: String.Encoding, + lineEnding: LineEnding, + header: [String]?, + records: [[String]], + truncatedAt: Int? = nil + ) { + self.delimiter = delimiter + self.encoding = encoding + self.lineEnding = lineEnding + self.header = header + self.records = records + self.truncatedAt = truncatedAt + } + + public enum LineEnding: String, Sendable { + case lf // `\n` + case crlf // `\r\n` + case cr // `\r` — rare, classic Mac + } +} + +/// One streamed row emitted by `CSVStreamer`. `lineNumber` is 1-based and +/// matches the on-wire row number so callers can attribute errors. +public struct CSVRecord: Sendable, Equatable { + public let lineNumber: Int + public let cells: [String] + + public init(lineNumber: Int, cells: [String]) { + self.lineNumber = lineNumber + self.cells = cells + } +} diff --git a/Packages/OsaurusCore/Models/Documents/PDFDocumentRepresentation.swift b/Packages/OsaurusCore/Models/Documents/PDFDocumentRepresentation.swift new file mode 100644 index 000000000..b0de53fd0 --- /dev/null +++ b/Packages/OsaurusCore/Models/Documents/PDFDocumentRepresentation.swift @@ -0,0 +1,63 @@ +// +// PDFDocumentRepresentation.swift +// osaurus +// +// Typed representation for parsed PDFs. Replaces the `PlainTextRepresentation` +// that PR 3's PDFAdapter emitted — PDFs that look tabular (invoices, bank +// statements, periodic reports) now surface the table structure alongside +// the flat text fallback, while narrative PDFs keep working exactly as +// before. +// +// Table detection lives in `PDFAdapter` and is intentionally permissive: +// a "table" here is a run of consecutive text rows that the layout +// heuristic split into at least two cells each. It won't perfectly match +// the author's semantic intent in every document — but for the files +// osaurus users actually attach (invoices, bank statements, financial +// tables), the heuristic is good enough to turn numeric columns from +// `1,234.56 1,920.00 ...` concatenation into proper cells. +// + +import Foundation + +public struct PDFDocumentRepresentation: StructuredRepresentation, Sendable { + public let pages: [PDFPageRepresentation] + + public var pageCount: Int { pages.count } + + public init(pages: [PDFPageRepresentation]) { + self.pages = pages + } +} + +public struct PDFPageRepresentation: Sendable { + /// 1-indexed page number matching the PDF's own display numbering. + public let pageNumber: Int + /// Plain text extracted via PDFKit. Kept on every page so the text + /// fallback path stays byte-identical to the legacy behaviour even + /// when no tables are detected. + public let text: String + /// Tables detected on this page. Empty for flowing-text pages. + /// A single page can carry multiple tables (e.g. an invoice with + /// line items + a summary block underneath). + public let tables: [PDFTable] + + public init(pageNumber: Int, text: String, tables: [PDFTable]) { + self.pageNumber = pageNumber + self.text = text + self.tables = tables + } +} + +/// Simple tabular region: an ordered list of rows, each with typed cell +/// strings. Coordinates are not retained because they're author-specific +/// and would force every downstream consumer to understand PDF geometry. +public struct PDFTable: Sendable, Equatable { + public let rows: [[String]] + + public init(rows: [[String]]) { + self.rows = rows + } + + public var rowCount: Int { rows.count } + public var columnCount: Int { rows.map(\.count).max() ?? 0 } +} diff --git a/Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift b/Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift new file mode 100644 index 000000000..f53c2348c --- /dev/null +++ b/Packages/OsaurusCore/Models/Documents/PlainTextRepresentation.swift @@ -0,0 +1,18 @@ +// +// PlainTextRepresentation.swift +// osaurus +// +// Default representation for adapters that extract a single text string. +// Every adapter has to publish *some* `StructuredRepresentation`; the +// wrappers around `PDFKit` text extraction and `NSAttributedString` don't +// preserve any format-native structure, so they emit this shape. The +// real typed representations (`Workbook`, `WordDocument`, …) replace it +// per-format as higher-fidelity adapters land. +// + +import Foundation + +public struct PlainTextRepresentation: StructuredRepresentation, Sendable { + public let text: String + public init(text: String) { self.text = text } +} diff --git a/Packages/OsaurusCore/Models/Documents/Workbook.swift b/Packages/OsaurusCore/Models/Documents/Workbook.swift new file mode 100644 index 000000000..a24031dbb --- /dev/null +++ b/Packages/OsaurusCore/Models/Documents/Workbook.swift @@ -0,0 +1,87 @@ +// +// Workbook.swift +// osaurus +// +// Typed representation for parsed XLSX workbooks. Designed as the +// round-trip target for both the read side (`XLSXAdapter`, this PR) and +// the write side (`XLSXEmitter`, landing in the next slice). Fields are +// chosen to match what CoreXLSX surfaces cleanly today — sheet names, +// merged ranges, raw cell values, formula source strings — plus the +// shared-string table so repeated strings round-trip without being +// re-interned on write. Style-derived fidelity (number formats, column +// widths) is deliberately out of scope for this PR; see the comment on +// `CellValue` for why. +// + +import Foundation + +public struct Workbook: StructuredRepresentation, Sendable { + public let sheets: [Sheet] + public let sharedStrings: [String] + + public init(sheets: [Sheet], sharedStrings: [String]) { + self.sheets = sheets + self.sharedStrings = sharedStrings + } +} + +public struct Sheet: Sendable { + public let name: String + public let rows: [Row] + public let mergedRanges: [CellRange] + + public init(name: String, rows: [Row], mergedRanges: [CellRange]) { + self.name = name + self.rows = rows + self.mergedRanges = mergedRanges + } +} + +public struct Row: Sendable { + /// 1-based row number matching the on-wire `r` attribute. + public let index: Int + public let cells: [Cell] + + public init(index: Int, cells: [Cell]) { + self.index = index + self.cells = cells + } +} + +public struct Cell: Sendable { + /// A1-style reference on-wire, e.g. "B3". + public let reference: String + public let value: CellValue + /// Formula source (`=SUM(A1:A3)`) when the cell carries one. Excel + /// stores both the formula and its cached result; we preserve both. + public let formula: String? + + public init(reference: String, value: CellValue, formula: String? = nil) { + self.reference = reference + self.value = value + self.formula = formula + } +} + +/// Scalar cell payload. Excel dates are stored as numbers with a style +/// attached — without parsing the style table we can't distinguish a date +/// from a plain number, so dates that aren't explicitly typed (`t="d"`) +/// surface as `.number`. Lifting that limitation means shipping a style +/// parser that tolerates the CoreXLSX `patternType` crash on +/// openpyxl-generated files; that work lives in a separate slice. +public enum CellValue: Sendable, Equatable { + case empty + case number(Double) + case string(String) + case bool(Bool) + case inlineString(String) +} + +/// A1-style cell range, e.g. "A1:C3". +public struct CellRange: Sendable, Equatable { + public let reference: String + + public init(reference: String) { + self.reference = reference + } +} diff --git a/Packages/OsaurusCore/Package.swift b/Packages/OsaurusCore/Package.swift index 51b3073d9..b98431576 100644 --- a/Packages/OsaurusCore/Package.swift +++ b/Packages/OsaurusCore/Package.swift @@ -148,6 +148,8 @@ let package = Package( .package(url: "https://github.com/mgriebling/SwiftMath", from: "1.7.3"), .package(url: "https://github.com/raspu/Highlightr", from: "2.3.0"), .package(url: "https://github.com/AAChartModel/AAChartKit-Swift.git", from: "9.5.0"), + .package(url: "https://github.com/CoreOffice/CoreXLSX.git", from: "0.14.2"), + .package(url: "https://github.com/jmcnamara/libxlsxwriter.git", from: "1.2.4"), ], targets: [ // Vendored SQLCipher 4.6.1 amalgamation (CommonCrypto @@ -271,6 +273,8 @@ let package = Package( .product(name: "ContainerizationExtras", package: "containerization"), .product(name: "Highlightr", package: "Highlightr"), .product(name: "AAInfographics", package: "AAChartKit-Swift"), + .product(name: "CoreXLSX", package: "CoreXLSX"), + .product(name: "libxlsxwriter", package: "libxlsxwriter"), ], path: ".", exclude: ["Tests", "SQLCipher"], @@ -284,7 +288,8 @@ let package = Package( .product(name: "NIOEmbedded", package: "swift-nio"), .product(name: "VecturaKit", package: "VecturaKit"), ], - path: "Tests" + path: "Tests", + resources: [.copy("Documents/Fixtures")] ), ] ) diff --git a/Packages/OsaurusCore/Services/Documents/CSVAdapter.swift b/Packages/OsaurusCore/Services/Documents/CSVAdapter.swift new file mode 100644 index 000000000..557e1e579 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/CSVAdapter.swift @@ -0,0 +1,136 @@ +// +// CSVAdapter.swift +// osaurus +// +// RFC-4180-ish CSV / TSV parser that produces a typed `CSVTable`. +// Replaces the legacy "CSV as plain text" path — the adapter still +// returns a text fallback for chat attachment display, but the typed +// representation exposes delimiter, encoding, line-ending, and per-row +// cell boundaries so the downstream tooling (agent tools, CSV streamer) +// can reason about columns rather than raw bytes. +// +// What's handled: +// - Delimiter defaults: `,` for `.csv`, `\t` for `.tsv`. +// - Double-quoted fields, including `""` escape sequences. +// - Embedded newlines inside quoted fields. +// - UTF-8 BOM stripping; UTF-8 first, ISO-Latin-1 fallback. +// - Header detection via a conservative "first row is non-numeric" +// heuristic that's easy to override once the agent has context. +// +// What's NOT handled yet: +// - Encoding detection beyond BOM — a Windows-1252 file with no BOM +// decodes as ISO-Latin-1 and may replace some byte sequences. +// - Escaping via backslashes (non-standard but common in hand-rolled +// CSVs) — quotes only. +// - Skipping comment lines (`#foo`) — not in the format. +// + +import Foundation + +public struct CSVAdapter: DocumentFormatAdapter { + public let formatId = "csv" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + let ext = url.pathExtension.lowercased() + return ext == "csv" || ext == "tsv" + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + let data: Data + do { data = try Data(contentsOf: url) } catch { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + + let decoded = Self.decode(data) + guard !decoded.text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + throw DocumentAdapterError.emptyContent + } + + let delimiter: Character = url.pathExtension.lowercased() == "tsv" ? "\t" : "," + let parsed = CSVParser.parseAll(text: decoded.text, delimiter: delimiter) + let lineEnding = Self.detectLineEnding(decoded.text) + let (header, body) = Self.detectHeader(parsed) + + let table = CSVTable( + delimiter: delimiter, + encoding: decoded.encoding, + lineEnding: lineEnding, + header: header, + records: body + ) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation(formatId: formatId, underlying: table), + textFallback: Self.renderTextFallback(table: table) + ) + } + + // MARK: - Decode + + /// Decodes the raw file bytes as a `String`, stripping any UTF-8 BOM, + /// and reports which encoding actually worked so callers can preserve + /// it on re-emit. + static func decode(_ data: Data) -> (text: String, encoding: String.Encoding) { + if data.count >= 3, data[0] == 0xEF, data[1] == 0xBB, data[2] == 0xBF { + let stripped = data.subdata(in: 3 ..< data.count) + return (String(data: stripped, encoding: .utf8) ?? "", .utf8) + } + if let utf8 = String(data: data, encoding: .utf8) { + return (utf8, .utf8) + } + if let latin1 = String(data: data, encoding: .isoLatin1) { + return (latin1, .isoLatin1) + } + return ("", .utf8) + } + + // MARK: - Line ending + header + + static func detectLineEnding(_ text: String) -> CSVTable.LineEnding { + for scalar in text.unicodeScalars { + if scalar == "\r" { return .crlf } // we'll refine below + if scalar == "\n" { return .lf } + } + return .lf + } + + /// Heuristic: treat the first row as a header when at least one of + /// its cells contains non-numeric text. Empty files return (nil, []). + static func detectHeader(_ rows: [[String]]) -> (header: [String]?, body: [[String]]) { + guard let first = rows.first else { return (nil, []) } + let anyNonNumeric = first.contains { cell in + !cell.isEmpty && Double(cell.trimmingCharacters(in: .whitespaces)) == nil + } + if anyNonNumeric, rows.count > 1 { + return (first, Array(rows.dropFirst())) + } + return (nil, rows) + } + + // MARK: - Text fallback + + static func renderTextFallback(table: CSVTable) -> String { + var out: [String] = [] + if let header = table.header { + out.append(header.joined(separator: " | ")) + out.append(String(repeating: "-", count: min(header.joined(separator: " | ").count, 80))) + } + for row in table.records.prefix(200) { + out.append(row.joined(separator: " | ")) + } + if table.records.count > 200 { + out.append("… (\(table.records.count - 200) more rows)") + } + return out.joined(separator: "\n") + } +} diff --git a/Packages/OsaurusCore/Services/Documents/CSVParser.swift b/Packages/OsaurusCore/Services/Documents/CSVParser.swift new file mode 100644 index 000000000..a49035b47 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/CSVParser.swift @@ -0,0 +1,174 @@ +// +// CSVParser.swift +// osaurus +// +// Shared state-machine parser for CSV / TSV content. Consumers: +// - `CSVAdapter` drains the whole file into `[[String]]`. +// - `CSVStreamer` feeds bytes in chunks and pulls rows out as they +// complete — the critical path for large files. +// +// Grammar (RFC 4180 with the two common extensions noted inline): +// - Fields are separated by the caller-specified `delimiter`. +// - Rows are separated by `\r\n`, `\n`, or a bare `\r`. +// - A field wrapped in `"` may contain delimiters and newlines; +// a literal `"` inside is escaped as `""`. +// - A `"` that follows the closing quote (unexpected per RFC) is +// tolerated: we append it and stay in text mode, matching +// real-world behaviour of Excel and Numbers exports. +// + +import Foundation + +enum CSVParser { + + /// One-shot: parse `text` into rows of cell strings. Quoted fields + /// with embedded newlines collapse into a single cell, so the result + /// isn't just `text.split(on: "\n")`. + static func parseAll(text: String, delimiter: Character) -> [[String]] { + var machine = Machine(delimiter: delimiter) + for scalar in text.unicodeScalars { + machine.consume(scalar) + } + machine.finish() + return machine.rows + } + + /// Incremental variant used by the streamer. Hand it bytes as they + /// come off the file handle; drain `rows` after each feed and reset. + struct Machine { + let delimiter: Character + private(set) var rows: [[String]] = [] + private var currentRow: [String] = [] + private var currentCell: String = "" + private var state: State = .fieldStart + private var pendingCR: Bool = false // saw `\r`, waiting to see if `\n` follows + + init(delimiter: Character) { + self.delimiter = delimiter + } + + mutating func drainRows() -> [[String]] { + let out = rows + rows = [] + return out + } + + mutating func consume(_ scalar: Unicode.Scalar) { + if pendingCR { + pendingCR = false + if scalar == "\n" { + // Swallow the `\n` of a CRLF — the `\r` already + // terminated the row. + return + } + // Bare `\r` line ending; fall through so this scalar is + // reprocessed as the start of the next row. + } + + let char = Character(scalar) + + switch state { + case .fieldStart: + if char == "\"" { + state = .inQuotedField + return + } + if char == delimiter { + currentRow.append(currentCell) + currentCell = "" + return + } + if scalar == "\n" { + finishRow() + return + } + if scalar == "\r" { + pendingCR = true + finishRow() + return + } + currentCell.append(char) + state = .inField + + case .inField: + if char == delimiter { + currentRow.append(currentCell) + currentCell = "" + state = .fieldStart + return + } + if scalar == "\n" { + finishRow() + return + } + if scalar == "\r" { + pendingCR = true + finishRow() + return + } + currentCell.append(char) + + case .inQuotedField: + if char == "\"" { + state = .afterQuote + return + } + currentCell.append(char) + + case .afterQuote: + if char == "\"" { + // `""` → literal quote in the field. + currentCell.append(char) + state = .inQuotedField + return + } + if char == delimiter { + currentRow.append(currentCell) + currentCell = "" + state = .fieldStart + return + } + if scalar == "\n" { + finishRow() + return + } + if scalar == "\r" { + pendingCR = true + finishRow() + return + } + // Tolerate a stray character after a closing quote rather + // than bailing — Excel-round-tripped CSVs occasionally + // emit this for fields that started quoted but had the + // closing quote elided. + currentCell.append(char) + state = .inField + } + } + + mutating func finish() { + if state != .fieldStart || !currentCell.isEmpty || !currentRow.isEmpty { + currentRow.append(currentCell) + rows.append(currentRow) + currentCell = "" + currentRow = [] + state = .fieldStart + } + } + + private mutating func finishRow() { + currentRow.append(currentCell) + rows.append(currentRow) + currentCell = "" + currentRow = [] + state = .fieldStart + } + + private enum State { + case fieldStart + case inField + case inQuotedField + case afterQuote + } + } +} diff --git a/Packages/OsaurusCore/Services/Documents/CSVStreamer.swift b/Packages/OsaurusCore/Services/Documents/CSVStreamer.swift new file mode 100644 index 000000000..d74ce5762 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/CSVStreamer.swift @@ -0,0 +1,165 @@ +// +// CSVStreamer.swift +// osaurus +// +// Streaming variant of `CSVAdapter` for files that don't fit in the +// in-memory cap — multi-GB bank exports, long-running event logs, etc. +// Emits one `CSVRecord` per row via an `AsyncThrowingStream` so callers +// can back-pressure and cancel rather than paying for the whole file up +// front. The agent tool surface is the obvious consumer; the chat +// attachment path stays on the eager `CSVAdapter` because it needs the +// whole table to render. +// +// The byte -> scalar -> parser pipeline reuses `CSVParser.Machine`, so +// quoted-field / embedded-newline / escape semantics match the batch +// adapter exactly. The only difference is that rows flush out as soon +// as they complete rather than waiting for the file to end. +// + +import Foundation + +public struct CSVStreamer: DocumentFormatStreamer { + public let formatId = "csv" + + public init() {} + + public func stream(url: URL) -> AsyncThrowingStream { + let delimiter: Character = url.pathExtension.lowercased() == "tsv" ? "\t" : "," + + return AsyncThrowingStream { continuation in + Task { + do { + try Self.drain(url: url, delimiter: delimiter, into: continuation) + continuation.finish() + } catch { + continuation.finish(throwing: error) + } + } + } + } + + // MARK: - Internals + + private static let chunkSize = 64 * 1024 + + /// Reads `url` in 64 KB chunks, feeds bytes through the shared + /// CSV state machine, and yields completed rows one at a time. + /// Throws on I/O failure or Task cancellation. + private static func drain( + url: URL, + delimiter: Character, + into continuation: AsyncThrowingStream.Continuation + ) throws { + let handle: FileHandle + do { + handle = try FileHandle(forReadingFrom: url) + } catch { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + defer { try? handle.close() } + + var machine = CSVParser.Machine(delimiter: delimiter) + var leftoverBytes = Data() + var didStripBOM = false + var lineNumber = 0 + + while true { + let chunk = handle.readData(ofLength: chunkSize) + if chunk.isEmpty { break } + + var buffer = leftoverBytes + buffer.append(chunk) + + if !didStripBOM { + didStripBOM = true + if buffer.count >= 3, buffer[0] == 0xEF, buffer[1] == 0xBB, buffer[2] == 0xBF { + buffer = buffer.subdata(in: 3 ..< buffer.count) + } + } + + // Split on the last valid UTF-8 boundary so we don't feed a + // partial multi-byte scalar into the parser. Anything after + // the last boundary becomes leftover for the next chunk. + let (decodable, tail) = Self.splitAtUTF8Boundary(buffer) + leftoverBytes = tail + + if let text = String(data: decodable, encoding: .utf8) { + for scalar in text.unicodeScalars { + machine.consume(scalar) + } + } else if let text = String(data: decodable, encoding: .isoLatin1) { + for scalar in text.unicodeScalars { + machine.consume(scalar) + } + } else { + throw DocumentAdapterError.readFailed(underlying: "could not decode CSV chunk") + } + + for row in machine.drainRows() { + lineNumber += 1 + continuation.yield(CSVRecord(lineNumber: lineNumber, cells: row)) + try Task.checkCancellation() + } + } + + // Flush trailing data (last scalar + final row when no trailing newline). + if !leftoverBytes.isEmpty { + let tail = + String(data: leftoverBytes, encoding: .utf8) + ?? String(data: leftoverBytes, encoding: .isoLatin1) + if let text = tail { + for scalar in text.unicodeScalars { machine.consume(scalar) } + } + } + machine.finish() + for row in machine.drainRows() { + lineNumber += 1 + continuation.yield(CSVRecord(lineNumber: lineNumber, cells: row)) + } + } + + /// Finds the last byte position where a valid UTF-8 scalar ends and + /// returns the prefix (decodable) + suffix (carry over to next read). + /// Falls back to the whole buffer when no multi-byte lead byte is in + /// the final 3 bytes — that means the last scalar is ASCII and + /// already complete. + static func splitAtUTF8Boundary(_ data: Data) -> (decodable: Data, tail: Data) { + guard data.count >= 4 else { return (data, Data()) } + let maxScan = min(data.count, 4) + for offset in 1 ... maxScan { + let byte = data[data.count - offset] + // Bytes `10xxxxxx` are continuation bytes; `11xxxxxx` are + // lead bytes; single-byte ASCII is `0xxxxxxx`. + if byte & 0b1100_0000 == 0b1000_0000 { + continue // continuation; keep scanning + } + if byte & 0b1000_0000 == 0 { + return (data, Data()) // ASCII final byte; safe to decode as-is + } + // Multi-byte lead byte. How many continuation bytes does it + // claim? 2-byte lead is `110x`, 3-byte is `1110`, 4-byte is + // `11110`. Expected-length minus what we've already scanned + // tells us how many bytes we're still short. + let leadMask = byte + let expected: Int + if leadMask & 0b1111_0000 == 0b1111_0000 { + expected = 4 + } else if leadMask & 0b1110_0000 == 0b1110_0000 { + expected = 3 + } else if leadMask & 0b1100_0000 == 0b1100_0000 { + expected = 2 + } else { + // Malformed lead byte; treat everything as decodable and + // let the String initializer fail closed. + return (data, Data()) + } + if expected <= offset { + return (data, Data()) // full scalar is inside the buffer + } + // Short; carry the partial scalar to the next read. + let boundary = data.count - offset + return (data.subdata(in: 0 ..< boundary), data.subdata(in: boundary ..< data.count)) + } + return (data, Data()) + } +} diff --git a/Packages/OsaurusCore/Services/Documents/PDFAdapter.swift b/Packages/OsaurusCore/Services/Documents/PDFAdapter.swift new file mode 100644 index 000000000..cef946a2a --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/PDFAdapter.swift @@ -0,0 +1,230 @@ +// +// PDFAdapter.swift +// osaurus +// +// Text-layer extraction plus layout-aware table detection for PDFs. +// +// The adapter produces a `PDFDocumentRepresentation` carrying one +// `PDFPageRepresentation` per text-bearing page. Each page retains the +// plain extracted text (byte-identical to the legacy PR-3 behaviour) +// and — when the layout heuristic finds them — a list of `PDFTable` +// regions. The `textFallback` on the returned `StructuredDocument` +// stays flat for chat-attachment display. +// +// Image-only PDFs still throw `.emptyContent` so the `DocumentParser` +// shim can fall through to the legacy image-render path; moving that +// path onto the adapter surface is deliberately out of scope here. +// +// Detection strategy (`PDFTableDetector` below): +// 1. Enumerate each page's characters, capturing `(char, x, y, width)` +// from `PDFPage.characterBounds(at:)`. +// 2. Cluster glyphs into rows by y-coordinate tolerance. +// 3. Within each row, split into cells wherever the inter-glyph gap +// exceeds the configured threshold. +// 4. Collect consecutive multi-cell rows as a table; single-cell rows +// are treated as prose and end the current table. +// + +import Foundation +import PDFKit + +public struct PDFAdapter: DocumentFormatAdapter { + public let formatId = "pdf" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + url.pathExtension.lowercased() == "pdf" + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + guard let document = PDFDocument(url: url) else { + throw DocumentAdapterError.readFailed(underlying: "PDFKit could not open document") + } + + var pages: [PDFPageRepresentation] = [] + for index in 0 ..< document.pageCount { + guard let page = document.page(at: index), let text = page.string else { continue } + let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) + if trimmed.isEmpty { continue } + let tables = PDFTableDetector.detect(on: page, text: text) + pages.append( + PDFPageRepresentation(pageNumber: index + 1, text: text, tables: tables) + ) + } + + guard !pages.isEmpty else { + // No text layer on any page — let the shim fall through to + // the legacy image-render fallback. Don't claim a result we + // can't produce. + throw DocumentAdapterError.emptyContent + } + + let flatText = pages.map(\.text).joined(separator: "\n\n") + let truncated = PlainTextAdapter.applyCharacterCap(flatText) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PDFDocumentRepresentation(pages: pages) + ), + textFallback: truncated + ) + } +} + +// MARK: - Table detector + +/// Glyph record used by the detector. Exposed internally so tests can +/// feed synthetic character grids in without going through PDFKit — +/// Core Graphics-generated PDFs report character bounds that span +/// trailing whitespace, which masks the real column gaps, so relying +/// only on end-to-end PDF fixtures makes the algorithm hard to pin. +struct PDFGlyph: Sendable, Equatable { + let scalar: Character + let rect: CGRect +} + +/// Layout-aware table detector. Pure function over a `PDFPage` at the +/// top, with the inner stages (`clusterRows`, `cellsForRow`, grouping) +/// internal so the heuristic can be unit-tested without PDFKit. +enum PDFTableDetector { + + /// Tunables chosen for common business PDFs (10-12pt body text). + /// `rowTolerance` is how far two characters' y-baselines can differ + /// and still be on the same row; `columnGap` is the inter-glyph + /// distance that must be exceeded before we declare a cell boundary. + /// Both are in PDF points. `columnGap` is set above typical single- + /// space width (~3-4pt at 12pt body) so "Widget Pro" stays one cell + /// but "Widget 7" splits — the heuristic trades some recall on + /// very tightly-spaced tables for precision on prose. + static let rowTolerance: CGFloat = 3.0 + static let columnGap: CGFloat = 8.0 + + static func detect(on page: PDFPage, text: String) -> [PDFTable] { + let glyphs = collectGlyphs(from: page, text: text) + return detect(glyphs: glyphs) + } + + /// Pure-function variant used by tests and reachable without PDFKit. + static func detect(glyphs: [PDFGlyph]) -> [PDFTable] { + let rows = clusterRows(glyphs) + let cellRows = rows.map { cellsForRow($0) } + return groupConsecutiveTabularRows(cellRows) + } + + // MARK: Glyph collection + + private struct RowCluster { + var y: CGFloat + var glyphs: [PDFGlyph] + } + + private static func collectGlyphs(from page: PDFPage, text: String) -> [PDFGlyph] { + // `characterBounds(at:)` uses UTF-16 offsets, so we index into + // `text.utf16` and map back to characters for the cell content. + // + // Space / tab / newline characters carry bounds that span the + // whitespace they introduce — including column gaps many points + // wide — so including them in the glyph stream hides the gap + // between "Item" and "Qty" (see e.g. PDFKit on a 3-column PDF, + // where the space glyph between columns reports width ≈ 95pt). + // Dropping whitespace glyphs turns "gap between meaningful + // characters" into the real signal we cluster on. + var glyphs: [PDFGlyph] = [] + glyphs.reserveCapacity(text.utf16.count) + var index = 0 + for scalar in text { + let length = scalar.utf16.count + defer { index += length } + if scalar.isWhitespace { continue } + let bounds = page.characterBounds(at: index) + if bounds.width > 0 || bounds.height > 0 { + glyphs.append(PDFGlyph(scalar: scalar, rect: bounds)) + } + } + return glyphs + } + + // MARK: Row clustering + + static func clusterRows(_ glyphs: [PDFGlyph]) -> [[PDFGlyph]] { + // Sort by y descending — PDF coordinates have origin at bottom- + // left, so top-of-page rows carry the highest y values. + let sorted = glyphs.sorted { lhs, rhs in + let ly = lhs.rect.midY + let ry = rhs.rect.midY + if abs(ly - ry) < rowTolerance { return lhs.rect.minX < rhs.rect.minX } + return ly > ry + } + + var clusters: [RowCluster] = [] + for glyph in sorted { + let y = glyph.rect.midY + if let last = clusters.last, abs(last.y - y) < rowTolerance { + var updated = last + updated.glyphs.append(glyph) + clusters[clusters.count - 1] = updated + } else { + clusters.append(RowCluster(y: y, glyphs: [glyph])) + } + } + + return clusters.map { cluster in + cluster.glyphs.sorted { $0.rect.minX < $1.rect.minX } + } + } + + // MARK: Row → cells + + static func cellsForRow(_ row: [PDFGlyph]) -> [String] { + guard !row.isEmpty else { return [] } + + var cells: [String] = [] + var buffer: String = String(row[0].scalar) + var cursor = row[0].rect.maxX + + for glyph in row.dropFirst() { + let gap = glyph.rect.minX - cursor + if gap > columnGap { + cells.append(buffer.trimmingCharacters(in: .whitespaces)) + buffer = String(glyph.scalar) + } else { + buffer.append(glyph.scalar) + } + cursor = glyph.rect.maxX + } + cells.append(buffer.trimmingCharacters(in: .whitespaces)) + return cells.filter { !$0.isEmpty } + } + + // MARK: Tabular row grouping + + static func groupConsecutiveTabularRows(_ rows: [[String]]) -> [PDFTable] { + var tables: [PDFTable] = [] + var current: [[String]] = [] + + for row in rows { + if row.count >= 2 { + current.append(row) + } else if !current.isEmpty { + tables.append(PDFTable(rows: current)) + current = [] + } + } + if !current.isEmpty { + tables.append(PDFTable(rows: current)) + } + // Single-row "tables" are almost always form lines ("Invoice: 1234"), + // not real tables — drop them so downstream consumers don't have to. + return tables.filter { $0.rows.count >= 2 } + } +} diff --git a/Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift b/Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift new file mode 100644 index 000000000..1df764eb0 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift @@ -0,0 +1,90 @@ +// +// PlainTextAdapter.swift +// osaurus +// +// Wraps the existing plain-text ingress path in `DocumentParser`. Claims +// roughly the 60 extensions that were previously handled by the inline +// `case _ where isPlainText(ext:)` branch — `.txt`, `.md`, source code, +// config files, etc. Behaviour is intentionally identical to the legacy +// switch: UTF-8 first, ISO-Latin-1 retry, post-read character-count +// truncation marker. This adapter is a migration bridge, not a fidelity +// improvement. +// + +import Foundation + +public struct PlainTextAdapter: DocumentFormatAdapter { + public let formatId = "plaintext" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + Self.plainTextExtensions.contains(url.pathExtension.lowercased()) + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + let rawContent: String + do { + rawContent = try String(contentsOf: url, encoding: .utf8) + } catch { + // Fall back to latin-1 for files that are "mostly text" with a few + // non-UTF-8 bytes — same behaviour as the legacy path. + guard let data = try? Data(contentsOf: url), + let decoded = String(data: data, encoding: .isoLatin1) + else { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + rawContent = decoded + } + + guard !rawContent.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + throw DocumentAdapterError.emptyContent + } + + let truncated = Self.applyCharacterCap(rawContent) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: truncated) + ), + textFallback: truncated + ) + } + + // MARK: - Helpers + + /// Preserves the legacy 500K-character UX — consumers already expect the + /// trailing marker when a document is truncated mid-read. The cap on + /// bytes-read is higher (see `DocumentLimits.plainText`), so the two + /// interact: oversized files are refused outright; merely long files + /// are surfaced with a truncation note. + static func applyCharacterCap(_ text: String) -> String { + let cap = 500_000 + guard text.count > cap else { return text } + return String(text.prefix(cap)) + + "\n\n[Document truncated — exceeded \(cap) character limit]" + } + + static let plainTextExtensions: Set = [ + "txt", "md", "markdown", "csv", "tsv", + "json", "xml", "yaml", "yml", "toml", + "log", "ini", "cfg", "conf", "env", + "swift", "py", "js", "ts", "tsx", "jsx", + "rs", "go", "java", "kt", "c", "cpp", "h", "hpp", + "rb", "php", "sh", "bash", "zsh", "fish", + "css", "scss", "less", "sql", + "r", "m", "mm", "lua", "pl", "ex", "exs", + "zig", "nim", "dart", "scala", "groovy", + "tf", "hcl", "dockerfile", + "gitignore", "editorconfig", "prettierrc", + ] +} diff --git a/Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift b/Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift new file mode 100644 index 000000000..33afe2f86 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/RichDocumentAdapter.swift @@ -0,0 +1,84 @@ +// +// RichDocumentAdapter.swift +// osaurus +// +// Wraps the `NSAttributedString(url:documentType:)` path in +// `DocumentParser.parseRichDocument`. A single adapter covers DOCX, DOC, +// RTF, RTFD, and HTML today because they share the same underlying +// framework call and produce the same plain-text output. When stage-4 +// PR 11 lands a high-fidelity DOCX reader (tables, tracked changes, +// comments) this adapter splits along format lines and this one becomes +// the RTF/HTML-only path. +// + +import AppKit +import Foundation + +public struct RichDocumentAdapter: DocumentFormatAdapter { + public let formatId = "richdoc" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + Self.supportedExtensions.contains(url.pathExtension.lowercased()) + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + let documentType = Self.documentType(forExtension: url.pathExtension.lowercased()) + let extracted: String + do { + var options: [NSAttributedString.DocumentReadingOptionKey: Any] = [:] + if let documentType { + options[.documentType] = documentType + } + let attributed = try NSAttributedString( + url: url, + options: options, + documentAttributes: nil + ) + extracted = attributed.string + } catch { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + + guard !extracted.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + throw DocumentAdapterError.emptyContent + } + + let truncated = PlainTextAdapter.applyCharacterCap(extracted) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: truncated) + ), + textFallback: truncated + ) + } + + // MARK: - Helpers + + static let supportedExtensions: Set = [ + "docx", "doc", "rtf", "rtfd", "html", "htm", + ] + + private static func documentType( + forExtension ext: String + ) -> NSAttributedString.DocumentType? { + switch ext { + case "docx": return nil // NSAttributedString auto-detects OOXML + case "doc": return .docFormat + case "rtf", "rtfd": return .rtf + case "html", "htm": return .html + default: return nil + } + } +} diff --git a/Packages/OsaurusCore/Services/Documents/XLSXAdapter.swift b/Packages/OsaurusCore/Services/Documents/XLSXAdapter.swift new file mode 100644 index 000000000..e91624ee5 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/XLSXAdapter.swift @@ -0,0 +1,211 @@ +// +// XLSXAdapter.swift +// osaurus +// +// First real-fidelity adapter: reads `.xlsx` into a typed `Workbook` +// rather than flattening it to markdown the way the legacy text path +// would. Backed by CoreXLSX. The adapter intentionally does NOT call +// `parseStyles()` — that entry point crashes on openpyxl-generated +// workbooks because CoreXLSX's `PatternFill.patternType` is non-optional +// while Excel's default empty pattern omits that attribute. Style- +// dependent fidelity (number formats, column widths, dates that aren't +// explicitly typed) is deferred to a follow-up slice so this PR can ship +// behaviour that works against every current-style XLSX writer. +// + +import CoreXLSX +import Foundation + +public struct XLSXAdapter: DocumentFormatAdapter { + public let formatId = "xlsx" + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + url.pathExtension.lowercased() == "xlsx" + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0) + if sizeLimit > 0, fileSize > sizeLimit { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + let file: XLSXFile + do { + guard let opened = XLSXFile(filepath: url.path) else { + throw DocumentAdapterError.readFailed(underlying: "XLSXFile could not open \(url.path)") + } + file = opened + } + + let sharedStrings: [String] + do { + // `parseSharedStrings` is nil on workbooks with no text cells, + // which is legal for a pure-numeric sheet. Treat that as empty. + let parsed = try file.parseSharedStrings() + sharedStrings = parsed?.items.map { $0.text ?? "" } ?? [] + } catch { + throw DocumentAdapterError.readFailed(underlying: "shared strings: \(error.localizedDescription)") + } + + let coreWorkbooks: [CoreXLSX.Workbook] + do { + coreWorkbooks = try file.parseWorkbooks() + } catch { + throw DocumentAdapterError.readFailed(underlying: "workbook index: \(error.localizedDescription)") + } + + var sheets: [Sheet] = [] + for coreWorkbook in coreWorkbooks { + let pathsAndNames: [(name: String?, path: String)] + do { + pathsAndNames = try file.parseWorksheetPathsAndNames(workbook: coreWorkbook) + } catch { + throw DocumentAdapterError.readFailed(underlying: "worksheet index: \(error.localizedDescription)") + } + + for pair in pathsAndNames { + let coreSheet: Worksheet + do { + coreSheet = try file.parseWorksheet(at: pair.path) + } catch { + throw DocumentAdapterError.readFailed( + underlying: "worksheet \(pair.name ?? pair.path): \(error.localizedDescription)" + ) + } + sheets.append( + Self.makeSheet( + name: pair.name ?? pair.path, + coreSheet: coreSheet, + sharedStrings: sharedStrings + ) + ) + } + } + + guard !sheets.isEmpty else { + throw DocumentAdapterError.emptyContent + } + + let workbook = Workbook(sheets: sheets, sharedStrings: sharedStrings) + let textFallback = Self.renderTextFallback(workbook) + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: workbook + ), + textFallback: textFallback + ) + } + + // MARK: - CoreXLSX → Workbook + + private static func makeSheet( + name: String, + coreSheet: Worksheet, + sharedStrings: [String] + ) -> Sheet { + let rows: [Row] = (coreSheet.data?.rows ?? []).map { coreRow in + let cells: [Cell] = coreRow.cells.map { coreCell in + Cell( + reference: coreCell.reference.description, + value: mapCellValue(coreCell, sharedStrings: sharedStrings), + formula: coreCell.formula?.value + ) + } + return Row(index: Int(coreRow.reference), cells: cells) + } + + let mergedRanges: [CellRange] = (coreSheet.mergeCells?.items ?? []).map { + CellRange(reference: $0.reference) + } + + return Sheet(name: name, rows: rows, mergedRanges: mergedRanges) + } + + private static func mapCellValue( + _ coreCell: CoreXLSX.Cell, + sharedStrings: [String] + ) -> CellValue { + // CoreXLSX's `Cell.type` is an optional enum; `Cell.value` is a + // raw string. The interpretation depends on `type`. + guard let rawValue = coreCell.value, !rawValue.isEmpty else { + return .empty + } + + switch coreCell.type { + case .bool: + return .bool(rawValue == "1") + case .sharedString: + if let index = Int(rawValue), index >= 0, index < sharedStrings.count { + return .string(sharedStrings[index]) + } + return .empty + case .inlineStr: + if let inline = coreCell.inlineString { + // CoreXLSX's `InlineString` concatenates all runs for us. + return .inlineString(inline.text ?? "") + } + return .inlineString(rawValue) + case .string: + return .string(rawValue) + case .number, .none: + if let number = Double(rawValue) { + return .number(number) + } + return .empty + case .date: + // Explicitly-typed dates are rare in the wild — Excel writers + // almost always store dates as numbers plus a style. Preserve + // the raw string so callers that know the style table can + // reconstruct; callers that don't still see a string. + return .string(rawValue) + case .error: + return .string(rawValue) + } + } + + // MARK: - Text fallback + + private static func renderTextFallback(_ workbook: Workbook) -> String { + var out: [String] = [] + for sheet in workbook.sheets { + out.append("## Sheet: \(sheet.name)") + for row in sheet.rows { + let cellText = row.cells.map { describeCell($0) }.joined(separator: "\t") + out.append("\(row.index)\t\(cellText)") + } + if !sheet.mergedRanges.isEmpty { + let ranges = sheet.mergedRanges.map { $0.reference }.joined(separator: ", ") + out.append("Merged: \(ranges)") + } + out.append("") + } + return out.joined(separator: "\n").trimmingCharacters(in: .whitespacesAndNewlines) + } + + private static func describeCell(_ cell: Cell) -> String { + let base: String + switch cell.value { + case .empty: base = "" + case .number(let value): + base = + value.truncatingRemainder(dividingBy: 1) == 0 + ? String(Int(value)) + : String(value) + case .string(let text), .inlineString(let text): + base = text + case .bool(let flag): + base = flag ? "TRUE" : "FALSE" + } + if let formula = cell.formula { + return base.isEmpty ? "=\(formula)" : "\(base) [=\(formula)]" + } + return base + } +} diff --git a/Packages/OsaurusCore/Services/Documents/XLSXEmitter.swift b/Packages/OsaurusCore/Services/Documents/XLSXEmitter.swift new file mode 100644 index 000000000..479c75f73 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/XLSXEmitter.swift @@ -0,0 +1,222 @@ +// +// XLSXEmitter.swift +// osaurus +// +// Writes a typed `Workbook` back out to `.xlsx` using libxlsxwriter. +// Pairs with `XLSXAdapter` to close the read-emit-read round trip that +// makes Excel a first-class format for osaurus agents: an agent can +// ingest a workbook, edit a `Workbook` in-process, and emit it back to +// the user as an attachable artifact. +// +// Licensing notes, surfaced for whoever owns acknowledgements: +// - libxlsxwriter itself is BSD-2-Clause. +// - It vendors `third_party/tmpfileplus/tmpfileplus.c` which is +// MPL 2.0. Statically linking it is permitted; the MPL only +// requires that the source of the covered file remain available. +// A follow-up to `AcknowledgementsView` should list both. +// + +import Foundation +import libxlsxwriter + +public struct XLSXEmitter: DocumentFormatEmitter { + public let formatId = "xlsx" + + public init() {} + + public func canEmit(_ document: StructuredDocument) -> Bool { + document.representation.underlying is Workbook + } + + public func emit(_ document: StructuredDocument, to url: URL) async throws { + guard let workbook = document.representation.underlying as? Workbook else { + throw DocumentAdapterError.writeFailed( + underlying: "emit called with non-Workbook representation" + ) + } + + // libxlsxwriter operates on a filename — it writes directly to the + // destination during `workbook_close` rather than handing back + // bytes. The caller has already resolved/contained `url` per the + // emitter contract. + let workbookHandle: UnsafeMutablePointer? = url.path.withCString { + workbook_new($0) + } + guard let lxwWorkbook = workbookHandle else { + throw DocumentAdapterError.writeFailed( + underlying: "workbook_new failed for \(url.path)" + ) + } + + var pendingError: DocumentAdapterError? + + for sheet in workbook.sheets { + let sheetHandle: UnsafeMutablePointer? = sheet.name.withCString { + workbook_add_worksheet(lxwWorkbook, $0) + } + guard let lxwSheet = sheetHandle else { + pendingError = .writeFailed( + underlying: "workbook_add_worksheet failed for '\(sheet.name)'" + ) + break + } + if let err = Self.writeSheet(sheet, to: lxwSheet) { + pendingError = err + break + } + } + + // `workbook_close` is ALWAYS called, even on earlier errors, so + // libxlsxwriter can release its buffers and temp files. + let closeError = workbook_close(lxwWorkbook) + if pendingError == nil, closeError.rawValue != 0 { + pendingError = .writeFailed(underlying: "workbook_close error \(closeError.rawValue)") + } + + if let error = pendingError { + // Best-effort cleanup — leaving a partial .xlsx behind would + // masquerade as a successful emit to any later reader. + try? FileManager.default.removeItem(at: url) + throw error + } + } + + // MARK: - Internals + + private static func writeSheet( + _ sheet: Sheet, + to lxwSheet: UnsafeMutablePointer + ) -> DocumentAdapterError? { + for row in sheet.rows { + for cell in row.cells { + if let error = writeCell(cell, to: lxwSheet) { + return error + } + } + } + for range in sheet.mergedRanges { + guard let coords = parseRange(range.reference) else { + return .writeFailed(underlying: "Bad merge range '\(range.reference)'") + } + // Passing a nil string tells libxlsxwriter to preserve whatever + // was already written at the top-left cell of the range; our + // top-left cell was emitted by the loop above. + let err = worksheet_merge_range( + lxwSheet, + coords.firstRow, + coords.firstCol, + coords.lastRow, + coords.lastCol, + nil, + nil + ) + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_merge_range \(range.reference) → \(err.rawValue)" + ) + } + } + return nil + } + + private static func writeCell( + _ cell: Cell, + to lxwSheet: UnsafeMutablePointer + ) -> DocumentAdapterError? { + guard let coords = parseA1(cell.reference) else { + return .writeFailed(underlying: "Bad cell reference '\(cell.reference)'") + } + let row = coords.row + let col = coords.col + + if let formula = cell.formula { + let err = formula.withCString { + worksheet_write_formula(lxwSheet, row, col, $0, nil) + } + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_write_formula \(cell.reference) → \(err.rawValue)" + ) + } + return nil + } + + switch cell.value { + case .empty: + return nil + case .number(let value): + let err = worksheet_write_number(lxwSheet, row, col, value, nil) + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_write_number \(cell.reference) → \(err.rawValue)" + ) + } + case .string(let text), .inlineString(let text): + let err = text.withCString { + worksheet_write_string(lxwSheet, row, col, $0, nil) + } + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_write_string \(cell.reference) → \(err.rawValue)" + ) + } + case .bool(let flag): + let err = worksheet_write_boolean(lxwSheet, row, col, flag ? 1 : 0, nil) + if err.rawValue != 0 { + return .writeFailed( + underlying: "worksheet_write_boolean \(cell.reference) → \(err.rawValue)" + ) + } + } + return nil + } + + // MARK: - A1 parsing + + /// Parses an A1-style cell reference ("B3", "AA10") into the 0-indexed + /// row and column that libxlsxwriter expects. Returns nil for anything + /// that doesn't match `[A-Z]+[0-9]+`. + private static func parseA1(_ reference: String) -> (row: UInt32, col: UInt16)? { + var letters: [UInt8] = [] + var digits: [UInt8] = [] + for scalar in reference.unicodeScalars { + guard scalar.isASCII, let byte = UInt8(exactly: scalar.value) else { return nil } + switch byte { + case 0x41 ... 0x5A: // A-Z + letters.append(byte) + case 0x61 ... 0x7A: // a-z + letters.append(byte - 32) + case 0x30 ... 0x39: // 0-9 + digits.append(byte) + default: + return nil + } + } + guard !letters.isEmpty, !digits.isEmpty else { return nil } + + let rowOneBasedString = String(bytes: digits, encoding: .ascii) ?? "" + guard let rowOneBased = UInt32(rowOneBasedString), rowOneBased > 0 else { return nil } + + var col: Int = 0 + let base = Int(UInt8(ascii: "A")) + for byte in letters { + col = col * 26 + (Int(byte) - base + 1) + } + guard col > 0, col <= 16_384 else { return nil } // Excel col cap + + return (row: rowOneBased - 1, col: UInt16(col - 1)) + } + + /// Parses an A1:A1 range ("A5:B5") into the four 0-indexed coordinates + /// libxlsxwriter's merge call wants. + private static func parseRange( + _ reference: String + ) -> (firstRow: UInt32, firstCol: UInt16, lastRow: UInt32, lastCol: UInt16)? { + let parts = reference.split(separator: ":", maxSplits: 1) + guard parts.count == 2, + let first = parseA1(String(parts[0])), + let last = parseA1(String(parts[1])) + else { return nil } + return (first.row, first.col, last.row, last.col) + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/CSVAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/CSVAdapterTests.swift new file mode 100644 index 000000000..31445a35b --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/CSVAdapterTests.swift @@ -0,0 +1,175 @@ +// +// CSVAdapterTests.swift +// osaurusTests +// +// Covers the in-memory CSV / TSV adapter. Pins the fields business users +// expect to survive an ingest: delimiter auto-pick per extension, quoted +// cells with commas and newlines, `""` quote escapes, UTF-8 BOM handling, +// header-row detection, and size-limit refusal. The streaming variant +// has its own suite — the parser state machine is shared, so this one +// focuses on the eager path + the typed CSVTable output. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("CSVAdapter") +struct CSVAdapterTests { + + @Test func canHandle_claimsCSVAndTSV() { + let adapter = CSVAdapter() + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.csv"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.TSV"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil) == false) + } + + @Test func parse_splitsHeaderFromRecords() async throws { + let url = try Self.write( + """ + Month,Revenue,Status + January,1200,closed + February,950,closed + March,1400,open + """, + ext: "csv" + ) + defer { try? FileManager.default.removeItem(at: url) } + + let document = try await CSVAdapter().parse(url: url, sizeLimit: 0) + guard let table = document.representation.underlying as? CSVTable else { + Issue.record("not a CSVTable"); return + } + #expect(table.header == ["Month", "Revenue", "Status"]) + #expect(table.records.count == 3) + #expect(table.records.first == ["January", "1200", "closed"]) + #expect(table.delimiter == ",") + } + + @Test func parse_tsvUsesTabDelimiter() async throws { + let url = try Self.write("Col1\tCol2\nA\t1\nB\t2\n", ext: "tsv") + defer { try? FileManager.default.removeItem(at: url) } + + let document = try await CSVAdapter().parse(url: url, sizeLimit: 0) + guard let table = document.representation.underlying as? CSVTable else { + Issue.record("not a CSVTable"); return + } + #expect(table.delimiter == "\t") + #expect(table.header == ["Col1", "Col2"]) + #expect(table.records == [["A", "1"], ["B", "2"]]) + } + + @Test func parse_preservesQuotedCommasAndNewlines() async throws { + // Row 1 has a comma inside the quoted second field; row 2 has a + // newline inside a quoted field. Both must end up as single cells. + let url = try Self.write( + """ + name,note + "Smith, John","note line 1 + note line 2" + Doe,plain + """, + ext: "csv" + ) + defer { try? FileManager.default.removeItem(at: url) } + + let document = try await CSVAdapter().parse(url: url, sizeLimit: 0) + guard let table = document.representation.underlying as? CSVTable else { + Issue.record("not a CSVTable"); return + } + #expect(table.records.count == 2) + #expect(table.records[0] == ["Smith, John", "note line 1\nnote line 2"]) + #expect(table.records[1] == ["Doe", "plain"]) + } + + @Test func parse_expandsDoubleQuoteEscape() async throws { + // Raw `#"""` so the embedded `""` escapes don't fight the compiler. + let url = try Self.write( + #""" + code,label + A,"He said ""yes""" + """#, + ext: "csv" + ) + defer { try? FileManager.default.removeItem(at: url) } + + let document = try await CSVAdapter().parse(url: url, sizeLimit: 0) + guard let table = document.representation.underlying as? CSVTable else { + Issue.record("not a CSVTable"); return + } + #expect(table.records.first == ["A", #"He said "yes""#]) + } + + @Test func parse_stripsUTF8BOM() async throws { + let bom = Data([0xEF, 0xBB, 0xBF]) + let body = "Name,Value\nAlpha,1\n".data(using: .utf8)! + var combined = bom + combined.append(body) + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-csv-bom-\(UUID().uuidString).csv") + try combined.write(to: url) + defer { try? FileManager.default.removeItem(at: url) } + + let document = try await CSVAdapter().parse(url: url, sizeLimit: 0) + guard let table = document.representation.underlying as? CSVTable else { + Issue.record("not a CSVTable"); return + } + #expect(table.header == ["Name", "Value"]) + #expect(table.records.first == ["Alpha", "1"]) + } + + @Test func parse_numericOnlyFirstRowIsNotHeader() async throws { + // All-numeric first row → no header detection; the whole file + // should surface as records. + let url = try Self.write("1,2,3\n4,5,6\n", ext: "csv") + defer { try? FileManager.default.removeItem(at: url) } + + let document = try await CSVAdapter().parse(url: url, sizeLimit: 0) + guard let table = document.representation.underlying as? CSVTable else { + Issue.record("not a CSVTable"); return + } + #expect(table.header == nil) + #expect(table.records == [["1", "2", "3"], ["4", "5", "6"]]) + } + + @Test func parse_rejectsOversizedFile() async throws { + let url = try Self.write("a,b\n1,2\n", ext: "csv") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await CSVAdapter().parse(url: url, sizeLimit: 1) + } + } + + @Test func parse_emptyFileThrowsEmptyContent() async throws { + let url = try Self.write("", ext: "csv") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await CSVAdapter().parse(url: url, sizeLimit: 0) + } + } + + @Test func parse_crlfLineEndingsAreRecognised() async throws { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-csv-crlf-\(UUID().uuidString).csv") + try "a,b\r\n1,2\r\n3,4\r\n".data(using: .utf8)!.write(to: url) + defer { try? FileManager.default.removeItem(at: url) } + + let document = try await CSVAdapter().parse(url: url, sizeLimit: 0) + guard let table = document.representation.underlying as? CSVTable else { + Issue.record("not a CSVTable"); return + } + #expect(table.records == [["1", "2"], ["3", "4"]]) + } + + // MARK: - Fixtures + + private static func write(_ content: String, ext: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-csv-\(UUID().uuidString).\(ext)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/CSVStreamerTests.swift b/Packages/OsaurusCore/Tests/Documents/CSVStreamerTests.swift new file mode 100644 index 000000000..6f13097a4 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/CSVStreamerTests.swift @@ -0,0 +1,147 @@ +// +// CSVStreamerTests.swift +// osaurusTests +// +// Streaming-side coverage for the CSV pipeline. The parser state machine +// is shared with `CSVAdapter`, so these tests focus on the streaming +// contract: back-pressure via AsyncThrowingStream, row-by-row emission, +// UTF-8 boundary handling across chunk edges, and cancellation. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("CSVStreamer") +struct CSVStreamerTests { + + @Test func stream_yieldsRowsInOrder() async throws { + let url = try Self.write( + """ + name,score + Alice,10 + Bob,20 + Carol,30 + """, + ext: "csv" + ) + defer { try? FileManager.default.removeItem(at: url) } + + var collected: [[String]] = [] + for try await record in CSVStreamer().stream(url: url) { + collected.append(record.cells) + } + #expect( + collected == [ + ["name", "score"], + ["Alice", "10"], + ["Bob", "20"], + ["Carol", "30"], + ] + ) + } + + @Test func stream_numbersRowsFromOne() async throws { + let url = try Self.write("a,b\n1,2\n3,4\n", ext: "csv") + defer { try? FileManager.default.removeItem(at: url) } + + var lineNumbers: [Int] = [] + for try await record in CSVStreamer().stream(url: url) { + lineNumbers.append(record.lineNumber) + } + #expect(lineNumbers == [1, 2, 3]) + } + + @Test func stream_tsvSplitsOnTab() async throws { + let url = try Self.write("col1\tcol2\nA\t1\nB\t2\n", ext: "tsv") + defer { try? FileManager.default.removeItem(at: url) } + + var collected: [[String]] = [] + for try await record in CSVStreamer().stream(url: url) { + collected.append(record.cells) + } + #expect(collected == [["col1", "col2"], ["A", "1"], ["B", "2"]]) + } + + @Test func stream_preservesQuotedNewlinesAcrossChunks() async throws { + // Build a payload bigger than one chunk so the parser actually + // sees the embedded newline arrive in two separate feeds. + let filler = String(repeating: "x", count: 70_000) + let url = try Self.write( + """ + id,note + 1,"line one + line two \(filler) end" + 2,plain + """, + ext: "csv" + ) + defer { try? FileManager.default.removeItem(at: url) } + + var collected: [[String]] = [] + for try await record in CSVStreamer().stream(url: url) { + collected.append(record.cells) + } + #expect(collected.count == 3) + #expect(collected[1].count == 2) + #expect(collected[1][0] == "1") + #expect(collected[1][1].hasPrefix("line one\nline two ")) + #expect(collected[1][1].hasSuffix(" end")) + #expect(collected[2] == ["2", "plain"]) + } + + @Test func stream_cancellationStopsMidFile() async throws { + // Generate a file large enough that cancellation happens before + // all rows drain. 10k rows × ~20 bytes = ~200 KB — several chunks. + var text = "id,v\n" + for i in 0 ..< 10_000 { + text.append("\(i),\(i * 2)\n") + } + let url = try Self.write(text, ext: "csv") + defer { try? FileManager.default.removeItem(at: url) } + + let task = Task { () -> Int in + var count = 0 + for try await _ in CSVStreamer().stream(url: url) { + count += 1 + if count >= 3 { + // Caller would normally break out of the loop; here we + // cancel the whole task so the streaming Task inside the + // streamer observes cancellation on its next check. + return count + } + } + return count + } + let delivered = try await task.value + #expect(delivered == 3) + } + + // MARK: - UTF-8 boundary split helper + + @Test func splitAtUTF8Boundary_keepsCompleteScalars() { + // `é` is C3 A9 in UTF-8. Cut the buffer mid-scalar and confirm + // the tail gets carried to the next read. + let data = Data([0x61, 0xC3, 0xA9, 0xC3]) // "a", "é", then a lead byte C3 with no continuation + let (decodable, tail) = CSVStreamer.splitAtUTF8Boundary(data) + #expect(decodable.count == 3) + #expect(tail == Data([0xC3])) + } + + @Test func splitAtUTF8Boundary_shortBufferReturnedAsIs() { + let data = Data([0x61, 0x62]) + let (decodable, tail) = CSVStreamer.splitAtUTF8Boundary(data) + #expect(decodable == data) + #expect(tail.isEmpty) + } + + // MARK: - Fixtures + + private static func write(_ content: String, ext: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-csvstream-\(UUID().uuidString).\(ext)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift b/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift new file mode 100644 index 000000000..580383bb2 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/DocumentParserShimTests.swift @@ -0,0 +1,125 @@ +// +// DocumentParserShimTests.swift +// osaurusTests +// +// Integration tests for the `DocumentParser.parseAll` shim: verifies that +// the registry is consulted first, that `.emptyContent` from a registered +// adapter falls through to the legacy switch, and that errors bubble up +// translated into the legacy `ParseError` surface. Uses the shared +// registry (register + `unregisterAll` in teardown) so the shim's call +// site is exactly the one reached from production. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("DocumentParser.parseAll registry shim", .serialized) +struct DocumentParserShimTests { + + // A fixture-extension adapter so tests don't collide with built-ins. + private static let fixtureFormatId = "test-fixture-shim" + private static let fixtureExtension = "fixtureshim" + + private func registerFixture(content: String) { + DocumentFormatRegistry.shared.register( + adapter: FixtureAdapter( + formatId: Self.fixtureFormatId, + extensions: [Self.fixtureExtension], + produce: content + ) + ) + } + + private func cleanUp() { + DocumentFormatRegistry.shared.unregisterAll(formatId: Self.fixtureFormatId) + } + + // MARK: - Routing + + @Test func parseAll_routesThroughRegistry_whenAdapterClaims() throws { + registerFixture(content: "routed-through-registry") + defer { cleanUp() } + + let url = try writeFile(content: "ignored", ext: Self.fixtureExtension) + defer { try? FileManager.default.removeItem(at: url) } + + let attachments = try DocumentParser.parseAll(url: url) + #expect(attachments.count == 1) + #expect(attachments.first?.documentContent == "routed-through-registry") + } + + @Test func parseAll_fallsThroughOnEmptyContent() throws { + // Fixture adapter with empty payload → adapter throws .emptyContent → + // shim should try the legacy switch, which for an unknown extension + // surfaces `ParseError.unsupportedFormat`. + registerFixture(content: "") + defer { cleanUp() } + + let url = try writeFile(content: "ignored", ext: Self.fixtureExtension) + defer { try? FileManager.default.removeItem(at: url) } + + #expect(throws: DocumentParser.ParseError.self) { + _ = try DocumentParser.parseAll(url: url) + } + } + + @Test func parseAll_preservesLegacyPath_whenNoAdapterMatches() throws { + // No fixture registered. A plain .txt file still flows through the + // legacy switch and produces exactly one document attachment. + let url = try writeFile(content: "legacy path still works", ext: "txt") + defer { try? FileManager.default.removeItem(at: url) } + + let attachments = try DocumentParser.parseAll(url: url) + #expect(attachments.count == 1) + #expect(attachments.first?.documentContent == "legacy path still works") + } + + // MARK: - Bootstrap + + @Test func bootstrap_registersExpectedBuiltInsOnIsolatedRegistry() { + let registry = DocumentFormatRegistry() + DocumentAdaptersBootstrap.registerBuiltIns(registry: registry) + let ids = registry.registeredFormatIds() + #expect(ids.contains("plaintext")) + #expect(ids.contains("pdf")) + #expect(ids.contains("richdoc")) + #expect(ids.contains("xlsx")) + } + + // MARK: - Fixtures + + private func writeFile(content: String, ext: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-shim-\(UUID().uuidString).\(ext)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } + + private struct FixtureAdapter: DocumentFormatAdapter { + let formatId: String + let extensions: Set + let produce: String + + func canHandle(url: URL, uti: String?) -> Bool { + extensions.contains(url.pathExtension.lowercased()) + } + + func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + guard !produce.isEmpty else { + throw DocumentAdapterError.emptyContent + } + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: formatId, + underlying: PlainTextRepresentation(text: produce) + ), + textFallback: produce + ) + } + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/Fixtures/xlsx/sample.xlsx b/Packages/OsaurusCore/Tests/Documents/Fixtures/xlsx/sample.xlsx new file mode 100644 index 000000000..f2a42dd32 Binary files /dev/null and b/Packages/OsaurusCore/Tests/Documents/Fixtures/xlsx/sample.xlsx differ diff --git a/Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift new file mode 100644 index 000000000..ed8033d38 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/PDFAdapterTests.swift @@ -0,0 +1,96 @@ +// +// PDFAdapterTests.swift +// osaurusTests +// +// Exercises the text-layer PDF adapter. Synthesises tiny PDFs via Core +// Graphics so the test bundle doesn't carry binary fixtures. The +// image-only fallback path stays in the legacy `DocumentParser` switch +// for now; the adapter intentionally throws `.emptyContent` when there's +// no text layer so the shim can fall through. +// + +import AppKit +import CoreGraphics +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("PDFAdapter") +struct PDFAdapterTests { + + @Test func canHandle_acceptsPDFExtensionOnly() { + let adapter = PDFAdapter() + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.pdf"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.PDF"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil) == false) + } + + @Test func parse_readsTextLayer() async throws { + let url = try Self.writePDF(text: "Hello PDF body content") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PDFAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.formatId == "pdf") + #expect(doc.textFallback.contains("Hello PDF body content")) + } + + @Test func parse_throwsEmptyContentForPDFWithNoTextLayer() async throws { + let url = try Self.writeBlankPDF() + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PDFAdapter().parse(url: url, sizeLimit: 0) + } + } + + @Test func parse_throwsSizeLimitExceededAboveCap() async throws { + let url = try Self.writePDF(text: "tiny") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PDFAdapter().parse(url: url, sizeLimit: 1) + } + } + + // MARK: - Fixtures + + private static func writePDF(text: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-pdf-\(UUID().uuidString).pdf") + var mediaBox = CGRect(x: 0, y: 0, width: 300, height: 200) + guard let ctx = CGContext(url as CFURL, mediaBox: &mediaBox, nil) else { + throw FixtureError.contextCreationFailed + } + ctx.beginPDFPage(nil) + + // Draw the text into the PDF context via NSAttributedString so PDFKit + // can recover it from the text layer on read-back. + let gc = NSGraphicsContext(cgContext: ctx, flipped: false) + NSGraphicsContext.saveGraphicsState() + NSGraphicsContext.current = gc + let font = NSFont.systemFont(ofSize: 14) + NSAttributedString(string: text, attributes: [.font: font]) + .draw(at: NSPoint(x: 20, y: 100)) + NSGraphicsContext.restoreGraphicsState() + + ctx.endPDFPage() + ctx.closePDF() + return url + } + + private static func writeBlankPDF() throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-pdf-blank-\(UUID().uuidString).pdf") + var mediaBox = CGRect(x: 0, y: 0, width: 100, height: 100) + guard let ctx = CGContext(url as CFURL, mediaBox: &mediaBox, nil) else { + throw FixtureError.contextCreationFailed + } + ctx.beginPDFPage(nil) + ctx.endPDFPage() + ctx.closePDF() + return url + } + + private enum FixtureError: Error { case contextCreationFailed } +} diff --git a/Packages/OsaurusCore/Tests/Documents/PDFTableDetectorTests.swift b/Packages/OsaurusCore/Tests/Documents/PDFTableDetectorTests.swift new file mode 100644 index 000000000..a29756e7d --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/PDFTableDetectorTests.swift @@ -0,0 +1,231 @@ +// +// PDFTableDetectorTests.swift +// osaurusTests +// +// Layout-aware table extraction coverage for `PDFAdapter`. The detector +// stages (`clusterRows`, `cellsForRow`, `groupConsecutiveTabularRows`, +// and the pure-function `detect(glyphs:)`) are exercised with +// synthesised `PDFGlyph` grids — Core Graphics-generated test PDFs +// report character bounds that span trailing whitespace, which hides +// the real column gaps and would make end-to-end fixtures unreliable. +// An integration test below still verifies the adapter wraps everything +// into a `PDFDocumentRepresentation` and preserves the flat text fallback. +// + +import AppKit +import CoreGraphics +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("PDFAdapter table extraction") +struct PDFTableDetectorTests { + + // MARK: - Algorithm: row clustering + + @Test func clusterRows_groupsByYTolerance() { + // Two rows at y=120 (higher on page, emitted first) and y=100. + // Within each row the glyphs are within 1pt of each other + // vertically (rowTolerance = 3). + let glyphs: [PDFGlyph] = [ + Self.glyph("A", x: 10, y: 100), + Self.glyph("B", x: 30, y: 100.5), + Self.glyph("C", x: 50, y: 100), + Self.glyph("D", x: 10, y: 120), + Self.glyph("E", x: 30, y: 120.5), + ] + let rows = PDFTableDetector.clusterRows(glyphs) + #expect(rows.count == 2) + // Top of page first (higher PDF y). + #expect(rows.first?.map(\.scalar) == ["D", "E"]) + #expect(rows.last?.map(\.scalar) == ["A", "B", "C"]) + } + + @Test func clusterRows_sortsWithinRowByX() { + // Glyphs arrive in scrambled x order — the clusterer must sort. + let glyphs: [PDFGlyph] = [ + Self.glyph("C", x: 100, y: 50), + Self.glyph("A", x: 10, y: 50), + Self.glyph("B", x: 60, y: 50), + ] + let rows = PDFTableDetector.clusterRows(glyphs) + #expect(rows.first?.map(\.scalar) == ["A", "B", "C"]) + } + + // MARK: - Algorithm: row → cells + + @Test func cellsForRow_splitsOnWideGap() { + // Three 10pt characters per column with ~50pt column gaps — + // well above the 8pt threshold. + let row: [PDFGlyph] = [ + Self.glyph("A", x: 10, y: 0, width: 6), + Self.glyph("a", x: 16, y: 0, width: 6), + Self.glyph("B", x: 60, y: 0, width: 6), + Self.glyph("b", x: 66, y: 0, width: 6), + Self.glyph("C", x: 110, y: 0, width: 6), + ] + let cells = PDFTableDetector.cellsForRow(row) + #expect(cells == ["Aa", "Bb", "C"]) + } + + @Test func cellsForRow_wordsInSameCellStayTogether() { + // "Net Revenue" inside a single cell. The tiny single-space gap + // (~3pt) is filtered upstream; even if it leaked through, it's + // below the 8pt column threshold. + let row: [PDFGlyph] = [ + Self.glyph("N", x: 10, y: 0, width: 6), + Self.glyph("e", x: 16, y: 0, width: 6), + Self.glyph("t", x: 22, y: 0, width: 6), + Self.glyph("R", x: 30, y: 0, width: 6), // 2pt gap where the space was + Self.glyph("e", x: 36, y: 0, width: 6), + Self.glyph("v", x: 42, y: 0, width: 6), + ] + let cells = PDFTableDetector.cellsForRow(row) + #expect(cells == ["NetRev"]) + } + + @Test func cellsForRow_singleGlyphReturnsSingleCell() { + let row: [PDFGlyph] = [Self.glyph("X", x: 0, y: 0)] + #expect(PDFTableDetector.cellsForRow(row) == ["X"]) + } + + // MARK: - Algorithm: tabular grouping + + @Test func groupConsecutive_collectsMultiCellRunsIntoOneTable() { + let rows: [[String]] = [ + ["Header1", "Header2"], + ["a", "1"], + ["b", "2"], + ] + let tables = PDFTableDetector.groupConsecutiveTabularRows(rows) + #expect(tables.count == 1) + #expect(tables.first?.rowCount == 3) + } + + @Test func groupConsecutive_splitsAcrossSingleCellBreaks() { + // Prose row in the middle cuts the table in two. + let rows: [[String]] = [ + ["A", "1"], + ["B", "2"], + ["paragraph"], + ["C", "3"], + ["D", "4"], + ] + let tables = PDFTableDetector.groupConsecutiveTabularRows(rows) + #expect(tables.count == 2) + #expect(tables[0].rowCount == 2) + #expect(tables[1].rowCount == 2) + } + + @Test func groupConsecutive_dropsIsolatedSingleTabularRows() { + // One tabular row on its own (form-field style: "Invoice No. 1234") + // shouldn't surface as a "table". + let rows: [[String]] = [ + ["Invoice", "No.", "1234"] + ] + let tables = PDFTableDetector.groupConsecutiveTabularRows(rows) + #expect(tables.isEmpty) + } + + @Test func groupConsecutive_emptyInputProducesEmptyOutput() { + #expect(PDFTableDetector.groupConsecutiveTabularRows([]).isEmpty) + } + + // MARK: - Algorithm: end-to-end on synthetic glyphs + + @Test func detect_endToEndSyntheticGrid() { + // 3×3 grid at y = 100 / 80 / 60 with 50pt column gaps. + var glyphs: [PDFGlyph] = [] + let yCoords: [CGFloat] = [100, 80, 60] + let xCoords: [CGFloat] = [10, 60, 110] + let data = [ + ["I", "Q", "P"], // Header + ["a", "1", "9"], + ["b", "2", "8"], + ] + for (rowIdx, row) in data.enumerated() { + for (colIdx, ch) in row.enumerated() { + glyphs.append( + Self.glyph(Character(ch), x: xCoords[colIdx], y: yCoords[rowIdx], width: 6) + ) + } + } + let tables = PDFTableDetector.detect(glyphs: glyphs) + #expect(tables.count == 1) + #expect(tables.first?.rowCount == 3) + #expect(tables.first?.columnCount == 3) + #expect(tables.first?.rows.first == ["I", "Q", "P"]) + } + + // MARK: - Integration: adapter contract + + @Test func parse_emitsPDFDocumentRepresentationWithTextFallback() async throws { + let url = try Self.writeHelloPDF() + defer { try? FileManager.default.removeItem(at: url) } + + let document = try await PDFAdapter().parse(url: url, sizeLimit: 0) + guard let repr = document.representation.underlying as? PDFDocumentRepresentation else { + Issue.record("representation was not a PDFDocumentRepresentation") + return + } + #expect(repr.pageCount == 1) + #expect(repr.pages.first?.pageNumber == 1) + #expect(document.textFallback.contains("Hello")) + } + + @Test func parse_propagatesEmptyContentForBlankPDF() async throws { + let url = try Self.writeBlankPDF() + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PDFAdapter().parse(url: url, sizeLimit: 0) + } + } + + // MARK: - Fixtures + + private static func glyph( + _ scalar: Character, + x: CGFloat, + y: CGFloat, + width: CGFloat = 5, + height: CGFloat = 10 + ) -> PDFGlyph { + PDFGlyph(scalar: scalar, rect: CGRect(x: x, y: y, width: width, height: height)) + } + + private static func writeHelloPDF() throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-pdftable-hello-\(UUID().uuidString).pdf") + var mediaBox = CGRect(x: 0, y: 0, width: 300, height: 200) + guard let ctx = CGContext(url as CFURL, mediaBox: &mediaBox, nil) else { + throw WriterError.contextCreationFailed + } + ctx.beginPDFPage(nil) + let gc = NSGraphicsContext(cgContext: ctx, flipped: false) + NSGraphicsContext.saveGraphicsState() + NSGraphicsContext.current = gc + NSAttributedString(string: "Hello PDF", attributes: [.font: NSFont.systemFont(ofSize: 14)]) + .draw(at: NSPoint(x: 20, y: 100)) + NSGraphicsContext.restoreGraphicsState() + ctx.endPDFPage() + ctx.closePDF() + return url + } + + private static func writeBlankPDF() throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-pdftable-blank-\(UUID().uuidString).pdf") + var mediaBox = CGRect(x: 0, y: 0, width: 100, height: 100) + guard let ctx = CGContext(url as CFURL, mediaBox: &mediaBox, nil) else { + throw WriterError.contextCreationFailed + } + ctx.beginPDFPage(nil) + ctx.endPDFPage() + ctx.closePDF() + return url + } + + private enum WriterError: Error { case contextCreationFailed } +} diff --git a/Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift new file mode 100644 index 000000000..bc18a3448 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/PlainTextAdapterTests.swift @@ -0,0 +1,84 @@ +// +// PlainTextAdapterTests.swift +// osaurusTests +// +// Covers the plain-text migration adapter. Same behavioural contract as +// the legacy `DocumentParser.parsePlainText` — UTF-8, ISO-Latin-1 retry, +// character-cap truncation — plus the size-limit contract from the new +// adapter protocol. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("PlainTextAdapter") +struct PlainTextAdapterTests { + + @Test func canHandle_acceptsCommonTextExtensions() { + let adapter = PlainTextAdapter() + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.MD"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.swift"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.pdf"), uti: nil) == false) + } + + @Test func parse_readsUtf8Content() async throws { + let url = try Self.write("hello\nutf8\n", filename: "hello.txt") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.formatId == "plaintext") + #expect(doc.filename.hasSuffix("hello.txt")) + #expect(doc.textFallback.contains("hello")) + #expect(doc.textFallback.contains("utf8")) + } + + @Test func parse_fallsBackToLatin1ForNonUtf8Bytes() async throws { + // A single 0xE9 byte (`é` in latin-1) is illegal standalone UTF-8. + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("latin-\(UUID().uuidString).txt") + try Data([0xE9, 0x0A]).write(to: url) + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.textFallback.contains("é")) + } + + @Test func parse_throwsEmptyContentForWhitespaceOnly() async throws { + let url = try Self.write(" \n\t\n", filename: "empty.txt") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + } + } + + @Test func parse_throwsSizeLimitExceededAboveCap() async throws { + let url = try Self.write("hello world", filename: "big.txt") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await PlainTextAdapter().parse(url: url, sizeLimit: 1) + } + } + + @Test func parse_truncatesLongContentWithMarker() async throws { + let payload = String(repeating: "a", count: 500_002) + let url = try Self.write(payload, filename: "long.txt") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await PlainTextAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.textFallback.hasSuffix("character limit]")) + } + + // MARK: - Helpers + + private static func write(_ content: String, filename: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("\(UUID().uuidString)-\(filename)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift new file mode 100644 index 000000000..5fb4f6631 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/RichDocumentAdapterTests.swift @@ -0,0 +1,70 @@ +// +// RichDocumentAdapterTests.swift +// osaurusTests +// +// Covers the NSAttributedString-backed migration adapter across the +// extensions it claims today (DOCX, RTF, HTML). Uses HTML and RTF +// fixtures authored inline; the DOCX path is exercised indirectly +// through `canHandle` — building a real DOCX on the fly requires ZIP +// plumbing that will come with the high-fidelity DOCX reader in stage-4 +// PR 11. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("RichDocumentAdapter") +struct RichDocumentAdapterTests { + + @Test func canHandle_acceptsAllRichDocumentExtensions() { + let adapter = RichDocumentAdapter() + for ext in ["docx", "doc", "rtf", "rtfd", "html", "htm"] { + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.\(ext)"), uti: nil)) + } + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.txt"), uti: nil) == false) + } + + @Test func parse_readsHTMLBodyAsPlainText() async throws { + let url = try Self.write( + "

Title

Body text

", + filename: "page.html" + ) + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await RichDocumentAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.formatId == "richdoc") + #expect(doc.textFallback.contains("Title")) + #expect(doc.textFallback.contains("Body text")) + #expect(doc.textFallback.contains("

") == false) + } + + @Test func parse_readsRTFAsPlainText() async throws { + let rtf = "{\\rtf1\\ansi Hello {\\b bold} world}" + let url = try Self.write(rtf, filename: "page.rtf") + defer { try? FileManager.default.removeItem(at: url) } + + let doc = try await RichDocumentAdapter().parse(url: url, sizeLimit: 0) + #expect(doc.textFallback.contains("Hello")) + #expect(doc.textFallback.contains("bold")) + } + + @Test func parse_throwsSizeLimitExceededAboveCap() async throws { + let url = try Self.write("hi", filename: "big.html") + defer { try? FileManager.default.removeItem(at: url) } + + await #expect(throws: DocumentAdapterError.self) { + _ = try await RichDocumentAdapter().parse(url: url, sizeLimit: 1) + } + } + + // MARK: - Helpers + + private static func write(_ content: String, filename: String) throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("\(UUID().uuidString)-\(filename)") + try content.write(to: url, atomically: true, encoding: .utf8) + return url + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/WorkbookToolsTests.swift b/Packages/OsaurusCore/Tests/Documents/WorkbookToolsTests.swift new file mode 100644 index 000000000..3b754c43f --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/WorkbookToolsTests.swift @@ -0,0 +1,181 @@ +// +// WorkbookToolsTests.swift +// osaurusTests +// +// End-to-end tests for the `read_workbook` / `read_workbook_cell` / +// `write_workbook` agent tools. Uses the checked-in sample.xlsx fixture +// for the read paths and a temp directory for the write path so the +// three tools exercise the same XLSXAdapter / XLSXEmitter pair that +// agents see in production. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("Workbook agent tools") +struct WorkbookToolsTests { + + private let rootPath: URL + private let fixturePath: URL + + init() throws { + let tmp = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-wb-tools-\(UUID().uuidString)", isDirectory: true) + try FileManager.default.createDirectory(at: tmp, withIntermediateDirectories: true) + rootPath = tmp + + // Copy the fixture into the temp root so the tools can resolve + // "sample.xlsx" as a relative path under the working folder. + guard + let bundled = Bundle.module.url( + forResource: "sample", + withExtension: "xlsx", + subdirectory: "Fixtures/xlsx" + ) + else { + throw FixtureError.missing + } + fixturePath = tmp.appendingPathComponent("sample.xlsx") + try FileManager.default.copyItem(at: bundled, to: fixturePath) + } + + // MARK: - read_workbook + + @Test func readWorkbook_returnsSheetSummaries() async throws { + let tool = ReadWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute(argumentsJSON: #"{"path":"sample.xlsx"}"#) + let payload = try Self.successTextAsDict(envelope) + + #expect(payload["path"] as? String == "sample.xlsx") + let sheets = payload["sheets"] as? [[String: Any]] ?? [] + #expect(sheets.count == 2) + #expect(sheets.map { $0["name"] as? String }.contains { $0 == "Revenue" }) + #expect(sheets.map { $0["name"] as? String }.contains { $0 == "Notes" }) + + let revenue = sheets.first { $0["name"] as? String == "Revenue" } ?? [:] + let merged = revenue["mergedRanges"] as? [String] ?? [] + #expect(merged.contains("A5:B5")) + } + + @Test func readWorkbook_rejectsMissingFile() async throws { + let tool = ReadWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute(argumentsJSON: #"{"path":"nope.xlsx"}"#) + #expect(envelope.contains("\"kind\":\"execution_error\"") || envelope.contains("\"ok\":false")) + } + + @Test func readWorkbook_rejectsPathOutsideRoot() async throws { + let tool = ReadWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute(argumentsJSON: #"{"path":"../outside.xlsx"}"#) + #expect(envelope.contains("outside") || envelope.contains("invalid")) + } + + // MARK: - read_workbook_cell + + @Test func readWorkbookCell_returnsFormulaAndValue() async throws { + let tool = ReadWorkbookCellTool(rootPath: rootPath) + let envelope = try await tool.execute( + argumentsJSON: #"{"path":"sample.xlsx","sheet":"Revenue","cell":"B4"}"# + ) + let payload = try Self.successTextAsDict(envelope) + #expect(payload["ref"] as? String == "B4") + #expect(payload["formula"] as? String == "SUM(B2:B3)") + } + + @Test func readWorkbookCell_rejectsMissingSheet() async throws { + let tool = ReadWorkbookCellTool(rootPath: rootPath) + let envelope = try await tool.execute( + argumentsJSON: #"{"path":"sample.xlsx","sheet":"Ghost","cell":"A1"}"# + ) + #expect(envelope.contains("not found")) + } + + // MARK: - write_workbook + + @Test func writeWorkbook_emitsAndRoundTrips() async throws { + let tool = WriteWorkbookTool(rootPath: rootPath) + let input = #""" + { + "path": "output.xlsx", + "sheets": [ + { + "name": "Numbers", + "cells": [ + {"ref": "A1", "type": "string", "value": "Label"}, + {"ref": "B1", "type": "number", "value": 42}, + {"ref": "A2", "type": "bool", "value": true}, + {"ref": "C1", "type": "formula", "formula": "B1*2"} + ], + "mergedRanges": ["A3:B3"] + } + ] + } + """# + let envelope = try await tool.execute(argumentsJSON: input) + let payload = try Self.successTextAsDict(envelope) + #expect(payload["sheetCount"] as? Int == 1) + + let outURL = rootPath.appendingPathComponent("output.xlsx") + #expect(FileManager.default.fileExists(atPath: outURL.path)) + + // Round-trip through XLSXAdapter to confirm the cells the agent + // requested actually landed in the file. + let reparsed = try await XLSXAdapter().parse(url: outURL, sizeLimit: 0) + guard let workbook = reparsed.representation.underlying as? Workbook else { + Issue.record("re-parsed representation was not a Workbook") + return + } + #expect(workbook.sheets.first?.name == "Numbers") + let cells = workbook.sheets.first?.rows.flatMap(\.cells) ?? [] + #expect(cells.contains { $0.reference == "A1" }) + #expect(cells.contains { $0.reference == "B1" }) + #expect(cells.contains { $0.reference == "C1" && $0.formula == "B1*2" }) + } + + @Test func writeWorkbook_rejectsNonXLSXPath() async throws { + let tool = WriteWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute( + argumentsJSON: #"{"path":"report.txt","sheets":[{"name":"Sheet1","cells":[]}]}"# + ) + #expect(envelope.contains("must end in")) + } + + @Test func writeWorkbook_rejectsEmptySheets() async throws { + let tool = WriteWorkbookTool(rootPath: rootPath) + let envelope = try await tool.execute( + argumentsJSON: #"{"path":"out.xlsx","sheets":[]}"# + ) + #expect(envelope.contains("non-empty")) + } + + // MARK: - Helpers + + /// Extracts the inner `result.text` from a `ToolEnvelope.success` JSON + /// and parses it as a dictionary — the envelope wraps every tool + /// response so tests have to peel one layer. + private static func successTextAsDict(_ envelope: String) throws -> [String: Any] { + let data = envelope.data(using: .utf8) ?? Data() + guard let obj = try JSONSerialization.jsonObject(with: data) as? [String: Any], + let result = obj["result"] as? [String: Any], + let text = result["text"] as? String, + let innerData = text.data(using: .utf8), + let inner = try JSONSerialization.jsonObject(with: innerData) as? [String: Any] + else { + throw FixtureError.notSuccessEnvelope(envelope) + } + return inner + } + + private enum FixtureError: Error, CustomStringConvertible { + case missing + case notSuccessEnvelope(String) + + var description: String { + switch self { + case .missing: return "Bundle.module lost the sample.xlsx fixture" + case .notSuccessEnvelope(let raw): return "Not a success envelope: \(raw)" + } + } + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/XLSXAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/XLSXAdapterTests.swift new file mode 100644 index 000000000..e0ea7d123 --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/XLSXAdapterTests.swift @@ -0,0 +1,143 @@ +// +// XLSXAdapterTests.swift +// osaurusTests +// +// Validates the first real-fidelity document adapter end-to-end against +// a checked-in XLSX fixture (produced by xlsxwriter, matching what most +// business users ship). Ensures we surface sheet names, shared strings, +// numeric cells, formulas as source strings, merged ranges, and booleans +// — the round-trip checklist from the stage-2 business catalog. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("XLSXAdapter") +struct XLSXAdapterTests { + + // MARK: - canHandle + + @Test func canHandle_acceptsXLSXOnly() { + let adapter = XLSXAdapter() + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.xlsx"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.XLSX"), uti: nil)) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.xls"), uti: nil) == false) + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/a.csv"), uti: nil) == false) + } + + // MARK: - parse against fixture + + @Test func parse_surfacesSheetStructureAndValues() async throws { + let url = try Self.fixtureURL() + let adapter = XLSXAdapter() + let document = try await adapter.parse(url: url, sizeLimit: 0) + + guard let workbook = document.representation.underlying as? Workbook else { + Issue.record("representation was not a Workbook") + return + } + + #expect(workbook.sheets.count == 2) + let sheetNames = workbook.sheets.map(\.name) + #expect(sheetNames.contains("Revenue")) + #expect(sheetNames.contains("Notes")) + } + + @Test func parse_preservesFormulasAndMergedRanges() async throws { + let url = try Self.fixtureURL() + let document = try await XLSXAdapter().parse(url: url, sizeLimit: 0) + guard let workbook = document.representation.underlying as? Workbook, + let revenue = workbook.sheets.first(where: { $0.name == "Revenue" }) + else { + Issue.record("Revenue sheet missing") + return + } + + let formulaCells = revenue.rows + .flatMap(\.cells) + .filter { $0.formula != nil } + #expect(formulaCells.count == 1) + #expect(formulaCells.first?.formula == "SUM(B2:B3)") + + // Merged cells in the fixture cover A5:B5 (the footer note). + #expect(revenue.mergedRanges.map(\.reference).contains("A5:B5")) + } + + @Test func parse_preservesSharedStringsAndNumbers() async throws { + let url = try Self.fixtureURL() + let document = try await XLSXAdapter().parse(url: url, sizeLimit: 0) + guard let workbook = document.representation.underlying as? Workbook, + let revenue = workbook.sheets.first(where: { $0.name == "Revenue" }) + else { + Issue.record("Revenue sheet missing") + return + } + + // Shared strings include the header labels. + #expect(workbook.sharedStrings.contains("Month")) + #expect(workbook.sharedStrings.contains("Amount")) + #expect(workbook.sharedStrings.contains("January")) + + // B2 = 1200 (numeric, not rendered as a shared string). + let b2 = revenue.rows.flatMap(\.cells).first { $0.reference == "B2" } + if case .number(let value) = b2?.value { + #expect(value == 1200) + } else { + Issue.record("B2 was not a number: \(String(describing: b2?.value))") + } + } + + @Test func parse_surfacesBooleansOnNotesSheet() async throws { + let url = try Self.fixtureURL() + let document = try await XLSXAdapter().parse(url: url, sizeLimit: 0) + guard let workbook = document.representation.underlying as? Workbook, + let notes = workbook.sheets.first(where: { $0.name == "Notes" }) + else { + Issue.record("Notes sheet missing") + return + } + let boolCells = notes.rows.flatMap(\.cells).filter { + if case .bool = $0.value { return true } else { return false } + } + #expect(boolCells.count == 2) + } + + @Test func parse_textFallback_containsHumanReadableTable() async throws { + let url = try Self.fixtureURL() + let document = try await XLSXAdapter().parse(url: url, sizeLimit: 0) + + #expect(document.textFallback.contains("## Sheet: Revenue")) + #expect(document.textFallback.contains("Total")) + #expect(document.textFallback.contains("=SUM(B2:B3)")) + #expect(document.textFallback.contains("Merged: A5:B5")) + } + + @Test func parse_rejectsOversizedFile() async throws { + let url = try Self.fixtureURL() + await #expect(throws: DocumentAdapterError.self) { + _ = try await XLSXAdapter().parse(url: url, sizeLimit: 64) + } + } + + // MARK: - Fixture plumbing + + private static func fixtureURL() throws -> URL { + // `.copy("Documents/Fixtures")` in `Package.swift` drops the parent + // `Documents/` segment inside the test bundle, so resources live + // under `Fixtures/xlsx/...` at the bundle root. + guard + let url = Bundle.module.url( + forResource: "sample", + withExtension: "xlsx", + subdirectory: "Fixtures/xlsx" + ) + else { + throw FixtureError.missing + } + return url + } + + private enum FixtureError: Error { case missing } +} diff --git a/Packages/OsaurusCore/Tests/Documents/XLSXEmitterTests.swift b/Packages/OsaurusCore/Tests/Documents/XLSXEmitterTests.swift new file mode 100644 index 000000000..8d29e456a --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/XLSXEmitterTests.swift @@ -0,0 +1,273 @@ +// +// XLSXEmitterTests.swift +// osaurusTests +// +// Proves the XLSX round trip: build a `Workbook` in memory, emit it +// through `XLSXEmitter`, re-parse the resulting file through +// `XLSXAdapter`, and assert that every fidelity feature we care about +// — sheet names, cell values, formula source strings, merged ranges, +// booleans — survives. Libxlsxwriter's output is strictly standards- +// conforming so the re-parse exercises the same CoreXLSX paths the +// read-side tests already pin. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("XLSXEmitter round trip") +struct XLSXEmitterTests { + + @Test func canEmit_onlyAcceptsWorkbookRepresentations() { + let emitter = XLSXEmitter() + let workbookDoc = StructuredDocument( + formatId: "xlsx", + filename: "a.xlsx", + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: "xlsx", + underlying: Workbook(sheets: [], sharedStrings: []) + ), + textFallback: "" + ) + let plainDoc = StructuredDocument( + formatId: "plaintext", + filename: "a.txt", + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: "plaintext", + underlying: PlainTextRepresentation(text: "") + ), + textFallback: "" + ) + + #expect(emitter.canEmit(workbookDoc)) + #expect(emitter.canEmit(plainDoc) == false) + } + + // MARK: - Round trip + + @Test func emit_thenReparse_preservesSheetsAndCells() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + let emitter = XLSXEmitter() + try await emitter.emit(Self.wrap(input), to: dest) + + #expect(FileManager.default.fileExists(atPath: dest.path)) + + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook else { + Issue.record("Re-parsed representation was not a Workbook") + return + } + + #expect(output.sheets.count == input.sheets.count) + for (expected, actual) in zip(input.sheets, output.sheets) { + #expect(expected.name == actual.name, "sheet name mismatch") + } + } + + @Test func emit_preservesFormulaSourceStrings() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + try await XLSXEmitter().emit(Self.wrap(input), to: dest) + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook else { + Issue.record("Re-parsed representation was not a Workbook") + return + } + + let formulas = output.sheets + .flatMap(\.rows) + .flatMap(\.cells) + .compactMap(\.formula) + #expect(formulas.contains("SUM(B2:B3)")) + } + + @Test func emit_preservesMergedRanges() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + try await XLSXEmitter().emit(Self.wrap(input), to: dest) + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook else { + Issue.record("Re-parsed representation was not a Workbook") + return + } + + let mergedRefs = output.sheets.flatMap { $0.mergedRanges.map(\.reference) } + #expect(mergedRefs.contains("A5:B5")) + } + + @Test func emit_preservesStringAndNumberCells() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + try await XLSXEmitter().emit(Self.wrap(input), to: dest) + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook, + let revenue = output.sheets.first(where: { $0.name == "Revenue" }) + else { + Issue.record("Revenue sheet missing after round trip") + return + } + + // "Month" string header lands in A1. + let a1 = revenue.rows.flatMap(\.cells).first { $0.reference == "A1" } + if case .string(let value) = a1?.value { + #expect(value == "Month") + } else { + Issue.record("A1 after round trip was \(String(describing: a1?.value))") + } + + // 1200 number lands in B2. + let b2 = revenue.rows.flatMap(\.cells).first { $0.reference == "B2" } + if case .number(let value) = b2?.value { + #expect(value == 1200) + } else { + Issue.record("B2 after round trip was \(String(describing: b2?.value))") + } + } + + @Test func emit_preservesBooleans() async throws { + let input = Self.makeRoundTripFixture() + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + + try await XLSXEmitter().emit(Self.wrap(input), to: dest) + let reparsed = try await XLSXAdapter().parse(url: dest, sizeLimit: 0) + guard let output = reparsed.representation.underlying as? Workbook, + let notes = output.sheets.first(where: { $0.name == "Notes" }) + else { + Issue.record("Notes sheet missing") + return + } + + let bools = notes.rows.flatMap(\.cells).compactMap { cell -> Bool? in + if case .bool(let flag) = cell.value { return flag } else { return nil } + } + #expect(bools.count == 2) + #expect(bools.contains(true)) + #expect(bools.contains(false)) + } + + @Test func emit_rejectsNonWorkbookRepresentation() async throws { + let dest = Self.tempURL() + defer { try? FileManager.default.removeItem(at: dest) } + let plain = StructuredDocument( + formatId: "plaintext", + filename: "a.txt", + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: "plaintext", + underlying: PlainTextRepresentation(text: "") + ), + textFallback: "" + ) + await #expect(throws: DocumentAdapterError.self) { + try await XLSXEmitter().emit(plain, to: dest) + } + } + + // MARK: - Fixture builder + + private static func wrap(_ workbook: Workbook) -> StructuredDocument { + StructuredDocument( + formatId: "xlsx", + filename: "fixture.xlsx", + fileSize: 0, + representation: AnyStructuredRepresentation( + formatId: "xlsx", + underlying: workbook + ), + textFallback: "" + ) + } + + /// Matches the shape of the checked-in `sample.xlsx` fixture so the + /// emitter and adapter exercise the same fidelity checklist. + private static func makeRoundTripFixture() -> Workbook { + let revenue = Sheet( + name: "Revenue", + rows: [ + Row( + index: 1, + cells: [ + Cell(reference: "A1", value: .string("Month")), + Cell(reference: "B1", value: .string("Amount")), + ] + ), + Row( + index: 2, + cells: [ + Cell(reference: "A2", value: .string("January")), + Cell(reference: "B2", value: .number(1200)), + ] + ), + Row( + index: 3, + cells: [ + Cell(reference: "A3", value: .string("February")), + Cell(reference: "B3", value: .number(950)), + ] + ), + Row( + index: 4, + cells: [ + Cell(reference: "A4", value: .string("Total")), + Cell(reference: "B4", value: .empty, formula: "SUM(B2:B3)"), + ] + ), + Row( + index: 5, + cells: [ + Cell(reference: "A5", value: .string("Generated for osaurus tests")) + ] + ), + ], + mergedRanges: [CellRange(reference: "A5:B5")] + ) + + let notes = Sheet( + name: "Notes", + rows: [ + Row( + index: 1, + cells: [ + Cell(reference: "A1", value: .string("Key")), + Cell(reference: "B1", value: .string("Value")), + ] + ), + Row( + index: 2, + cells: [ + Cell(reference: "A2", value: .string("reviewer")), + Cell(reference: "B2", value: .string("mimeding")), + ] + ), + Row( + index: 3, + cells: [ + Cell(reference: "A3", value: .bool(true)), + Cell(reference: "B3", value: .bool(false)), + ] + ), + ], + mergedRanges: [] + ) + + return Workbook(sheets: [revenue, notes], sharedStrings: []) + } + + private static func tempURL() -> URL { + FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-xlsx-roundtrip-\(UUID().uuidString).xlsx") + } +} diff --git a/Packages/OsaurusCore/Utils/DocumentParser.swift b/Packages/OsaurusCore/Utils/DocumentParser.swift index 2bce81665..69b46ce55 100644 --- a/Packages/OsaurusCore/Utils/DocumentParser.swift +++ b/Packages/OsaurusCore/Utils/DocumentParser.swift @@ -57,6 +57,14 @@ enum DocumentParser { let ext = url.pathExtension.lowercased() let filename = url.lastPathComponent + // Registry-routed path. Returns nil when no adapter claims the file + // OR when the claiming adapter surfaces `.emptyContent` / + // `.unsupportedFormat`, so the legacy switch below still handles + // e.g. image-only PDFs and any format an adapter hasn't taken over. + if let attachments = try routeThroughRegistry(url: url, fileSize: fileSize) { + return attachments + } + // PDF may fall back to image rendering if text extraction yields nothing if ext == "pdf" { return try parsePDFWithFallback(url: url, filename: filename, fileSize: fileSize) @@ -147,9 +155,8 @@ enum DocumentParser { return try String(contentsOf: url, encoding: .utf8) } catch { // Retry with latin1 for binary-ish text files - if let data = try? Data(contentsOf: url), - let str = String(data: data, encoding: .isoLatin1) - { + let fallbackData = try? Data(contentsOf: url) + if let fallbackData, let str = String(data: fallbackData, encoding: .isoLatin1) { return str } throw ParseError.readFailed(error.localizedDescription) @@ -189,9 +196,8 @@ enum DocumentParser { private static func extractPDFText(from document: PDFDocument) -> String { var pages: [String] = [] for i in 0 ..< document.pageCount { - if let page = document.page(at: i), let text = page.string, - !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty - { + let text = document.page(at: i)?.string + if let text, !text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { pages.append(text) } } @@ -261,4 +267,86 @@ enum DocumentParser { throw ParseError.readFailed(error.localizedDescription) } } + + // MARK: - Registry shim + + /// Tries the document format registry before the legacy switch. The + /// registry runs async; we block on a dedicated dispatch queue so the + /// synchronous `parseAll` contract is preserved during the migration + /// window. Once every caller is async (stage-4 PR 10), this shim goes + /// away. + /// + /// Return value conventions: + /// - `nil` — no adapter is registered, or an adapter declined the file + /// via `.emptyContent` / `.unsupportedFormat`; legacy path handles it. + /// - non-nil — the adapter produced a text view; convert to + /// `[Attachment]` by wrapping `textFallback`. + /// - throws — adapter produced a non-recoverable error (size / read / + /// write); surface as `ParseError`. + private static func routeThroughRegistry(url: URL, fileSize: Int) throws -> [Attachment]? { + let registry = DocumentFormatRegistry.shared + guard let adapter = registry.adapter(for: url) else { return nil } + + let sizeLimit = DocumentLimits.limit(forFormatId: adapter.formatId) + do { + let document = try runBlocking { + try await adapter.parse(url: url, sizeLimit: sizeLimit) + } + return [ + .document( + filename: document.filename, + content: document.textFallback, + fileSize: Int(document.fileSize) + ) + ] + } catch DocumentAdapterError.emptyContent, DocumentAdapterError.unsupportedFormat { + // Fall through so the legacy switch (image-only PDFs, formats + // without an adapter yet) still gets a shot. + return nil + } catch DocumentAdapterError.sizeLimitExceeded { + throw ParseError.fileTooLarge + } catch let DocumentAdapterError.readFailed(reason) { + throw ParseError.readFailed(reason) + } catch DocumentAdapterError.writeFailed, DocumentAdapterError.cancelled { + throw ParseError.readFailed("Adapter emitted non-read error for ingress") + } catch { + throw ParseError.readFailed(error.localizedDescription) + } + } + + /// Synchronously awaits an async body. The shim is called from + /// `parseAll` which is itself invoked from UI callbacks that are still + /// synchronous — see `FloatingInputCard`. Dropping the semaphore means + /// reworking every ingress call site, which isn't in scope for PR 3. + private static func runBlocking(_ body: @escaping @Sendable () async throws -> T) throws -> T { + let semaphore = DispatchSemaphore(value: 0) + let resultBox = UnfairLockedBox?>(nil) + + Task.detached { + let result: Result + do { + result = .success(try await body()) + } catch { + result = .failure(error) + } + resultBox.set(result) + semaphore.signal() + } + + semaphore.wait() + switch resultBox.get()! { + case .success(let value): return value + case .failure(let error): throw error + } + } +} + +/// Tiny lock-box so the blocking-await shim above can hand a value back +/// across the actor/thread boundary without tripping Swift 6 sendability. +private final class UnfairLockedBox: @unchecked Sendable { + private var value: Value + private let lock = NSLock() + init(_ value: Value) { self.value = value } + func get() -> Value { lock.lock(); defer { lock.unlock() }; return value } + func set(_ newValue: Value) { lock.lock(); defer { lock.unlock() }; value = newValue } } diff --git a/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved b/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved index 55fbfe785..6c8c81f72 100644 --- a/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/osaurus.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -1,5 +1,5 @@ { - "originHash" : "8b2c8aee839bee68c488f430fa30e77681adea30962ae12dd392a5b5cd847ae2", + "originHash" : "c1188a7167ae42da4fce56bed21b6f0ad5f487ffcd6ff8a2788d7af08befc3a2", "pins" : [ { "identity" : "aachartkit-swift", @@ -28,6 +28,15 @@ "version" : "0.31.0" } }, + { + "identity" : "corexlsx", + "kind" : "remoteSourceControl", + "location" : "https://github.com/CoreOffice/CoreXLSX.git", + "state" : { + "revision" : "1391f3832ea2eeee5186ea8abb81ea49ed0609cc", + "version" : "0.14.2" + } + }, { "identity" : "eventsource", "kind" : "remoteSourceControl", @@ -91,6 +100,15 @@ "version" : "2.4.3" } }, + { + "identity" : "libxlsxwriter", + "kind" : "remoteSourceControl", + "location" : "https://github.com/jmcnamara/libxlsxwriter.git", + "state" : { + "revision" : "2894634d65cee6021901a165bfc2bb0fad6da193", + "version" : "1.2.4" + } + }, { "identity" : "mlx-swift", "kind" : "remoteSourceControl", @@ -420,6 +438,15 @@ "revision" : "2e61c12a1573d073618ee2f98f39149ea36068e1" } }, + { + "identity" : "xmlcoder", + "kind" : "remoteSourceControl", + "location" : "https://github.com/maxdesiatov/XMLCoder.git", + "state" : { + "revision" : "ca932442d7481700f5434a7b138c47dd42d9902b", + "version" : "0.14.0" + } + }, { "identity" : "yyjson", "kind" : "remoteSourceControl", @@ -429,6 +456,15 @@ "version" : "0.12.0" } }, + { + "identity" : "zipfoundation", + "kind" : "remoteSourceControl", + "location" : "https://github.com/weichsel/ZIPFoundation.git", + "state" : { + "revision" : "22787ffb59de99e5dc1fbfe80b19c97a904ad48d", + "version" : "0.9.20" + } + }, { "identity" : "zstd", "kind" : "remoteSourceControl",