Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions Packages/OsaurusCore/AppDelegate.swift
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega
// the specific crash class this prevents.
MLXErrorRecovery.installGlobalHandler()

// Register in-tree document format adapters before any file-ingress
// path can run. Idempotent; safe if a future migration moves this.
DocumentAdaptersBootstrap.registerBuiltIns()

// Detect repeated startup crashes and enter safe mode if needed
LaunchGuard.checkOnLaunch()

Expand Down Expand Up @@ -175,7 +179,7 @@ public final class AppDelegate: NSObject, NSApplicationDelegate, NSPopoverDelega
#endif

// Initialize directory access early so security-scoped bookmark is active
let _ = DirectoryPickerService.shared
_ = DirectoryPickerService.shared

if LaunchGuard.isSafeMode {
NotificationService.shared.postSafeModeActive()
Expand Down Expand Up @@ -876,7 +880,7 @@ extension AppDelegate {
}

@objc private func handleServeCommand(_ note: Notification) {
var desiredPort: Int? = nil
var desiredPort: Int?
var exposeFlag: Bool = false
if let ui = note.userInfo {
if let p = ui["port"] as? Int {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//
// DocumentAdaptersBootstrap.swift
// osaurus
//
// Registers the in-tree document adapters with `DocumentFormatRegistry.shared`
// exactly once, at app launch. Kept separate from `AppDelegate` so tests can
// opt into the same registration (or opt out of it entirely) without dragging
// in `NSApplication`.
//

import Foundation

public enum DocumentAdaptersBootstrap {
private static let lock = NSLock()
// Guarded by `lock`; the `nonisolated(unsafe)` matches the project pattern
// for lock-protected process-global state (see `OsaurusPaths.overrideRoot`).
nonisolated(unsafe) private static var didRegisterShared = false

/// Idempotent against the shared registry: safe to call from multiple
/// launch paths without producing duplicate adapter registrations.
/// Non-shared registries (tests, isolated instances) are re-registered on
/// every call so each test gets a clean baseline.
public static func registerBuiltIns(registry: DocumentFormatRegistry = .shared) {
lock.lock()
defer { lock.unlock() }
if registry === DocumentFormatRegistry.shared, didRegisterShared { return }
registry.register(adapter: PlainTextAdapter())
registry.register(adapter: PDFAdapter())
registry.register(adapter: RichDocumentAdapter())
registry.register(adapter: XLSXAdapter())
if registry === DocumentFormatRegistry.shared {
didRegisterShared = true
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
//
// PlainTextRepresentation.swift
// osaurus
//
// Default representation for adapters that extract a single text string.
// Every adapter has to publish *some* `StructuredRepresentation`; the
// wrappers around `PDFKit` text extraction and `NSAttributedString` don't
// preserve any format-native structure, so they emit this shape. The
// real typed representations (`Workbook`, `WordDocument`, …) replace it
// per-format as higher-fidelity adapters land.
//

import Foundation

public struct PlainTextRepresentation: StructuredRepresentation, Sendable {
public let text: String
public init(text: String) { self.text = text }
}
87 changes: 87 additions & 0 deletions Packages/OsaurusCore/Models/Documents/Workbook.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
//
// Workbook.swift
// osaurus
//
// Typed representation for parsed XLSX workbooks. Designed as the
// round-trip target for both the read side (`XLSXAdapter`, this PR) and
// the write side (`XLSXEmitter`, landing in the next slice). Fields are
// chosen to match what CoreXLSX surfaces cleanly today — sheet names,
// merged ranges, raw cell values, formula source strings — plus the
// shared-string table so repeated strings round-trip without being
// re-interned on write. Style-derived fidelity (number formats, column
// widths) is deliberately out of scope for this PR; see the comment on
// `CellValue` for why.
//

import Foundation

public struct Workbook: StructuredRepresentation, Sendable {
public let sheets: [Sheet]
public let sharedStrings: [String]

public init(sheets: [Sheet], sharedStrings: [String]) {
self.sheets = sheets
self.sharedStrings = sharedStrings
}
}

public struct Sheet: Sendable {
public let name: String
public let rows: [Row]
public let mergedRanges: [CellRange]

public init(name: String, rows: [Row], mergedRanges: [CellRange]) {
self.name = name
self.rows = rows
self.mergedRanges = mergedRanges
}
}

public struct Row: Sendable {
/// 1-based row number matching the on-wire `r` attribute.
public let index: Int
public let cells: [Cell]

public init(index: Int, cells: [Cell]) {
self.index = index
self.cells = cells
}
}

public struct Cell: Sendable {
/// A1-style reference on-wire, e.g. "B3".
public let reference: String
public let value: CellValue
/// Formula source (`=SUM(A1:A3)`) when the cell carries one. Excel
/// stores both the formula and its cached result; we preserve both.
public let formula: String?

public init(reference: String, value: CellValue, formula: String? = nil) {
self.reference = reference
self.value = value
self.formula = formula
}
}

/// Scalar cell payload. Excel dates are stored as numbers with a style
/// attached — without parsing the style table we can't distinguish a date
/// from a plain number, so dates that aren't explicitly typed (`t="d"`)
/// surface as `.number`. Lifting that limitation means shipping a style
/// parser that tolerates the CoreXLSX `patternType` crash on
/// openpyxl-generated files; that work lives in a separate slice.
public enum CellValue: Sendable, Equatable {
case empty
case number(Double)
case string(String)
case bool(Bool)
case inlineString(String)
}

/// A1-style cell range, e.g. "A1:C3".
public struct CellRange: Sendable, Equatable {
public let reference: String

public init(reference: String) {
self.reference = reference
}
}
5 changes: 4 additions & 1 deletion Packages/OsaurusCore/Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ let package = Package(
.package(url: "https://github.com/mgriebling/SwiftMath", from: "1.7.3"),
.package(url: "https://github.com/raspu/Highlightr", from: "2.3.0"),
.package(url: "https://github.com/AAChartModel/AAChartKit-Swift.git", from: "9.5.0"),
.package(url: "https://github.com/CoreOffice/CoreXLSX.git", from: "0.14.2"),
],
targets: [
// Vendored SQLCipher 4.6.1 amalgamation (CommonCrypto
Expand Down Expand Up @@ -271,6 +272,7 @@ let package = Package(
.product(name: "ContainerizationExtras", package: "containerization"),
.product(name: "Highlightr", package: "Highlightr"),
.product(name: "AAInfographics", package: "AAChartKit-Swift"),
.product(name: "CoreXLSX", package: "CoreXLSX"),
],
path: ".",
exclude: ["Tests", "SQLCipher"],
Expand All @@ -284,7 +286,8 @@ let package = Package(
.product(name: "NIOEmbedded", package: "swift-nio"),
.product(name: "VecturaKit", package: "VecturaKit"),
],
path: "Tests"
path: "Tests",
resources: [.copy("Documents/Fixtures")]
),
]
)
68 changes: 68 additions & 0 deletions Packages/OsaurusCore/Services/Documents/PDFAdapter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
//
// PDFAdapter.swift
// osaurus
//
// Wraps the text-layer extraction path in `DocumentParser.parsePDFWithFallback`.
// Intentionally does NOT cover the image-rendering fallback — when a PDF has
// no extractable text, this adapter throws `.emptyContent` and the
// `DocumentParser` shim falls through to the legacy switch, which still
// renders each page as PNG. Moving that path onto the adapter surface is
// deferred to stage-4 PR 8 (layout-aware table extraction), where the
// typed `PDFDocument` representation gets introduced.
//

import Foundation
import PDFKit

public struct PDFAdapter: DocumentFormatAdapter {
public let formatId = "pdf"

public init() {}

public func canHandle(url: URL, uti: String?) -> Bool {
url.pathExtension.lowercased() == "pdf"
}

public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument {
let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0)
if sizeLimit > 0, fileSize > sizeLimit {
throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit)
}

guard let document = PDFDocument(url: url) else {
throw DocumentAdapterError.readFailed(underlying: "PDFKit could not open document")
}

let extracted = Self.extractText(from: document)
guard !extracted.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
// No text layer — let the shim fall through to the legacy image-
// render fallback. Don't claim a result we can't produce.
throw DocumentAdapterError.emptyContent
}

let truncated = PlainTextAdapter.applyCharacterCap(extracted)

return StructuredDocument(
formatId: formatId,
filename: url.lastPathComponent,
fileSize: fileSize,
representation: AnyStructuredRepresentation(
formatId: formatId,
underlying: PlainTextRepresentation(text: truncated)
),
textFallback: truncated
)
}

private static func extractText(from document: PDFDocument) -> String {
var pages: [String] = []
for index in 0 ..< document.pageCount {
guard let page = document.page(at: index),
let text = page.string,
!text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
else { continue }
pages.append(text)
}
return pages.joined(separator: "\n\n")
}
}
90 changes: 90 additions & 0 deletions Packages/OsaurusCore/Services/Documents/PlainTextAdapter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
//
// PlainTextAdapter.swift
// osaurus
//
// Wraps the existing plain-text ingress path in `DocumentParser`. Claims
// roughly the 60 extensions that were previously handled by the inline
// `case _ where isPlainText(ext:)` branch — `.txt`, `.md`, source code,
// config files, etc. Behaviour is intentionally identical to the legacy
// switch: UTF-8 first, ISO-Latin-1 retry, post-read character-count
// truncation marker. This adapter is a migration bridge, not a fidelity
// improvement.
//

import Foundation

public struct PlainTextAdapter: DocumentFormatAdapter {
public let formatId = "plaintext"

public init() {}

public func canHandle(url: URL, uti: String?) -> Bool {
Self.plainTextExtensions.contains(url.pathExtension.lowercased())
}

public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument {
let fileSize = Int64((try? url.resourceValues(forKeys: [.fileSizeKey]))?.fileSize ?? 0)
if sizeLimit > 0, fileSize > sizeLimit {
throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit)
}

let rawContent: String
do {
rawContent = try String(contentsOf: url, encoding: .utf8)
} catch {
// Fall back to latin-1 for files that are "mostly text" with a few
// non-UTF-8 bytes — same behaviour as the legacy path.
guard let data = try? Data(contentsOf: url),
let decoded = String(data: data, encoding: .isoLatin1)
else {
throw DocumentAdapterError.readFailed(underlying: error.localizedDescription)
}
rawContent = decoded
}

guard !rawContent.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
throw DocumentAdapterError.emptyContent
}

let truncated = Self.applyCharacterCap(rawContent)

return StructuredDocument(
formatId: formatId,
filename: url.lastPathComponent,
fileSize: fileSize,
representation: AnyStructuredRepresentation(
formatId: formatId,
underlying: PlainTextRepresentation(text: truncated)
),
textFallback: truncated
)
}

// MARK: - Helpers

/// Preserves the legacy 500K-character UX — consumers already expect the
/// trailing marker when a document is truncated mid-read. The cap on
/// bytes-read is higher (see `DocumentLimits.plainText`), so the two
/// interact: oversized files are refused outright; merely long files
/// are surfaced with a truncation note.
static func applyCharacterCap(_ text: String) -> String {
let cap = 500_000
guard text.count > cap else { return text }
return String(text.prefix(cap))
+ "\n\n[Document truncated — exceeded \(cap) character limit]"
}

static let plainTextExtensions: Set<String> = [
"txt", "md", "markdown", "csv", "tsv",
"json", "xml", "yaml", "yml", "toml",
"log", "ini", "cfg", "conf", "env",
"swift", "py", "js", "ts", "tsx", "jsx",
"rs", "go", "java", "kt", "c", "cpp", "h", "hpp",
"rb", "php", "sh", "bash", "zsh", "fish",
"css", "scss", "less", "sql",
"r", "m", "mm", "lua", "pl", "ex", "exs",
"zig", "nim", "dart", "scala", "groovy",
"tf", "hcl", "dockerfile",
"gitignore", "editorconfig", "prettierrc",
]
}
Loading
Loading