diff --git a/Packages/OsaurusCore/Models/Documents/PresentationDocument.swift b/Packages/OsaurusCore/Models/Documents/PresentationDocument.swift new file mode 100644 index 000000000..9b9cbad8f --- /dev/null +++ b/Packages/OsaurusCore/Models/Documents/PresentationDocument.swift @@ -0,0 +1,235 @@ +// +// PresentationDocument.swift +// osaurus +// +// Typed read model for presentation formats. This is intentionally not +// an OOXML AST; it captures the business-level shape that downstream +// tools need while preserving source markers for traceability. +// + +import Foundation + +public struct PresentationDocument: StructuredRepresentation, Equatable, Sendable { + public var slides: [PresentationSlide] + public var theme: PresentationTheme? + public var sourceProvenance: SourceProvenance + + public init( + slides: [PresentationSlide], + theme: PresentationTheme? = nil, + sourceProvenance: SourceProvenance + ) { + self.slides = slides + self.theme = theme + self.sourceProvenance = sourceProvenance + } +} + +public struct PresentationSlide: Equatable, Sendable { + public var number: Int + public var layout: PresentationLayoutKind + public var elements: [PresentationElement] + public var speakerNotes: SpeakerNotes? + public var sourceProvenance: SourceProvenance + + public init( + number: Int, + layout: PresentationLayoutKind, + elements: [PresentationElement], + speakerNotes: SpeakerNotes? = nil, + sourceProvenance: SourceProvenance + ) { + self.number = number + self.layout = layout + self.elements = elements + self.speakerNotes = speakerNotes + self.sourceProvenance = sourceProvenance + } +} + +public enum PresentationLayoutKind: Equatable, Sendable { + case blank + case title + case titleAndContent + case sectionHeader + case twoContent + case comparison + case pictureWithCaption + case custom(String) +} + +public enum PresentationElement: Equatable, Sendable { + case title(PresentationText) + case bodyText(PresentationBulletList) + case shape(PresentationShape) + case table(PresentationTable) + case chartReference(PresentationChartReference) + case image(PresentationImage) +} + +public struct PresentationText: Equatable, Sendable { + public var text: String + public var sourceProvenance: SourceProvenance? + + public init(text: String, sourceProvenance: SourceProvenance? = nil) { + self.text = text + self.sourceProvenance = sourceProvenance + } +} + +public struct PresentationBulletList: Equatable, Sendable { + public var items: [Item] + public var sourceProvenance: SourceProvenance? + + public init(items: [Item], sourceProvenance: SourceProvenance? = nil) { + self.items = items + self.sourceProvenance = sourceProvenance + } + + public init(_ texts: [String], sourceProvenance: SourceProvenance? = nil) { + self.items = texts.map { Item(text: $0) } + self.sourceProvenance = sourceProvenance + } + + public struct Item: Equatable, Sendable { + public var text: String + public var level: Int + + public init(text: String, level: Int = 0) { + self.text = text + self.level = level + } + } +} + +public struct PresentationShape: Equatable, Sendable { + public var kind: String + public var text: PresentationText? + public var frame: PresentationRect? + + public init(kind: String, text: PresentationText? = nil, frame: PresentationRect? = nil) { + self.kind = kind + self.text = text + self.frame = frame + } +} + +public struct PresentationTable: Equatable, Sendable { + public var rows: [[String]] + public var headerRowCount: Int + public var caption: String? + public var sourceProvenance: SourceProvenance? + + public init( + rows: [[String]], + headerRowCount: Int = 0, + caption: String? = nil, + sourceProvenance: SourceProvenance? = nil + ) { + self.rows = rows + self.headerRowCount = headerRowCount + self.caption = caption + self.sourceProvenance = sourceProvenance + } +} + +public struct PresentationChartReference: Equatable, Sendable { + public var title: String? + public var relationshipId: String? + public var sourceProvenance: SourceProvenance? + + public init( + title: String? = nil, + relationshipId: String? = nil, + sourceProvenance: SourceProvenance? = nil + ) { + self.title = title + self.relationshipId = relationshipId + self.sourceProvenance = sourceProvenance + } +} + +public struct PresentationImage: Equatable, Sendable { + public var relationshipId: String? + public var path: String? + public var mimeType: String? + public var altText: String? + public var frame: PresentationRect? + public var sourceProvenance: SourceProvenance? + + public init( + relationshipId: String? = nil, + path: String? = nil, + mimeType: String? = nil, + altText: String? = nil, + frame: PresentationRect? = nil, + sourceProvenance: SourceProvenance? = nil + ) { + self.relationshipId = relationshipId + self.path = path + self.mimeType = mimeType + self.altText = altText + self.frame = frame + self.sourceProvenance = sourceProvenance + } +} + +public struct PresentationRect: Equatable, Sendable { + public var x: Double + public var y: Double + public var width: Double + public var height: Double + + public init(x: Double, y: Double, width: Double, height: Double) { + self.x = x + self.y = y + self.width = width + self.height = height + } +} + +public struct SpeakerNotes: Equatable, Sendable { + public var text: String + public var sourceProvenance: SourceProvenance + + public init(text: String, sourceProvenance: SourceProvenance) { + self.text = text + self.sourceProvenance = sourceProvenance + } +} + +public struct PresentationTheme: Equatable, Sendable { + public var name: String? + public var colors: [String: String] + public var fonts: [String: String] + public var sourceProvenance: SourceProvenance? + + public init( + name: String? = nil, + colors: [String: String] = [:], + fonts: [String: String] = [:], + sourceProvenance: SourceProvenance? = nil + ) { + self.name = name + self.colors = colors + self.fonts = fonts + self.sourceProvenance = sourceProvenance + } +} + +public struct SourceProvenance: Equatable, Sendable { + public var origin: Origin + public var sourceName: String? + + public init(origin: Origin, sourceName: String? = nil) { + self.origin = origin + self.sourceName = sourceName + } + + public enum Origin: Equatable, Sendable { + case file + case pptxPart(String) + case pptxSlide(Int) + case pptxNotesSlide(Int) + } +} diff --git a/Packages/OsaurusCore/Services/Documents/PPTXAdapter.swift b/Packages/OsaurusCore/Services/Documents/PPTXAdapter.swift new file mode 100644 index 000000000..cddedec95 --- /dev/null +++ b/Packages/OsaurusCore/Services/Documents/PPTXAdapter.swift @@ -0,0 +1,349 @@ +// +// PPTXAdapter.swift +// osaurus +// +// Read-only PPTX adapter. It extracts slide text and speaker notes from +// the OpenXML package into `PresentationDocument`; richer media, layout, +// and chart extraction can fill the existing typed slots later. +// + +import Foundation + +public struct PPTXAdapter: DocumentFormatAdapter { + public static let id = "pptx" + + public let formatId = PPTXAdapter.id + + public init() {} + + public func canHandle(url: URL, uti: String?) -> Bool { + if url.pathExtension.lowercased() == "pptx" { + return true + } + + return uti == "org.openxmlformats.presentationml.presentation" + } + + public func parse(url: URL, sizeLimit: Int64) async throws -> StructuredDocument { + try Task.checkCancellation() + + let fileSize = try Self.fileSize(for: url) + guard fileSize <= sizeLimit else { + throw DocumentAdapterError.sizeLimitExceeded(actual: fileSize, limit: sizeLimit) + } + + do { + let presentation = try await Self.parsePresentation(at: url) + let textFallback = Self.textFallback(for: presentation) + guard !textFallback.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { + throw DocumentAdapterError.emptyContent + } + + return StructuredDocument( + formatId: formatId, + filename: url.lastPathComponent, + fileSize: fileSize, + representation: AnyStructuredRepresentation(formatId: formatId, underlying: presentation), + textFallback: textFallback + ) + } catch is CancellationError { + throw DocumentAdapterError.cancelled + } catch let error as DocumentAdapterError { + throw error + } catch { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + } + + // MARK: - Parse + + private static func parsePresentation(at url: URL) async throws -> PresentationDocument { + let entries = try zipEntryNames(in: url) + try Task.checkCancellation() + + let slideEntries = numberedEntries(entries, directory: "ppt/slides", prefix: "slide") + guard !slideEntries.isEmpty else { + throw DocumentAdapterError.readFailed(underlying: "PPTX contains no slide XML files") + } + + let notesEntries = numberedEntries(entries, directory: "ppt/notesSlides", prefix: "notesSlide") + var notesByNumber: [Int: SpeakerNotes] = [:] + for entry in notesEntries { + try Task.checkCancellation() + let paragraphs = try textParagraphs(in: entry.path, from: url) + let noteText = paragraphs.joined(separator: "\n").trimmingCharacters(in: .whitespacesAndNewlines) + if !noteText.isEmpty { + notesByNumber[entry.number] = SpeakerNotes( + text: noteText, + sourceProvenance: SourceProvenance( + origin: .pptxNotesSlide(entry.number), + sourceName: entry.path + ) + ) + } + } + + let slides = try slideEntries.map { entry in + let provenance = SourceProvenance(origin: .pptxSlide(entry.number), sourceName: entry.path) + let paragraphs = try textParagraphs(in: entry.path, from: url) + let elements = slideElements(from: paragraphs, provenance: provenance) + return PresentationSlide( + number: entry.number, + layout: layoutKind(for: elements), + elements: elements, + speakerNotes: notesByNumber[entry.number], + sourceProvenance: provenance + ) + } + + return PresentationDocument( + slides: slides, + sourceProvenance: SourceProvenance(origin: .file, sourceName: url.lastPathComponent) + ) + } + + private static func slideElements( + from paragraphs: [String], + provenance: SourceProvenance + ) -> [PresentationElement] { + guard let first = paragraphs.first else { return [] } + + var elements: [PresentationElement] = [ + .title(PresentationText(text: first, sourceProvenance: provenance)) + ] + let body = Array(paragraphs.dropFirst()) + if !body.isEmpty { + elements.append(.bodyText(PresentationBulletList(body, sourceProvenance: provenance))) + } + return elements + } + + private static func layoutKind(for elements: [PresentationElement]) -> PresentationLayoutKind { + if elements.isEmpty { + return .blank + } + if elements.count == 1, case .title = elements[0] { + return .title + } + return .titleAndContent + } + + private static func textFallback(for presentation: PresentationDocument) -> String { + presentation.slides.map { slide in + var parts = ["Slide \(slide.number)"] + let slideText = plainText(for: slide.elements) + if !slideText.isEmpty { + parts.append(slideText) + } + if let noteText = slide.speakerNotes?.text, !noteText.isEmpty { + parts.append("Speaker notes:\n\(noteText)") + } + return parts.joined(separator: "\n") + } + .joined(separator: "\n\n") + } + + private static func plainText(for elements: [PresentationElement]) -> String { + elements.flatMap { element -> [String] in + switch element { + case .title(let text): + return [text.text] + case .bodyText(let list): + return list.items.map(\.text) + case .shape(let shape): + return [shape.text?.text].compactMap { $0 } + case .table(let table): + return table.rows.map { $0.joined(separator: "\t") } + case .chartReference(let chart): + return [chart.title].compactMap { $0 } + case .image: + return [] + } + } + .joined(separator: "\n") + } + + // MARK: - XML + + private static func textParagraphs(in entryPath: String, from archiveURL: URL) throws -> [String] { + let data = try zipEntryData(entryPath, from: archiveURL) + let collector = OpenXMLTextCollector() + let parser = XMLParser(data: data) + parser.delegate = collector + + guard parser.parse() else { + let message = parser.parserError?.localizedDescription ?? "Invalid XML in \(entryPath)" + throw DocumentAdapterError.readFailed(underlying: message) + } + + return collector.paragraphs + } + + // MARK: - ZIP + + private static func zipEntryNames(in archiveURL: URL) throws -> [String] { + try runUnzip(arguments: ["-Z1", archiveURL.path]) + .split(whereSeparator: \.isNewline) + .map(String.init) + .filter { !$0.isEmpty } + } + + private static func zipEntryData(_ entryPath: String, from archiveURL: URL) throws -> Data { + try runUnzipData(arguments: ["-p", archiveURL.path, entryPath]) + } + + private static func numberedEntries( + _ entries: [String], + directory: String, + prefix: String + ) -> [NumberedEntry] { + let expectedPrefix = "\(directory)/\(prefix)" + return entries.compactMap { entry -> NumberedEntry? in + guard entry.hasPrefix(expectedPrefix), entry.hasSuffix(".xml") else { + return nil + } + + let start = entry.index(entry.startIndex, offsetBy: expectedPrefix.count) + let end = entry.index(entry.endIndex, offsetBy: -".xml".count) + guard start <= end, let number = Int(entry[start ..< end]) else { + return nil + } + + return NumberedEntry(number: number, path: entry) + } + .sorted { + if $0.number == $1.number { + return $0.path < $1.path + } + return $0.number < $1.number + } + } + + private static func runUnzip(arguments: [String]) throws -> String { + let data = try runUnzipData(arguments: arguments) + return String(data: data, encoding: .utf8) ?? "" + } + + private static func runUnzipData(arguments: [String]) throws -> Data { + let process = Process() + process.executableURL = URL(fileURLWithPath: "/usr/bin/unzip") + process.arguments = arguments + + let output = Pipe() + let errorOutput = Pipe() + process.standardOutput = output + process.standardError = errorOutput + + do { + try process.run() + process.waitUntilExit() + } catch { + throw DocumentAdapterError.readFailed(underlying: "Unable to run unzip: \(error.localizedDescription)") + } + + let outputData = output.fileHandleForReading.readDataToEndOfFile() + guard process.terminationStatus == 0 else { + let errorData = errorOutput.fileHandleForReading.readDataToEndOfFile() + let message = String(data: errorData.isEmpty ? outputData : errorData, encoding: .utf8)? + .trimmingCharacters(in: .whitespacesAndNewlines) + let reason = message?.isEmpty == false ? message! : "unzip exited with status \(process.terminationStatus)" + throw DocumentAdapterError.readFailed(underlying: reason) + } + + return outputData + } + + // MARK: - Files + + private static func fileSize(for url: URL) throws -> Int64 { + do { + let values = try url.resourceValues(forKeys: [.fileSizeKey]) + if let size = values.fileSize { + return Int64(size) + } + + let attributes = try FileManager.default.attributesOfItem(atPath: url.path) + if let size = attributes[.size] as? NSNumber { + return size.int64Value + } + return 0 + } catch { + throw DocumentAdapterError.readFailed(underlying: error.localizedDescription) + } + } + + private struct NumberedEntry { + var number: Int + var path: String + } +} + +private final class OpenXMLTextCollector: NSObject, XMLParserDelegate { + private var isInParagraph = false + private var isInTextRun = false + private var currentParagraph = "" + private var currentRun = "" + + private(set) var paragraphs: [String] = [] + + func parser( + _ parser: XMLParser, + didStartElement elementName: String, + namespaceURI: String?, + qualifiedName qName: String?, + attributes attributeDict: [String: String] = [:] + ) { + if Self.matches(elementName, qualifiedName: qName, suffix: "p") { + isInParagraph = true + currentParagraph = "" + } else if Self.matches(elementName, qualifiedName: qName, suffix: "t") { + isInTextRun = true + currentRun = "" + } + } + + func parser( + _ parser: XMLParser, + didEndElement elementName: String, + namespaceURI: String?, + qualifiedName qName: String? + ) { + if Self.matches(elementName, qualifiedName: qName, suffix: "t") { + if isInParagraph { + currentParagraph.append(currentRun) + } else { + appendParagraph(currentRun) + } + isInTextRun = false + currentRun = "" + } else if Self.matches(elementName, qualifiedName: qName, suffix: "p") { + appendParagraph(currentParagraph) + isInParagraph = false + currentParagraph = "" + } + } + + func parser(_ parser: XMLParser, foundCharacters string: String) { + guard isInTextRun else { return } + currentRun.append(string) + } + + private func appendParagraph(_ text: String) { + let normalized = Self.normalize(text) + if !normalized.isEmpty { + paragraphs.append(normalized) + } + } + + private static func matches(_ elementName: String, qualifiedName: String?, suffix: String) -> Bool { + let names = [elementName, qualifiedName].compactMap { $0 } + return names.contains(suffix) || names.contains { $0.hasSuffix(":\(suffix)") } + } + + private static func normalize(_ text: String) -> String { + text + .components(separatedBy: .whitespacesAndNewlines) + .filter { !$0.isEmpty } + .joined(separator: " ") + } +} diff --git a/Packages/OsaurusCore/Tests/Documents/PPTXAdapterTests.swift b/Packages/OsaurusCore/Tests/Documents/PPTXAdapterTests.swift new file mode 100644 index 000000000..32da54a7c --- /dev/null +++ b/Packages/OsaurusCore/Tests/Documents/PPTXAdapterTests.swift @@ -0,0 +1,241 @@ +// +// PPTXAdapterTests.swift +// osaurusTests +// +// Generates tiny OpenXML packages in temp directories so the repository +// does not carry binary PPTX fixtures. +// + +import Foundation +import Testing + +@testable import OsaurusCore + +@Suite("PPTXAdapter") +struct PPTXAdapterTests { + @Test func canHandle_acceptsPPTXExtensionAndUTI() { + let adapter = PPTXAdapter() + + #expect(adapter.canHandle(url: URL(fileURLWithPath: "/tmp/deck.pptx"), uti: nil)) + #expect( + adapter.canHandle( + url: URL(fileURLWithPath: "/tmp/deck"), + uti: "org.openxmlformats.presentationml.presentation" + ) + ) + #expect(!adapter.canHandle(url: URL(fileURLWithPath: "/tmp/deck.ppt"), uti: nil)) + #expect(!adapter.canHandle(url: URL(fileURLWithPath: "/tmp/deck.docx"), uti: nil)) + } + + @Test func parse_extractsSlideText() async throws { + let fixture = try makePPTXFixture( + slides: [ + 1: ["Quarterly Review", "Revenue & retention", "Next steps"], + 2: ["Appendix", "Churn by segment"], + ] + ) + defer { try? FileManager.default.removeItem(at: fixture.root) } + + let document = try await PPTXAdapter().parse(url: fixture.url, sizeLimit: 50_000) + let presentation = try #require(document.representation.underlying as? PresentationDocument) + + #expect(document.fileSize < 50_000) + #expect(presentation.sourceProvenance.origin == .file) + #expect(presentation.slides.count == 2) + #expect(presentation.slides[0].number == 1) + #expect(presentation.slides[0].layout == .titleAndContent) + #expect( + presentation.slides[0].elements.first + == .title( + PresentationText( + text: "Quarterly Review", + sourceProvenance: SourceProvenance( + origin: .pptxSlide(1), + sourceName: "ppt/slides/slide1.xml" + ) + ) + ) + ) + #expect(document.textFallback.contains("Revenue & retention")) + #expect(document.textFallback.contains("Churn by segment")) + } + + @Test func parse_extractsSpeakerNotes() async throws { + let fixture = try makePPTXFixture( + slides: [1: ["Launch Plan", "Rollout phases"]], + notes: [1: ["Mention pilot customers", "Pause for questions"]] + ) + defer { try? FileManager.default.removeItem(at: fixture.root) } + + let document = try await PPTXAdapter().parse(url: fixture.url, sizeLimit: 50_000) + let presentation = try #require(document.representation.underlying as? PresentationDocument) + + #expect(presentation.slides.first?.speakerNotes?.text == "Mention pilot customers\nPause for questions") + #expect(presentation.slides.first?.speakerNotes?.sourceProvenance.origin == .pptxNotesSlide(1)) + #expect(document.textFallback.contains("Speaker notes:")) + #expect(document.textFallback.contains("Pause for questions")) + } + + @Test func parse_refusesFilesAboveSizeLimit() async throws { + let root = try makeTempDirectory() + defer { try? FileManager.default.removeItem(at: root) } + + let url = root.appendingPathComponent("too-large.pptx") + try Data(repeating: 0x41, count: 16).write(to: url) + + do { + _ = try await PPTXAdapter().parse(url: url, sizeLimit: 15) + Issue.record("expected sizeLimitExceeded") + } catch DocumentAdapterError.sizeLimitExceeded(let actual, let limit) { + #expect(actual == 16) + #expect(limit == 15) + } catch { + Issue.record("expected sizeLimitExceeded, got \(error)") + } + } + + @Test func parse_corruptZipThrowsReadFailed() async throws { + let root = try makeTempDirectory() + defer { try? FileManager.default.removeItem(at: root) } + + let url = root.appendingPathComponent("corrupt.pptx") + try Data("not a zip archive".utf8).write(to: url) + + do { + _ = try await PPTXAdapter().parse(url: url, sizeLimit: 50_000) + Issue.record("expected readFailed") + } catch DocumentAdapterError.readFailed(let underlying) { + #expect(!underlying.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty) + } catch { + Issue.record("expected readFailed, got \(error)") + } + } + + // MARK: - Fixture generation + + private func makePPTXFixture( + slides: [Int: [String]], + notes: [Int: [String]] = [:] + ) throws -> (root: URL, url: URL) { + let root = try makeTempDirectory() + let packageRoot = root.appendingPathComponent("package", isDirectory: true) + let pptRoot = packageRoot.appendingPathComponent("ppt", isDirectory: true) + let slidesRoot = pptRoot.appendingPathComponent("slides", isDirectory: true) + let notesRoot = pptRoot.appendingPathComponent("notesSlides", isDirectory: true) + + try FileManager.default.createDirectory(at: slidesRoot, withIntermediateDirectories: true) + try FileManager.default.createDirectory(at: notesRoot, withIntermediateDirectories: true) + + try contentTypesXML.write( + to: packageRoot.appendingPathComponent("[Content_Types].xml"), + atomically: true, + encoding: .utf8 + ) + + for (number, paragraphs) in slides { + try slideXML(paragraphs).write( + to: slidesRoot.appendingPathComponent("slide\(number).xml"), + atomically: true, + encoding: .utf8 + ) + } + + for (number, paragraphs) in notes { + try notesXML(paragraphs).write( + to: notesRoot.appendingPathComponent("notesSlide\(number).xml"), + atomically: true, + encoding: .utf8 + ) + } + + let url = root.appendingPathComponent("fixture.pptx") + try zipDirectory(packageRoot, to: url) + return (root, url) + } + + private var contentTypesXML: String { + """ + + + + + """ + } + + private func slideXML(_ paragraphs: [String]) -> String { + """ + + + + + \(paragraphs.map(textShapeXML).joined(separator: "\n")) + + + + """ + } + + private func notesXML(_ paragraphs: [String]) -> String { + """ + + + + + \(paragraphs.map(textShapeXML).joined(separator: "\n")) + + + + """ + } + + private func textShapeXML(_ text: String) -> String { + """ + + + + \(escapeXML(text)) + + + + """ + } + + private func escapeXML(_ text: String) -> String { + text + .replacingOccurrences(of: "&", with: "&") + .replacingOccurrences(of: "<", with: "<") + .replacingOccurrences(of: ">", with: ">") + .replacingOccurrences(of: "\"", with: """) + .replacingOccurrences(of: "'", with: "'") + } + + private func makeTempDirectory() throws -> URL { + let url = FileManager.default.temporaryDirectory + .appendingPathComponent("osaurus-pptx-tests-\(UUID().uuidString)", isDirectory: true) + try FileManager.default.createDirectory(at: url, withIntermediateDirectories: true) + return url + } + + private func zipDirectory(_ source: URL, to destination: URL) throws { + let process = Process() + process.executableURL = URL(fileURLWithPath: "/usr/bin/zip") + process.currentDirectoryURL = source + process.arguments = ["-r", "-q", destination.path, "."] + + let output = Pipe() + process.standardOutput = output + process.standardError = output + + try process.run() + process.waitUntilExit() + + guard process.terminationStatus == 0 else { + let message = String(data: output.fileHandleForReading.readDataToEndOfFile(), encoding: .utf8) ?? "" + throw NSError( + domain: "PPTXAdapterTests", + code: Int(process.terminationStatus), + userInfo: [NSLocalizedDescriptionKey: "zip failed: \(message)"] + ) + } + } +}