From 3cd46ff35a07aa03ad7dea988e2bcf45c754fc5e Mon Sep 17 00:00:00 2001 From: Leandro Alonso Date: Mon, 1 Jul 2024 14:10:15 -0300 Subject: [PATCH 01/11] Ask for permission to use speech API right away --- podcasts/MainTabBarController.swift | 5 +++++ podcasts/podcasts-Info.plist | 2 ++ 2 files changed, 7 insertions(+) diff --git a/podcasts/MainTabBarController.swift b/podcasts/MainTabBarController.swift index 79913dc9b1..266fd9a7a3 100644 --- a/podcasts/MainTabBarController.swift +++ b/podcasts/MainTabBarController.swift @@ -1,4 +1,5 @@ import PocketCastsDataModel +import Speech import PocketCastsServer import SafariServices import UIKit @@ -103,6 +104,10 @@ class MainTabBarController: UITabBarController, NavigationProtocol { showInitialOnboardingIfNeeded() updateDatabaseIndexes() + + SFSpeechRecognizer.requestAuthorization { SFSpeechRecognizerAuthorizationStatus in + + } } /// Update database indexes and delete unused columns diff --git a/podcasts/podcasts-Info.plist b/podcasts/podcasts-Info.plist index 766a53e849..ad34d51814 100644 --- a/podcasts/podcasts-Info.plist +++ b/podcasts/podcasts-Info.plist @@ -1029,6 +1029,8 @@ Pocket Casts needs permission to save this image to your photo library. NSPhotoLibraryUsageDescription Pocket Casts needs permission to save this image to your photo library. + NSSpeechRecognitionUsageDescription + Pocket Casts needs permission to transcribe audio that is playing to text. NSUserActivityTypes ChapterIntent From 34efd262489a95cc4e563929e4608dff73340091 Mon Sep 17 00:00:00 2001 From: Leandro Alonso Date: Mon, 1 Jul 2024 14:11:32 -0300 Subject: [PATCH 02/11] Integrates audio read task with speech --- podcasts/AudioReadTask.swift | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/podcasts/AudioReadTask.swift b/podcasts/AudioReadTask.swift index 310cfa7857..39a052a350 100644 --- a/podcasts/AudioReadTask.swift +++ b/podcasts/AudioReadTask.swift @@ -2,6 +2,7 @@ import AVFoundation import PocketCastsDataModel import PocketCastsServer import PocketCastsUtils +import Speech class AudioReadTask { private let maxSilenceAmountToSave = 1000 @@ -33,6 +34,15 @@ class AudioReadTask { private var currentFramePosition: AVAudioFramePosition = 0 private let endOfFileSemaphore = DispatchSemaphore(value: 0) + private lazy var request: SFSpeechAudioBufferRecognitionRequest? = { + let request = SFSpeechAudioBufferRecognitionRequest() + request.shouldReportPartialResults = true + return request + }() + + private var task: SFSpeechRecognitionTask? + private let recognizer: SFSpeechRecognizer? = SFSpeechRecognizer() + init(trimSilence: TrimSilenceAmount, audioFile: AVAudioFile, outputFormat: AVAudioFormat, bufferManager: PlayBufferManager, playPositionHint: TimeInterval, frameCount: Int64) { self.trimSilence = trimSilence self.audioFile = audioFile @@ -54,6 +64,12 @@ class AudioReadTask { readQueue.async { [weak self] in guard let self = self else { return } + guard let recognizer, let request else { return } + + task = recognizer.recognitionTask(with: request, resultHandler: { [weak self] result, error in + self?.recognitionHandler(result: result, error: error) + }) + // there are some Core Audio errors that aren't marked as throws in the Swift code, so they'll crash the app // that's why we have an Objective-C try/catch block here to catch them (see https://github.com/shiftyjelly/pocketcasts-ios/issues/1493 for more details) do { @@ -84,6 +100,15 @@ class AudioReadTask { } } + nonisolated private func recognitionHandler(result: SFSpeechRecognitionResult?, error: Error?) { + let receivedFinalResult = result?.isFinal ?? false + let receivedError = error != nil + + if let result { + print("$$ \(result.bestTranscription.formattedString)") + } + } + func shutdown() { cancelled.value = true bufferManager.bufferSemaphore.signal() @@ -292,6 +317,8 @@ class AudioReadTask { if !cancelled.value { bufferManager.push(buffer) } + + request?.append(buffer.audioBuffer) } private func gapSizeForSilenceAmount() -> Int { From 5be9f863518e3d5e0e0239defee81035f5ba3c97 Mon Sep 17 00:00:00 2001 From: Leandro Alonso Date: Mon, 1 Jul 2024 14:27:38 -0300 Subject: [PATCH 03/11] Use a notification to send transcribed text --- podcasts/AudioReadTask.swift | 2 +- podcasts/Constants.swift | 3 +++ podcasts/TranscriptsViewController.swift | 8 ++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/podcasts/AudioReadTask.swift b/podcasts/AudioReadTask.swift index 39a052a350..8ed646431a 100644 --- a/podcasts/AudioReadTask.swift +++ b/podcasts/AudioReadTask.swift @@ -105,7 +105,7 @@ class AudioReadTask { let receivedError = error != nil if let result { - print("$$ \(result.bestTranscription.formattedString)") + NotificationCenter.postOnMainThread(notification: Constants.Notifications.speechToTextAvailable, userInfo: ["text": result.bestTranscription]) } } diff --git a/podcasts/Constants.swift b/podcasts/Constants.swift index 0fc3c028c2..e205ab4e6e 100644 --- a/podcasts/Constants.swift +++ b/podcasts/Constants.swift @@ -96,6 +96,9 @@ struct Constants { // End of Year static let profileSeen = NSNotification.Name(rawValue: "profileSeen") + + // Speech to Text + static let speechToTextAvailable = NSNotification.Name(rawValue: "speechToTextAvailable") } enum UserDefaults { diff --git a/podcasts/TranscriptsViewController.swift b/podcasts/TranscriptsViewController.swift index 31c07b5349..5603d3529c 100644 --- a/podcasts/TranscriptsViewController.swift +++ b/podcasts/TranscriptsViewController.swift @@ -1,4 +1,5 @@ import Foundation +import Speech import UIKit class TranscriptsViewController: PlayerItemViewController { @@ -158,6 +159,13 @@ class TranscriptsViewController: PlayerItemViewController { private func addObservers() { addCustomObserver(Constants.Notifications.playbackTrackChanged, selector: #selector(update)) addCustomObserver(Constants.Notifications.playbackProgress, selector: #selector(updateTranscriptPosition)) + addCustomObserver(Constants.Notifications.speechToTextAvailable, selector: #selector(updateTranscriptPositionn)) + } + + @objc private func updateTranscriptPositionn(notification: NSNotification) { + guard let text = notification.userInfo?["text"] as? SFTranscription else { return } + + print("$$ \(text.formattedString)") } @objc private func updateTranscriptPosition() { From b1b403b851171aa640e4084e77a52e1be36f7041 Mon Sep 17 00:00:00 2001 From: Leandro Alonso Date: Thu, 4 Jul 2024 13:49:41 -0300 Subject: [PATCH 04/11] First algorithm iteration --- podcasts/AudioReadTask.swift | 26 ++- podcasts/PlaybackManager.swift | 2 +- podcasts/TranscriptModel.swift | 226 ++++++++++++++++++++++- podcasts/TranscriptsViewController.swift | 64 +++++-- 4 files changed, 297 insertions(+), 21 deletions(-) diff --git a/podcasts/AudioReadTask.swift b/podcasts/AudioReadTask.swift index 8ed646431a..1ddc21fe0c 100644 --- a/podcasts/AudioReadTask.swift +++ b/podcasts/AudioReadTask.swift @@ -36,15 +36,21 @@ class AudioReadTask { private lazy var request: SFSpeechAudioBufferRecognitionRequest? = { let request = SFSpeechAudioBufferRecognitionRequest() - request.shouldReportPartialResults = true + request.shouldReportPartialResults = false + request.requiresOnDeviceRecognition = true + request.taskHint = .dictation return request }() private var task: SFSpeechRecognitionTask? - private let recognizer: SFSpeechRecognizer? = SFSpeechRecognizer() + private lazy var recognizer: SFSpeechRecognizer? = { + let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US")) + recognizer?.defaultTaskHint = .dictation + return recognizer + }() init(trimSilence: TrimSilenceAmount, audioFile: AVAudioFile, outputFormat: AVAudioFormat, bufferManager: PlayBufferManager, playPositionHint: TimeInterval, frameCount: Int64) { - self.trimSilence = trimSilence + self.trimSilence = .off self.audioFile = audioFile self.outputFormat = outputFormat self.bufferManager = bufferManager @@ -60,12 +66,16 @@ class AudioReadTask { } } + var offset: TimeInterval = 0 + func startup() { readQueue.async { [weak self] in guard let self = self else { return } guard let recognizer, let request else { return } + offset = PlaybackManager.shared.currentTime() + print("$$ Starting task") task = recognizer.recognitionTask(with: request, resultHandler: { [weak self] result, error in self?.recognitionHandler(result: result, error: error) }) @@ -104,8 +114,16 @@ class AudioReadTask { let receivedFinalResult = result?.isFinal ?? false let receivedError = error != nil + if result?.isFinal == true { + print("$$ SFSpeechRecognition finished") + } + + if let error { + print("$$ SFSpeechRecognition error \(error)") + } + if let result { - NotificationCenter.postOnMainThread(notification: Constants.Notifications.speechToTextAvailable, userInfo: ["text": result.bestTranscription]) + NotificationCenter.postOnMainThread(notification: Constants.Notifications.speechToTextAvailable, userInfo: ["text": result.bestTranscription, "offset": offset]) } } diff --git a/podcasts/PlaybackManager.swift b/podcasts/PlaybackManager.swift index 9fdb507840..e48b6990f3 100644 --- a/podcasts/PlaybackManager.swift +++ b/podcasts/PlaybackManager.swift @@ -45,7 +45,7 @@ class PlaybackManager: ServerPlaybackDelegate { private let shouldDeactivateSession = AtomicBool() private var haveCalledPlayerLoad = false - private let updateTimerInterval = 1 as TimeInterval + private let updateTimerInterval = 0.01 as TimeInterval #if !os(watchOS) private var backgroundTask = UIBackgroundTaskIdentifier.invalid diff --git a/podcasts/TranscriptModel.swift b/podcasts/TranscriptModel.swift index 4b55d32ae1..ff673db6e9 100644 --- a/podcasts/TranscriptModel.swift +++ b/podcasts/TranscriptModel.swift @@ -1,5 +1,7 @@ import Foundation +import Speech import SwiftSubtitles +import NaturalLanguage enum TranscriptFormat: String { case srt = "application/srt" @@ -35,11 +37,20 @@ extension NSAttributedString: @unchecked Sendable { } -struct TranscriptModel: Sendable { +class TranscriptModel: @unchecked Sendable { let attributedText: NSAttributedString let cues: [TranscriptCue] + lazy var rawText: String = { + attributedText.string + }() + + init(attributedText: NSAttributedString, cues: [TranscriptCue]) { + self.attributedText = attributedText + self.cues = cues + } + static func makeModel(from transcriptText: String, format: TranscriptFormat) -> TranscriptModel? { if format == .textHTML { return TranscriptModel(attributedText: NSAttributedString(string: transcriptText), cues: []) @@ -67,4 +78,217 @@ struct TranscriptModel: Sendable { @inlinable public func firstCue(containing secondsValue: Double) -> TranscriptCue? { self.cues.first { $0.contains(timeInSeconds: secondsValue) } } + + var allSpeechToText: [String] = [] { + didSet { + print("$$ \(allSpeechToText.joined(separator: " "))") + print("$$") + } + } + var timestamps: [(TimeInterval, TimeInterval)] = [] + + var words: [Word] = [] + + public func firstWord(containing secondsValue: TimeInterval) -> Word? { + words +// .filter { $0.timestamp != nil } +// .sorted(by: { $0.timestamp!.seconds < $1.timestamp!.seconds }) + .first { $0.contains(timeInSeconds: secondsValue) } + } + + func wordByWord(speechToText: SFTranscription) { + // Define constants + let matchScore = 1 + let mismatchScore = -1 + let gapPenalty = -2 + + struct TimedWord { + let word: String + let timestamp: TimeInterval + let duration: TimeInterval + } + + // Tokenize the text while preserving punctuation + func tokenize(text: String) -> [String] { + var words = [String]() + let tokenizer = NLTokenizer(unit: .word) + tokenizer.string = text + tokenizer.enumerateTokens(in: text.startIndex.. [(String, NSRange)] { + var words = [(String, NSRange)]() + let tokenizer = NLTokenizer(unit: .word) + tokenizer.string = text + tokenizer.enumerateTokens(in: text.startIndex.. [(normalized: String, range: NSRange)] { + let words = tokenizeWithRange(text: text) + return words.map { ($0.lowercased(), $1) } + } + + // Preprocess timed words: tokenize and normalize, preserving timestamps + func preprocessTimedWords(text: [String], timestamps: [(timestamp: TimeInterval, duration: TimeInterval)]) -> [TimedWord] { + var timedWords = [TimedWord]() + for (index, word) in text.enumerated() { + timedWords.append(TimedWord(word: word.lowercased(), timestamp: timestamps[index].timestamp, duration: timestamps[index].duration)) + } + return timedWords + } + + // Define the scoring function + func score(word1: String, word2: String) -> Int { + return word1 == word2 ? matchScore : mismatchScore + } + + // Perform sequence alignment + func alignSequences(subtitle: String, transcript: [String], transcriptTimestamps: [(timestamp: TimeInterval, duration: TimeInterval)]) -> ([NSRange?], [String], [(timestamp: TimeInterval, duration: TimeInterval)]) { + let subtitleWords = preprocessSubtitleWords(text: subtitle) + let transcriptTimedWords = preprocessTimedWords(text: transcript, timestamps: transcriptTimestamps) + + let lenSub = subtitleWords.count + let lenTrans = transcriptTimedWords.count + + // Initialize the scoring matrix + var S = Array(repeating: Array(repeating: 0, count: lenTrans + 1), count: lenSub + 1) + + // Initialize first row and column with gap penalties + for i in 1...lenSub { + S[i][0] = S[i-1][0] + gapPenalty + } + for j in 1...lenTrans { + S[0][j] = S[0][j-1] + gapPenalty + } + + // Populate the scoring matrix + for i in 1...lenSub { + for j in 1...lenTrans { + let match = S[i-1][j-1] + score(word1: subtitleWords[i-1].normalized, word2: transcriptTimedWords[j-1].word) + let delete = S[i-1][j] + gapPenalty + let insert = S[i][j-1] + gapPenalty + S[i][j] = max(match, delete, insert) + } + } + + // Traceback to get the aligned sequences + var alignedSubtitle = [NSRange?]() + var alignedTranscript = [String]() + var alignedTimestamps = [(timestamp: TimeInterval, duration: TimeInterval)]() + var i = lenSub + var j = lenTrans + + while i > 0 && j > 0 { + if S[i][j] == S[i-1][j-1] + score(word1: subtitleWords[i-1].normalized, word2: transcriptTimedWords[j-1].word) { + alignedSubtitle.append(subtitleWords[i-1].range) + alignedTranscript.append(transcriptTimedWords[j-1].word) + alignedTimestamps.append((transcriptTimedWords[j-1].timestamp, transcriptTimedWords[j-1].duration)) + i -= 1 + j -= 1 + } else if S[i][j] == S[i-1][j] + gapPenalty { + alignedSubtitle.append(subtitleWords[i-1].range) + alignedTranscript.append("-") + alignedTimestamps.append((-1, -1)) // Indicate a gap with a negative timestamp + i -= 1 + } else { + alignedSubtitle.append(nil) + alignedTranscript.append(transcriptTimedWords[j-1].word) + alignedTimestamps.append((transcriptTimedWords[j-1].timestamp, transcriptTimedWords[j-1].duration)) + j -= 1 + } + } + + while i > 0 { + alignedSubtitle.append(subtitleWords[i-1].range) + alignedTranscript.append("-") + alignedTimestamps.append((-1, -1)) + i -= 1 + } + + while j > 0 { + alignedSubtitle.append(nil) + alignedTranscript.append(transcriptTimedWords[j-1].word) + alignedTimestamps.append((transcriptTimedWords[j-1].timestamp, transcriptTimedWords[j-1].duration)) + j -= 1 + } + + return (alignedSubtitle.reversed(), alignedTranscript.reversed(), alignedTimestamps.reversed()) + } + + allSpeechToText.append(contentsOf: speechToText.segments.map { $0.substring }) + + // Example usage + let subtitle = rawText + let transcript = allSpeechToText + timestamps.append(contentsOf: speechToText.segments.map { ($0.timestamp, $0.duration) }) + + let (alignedSubtitle, alignedTranscript, alignedTimestamps) = alignSequences(subtitle: subtitle, transcript: transcript, transcriptTimestamps: timestamps) + + words = alignedSubtitle.enumerated().compactMap { index, range in + guard let range else { return nil } + + return Word(timestamp: alignedTimestamps[index].timestamp, duration: alignedTimestamps[index].duration, characterRange: range) + } + } +} + +class Word { + var timestamp: TimeInterval + var duration: TimeInterval + var characterRange: NSRange + + init(timestamp: TimeInterval, duration: TimeInterval, characterRange: NSRange) { + self.timestamp = timestamp + self.duration = duration + self.characterRange = characterRange + } + + func contains(timeInSeconds seconds: TimeInterval) -> Bool { + seconds >= timestamp && seconds <= (timestamp + duration) + } +} + +class WordToAnalyze { + let text: String + var timestamp: TimeInterval + var duration: TimeInterval + var matched: Bool = false + + init(text: String, timestamp: TimeInterval, duration: TimeInterval, matched: Bool) { + self.text = text + self.timestamp = timestamp + self.duration = duration + self.matched = matched + } +} + +// Levenshtein distance algorithm +func levDis(_ w1: String, _ w2: String) -> Int { + let empty = [Int](repeating:0, count: w2.count) + var last = [Int](0...w2.count) + + for (i, char1) in w1.enumerated() { + var cur = [i + 1] + empty + for (j, char2) in w2.enumerated() { + cur[j + 1] = char1 == char2 ? last[j] : min(last[j], last[j + 1], cur[j]) + 1 + } + last = cur + } + return last.last! +} + +func levDisNormalized(_ w1: String, _ w2: String) -> Double { + let levDis = levDis(w1, w2) + return 1 - Double(levDis) / Double(max(w1.count, w2.count)) } diff --git a/podcasts/TranscriptsViewController.swift b/podcasts/TranscriptsViewController.swift index 5603d3529c..2a8930b2c5 100644 --- a/podcasts/TranscriptsViewController.swift +++ b/podcasts/TranscriptsViewController.swift @@ -1,6 +1,7 @@ import Foundation import Speech import UIKit +import Ifrit class TranscriptsViewController: PlayerItemViewController { @@ -139,7 +140,7 @@ class TranscriptsViewController: PlayerItemViewController { formattedText.addAttributes(normalStyle, range: NSRange(location: 0, length: formattedText.length)) - if let range = transcript.firstCue(containing: position)?.characterRange { + if let range = transcript.firstWord(containing: position)?.characterRange { formattedText.addAttributes(highlightStyle, range: range) } @@ -159,33 +160,66 @@ class TranscriptsViewController: PlayerItemViewController { private func addObservers() { addCustomObserver(Constants.Notifications.playbackTrackChanged, selector: #selector(update)) addCustomObserver(Constants.Notifications.playbackProgress, selector: #selector(updateTranscriptPosition)) - addCustomObserver(Constants.Notifications.speechToTextAvailable, selector: #selector(updateTranscriptPositionn)) + addCustomObserver(Constants.Notifications.speechToTextAvailable, selector: #selector(receivedSpeechToTextContent)) } - @objc private func updateTranscriptPositionn(notification: NSNotification) { - guard let text = notification.userInfo?["text"] as? SFTranscription else { return } + var offset: TimeInterval = 0 - print("$$ \(text.formattedString)") + @objc private func receivedSpeechToTextContent(notification: NSNotification) { + guard let text = notification.userInfo?["text"] as? SFTranscription, + let offset = notification.userInfo?["offset"] as? TimeInterval else { return } + + self.offset = offset + + transcript?.wordByWord(speechToText: text) } @objc private func updateTranscriptPosition() { - let position = playbackManager.currentTime() + let position = playbackManager.currentTime() - offset print("Transcript position: \(position)") guard let transcript else { return } - if let cue = transcript.firstCue(containing: position), cue.characterRange != previousRange { - let range = cue.characterRange - //Comment this line out if you want to check the player position and cues in range - //print("Transcript position: \(position) in [\(cue.startTime) <-> \(cue.endTime)]") - previousRange = range + + if let word = transcript.firstWord(containing: position) { transcriptView.attributedText = styleText(transcript: transcript, position: position) // adjusting the scroll to range so it shows more text - let scrollRange = NSRange(location: range.location, length: range.length * 5) + let scrollRange = NSRange(location: word.characterRange.location, length: word.characterRange.length) transcriptView.scrollRangeToVisible(scrollRange) - } else if let startTime = transcript.cues.first?.startTime, position < startTime { - previousRange = nil - transcriptView.scrollRangeToVisible(NSRange(location: 0, length: 0)) } + +// if let cue = transcript.firstCue(containing: position), cue.characterRange != previousRange { +// let range = cue.characterRange +// //Comment this line out if you want to check the player position and cues in range +// //print("Transcript position: \(position) in [\(cue.startTime) <-> \(cue.endTime)]") +// previousRange = range +// transcriptView.attributedText = styleText(transcript: transcript, position: position) +// // adjusting the scroll to range so it shows more text +// let scrollRange = NSRange(location: range.location, length: range.length * 5) +// transcriptView.scrollRangeToVisible(scrollRange) +// } else if let startTime = transcript.cues.first?.startTime, position < startTime { +// previousRange = nil +// transcriptView.scrollRangeToVisible(NSRange(location: 0, length: 0)) +// } + } +} + +extension String { + subscript (bounds: CountableClosedRange) -> String { + let start = index(startIndex, offsetBy: bounds.lowerBound) + let end = index(startIndex, offsetBy: bounds.upperBound) + return String(self[start...end]) + } + + subscript (bounds: CountableRange) -> String { + let start = index(startIndex, offsetBy: bounds.lowerBound) + let end = index(startIndex, offsetBy: bounds.upperBound) + return String(self[start..) -> ArraySlice { + indices.contains(bounds.upperBound) && indices.contains(bounds.lowerBound) ? self[bounds] : [] } } From 22e2d4b3085a978e4c36cf35f33a64718a31fd52 Mon Sep 17 00:00:00 2001 From: Leandro Alonso Date: Mon, 8 Jul 2024 14:09:12 -0300 Subject: [PATCH 05/11] Make word by word in a BG thread --- podcasts/TranscriptsViewController.swift | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/podcasts/TranscriptsViewController.swift b/podcasts/TranscriptsViewController.swift index 2a8930b2c5..ae4289f5d2 100644 --- a/podcasts/TranscriptsViewController.swift +++ b/podcasts/TranscriptsViewController.swift @@ -171,17 +171,19 @@ class TranscriptsViewController: PlayerItemViewController { self.offset = offset - transcript?.wordByWord(speechToText: text) + DispatchQueue.global().async { + self.transcript?.wordByWord(speechToText: text) + } } @objc private func updateTranscriptPosition() { let position = playbackManager.currentTime() - offset - print("Transcript position: \(position)") guard let transcript else { return } if let word = transcript.firstWord(containing: position) { +// print(transcript.rawText[word.characterRange.lowerBound.. Date: Mon, 8 Jul 2024 14:18:58 -0300 Subject: [PATCH 06/11] Use levenshtein distance --- podcasts/TranscriptModel.swift | 36 +++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/podcasts/TranscriptModel.swift b/podcasts/TranscriptModel.swift index ff673db6e9..e357599704 100644 --- a/podcasts/TranscriptModel.swift +++ b/podcasts/TranscriptModel.swift @@ -97,6 +97,39 @@ class TranscriptModel: @unchecked Sendable { } func wordByWord(speechToText: SFTranscription) { + // Calculate Levenshtein distance + func levenshtein(aStr: String, bStr: String) -> Int { + let a = Array(aStr) + let b = Array(bStr) + let m = a.count + let n = b.count + + var dist = [[Int]](repeating: [Int](repeating: 0, count: n + 1), count: m + 1) + + for i in 0...m { + dist[i][0] = i + } + for j in 0...n { + dist[0][j] = j + } + + for i in 1...m { + for j in 1...n { + if a[i-1] == b[j-1] { + dist[i][j] = dist[i-1][j-1] + } else { + dist[i][j] = min( + dist[i-1][j] + 1, + dist[i][j-1] + 1, + dist[i-1][j-1] + 1 + ) + } + } + } + + return dist[m][n] + } + // Define constants let matchScore = 1 let mismatchScore = -1 @@ -150,7 +183,8 @@ class TranscriptModel: @unchecked Sendable { // Define the scoring function func score(word1: String, word2: String) -> Int { - return word1 == word2 ? matchScore : mismatchScore + let distance = levenshtein(aStr: word1, bStr: word2) + return distance == 0 ? 1 : -distance } // Perform sequence alignment From e7a25ea1e1bcf4c75ceae8def4e866a0348a7e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Est=C3=AAv=C3=A3o?= Date: Wed, 21 May 2025 16:22:44 +0100 Subject: [PATCH 07/11] Update packages --- .../xcshareddata/swiftpm/Package.resolved | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved b/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved index ec2dfa978c..7d3d374e28 100644 --- a/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -42,8 +42,8 @@ "repositoryURL": "https://github.com/Automattic/Automattic-Tracks-iOS", "state": { "branch": null, - "revision": "948c7642009237c74ef30dad59f03835416873eb", - "version": "3.4.2" + "revision": "437f586a62c07c6ceec9baf043237b2afe7ea1ac", + "version": "3.5.2" } }, { @@ -222,8 +222,8 @@ "repositoryURL": "https://github.com/getsentry/sentry-cocoa", "state": { "branch": null, - "revision": "08862789e1cbba7a9561bed69832a9306f339cd3", - "version": "8.29.1" + "revision": "11aee2d3efdd74db8ce51533118500d140210292", + "version": "8.51.0" } }, { @@ -249,8 +249,8 @@ "repositoryURL": "https://github.com/dagronf/SwiftSubtitles", "state": { "branch": null, - "revision": "ed8c19dab44e285b9463000c3bbb31dcea7790e9", - "version": "1.3.0" + "revision": "aaa309326c2b8bfb52b7fdfabb1a43820c2e26b2", + "version": "1.8.2" } }, { From 3247da71e2702cf114c209ca2243f433f2faddcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Est=C3=AAv=C3=A3o?= Date: Thu, 22 May 2025 15:49:31 +0100 Subject: [PATCH 08/11] Update pods --- Gemfile.lock | 65 ++++++++++--------- Podfile.lock | 2 +- podcasts.xcodeproj/project.pbxproj | 23 +++++++ .../xcshareddata/swiftpm/Package.resolved | 9 +++ 4 files changed, 68 insertions(+), 31 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index ed71441c0d..7da7ba9b6f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -5,18 +5,20 @@ GEM base64 nkf rexml - activesupport (7.1.3.4) + activesupport (7.2.2.1) base64 + benchmark (>= 0.3) bigdecimal - concurrent-ruby (~> 1.0, >= 1.0.2) + concurrent-ruby (~> 1.0, >= 1.3.1) connection_pool (>= 2.2.5) drb i18n (>= 1.6, < 2) + logger (>= 1.4.2) minitest (>= 5.1) - mutex_m - tzinfo (~> 2.0) - addressable (2.8.6) - public_suffix (>= 2.0.2, < 6.0) + securerandom (>= 0.3) + tzinfo (~> 2.0, >= 2.0.5) + addressable (2.8.7) + public_suffix (>= 2.0.2, < 7.0) algoliasearch (1.27.5) httpclient (~> 2.8, >= 2.8.3) json (>= 1.5.1) @@ -41,7 +43,8 @@ GEM aws-eventstream (~> 1, >= 1.0.2) babosa (1.0.4) base64 (0.2.0) - bigdecimal (3.1.8) + benchmark (0.4.0) + bigdecimal (3.1.9) buildkit (1.6.0) sawyer (>= 0.6) chroma (0.2.0) @@ -50,12 +53,12 @@ GEM cork nap open4 (~> 1.3) - cocoapods (1.14.2) + cocoapods (1.16.2) addressable (~> 2.8) claide (>= 1.0.2, < 2.0) - cocoapods-core (= 1.14.2) + cocoapods-core (= 1.16.2) cocoapods-deintegrate (>= 1.0.3, < 2.0) - cocoapods-downloader (>= 2.0) + cocoapods-downloader (>= 2.1, < 3.0) cocoapods-plugins (>= 1.0.0, < 2.0) cocoapods-search (>= 1.0.0, < 2.0) cocoapods-trunk (>= 1.6.0, < 2.0) @@ -67,10 +70,10 @@ GEM molinillo (~> 0.8.0) nap (~> 1.0) ruby-macho (>= 2.3.0, < 3.0) - xcodeproj (>= 1.23.0, < 2.0) + xcodeproj (>= 1.27.0, < 2.0) cocoapods-check (1.1.0) cocoapods (~> 1.0) - cocoapods-core (1.14.2) + cocoapods-core (1.16.2) activesupport (>= 5.0, < 8) addressable (~> 2.8) algoliasearch (~> 1.0) @@ -81,7 +84,7 @@ GEM public_suffix (~> 4.0) typhoeus (~> 1.0) cocoapods-deintegrate (1.0.5) - cocoapods-downloader (2.0) + cocoapods-downloader (2.1) cocoapods-plugins (1.0.0) nap cocoapods-search (1.0.1) @@ -94,8 +97,8 @@ GEM commander (4.6.0) highline (~> 2.0.0) commonmarker (0.23.10) - concurrent-ruby (1.3.1) - connection_pool (2.4.1) + concurrent-ruby (1.3.5) + connection_pool (2.5.3) cork (0.3.0) colored2 (~> 3.1) danger (9.4.3) @@ -127,7 +130,7 @@ GEM rake (>= 12.0.0, < 14.0.0) domain_name (0.6.20240107) dotenv (2.8.1) - drb (2.2.1) + drb (2.2.3) emoji_regex (3.2.3) escape (0.0.4) ethon (0.16.0) @@ -225,7 +228,8 @@ GEM rake (>= 12.3, < 14.0) rake-compiler (~> 1.0) xcodeproj (~> 1.22) - ffi (1.16.3) + ffi (1.17.2) + ffi (1.17.2-arm64-darwin) fourflusher (2.3.1) fuzzy_match (2.0.4) gh_inspector (1.1.3) @@ -271,12 +275,13 @@ GEM highline (2.0.3) http-cookie (1.0.6) domain_name (~> 0.5) - httpclient (2.8.3) - i18n (1.14.5) + httpclient (2.9.0) + mutex_m + i18n (1.14.7) concurrent-ruby (~> 1.0) java-properties (0.3.0) jmespath (1.6.2) - json (2.7.2) + json (2.12.0) jwt (2.8.1) base64 kramdown (2.4.0) @@ -284,15 +289,16 @@ GEM kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) language_server-protocol (3.17.0.3) + logger (1.7.0) mini_magick (4.12.0) mini_mime (1.1.5) mini_portile2 (2.8.7) - minitest (5.23.1) + minitest (5.25.5) molinillo (0.8.0) multi_json (1.15.0) multipart-post (2.4.1) - mutex_m (0.2.0) - nanaimo (0.3.0) + mutex_m (0.3.0) + nanaimo (0.4.0) nap (1.1.0) naturally (2.2.1) netrc (0.11.0) @@ -331,8 +337,7 @@ GEM trailblazer-option (>= 0.1.1, < 0.2.0) uber (< 0.2.0) retriable (3.1.2) - rexml (3.2.8) - strscan (>= 3.0.9) + rexml (3.4.1) rouge (2.0.7) rubocop (1.63.2) json (~> 2.3) @@ -354,6 +359,7 @@ GEM sawyer (0.9.2) addressable (>= 2.3.5) faraday (>= 0.17.3, < 3) + securerandom (0.4.1) security (0.1.5) signet (0.19.0) addressable (~> 2.8) @@ -363,7 +369,6 @@ GEM simctl (1.6.10) CFPropertyList naturally - strscan (3.1.0) terminal-notifier (2.0.0) terminal-table (3.0.2) unicode-display_width (>= 1.1.1, < 3) @@ -372,7 +377,7 @@ GEM tty-screen (0.8.2) tty-spinner (0.9.3) tty-cursor (~> 0.7) - typhoeus (1.4.0) + typhoeus (1.4.1) ethon (>= 0.9.0) tzinfo (2.0.6) concurrent-ruby (~> 1.0) @@ -382,13 +387,13 @@ GEM fastlane (>= 2.0.0, < 3.0.0) terminal-notifier word_wrap (1.0.0) - xcodeproj (1.24.0) + xcodeproj (1.27.0) CFPropertyList (>= 2.3.3, < 4.0) atomos (~> 0.1.3) claide (>= 1.0.2, < 2.0) colored2 (~> 3.1) - nanaimo (~> 0.3.0) - rexml (~> 3.2.4) + nanaimo (~> 0.4.0) + rexml (>= 3.3.6, < 4.0) xcpretty (0.3.0) rouge (~> 2.0.7) xcpretty-travis-formatter (1.0.1) diff --git a/Podfile.lock b/Podfile.lock index 51ae8722e2..6c57e898b9 100644 --- a/Podfile.lock +++ b/Podfile.lock @@ -48,4 +48,4 @@ SPEC CHECKSUMS: PODFILE CHECKSUM: 939ce570484d4733909d71d3cfa9a00c8f5ebc02 -COCOAPODS: 1.14.2 +COCOAPODS: 1.16.2 diff --git a/podcasts.xcodeproj/project.pbxproj b/podcasts.xcodeproj/project.pbxproj index 176301cd83..30e5f581ca 100644 --- a/podcasts.xcodeproj/project.pbxproj +++ b/podcasts.xcodeproj/project.pbxproj @@ -7867,6 +7867,7 @@ 8B1762752B6808F700F44450 /* XCRemoteSwiftPackageReference "JLRoutes" */, 8B1762782B684E7100F44450 /* XCRemoteSwiftPackageReference "Kingfisher" */, FF7F89EB2C2AF53C00FC0ED5 /* XCRemoteSwiftPackageReference "SwiftSubtitles" */, + FFF450222DDF6A8A00272AB8 /* XCRemoteSwiftPackageReference "Ifrit" */, ); productRefGroup = BDBD53ED17019B2A0048C8C5 /* Products */; projectDirPath = ""; @@ -8610,10 +8611,14 @@ inputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); + inputPaths = ( + ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); + outputPaths = ( + ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks.sh\"\n"; @@ -8645,10 +8650,14 @@ inputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); + inputPaths = ( + ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); + outputPaths = ( + ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks.sh\"\n"; @@ -10182,6 +10191,8 @@ ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; CLANG_ENABLE_MODULES = YES; CODE_SIGN_ENTITLEMENTS = podcasts/podcastsDebug.entitlements; + "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer"; + "DEVELOPMENT_TEAM[sdk=iphoneos*]" = PZYM8XX95Q; ENABLE_BITCODE = NO; FRAMEWORK_SEARCH_PATHS = ( "$(inherited)", @@ -10196,6 +10207,7 @@ ); PRODUCT_NAME = "$(TARGET_NAME)"; PROVISIONING_PROFILE_SPECIFIER = "Pocket Casts Development"; + "PROVISIONING_PROFILE_SPECIFIER[sdk=iphoneos*]" = "Pocket Casts Development"; SWIFT_OBJC_BRIDGING_HEADER = "podcasts/podcasts-Bridging-Header.h"; WRAPPER_EXTENSION = app; }; @@ -11567,6 +11579,8 @@ ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; CLANG_ENABLE_MODULES = YES; CODE_SIGN_ENTITLEMENTS = podcasts/podcastsDebug.entitlements; + "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer"; + "DEVELOPMENT_TEAM[sdk=iphoneos*]" = PZYM8XX95Q; ENABLE_BITCODE = NO; FRAMEWORK_SEARCH_PATHS = ( "$(inherited)", @@ -11581,6 +11595,7 @@ ); PRODUCT_NAME = "$(TARGET_NAME)"; PROVISIONING_PROFILE_SPECIFIER = "Pocket Casts Development"; + "PROVISIONING_PROFILE_SPECIFIER[sdk=iphoneos*]" = "Pocket Casts Development"; SWIFT_OBJC_BRIDGING_HEADER = "podcasts/podcasts-Bridging-Header.h"; WRAPPER_EXTENSION = app; }; @@ -11978,6 +11993,14 @@ minimumVersion = 1.3.0; }; }; + FFF450222DDF6A8A00272AB8 /* XCRemoteSwiftPackageReference "Ifrit" */ = { + isa = XCRemoteSwiftPackageReference; + repositoryURL = "https://github.com/ukushu/Ifrit.git"; + requirement = { + branch = main; + kind = branch; + }; + }; /* End XCRemoteSwiftPackageReference section */ /* Begin XCSwiftPackageProductDependency section */ diff --git a/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved b/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved index 7d3d374e28..0bcd6c9ee0 100644 --- a/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -154,6 +154,15 @@ "version": "4.1.1" } }, + { + "package": "Ifrit", + "repositoryURL": "https://github.com/ukushu/Ifrit.git", + "state": { + "branch": "main", + "revision": "5cb8badc28c7a4b7f59a05b8f0da37c363d25b6a", + "version": null + } + }, { "package": "InteropForGoogle", "repositoryURL": "https://github.com/google/interop-ios-for-google-sdks.git", From 27866d2967066233a8967681772bf0c1d94a5806 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Est=C3=AAv=C3=A3o?= Date: Thu, 22 May 2025 15:49:38 +0100 Subject: [PATCH 09/11] Remove unused import --- podcasts/TranscriptsViewController.swift | 1 - 1 file changed, 1 deletion(-) diff --git a/podcasts/TranscriptsViewController.swift b/podcasts/TranscriptsViewController.swift index ae4289f5d2..11450f9730 100644 --- a/podcasts/TranscriptsViewController.swift +++ b/podcasts/TranscriptsViewController.swift @@ -1,7 +1,6 @@ import Foundation import Speech import UIKit -import Ifrit class TranscriptsViewController: PlayerItemViewController { From db535e305d8d44cdce4cda018d253df92645fa84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Est=C3=AAv=C3=A3o?= Date: Fri, 23 May 2025 15:52:27 +0100 Subject: [PATCH 10/11] Update changes to separate files and extensions --- podcasts.xcodeproj/project.pbxproj | 20 +- podcasts/AudioReadSpeechRecognitionTask.swift | 386 ++++++++++++++++++ podcasts/AudioReadTask.swift | 49 +-- podcasts/TranscriptModel.swift | 10 +- podcasts/TranscriptSyncChanges.swift | 70 ++++ podcasts/TranscriptsViewController.swift | 226 ---------- 6 files changed, 473 insertions(+), 288 deletions(-) create mode 100644 podcasts/AudioReadSpeechRecognitionTask.swift create mode 100644 podcasts/TranscriptSyncChanges.swift delete mode 100644 podcasts/TranscriptsViewController.swift diff --git a/podcasts.xcodeproj/project.pbxproj b/podcasts.xcodeproj/project.pbxproj index 30e5f581ca..16cafe274b 100644 --- a/podcasts.xcodeproj/project.pbxproj +++ b/podcasts.xcodeproj/project.pbxproj @@ -1660,7 +1660,7 @@ FF4E93122BDBF0D800F370AA /* MiniPlayerGradientView.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF4E93112BDBF0D800F370AA /* MiniPlayerGradientView.swift */; }; FF5B2A442BB1859B009F3DC2 /* SourceInterfaceNavigationView.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF5B2A432BB1859B009F3DC2 /* SourceInterfaceNavigationView.swift */; }; FF5B2A462BB189C7009F3DC2 /* PocketCastsApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF5B2A452BB189C7009F3DC2 /* PocketCastsApp.swift */; }; - FF7F89EA2C2979D900FC0ED5 /* TranscriptsViewController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF7F89E92C2979D900FC0ED5 /* TranscriptsViewController.swift */; }; + FF7F89EA2C2979D900FC0ED5 /* TranscriptSyncChanges.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF7F89E92C2979D900FC0ED5 /* TranscriptSyncChanges.swift */; }; FF7F89ED2C2AF6DE00FC0ED5 /* TranscriptManager.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF7F89EC2C2AF6DE00FC0ED5 /* TranscriptManager.swift */; }; FF7F89EF2C2AF7B600FC0ED5 /* SwiftSubtitles in Frameworks */ = {isa = PBXBuildFile; productRef = FF7F89EE2C2AF7B600FC0ED5 /* SwiftSubtitles */; }; FF7F89F12C2C0FD600FC0ED5 /* TranscriptModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF7F89F02C2C0FD600FC0ED5 /* TranscriptModel.swift */; }; @@ -1673,6 +1673,7 @@ FFAA063E2C086DEA00FBC38F /* InsetAdjuster.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFAA063D2C086DEA00FBC38F /* InsetAdjuster.swift */; }; FFC293992B6173400059F3BB /* IAPTypes.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFC293982B6173400059F3BB /* IAPTypes.swift */; }; FFD3AB8C2BD15E8F00C562CB /* CircleView.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFD3AB8B2BD15E8F00C562CB /* CircleView.swift */; }; + FFDB38302DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFDB382F2DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift */; }; FFF024CE2B62AC9400457373 /* IAPHelperTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFF024CD2B62AC9400457373 /* IAPHelperTests.swift */; }; FFF024CF2B62B13A00457373 /* Pocket Casts Configuration.storekit in Resources */ = {isa = PBXBuildFile; fileRef = C7547F77286F571900DC1C9E /* Pocket Casts Configuration.storekit */; }; FFF17EC42B97930700E116C8 /* BookmarksProfileListController.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFF17EC32B97930700E116C8 /* BookmarksProfileListController.swift */; }; @@ -3479,7 +3480,7 @@ FF57373C2B4EB5B100F511C7 /* README.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = ""; }; FF5B2A432BB1859B009F3DC2 /* SourceInterfaceNavigationView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SourceInterfaceNavigationView.swift; sourceTree = ""; }; FF5B2A452BB189C7009F3DC2 /* PocketCastsApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PocketCastsApp.swift; sourceTree = ""; }; - FF7F89E92C2979D900FC0ED5 /* TranscriptsViewController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptsViewController.swift; sourceTree = ""; }; + FF7F89E92C2979D900FC0ED5 /* TranscriptSyncChanges.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptSyncChanges.swift; sourceTree = ""; }; FF7F89EC2C2AF6DE00FC0ED5 /* TranscriptManager.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptManager.swift; sourceTree = ""; }; FF7F89F02C2C0FD600FC0ED5 /* TranscriptModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptModel.swift; sourceTree = ""; }; FF8970752B5FFC5E004ADB23 /* SubscriptionPriceAndOfferView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SubscriptionPriceAndOfferView.swift; sourceTree = ""; }; @@ -3491,6 +3492,7 @@ FFAA063D2C086DEA00FBC38F /* InsetAdjuster.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InsetAdjuster.swift; sourceTree = ""; }; FFC293982B6173400059F3BB /* IAPTypes.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = IAPTypes.swift; sourceTree = ""; }; FFD3AB8B2BD15E8F00C562CB /* CircleView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = CircleView.swift; sourceTree = ""; }; + FFDB382F2DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = AudioReadSpeechRecognitionTask.swift; sourceTree = ""; }; FFF024CD2B62AC9400457373 /* IAPHelperTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = IAPHelperTests.swift; sourceTree = ""; }; FFF17EC32B97930700E116C8 /* BookmarksProfileListController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BookmarksProfileListController.swift; sourceTree = ""; }; FFF17EC52B979B5500E116C8 /* BookmarksProfileListView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BookmarksProfileListView.swift; sourceTree = ""; }; @@ -5109,6 +5111,7 @@ children = ( BDF15A411B54E088000EC323 /* EffectsPlayer.swift */, BD4098791B9ECA5F007F36BD /* AudioReadTask.swift */, + FFDB382F2DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift */, BD40987F1B9EFE3C007F36BD /* AudioPlayTask.swift */, BD4098811B9EFF6E007F36BD /* PlayBufferManager.swift */, BD40987B1B9ED731007F36BD /* SynchronizedAudioStack.swift */, @@ -7456,7 +7459,7 @@ FF7F89E82C2979C000FC0ED5 /* Transcripts */ = { isa = PBXGroup; children = ( - FF7F89E92C2979D900FC0ED5 /* TranscriptsViewController.swift */, + FF7F89E92C2979D900FC0ED5 /* TranscriptSyncChanges.swift */, FF7F89EC2C2AF6DE00FC0ED5 /* TranscriptManager.swift */, FF7F89F02C2C0FD600FC0ED5 /* TranscriptModel.swift */, ); @@ -8611,14 +8614,10 @@ inputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); - inputPaths = ( - ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); - outputPaths = ( - ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks.sh\"\n"; @@ -8650,14 +8649,10 @@ inputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); - inputPaths = ( - ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); - outputPaths = ( - ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks.sh\"\n"; @@ -8932,7 +8927,7 @@ BD998AD327B3430700B38857 /* ColorPreviewFolderView.swift in Sources */, BD7C1FFD237D16C600B3353B /* PCAlwaysVisibleCastBtn.swift in Sources */, BD9324712398BB50004F19A1 /* TourView.swift in Sources */, - FF7F89EA2C2979D900FC0ED5 /* TranscriptsViewController.swift in Sources */, + FF7F89EA2C2979D900FC0ED5 /* TranscriptSyncChanges.swift in Sources */, BDC5360A27C88A5700EFCF31 /* FolderPreviewWrapper.swift in Sources */, 8B5AB48E2901A8BD0018C637 /* StoriesController.swift in Sources */, BD6D4187200C802900CA8993 /* PodcastViewController+NetworkLoad.swift in Sources */, @@ -9033,6 +9028,7 @@ C7318EA42A61E40800EAFA9C /* BookmarkEditTitleView.swift in Sources */, BDF4D30B2175A9E90086463E /* StarredViewController.swift in Sources */, C75BB06829B954B100F2DF63 /* EpisodeLoadingController.swift in Sources */, + FFDB38302DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift in Sources */, 40B118EB2153B8CA000932C9 /* RadioButtonCell.swift in Sources */, BDEAA1861BB144AD001097D9 /* DisclosureCell.swift in Sources */, BDD40A1A1FA1AF7900A53AE1 /* TintableImageView.swift in Sources */, diff --git a/podcasts/AudioReadSpeechRecognitionTask.swift b/podcasts/AudioReadSpeechRecognitionTask.swift new file mode 100644 index 0000000000..a4877e3c23 --- /dev/null +++ b/podcasts/AudioReadSpeechRecognitionTask.swift @@ -0,0 +1,386 @@ +import AVFoundation +import PocketCastsDataModel +import PocketCastsServer +import PocketCastsUtils +import Speech + +class AudioReadSpeechRecognitionTask { + private let maxSilenceAmountToSave = 1000 + + private var minRMS = 0.005 as Float32 + private var minGapSizeInFrames = 3 + private var amountOfSilentFramesToReInsert = 1 + + private let cancelled = AtomicBool() + + private let readQueue: DispatchQueue + private let lock = NSObject() + + private var trimSilence: TrimSilenceAmount = .off + + private var audioFile: AVAudioFile + private var outputFormat: AVAudioFormat + private var bufferManager: PlayBufferManager + + private let bufferLength = UInt32(Constants.Audio.defaultFrameSize) + private let bufferByteSize = Float32(MemoryLayout.size) + + private var foundGap = false + private var channelCount = 0 as UInt32 + private var buffersSavedDuringGap = SynchronizedAudioStack() + private var fadeInNextFrame = true + private var cachedFrameCount = 0 as Int64 + + private var currentFramePosition: AVAudioFramePosition = 0 + private let endOfFileSemaphore = DispatchSemaphore(value: 0) + + private lazy var request: SFSpeechAudioBufferRecognitionRequest? = { + let request = SFSpeechAudioBufferRecognitionRequest() + request.shouldReportPartialResults = false + request.requiresOnDeviceRecognition = true + request.taskHint = .dictation + return request + }() + + private var task: SFSpeechRecognitionTask? + private lazy var recognizer: SFSpeechRecognizer? = { + let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US")) + recognizer?.defaultTaskHint = .dictation + return recognizer + }() + + init(trimSilence: TrimSilenceAmount, audioFile: AVAudioFile, outputFormat: AVAudioFormat, bufferManager: PlayBufferManager, playPositionHint: TimeInterval, frameCount: Int64) { + self.trimSilence = .off + self.audioFile = audioFile + self.outputFormat = outputFormat + self.bufferManager = bufferManager + cachedFrameCount = frameCount + + readQueue = DispatchQueue(label: "au.com.pocketcasts.ReadQueue", qos: .default, attributes: [], autoreleaseFrequency: .never, target: nil) + + updateRemoveSilenceNumbers() + + if playPositionHint > 0 { + currentFramePosition = framePositionForTime(playPositionHint).framePosition + audioFile.framePosition = currentFramePosition + } + } + + var offset: TimeInterval = 0 + + func startup() { + readQueue.async { [weak self] in + guard let self = self else { return } + + guard let recognizer, let request else { return } + + offset = PlaybackManager.shared.currentTime() + print("$$ Starting task") + task = recognizer.recognitionTask(with: request, resultHandler: { [weak self] result, error in + self?.recognitionHandler(result: result, error: error) + }) + + // there are some Core Audio errors that aren't marked as throws in the Swift code, so they'll crash the app + // that's why we have an Objective-C try/catch block here to catch them (see https://github.com/shiftyjelly/pocketcasts-ios/issues/1493 for more details) + do { + try SJCommonUtils.catchException { [weak self] in + guard let self = self else { return } + + do { + while !self.cancelled.value { + // nil is returned when there are playback errors or us getting to the end of a file, sleep so we don't end up in a tight loop but these all set the cancelled flag + guard let audioBuffers = try self.readFromFile() else { + Thread.sleep(forTimeInterval: 0.1) + continue + } + + for buffer in audioBuffers { + self.scheduleForPlayback(buffer: buffer) + } + } + } catch { + self.bufferManager.readErrorOccurred.value = true + FileLog.shared.addMessage("Audio Read failed (Swift): \(error.localizedDescription)") + } + } + } catch { + self.bufferManager.readErrorOccurred.value = true + FileLog.shared.addMessage("Audio Read failed (obj-c): \(error.localizedDescription)") + } + } + } + + nonisolated private func recognitionHandler(result: SFSpeechRecognitionResult?, error: Error?) { + let receivedFinalResult = result?.isFinal ?? false + let receivedError = error != nil + + if result?.isFinal == true { + print("$$ SFSpeechRecognition finished") + } + + if let error { + print("$$ SFSpeechRecognition error \(error)") + } + + if let result { + NotificationCenter.postOnMainThread(notification: Constants.Notifications.speechToTextAvailable, userInfo: ["text": result.bestTranscription, "offset": offset]) + } + } + + func shutdown() { + cancelled.value = true + bufferManager.bufferSemaphore.signal() + endOfFileSemaphore.signal() + } + + func setTrimSilence(_ trimSilence: TrimSilenceAmount) { + objc_sync_enter(lock) + defer { objc_sync_exit(lock) } + + self.trimSilence = trimSilence + updateRemoveSilenceNumbers() + } + + private func updateRemoveSilenceNumbers() { + guard trimSilence != .off else { return } + + minGapSizeInFrames = gapSizeForSilenceAmount() + amountOfSilentFramesToReInsert = framesToReInsertForSilenceAmount() + minRMS = minRMSForSilenceAmount() + } + + func seekTo(_ time: TimeInterval, completion: ((Bool) -> Void)?) { + DispatchQueue.global(qos: .default).async { () in + let seekResult = self.performSeek(time) + self.bufferManager.bufferSemaphore.signal() + completion?(seekResult) + } + } + + private func performSeek(_ time: TimeInterval) -> Bool { + objc_sync_enter(lock) + defer { objc_sync_exit(lock) } + + let positionRequired = framePositionForTime(time) + var seekedToEnd = false + + if positionRequired.passedEndOfFile { + bufferManager.removeAll() + bufferManager.readToEOFSuccessfully.value = true + + seekedToEnd = true + } else { + currentFramePosition = positionRequired.framePosition + audioFile.framePosition = currentFramePosition + bufferManager.aboutToSeek() + foundGap = false + buffersSavedDuringGap.removeAll() + fadeInNextFrame = true + + // if we've finished reading this file, wake the reading thread back up + if bufferManager.readToEOFSuccessfully.value { + endOfFileSemaphore.signal() + } + } + + return seekedToEnd + } + + private func handleReachedEndOfFile() { + bufferManager.readToEOFSuccessfully.value = true + + // we've read to the end but the player won't yet have played to the end, wait til it signals us that it has + endOfFileSemaphore.wait() + } + + private func readFromFile() throws -> [BufferedAudio]? { + objc_sync_enter(lock) + + // are we at the end of the file? + currentFramePosition = audioFile.framePosition + if currentFramePosition >= cachedFrameCount { + objc_sync_exit(lock) + handleReachedEndOfFile() + + return nil + } + + let audioPCMBuffer = AVAudioPCMBuffer(pcmFormat: outputFormat, frameCapacity: bufferLength) + do { + try audioFile.read(into: audioPCMBuffer!) + } catch { + objc_sync_exit(lock) + throw PlaybackError.errorDuringPlayback + } + + // check that we actually read something + if audioPCMBuffer?.frameLength == 0 { + objc_sync_exit(lock) + handleReachedEndOfFile() + + return nil + } + + currentFramePosition = audioFile.framePosition + fadeInNextFrame = false + if channelCount == 0 { channelCount = (audioPCMBuffer?.audioBufferList.pointee.mNumberBuffers)! } + + if channelCount == 0 { + bufferManager.readErrorOccurred.value = true + cancelled.value = true + objc_sync_exit(lock) + + return nil + } + + // iOS 16 has an issue in which if the conditions below are met, the playback will fail: + // Audio file has a single channel and spatial audio is enabled + // In order to prevent this issue, we convert a mono buffer to stereo buffer + // For more info, see: https://github.com/Automattic/pocket-casts-ios/issues/62 + var audioBuffer: BufferedAudio + if #available(iOS 16, *), + let audioPCMBuffer = audioPCMBuffer, + audioPCMBuffer.audioBufferList.pointee.mNumberBuffers == 1, + let twoChannelsFormat = AVAudioFormat(standardFormatWithSampleRate: audioFile.processingFormat.sampleRate, channels: 2), + let twoChannnelBuffer = AVAudioPCMBuffer(pcmFormat: twoChannelsFormat, frameCapacity: audioPCMBuffer.frameCapacity) { + let converter = AVAudioConverter(from: audioFile.processingFormat, to: twoChannelsFormat) + try? converter?.convert(to: twoChannnelBuffer, from: audioPCMBuffer) + audioBuffer = BufferedAudio(audioBuffer: twoChannnelBuffer, framePosition: currentFramePosition, shouldFadeOut: false, shouldFadeIn: fadeInNextFrame) + } else { + audioBuffer = BufferedAudio(audioBuffer: audioPCMBuffer!, framePosition: currentFramePosition, shouldFadeOut: false, shouldFadeIn: fadeInNextFrame) + } + + var buffers = [BufferedAudio]() + if trimSilence != .off { + guard let bufferListPointer = UnsafeMutableAudioBufferListPointer(audioPCMBuffer?.mutableAudioBufferList) else { + buffers.append(audioBuffer) + objc_sync_exit(lock) + + return buffers + } + + let currPosition = currentFramePosition / Int64(audioFile.fileFormat.sampleRate) + let totalDuration = cachedFrameCount / Int64(audioFile.fileFormat.sampleRate) + let timeLeft = totalDuration - currPosition + var rms: Float32 = 0 + if timeLeft <= 5 { + // don't trim silence from the last 5 seconds + rms = 1 + } else { + rms = (channelCount == 1) ? AudioUtils.calculateRms(bufferListPointer[0]) : AudioUtils.calculateStereoRms(bufferListPointer[0], rightBuffer: bufferListPointer[1]) + } + + if rms > minRMS, !foundGap { + // the RMS is higher than our minimum and we aren't currently in a gap, just play it + buffers.append(audioBuffer) + } else if foundGap, rms > minRMS || buffersSavedDuringGap.count() > maxSilenceAmountToSave { + foundGap = false + // we've come to the end of a gap (or we've had a suspiscious amount of gap), piece back together the audio + if buffersSavedDuringGap.count() < minGapSizeInFrames { + // we don't have enough gap to remove, just push + while buffersSavedDuringGap.canPop() { + buffers.append(buffersSavedDuringGap.pop()!) + } + + buffers.append(audioBuffer) + } else { + for index in 0 ... amountOfSilentFramesToReInsert { + if index < amountOfSilentFramesToReInsert { + buffers.append(buffersSavedDuringGap.pop()!) + } else { + // fade out the last frame to avoid a jarring re-attach + let buffer = buffersSavedDuringGap.pop()! + AudioUtils.fadeAudio(buffer, fadeOut: true, channelCount: channelCount) + buffers.append(buffer) + } + } + + // pop all the ones we don't need after that + while buffersSavedDuringGap.canPop(), buffersSavedDuringGap.count() > (amountOfSilentFramesToReInsert - 1) { + _ = buffersSavedDuringGap.pop() + let secondsSaved = Double((audioPCMBuffer?.frameLength)!) / audioFile.fileFormat.sampleRate + StatsManager.shared.addTimeSavedDynamicSpeed(secondsSaved) + } + + while buffersSavedDuringGap.canPop() { + buffers.append(buffersSavedDuringGap.pop()!) + } + + // fade back in the new frame + AudioUtils.fadeAudio(audioBuffer, fadeOut: false, channelCount: channelCount) + buffers.append(audioBuffer) + } + } else if rms < minRMS, !foundGap { + // we are at the start of a gap, save this clip and keep going + foundGap = true + buffersSavedDuringGap.push(audioBuffer) + } else if rms < minRMS, foundGap { + // we are inside a gap we've already found + buffersSavedDuringGap.push(audioBuffer) + } + } else { + buffers.append(audioBuffer) + } + + objc_sync_exit(lock) + return buffers + } + + private func scheduleForPlayback(buffer: BufferedAudio) { + // the play task will signal us when it needs more buffer, but it will keep signalling as long as the buffer is low, so keep calling wait until we get below the high point + while !cancelled.value, bufferManager.bufferLength() >= bufferManager.highBufferPoint { + bufferManager.bufferSemaphore.wait() + } + + if !cancelled.value { + bufferManager.push(buffer) + } + + request?.append(buffer.audioBuffer) + } + + private func gapSizeForSilenceAmount() -> Int { + switch trimSilence { + case .low: + return 20 + case .medium: + return 16 + case .high: + return 4 + case .off: + return 0 + } + } + + private func framesToReInsertForSilenceAmount() -> Int { + switch trimSilence { + case .low: + return 14 + case .medium: + return 12 + case .high, .off: + return 0 + } + } + + private func minRMSForSilenceAmount() -> Float32 { + switch trimSilence { + case .low: + return 0.0055 + case .medium: + return 0.00511 + case .high: + return 0.005 + case .off: + return 0 + } + } + + private func framePositionForTime(_ time: TimeInterval) -> (framePosition: Int64, passedEndOfFile: Bool) { + let totalFrames = Double(cachedFrameCount) + let totalSeconds = totalFrames / audioFile.fileFormat.sampleRate + let percentSeek = time / totalSeconds + + return (Int64(totalFrames * percentSeek), percentSeek >= 1) + } +} diff --git a/podcasts/AudioReadTask.swift b/podcasts/AudioReadTask.swift index 1ddc21fe0c..cf1ccc3ed4 100644 --- a/podcasts/AudioReadTask.swift +++ b/podcasts/AudioReadTask.swift @@ -2,7 +2,6 @@ import AVFoundation import PocketCastsDataModel import PocketCastsServer import PocketCastsUtils -import Speech class AudioReadTask { private let maxSilenceAmountToSave = 1000 @@ -34,23 +33,8 @@ class AudioReadTask { private var currentFramePosition: AVAudioFramePosition = 0 private let endOfFileSemaphore = DispatchSemaphore(value: 0) - private lazy var request: SFSpeechAudioBufferRecognitionRequest? = { - let request = SFSpeechAudioBufferRecognitionRequest() - request.shouldReportPartialResults = false - request.requiresOnDeviceRecognition = true - request.taskHint = .dictation - return request - }() - - private var task: SFSpeechRecognitionTask? - private lazy var recognizer: SFSpeechRecognizer? = { - let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US")) - recognizer?.defaultTaskHint = .dictation - return recognizer - }() - init(trimSilence: TrimSilenceAmount, audioFile: AVAudioFile, outputFormat: AVAudioFormat, bufferManager: PlayBufferManager, playPositionHint: TimeInterval, frameCount: Int64) { - self.trimSilence = .off + self.trimSilence = trimSilence self.audioFile = audioFile self.outputFormat = outputFormat self.bufferManager = bufferManager @@ -66,20 +50,10 @@ class AudioReadTask { } } - var offset: TimeInterval = 0 - func startup() { readQueue.async { [weak self] in guard let self = self else { return } - guard let recognizer, let request else { return } - - offset = PlaybackManager.shared.currentTime() - print("$$ Starting task") - task = recognizer.recognitionTask(with: request, resultHandler: { [weak self] result, error in - self?.recognitionHandler(result: result, error: error) - }) - // there are some Core Audio errors that aren't marked as throws in the Swift code, so they'll crash the app // that's why we have an Objective-C try/catch block here to catch them (see https://github.com/shiftyjelly/pocketcasts-ios/issues/1493 for more details) do { @@ -110,23 +84,6 @@ class AudioReadTask { } } - nonisolated private func recognitionHandler(result: SFSpeechRecognitionResult?, error: Error?) { - let receivedFinalResult = result?.isFinal ?? false - let receivedError = error != nil - - if result?.isFinal == true { - print("$$ SFSpeechRecognition finished") - } - - if let error { - print("$$ SFSpeechRecognition error \(error)") - } - - if let result { - NotificationCenter.postOnMainThread(notification: Constants.Notifications.speechToTextAvailable, userInfo: ["text": result.bestTranscription, "offset": offset]) - } - } - func shutdown() { cancelled.value = true bufferManager.bufferSemaphore.signal() @@ -334,9 +291,7 @@ class AudioReadTask { if !cancelled.value { bufferManager.push(buffer) - } - - request?.append(buffer.audioBuffer) + } } private func gapSizeForSilenceAmount() -> Int { diff --git a/podcasts/TranscriptModel.swift b/podcasts/TranscriptModel.swift index e357599704..33d0a72c1e 100644 --- a/podcasts/TranscriptModel.swift +++ b/podcasts/TranscriptModel.swift @@ -79,15 +79,19 @@ class TranscriptModel: @unchecked Sendable { self.cues.first { $0.contains(timeInSeconds: secondsValue) } } + /// MARK - transcript sync extras: + var timestamps: [(TimeInterval, TimeInterval)] = [] + + var words: [Word] = [] + var allSpeechToText: [String] = [] { didSet { print("$$ \(allSpeechToText.joined(separator: " "))") print("$$") } } - var timestamps: [(TimeInterval, TimeInterval)] = [] - - var words: [Word] = [] +} +extension TranscriptModel { public func firstWord(containing secondsValue: TimeInterval) -> Word? { words diff --git a/podcasts/TranscriptSyncChanges.swift b/podcasts/TranscriptSyncChanges.swift new file mode 100644 index 0000000000..ef69398929 --- /dev/null +++ b/podcasts/TranscriptSyncChanges.swift @@ -0,0 +1,70 @@ +import Speech + +class TranscriptSyncChanges { + + init() { + + } + + var transcript: TranscriptModel? + var transcriptView: UITextView = UITextView() + var playbackManager: PlaybackManager = PlaybackManager.shared + + private func styleText(transcript: TranscriptModel, position: Double = 0) -> NSAttributedString { + if let range = transcript.firstWord(containing: position)?.characterRange { + } + return NSAttributedString() + } + + private func addObservers() { + //addCustomObserver(Constants.Notifications.speechToTextAvailable, selector: #selector(receivedSpeechToTextContent)) + } + + var offset: TimeInterval = 0 + + @objc private func receivedSpeechToTextContent(notification: NSNotification) { + guard let text = notification.userInfo?["text"] as? SFTranscription, + let offset = notification.userInfo?["offset"] as? TimeInterval else { return } + + self.offset = offset + + DispatchQueue.global().async { + self.transcript?.wordByWord(speechToText: text) + } + } + + @objc private func updateTranscriptPosition() { + let position = playbackManager.currentTime() - offset + guard let transcript else { + return + } + + if let word = transcript.firstWord(containing: position) { +// print(transcript.rawText[word.characterRange.lowerBound..) -> String { + let start = index(startIndex, offsetBy: bounds.lowerBound) + let end = index(startIndex, offsetBy: bounds.upperBound) + return String(self[start...end]) + } + + subscript (bounds: CountableRange) -> String { + let start = index(startIndex, offsetBy: bounds.lowerBound) + let end = index(startIndex, offsetBy: bounds.upperBound) + return String(self[start..) -> ArraySlice { + indices.contains(bounds.upperBound) && indices.contains(bounds.lowerBound) ? self[bounds] : [] + } +} diff --git a/podcasts/TranscriptsViewController.swift b/podcasts/TranscriptsViewController.swift deleted file mode 100644 index 11450f9730..0000000000 --- a/podcasts/TranscriptsViewController.swift +++ /dev/null @@ -1,226 +0,0 @@ -import Foundation -import Speech -import UIKit - -class TranscriptsViewController: PlayerItemViewController { - - let playbackManager: PlaybackManager - var transcript: TranscriptModel? - var previousRange: NSRange? - - init(playbackManager: PlaybackManager) { - self.playbackManager = playbackManager - super.init() - } - - required override init(nibName nibNameOrNil: String?, bundle nibBundleOrNil: Bundle?) { - self.playbackManager = PlaybackManager.shared - super.init(nibName: nil, bundle: nil) - } - - required init?(coder: NSCoder) { - fatalError("init(coder:) has not been implemented") - } - - public override func viewDidLoad() { - super.viewDidLoad() - setupViews() - } - - private func setupViews() { - view.addSubview(transcriptView) - NSLayoutConstraint.activate( - [ - transcriptView.topAnchor.constraint(equalTo: view.topAnchor), - transcriptView.bottomAnchor.constraint(equalTo: view.bottomAnchor), - transcriptView.leadingAnchor.constraint(equalTo: view.leadingAnchor, constant: 32), - transcriptView.trailingAnchor.constraint(equalTo: view.trailingAnchor, constant: -32) - ] - ) - - view.addSubview(activityIndicatorView) - NSLayoutConstraint.activate( - [ - activityIndicatorView.centerXAnchor.constraint(equalTo: view.centerXAnchor), - activityIndicatorView.centerYAnchor.constraint(equalTo: view.centerYAnchor) - ] - ) - } - - private lazy var transcriptView: UITextView = { - let textView = UITextView() - textView.translatesAutoresizingMaskIntoConstraints = false - textView.font = .systemFont(ofSize: 16) - textView.isEditable = false - textView.showsVerticalScrollIndicator = true - return textView - }() - - private lazy var activityIndicatorView: UIActivityIndicatorView = { - let activityIndicatorView = UIActivityIndicatorView() - activityIndicatorView.style = .medium - activityIndicatorView.hidesWhenStopped = true - activityIndicatorView.translatesAutoresizingMaskIntoConstraints = false - return activityIndicatorView - }() - - override func willBeAddedToPlayer() { - updateColors() - loadTranscript() - addObservers() - } - - override func willBeRemovedFromPlayer() { - removeAllCustomObservers() - } - - override func themeDidChange() { - updateColors() - } - - private func updateColors() { - view.backgroundColor = PlayerColorHelper.playerBackgroundColor01() - transcriptView.backgroundColor = PlayerColorHelper.playerBackgroundColor01() - transcriptView.textColor = ThemeColor.playerContrast02() - transcriptView.indicatorStyle = .white - activityIndicatorView.color = ThemeColor.playerContrast01() - } - - @objc private func update() { - updateColors() - loadTranscript() - } - - private func loadTranscript() { - activityIndicatorView.startAnimating() - Task.detached { [weak self] in - guard let self else { - return - } - let transcriptManager = TranscriptManager(playbackManager: self.playbackManager) - do { - let transcript = try await transcriptManager.loadTranscript() - await show(transcript: transcript) - } catch { - await show(error: error) - } - } - } - - private func show(transcript: TranscriptModel) { - activityIndicatorView.stopAnimating() - self.previousRange = nil - self.transcript = transcript - transcriptView.attributedText = styleText(transcript: transcript) - } - - private func styleText(transcript: TranscriptModel, position: Double = 0) -> NSAttributedString { - let formattedText = NSMutableAttributedString(attributedString: transcript.attributedText) - - let paragraphStyle = NSMutableParagraphStyle() - paragraphStyle.lineHeightMultiple = 1.2 - paragraphStyle.paragraphSpacing = 10 - paragraphStyle.lineBreakMode = .byWordWrapping - - let standardFont = UIFont.systemFont(ofSize: 16) - let highlightFont = UIFont.systemFont(ofSize: 18) - - let normalStyle: [NSAttributedString.Key: Any] = [ - .paragraphStyle: paragraphStyle, - .font: standardFont, - .foregroundColor: ThemeColor.playerContrast02() - ] - - let highlightStyle: [NSAttributedString.Key: Any] = [ - .paragraphStyle: paragraphStyle, - .font: highlightFont, - .foregroundColor: ThemeColor.playerContrast01() - ] - - formattedText.addAttributes(normalStyle, range: NSRange(location: 0, length: formattedText.length)) - - if let range = transcript.firstWord(containing: position)?.characterRange { - formattedText.addAttributes(highlightStyle, range: range) - } - - return formattedText - } - - private func show(error: Error) { - activityIndicatorView.stopAnimating() - guard let transcriptError = error as? TranscriptError else { - transcriptView.text = "Transcript unknow error" - return - } - - transcriptView.text = transcriptError.localizedDescription - } - - private func addObservers() { - addCustomObserver(Constants.Notifications.playbackTrackChanged, selector: #selector(update)) - addCustomObserver(Constants.Notifications.playbackProgress, selector: #selector(updateTranscriptPosition)) - addCustomObserver(Constants.Notifications.speechToTextAvailable, selector: #selector(receivedSpeechToTextContent)) - } - - var offset: TimeInterval = 0 - - @objc private func receivedSpeechToTextContent(notification: NSNotification) { - guard let text = notification.userInfo?["text"] as? SFTranscription, - let offset = notification.userInfo?["offset"] as? TimeInterval else { return } - - self.offset = offset - - DispatchQueue.global().async { - self.transcript?.wordByWord(speechToText: text) - } - } - - @objc private func updateTranscriptPosition() { - let position = playbackManager.currentTime() - offset - guard let transcript else { - return - } - - if let word = transcript.firstWord(containing: position) { -// print(transcript.rawText[word.characterRange.lowerBound.. \(cue.endTime)]") -// previousRange = range -// transcriptView.attributedText = styleText(transcript: transcript, position: position) -// // adjusting the scroll to range so it shows more text -// let scrollRange = NSRange(location: range.location, length: range.length * 5) -// transcriptView.scrollRangeToVisible(scrollRange) -// } else if let startTime = transcript.cues.first?.startTime, position < startTime { -// previousRange = nil -// transcriptView.scrollRangeToVisible(NSRange(location: 0, length: 0)) -// } - } -} - -extension String { - subscript (bounds: CountableClosedRange) -> String { - let start = index(startIndex, offsetBy: bounds.lowerBound) - let end = index(startIndex, offsetBy: bounds.upperBound) - return String(self[start...end]) - } - - subscript (bounds: CountableRange) -> String { - let start = index(startIndex, offsetBy: bounds.lowerBound) - let end = index(startIndex, offsetBy: bounds.upperBound) - return String(self[start..) -> ArraySlice { - indices.contains(bounds.upperBound) && indices.contains(bounds.lowerBound) ? self[bounds] : [] - } -} From 98a4a4d6c44efaa7a354a0d163113598cff655ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9rgio=20Est=C3=AAv=C3=A3o?= Date: Fri, 23 May 2025 17:07:51 +0100 Subject: [PATCH 11/11] Allow use of SpeechRecognition Task using a FF --- .../Feature Flags/FeatureFlag.swift | 5 +++++ podcasts.xcodeproj/project.pbxproj | 15 +++++++++++---- podcasts/AudioReadSpeechRecognitionTask.swift | 3 ++- podcasts/AudioReadTask.swift | 2 +- podcasts/EffectsPlayer.swift | 15 +++++++++++++-- 5 files changed, 32 insertions(+), 8 deletions(-) diff --git a/Modules/Utils/Sources/PocketCastsUtils/Feature Flags/FeatureFlag.swift b/Modules/Utils/Sources/PocketCastsUtils/Feature Flags/FeatureFlag.swift index 48a9e6bc1b..fc580a19b3 100644 --- a/Modules/Utils/Sources/PocketCastsUtils/Feature Flags/FeatureFlag.swift +++ b/Modules/Utils/Sources/PocketCastsUtils/Feature Flags/FeatureFlag.swift @@ -188,6 +188,9 @@ public enum FeatureFlag: String, CaseIterable { /// When replacing an episode list with a new one, use the provided episode instead of Up Next Queue case replaceSpecificEpisode + /// If we should try to transcript sync with the audio + case transcriptSync + public var enabled: Bool { if let overriddenValue = FeatureFlagOverrideStore().overriddenValue(for: self) { return overriddenValue @@ -318,6 +321,8 @@ public enum FeatureFlag: String, CaseIterable { true case .replaceSpecificEpisode: true + case .transcriptSync: + true } } diff --git a/podcasts.xcodeproj/project.pbxproj b/podcasts.xcodeproj/project.pbxproj index b3b7119508..467129d029 100644 --- a/podcasts.xcodeproj/project.pbxproj +++ b/podcasts.xcodeproj/project.pbxproj @@ -2018,7 +2018,6 @@ FF54BCAA2C5A2F4D00A342E5 /* TranscriptFormat.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF54BCA82C5A2F3900A342E5 /* TranscriptFormat.swift */; }; FF5B2A442BB1859B009F3DC2 /* SourceInterfaceNavigationView.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF5B2A432BB1859B009F3DC2 /* SourceInterfaceNavigationView.swift */; }; FF5B2A462BB189C7009F3DC2 /* PocketCastsApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF5B2A452BB189C7009F3DC2 /* PocketCastsApp.swift */; }; - FF7F89EA2C2979D900FC0ED5 /* TranscriptSyncChanges.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF7F89E92C2979D900FC0ED5 /* TranscriptSyncChanges.swift */; }; FF6BBCEE2C53E9D000604A01 /* TranscriptManagerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF6BBCED2C53E9D000604A01 /* TranscriptManagerTests.swift */; }; FF6BBCF02C53EA1F00604A01 /* sample.vtt in Resources */ = {isa = PBXBuildFile; fileRef = FF6BBCEF2C53EA1F00604A01 /* sample.vtt */; }; FF6BBCF22C578CE600604A01 /* TranscriptsDataRetriever.swift in Sources */ = {isa = PBXBuildFile; fileRef = FF6BBCF12C578CE600604A01 /* TranscriptsDataRetriever.swift */; }; @@ -2045,6 +2044,7 @@ FFD044192C4FE9F400CCB192 /* TranscriptModelFilterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFD044182C4FE9F400CCB192 /* TranscriptModelFilterTests.swift */; }; FFD3AB8C2BD15E8F00C562CB /* CircleView.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFD3AB8B2BD15E8F00C562CB /* CircleView.swift */; }; FFDB38302DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFDB382F2DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift */; }; + FFDB38312DE0CFE800911B51 /* AudioReadSpeechRecognitionTask.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFDB382F2DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift */; }; FFDE41A32DD201AE0065ADDE /* AppClipAppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFDE41A22DD201A40065ADDE /* AppClipAppDelegate.swift */; }; FFE794DD2DA443FF005E4D40 /* NotificationsCoordinator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFE794DC2DA443F8005E4D40 /* NotificationsCoordinator.swift */; }; FFF024CE2B62AC9400457373 /* IAPHelperTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFF024CD2B62AC9400457373 /* IAPHelperTests.swift */; }; @@ -4092,7 +4092,6 @@ FF57373C2B4EB5B100F511C7 /* README.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = ""; }; FF5B2A432BB1859B009F3DC2 /* SourceInterfaceNavigationView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SourceInterfaceNavigationView.swift; sourceTree = ""; }; FF5B2A452BB189C7009F3DC2 /* PocketCastsApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PocketCastsApp.swift; sourceTree = ""; }; - FF7F89E92C2979D900FC0ED5 /* TranscriptSyncChanges.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptSyncChanges.swift; sourceTree = ""; }; FF6BBCED2C53E9D000604A01 /* TranscriptManagerTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TranscriptManagerTests.swift; sourceTree = ""; }; FF6BBCEF2C53EA1F00604A01 /* sample.vtt */ = {isa = PBXFileReference; lastKnownFileType = text; path = sample.vtt; sourceTree = ""; }; FF6BBCF12C578CE600604A01 /* TranscriptsDataRetriever.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranscriptsDataRetriever.swift; sourceTree = ""; }; @@ -8599,7 +8598,6 @@ FF7F89E82C2979C000FC0ED5 /* Transcripts */ = { isa = PBXGroup; children = ( - FF7F89E92C2979D900FC0ED5 /* TranscriptSyncChanges.swift */, FF7F89E92C2979D900FC0ED5 /* TranscriptViewController.swift */, 9A0E6C652D822637008CFDF7 /* GeneratedTranscriptsPremiumOverlay.swift */, FF7F89EC2C2AF6DE00FC0ED5 /* TranscriptManager.swift */, @@ -9828,10 +9826,14 @@ inputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); + inputPaths = ( + ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); + outputPaths = ( + ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks.sh\"\n"; @@ -9863,10 +9865,14 @@ inputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); + inputPaths = ( + ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); + outputPaths = ( + ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks.sh\"\n"; @@ -10170,7 +10176,7 @@ BD998AD327B3430700B38857 /* ColorPreviewFolderView.swift in Sources */, BD7C1FFD237D16C600B3353B /* PCAlwaysVisibleCastBtn.swift in Sources */, BD9324712398BB50004F19A1 /* TourView.swift in Sources */, - FF7F89EA2C2979D900FC0ED5 /* TranscriptSyncChanges.swift in Sources */, + FF7F89EA2C2979D900FC0ED5 /* TranscriptViewController.swift in Sources */, FF7F89EA2C2979D900FC0ED5 /* TranscriptViewController.swift in Sources */, BDC5360A27C88A5700EFCF31 /* FolderPreviewWrapper.swift in Sources */, 8B5AB48E2901A8BD0018C637 /* StoriesController.swift in Sources */, @@ -11330,6 +11336,7 @@ F5F509382CEBF1FE007D6E39 /* ThemeableLabel.swift in Sources */, FF0F407C2D5E37250036FFE9 /* ThreadSafeDictionary.swift in Sources */, F5F509562CEBF62D007D6E39 /* DownloadProgress.swift in Sources */, + FFDB38312DE0CFE800911B51 /* AudioReadSpeechRecognitionTask.swift in Sources */, F5F509702CEBF8B8007D6E39 /* SJCommonUtils.m in Sources */, F5ADE2F52CF52AFA00F2CEA7 /* TracksAdapter.swift in Sources */, F5D5F7BB2CEBE7CE001F492D /* NowPlayingView.swift in Sources */, diff --git a/podcasts/AudioReadSpeechRecognitionTask.swift b/podcasts/AudioReadSpeechRecognitionTask.swift index a4877e3c23..4451428d5e 100644 --- a/podcasts/AudioReadSpeechRecognitionTask.swift +++ b/podcasts/AudioReadSpeechRecognitionTask.swift @@ -4,7 +4,7 @@ import PocketCastsServer import PocketCastsUtils import Speech -class AudioReadSpeechRecognitionTask { +class AudioReadSpeechRecognitionTask: AudioReaderTask { private let maxSilenceAmountToSave = 1000 private var minRMS = 0.005 as Float32 @@ -123,6 +123,7 @@ class AudioReadSpeechRecognitionTask { } if let result { + print("$$ SFSpeechRecognition result: \(result.bestTranscription.formattedString)") NotificationCenter.postOnMainThread(notification: Constants.Notifications.speechToTextAvailable, userInfo: ["text": result.bestTranscription, "offset": offset]) } } diff --git a/podcasts/AudioReadTask.swift b/podcasts/AudioReadTask.swift index 44fd283e01..420ffa116f 100644 --- a/podcasts/AudioReadTask.swift +++ b/podcasts/AudioReadTask.swift @@ -3,7 +3,7 @@ import PocketCastsDataModel import PocketCastsServer import PocketCastsUtils -class AudioReadTask { +class AudioReadTask: AudioReaderTask { private let maxSilenceAmountToSave = 1000 private var minRMS = 0.005 as Float32 diff --git a/podcasts/EffectsPlayer.swift b/podcasts/EffectsPlayer.swift index 4d1deaf7eb..866b813ee8 100644 --- a/podcasts/EffectsPlayer.swift +++ b/podcasts/EffectsPlayer.swift @@ -4,6 +4,13 @@ import PocketCastsDataModel import PocketCastsUtils import UIKit +protocol AudioReaderTask { + func startup() + func shutdown() + func setTrimSilence(_ trimSilence: TrimSilenceAmount) + func seekTo(_ time: TimeInterval, completion: ((Bool) -> Void)?) +} + class EffectsPlayer: PlaybackProtocol, Hashable { private static let targetVolumeDbGain = 15.0 as Float @@ -21,7 +28,7 @@ class EffectsPlayer: PlaybackProtocol, Hashable { private var peakLimiter: AVAudioUnitEffect? private var playBufferManager: PlayBufferManager? - private var audioReadTask: AudioReadTask? + private var audioReadTask: AudioReaderTask? private var audioPlayTask: AudioPlayTask? private var audioFile: AVAudioFile? @@ -358,7 +365,11 @@ class EffectsPlayer: PlaybackProtocol, Hashable { guard let audioFile = audioFile, let player = player, let playBufferManager = playBufferManager else { return } let requiredStartTime = PlaybackManager.shared.requiredStartingPosition() - audioReadTask = AudioReadTask(trimSilence: effects.trimSilence, audioFile: audioFile, outputFormat: audioFile.processingFormat, bufferManager: playBufferManager, playPositionHint: requiredStartTime, frameCount: cachedFrameCount) + if FeatureFlag.transcriptSync.enabled { + audioReadTask = AudioReadSpeechRecognitionTask(trimSilence: effects.trimSilence, audioFile: audioFile, outputFormat: audioFile.processingFormat, bufferManager: playBufferManager, playPositionHint: requiredStartTime, frameCount: cachedFrameCount) + } else { + audioReadTask = AudioReadTask(trimSilence: effects.trimSilence, audioFile: audioFile, outputFormat: audioFile.processingFormat, bufferManager: playBufferManager, playPositionHint: requiredStartTime, frameCount: cachedFrameCount) + } audioPlayTask = AudioPlayTask(player: player, bufferManager: playBufferManager) audioReadTask?.startup()