diff --git a/Modules/Utils/Sources/PocketCastsUtils/Feature Flags/FeatureFlag.swift b/Modules/Utils/Sources/PocketCastsUtils/Feature Flags/FeatureFlag.swift index 48a9e6bc1b..fc580a19b3 100644 --- a/Modules/Utils/Sources/PocketCastsUtils/Feature Flags/FeatureFlag.swift +++ b/Modules/Utils/Sources/PocketCastsUtils/Feature Flags/FeatureFlag.swift @@ -188,6 +188,9 @@ public enum FeatureFlag: String, CaseIterable { /// When replacing an episode list with a new one, use the provided episode instead of Up Next Queue case replaceSpecificEpisode + /// If we should try to transcript sync with the audio + case transcriptSync + public var enabled: Bool { if let overriddenValue = FeatureFlagOverrideStore().overriddenValue(for: self) { return overriddenValue @@ -318,6 +321,8 @@ public enum FeatureFlag: String, CaseIterable { true case .replaceSpecificEpisode: true + case .transcriptSync: + true } } diff --git a/podcasts.xcodeproj/project.pbxproj b/podcasts.xcodeproj/project.pbxproj index 7d92300572..467129d029 100644 --- a/podcasts.xcodeproj/project.pbxproj +++ b/podcasts.xcodeproj/project.pbxproj @@ -2043,6 +2043,8 @@ FFC293992B6173400059F3BB /* IAPTypes.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFC293982B6173400059F3BB /* IAPTypes.swift */; }; FFD044192C4FE9F400CCB192 /* TranscriptModelFilterTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFD044182C4FE9F400CCB192 /* TranscriptModelFilterTests.swift */; }; FFD3AB8C2BD15E8F00C562CB /* CircleView.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFD3AB8B2BD15E8F00C562CB /* CircleView.swift */; }; + FFDB38302DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFDB382F2DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift */; }; + FFDB38312DE0CFE800911B51 /* AudioReadSpeechRecognitionTask.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFDB382F2DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift */; }; FFDE41A32DD201AE0065ADDE /* AppClipAppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFDE41A22DD201A40065ADDE /* AppClipAppDelegate.swift */; }; FFE794DD2DA443FF005E4D40 /* NotificationsCoordinator.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFE794DC2DA443F8005E4D40 /* NotificationsCoordinator.swift */; }; FFF024CE2B62AC9400457373 /* IAPHelperTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = FFF024CD2B62AC9400457373 /* IAPHelperTests.swift */; }; @@ -4114,6 +4116,7 @@ FFC293982B6173400059F3BB /* IAPTypes.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = IAPTypes.swift; sourceTree = ""; }; FFD044182C4FE9F400CCB192 /* TranscriptModelFilterTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = TranscriptModelFilterTests.swift; sourceTree = ""; }; FFD3AB8B2BD15E8F00C562CB /* CircleView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = CircleView.swift; sourceTree = ""; }; + FFDB382F2DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = AudioReadSpeechRecognitionTask.swift; sourceTree = ""; }; FFDE41A22DD201A40065ADDE /* AppClipAppDelegate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppClipAppDelegate.swift; sourceTree = ""; }; FFE794DC2DA443F8005E4D40 /* NotificationsCoordinator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = NotificationsCoordinator.swift; sourceTree = ""; }; FFF024CD2B62AC9400457373 /* IAPHelperTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = IAPHelperTests.swift; sourceTree = ""; }; @@ -6017,6 +6020,7 @@ children = ( BDF15A411B54E088000EC323 /* EffectsPlayer.swift */, BD4098791B9ECA5F007F36BD /* AudioReadTask.swift */, + FFDB382F2DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift */, BD40987F1B9EFE3C007F36BD /* AudioPlayTask.swift */, BD4098811B9EFF6E007F36BD /* PlayBufferManager.swift */, BD40987B1B9ED731007F36BD /* SynchronizedAudioStack.swift */, @@ -9047,6 +9051,7 @@ 8B1762752B6808F700F44450 /* XCRemoteSwiftPackageReference "JLRoutes" */, 8B1762782B684E7100F44450 /* XCRemoteSwiftPackageReference "Kingfisher" */, FF7F89EB2C2AF53C00FC0ED5 /* XCRemoteSwiftPackageReference "SwiftSubtitles" */, + FFF450222DDF6A8A00272AB8 /* XCRemoteSwiftPackageReference "Ifrit" */, 9A11FC912D91CBF300A1EF3E /* XCRemoteSwiftPackageReference "AppsFlyerFramework-Dynamic" */, F5FFC2312DBAF0340091429C /* XCRemoteSwiftPackageReference "facebook-ios-sdk" */, ); @@ -9821,10 +9826,14 @@ inputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); + inputPaths = ( + ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); + outputPaths = ( + ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-podcasts/Pods-podcasts-frameworks.sh\"\n"; @@ -9856,10 +9865,14 @@ inputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks-${CONFIGURATION}-input-files.xcfilelist", ); + inputPaths = ( + ); name = "[CP] Embed Pods Frameworks"; outputFileListPaths = ( "${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks-${CONFIGURATION}-output-files.xcfilelist", ); + outputPaths = ( + ); runOnlyForDeploymentPostprocessing = 0; shellPath = /bin/sh; shellScript = "\"${PODS_ROOT}/Target Support Files/Pods-PocketCastsTests/Pods-PocketCastsTests-frameworks.sh\"\n"; @@ -10164,6 +10177,7 @@ BD7C1FFD237D16C600B3353B /* PCAlwaysVisibleCastBtn.swift in Sources */, BD9324712398BB50004F19A1 /* TourView.swift in Sources */, FF7F89EA2C2979D900FC0ED5 /* TranscriptViewController.swift in Sources */, + FF7F89EA2C2979D900FC0ED5 /* TranscriptViewController.swift in Sources */, BDC5360A27C88A5700EFCF31 /* FolderPreviewWrapper.swift in Sources */, 8B5AB48E2901A8BD0018C637 /* StoriesController.swift in Sources */, BD6D4187200C802900CA8993 /* PodcastViewController+NetworkLoad.swift in Sources */, @@ -10283,6 +10297,7 @@ C7318EA42A61E40800EAFA9C /* BookmarkEditTitleView.swift in Sources */, BDF4D30B2175A9E90086463E /* StarredViewController.swift in Sources */, C75BB06829B954B100F2DF63 /* EpisodeLoadingController.swift in Sources */, + FFDB38302DE0B88200911B51 /* AudioReadSpeechRecognitionTask.swift in Sources */, 40B118EB2153B8CA000932C9 /* RadioButtonCell.swift in Sources */, BDEAA1861BB144AD001097D9 /* DisclosureCell.swift in Sources */, BDD40A1A1FA1AF7900A53AE1 /* TintableImageView.swift in Sources */, @@ -11321,6 +11336,7 @@ F5F509382CEBF1FE007D6E39 /* ThemeableLabel.swift in Sources */, FF0F407C2D5E37250036FFE9 /* ThreadSafeDictionary.swift in Sources */, F5F509562CEBF62D007D6E39 /* DownloadProgress.swift in Sources */, + FFDB38312DE0CFE800911B51 /* AudioReadSpeechRecognitionTask.swift in Sources */, F5F509702CEBF8B8007D6E39 /* SJCommonUtils.m in Sources */, F5ADE2F52CF52AFA00F2CEA7 /* TracksAdapter.swift in Sources */, F5D5F7BB2CEBE7CE001F492D /* NowPlayingView.swift in Sources */, @@ -11745,6 +11761,8 @@ ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; CLANG_ENABLE_MODULES = YES; CODE_SIGN_ENTITLEMENTS = podcasts/podcastsDebug.entitlements; + "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer"; + "DEVELOPMENT_TEAM[sdk=iphoneos*]" = PZYM8XX95Q; ENABLE_BITCODE = NO; FRAMEWORK_SEARCH_PATHS = ( "$(inherited)", @@ -11759,6 +11777,7 @@ ); PRODUCT_NAME = "$(TARGET_NAME)"; PROVISIONING_PROFILE_SPECIFIER = "Pocket Casts Development"; + "PROVISIONING_PROFILE_SPECIFIER[sdk=iphoneos*]" = "Pocket Casts Development"; SWIFT_OBJC_BRIDGING_HEADER = "podcasts/podcasts-Bridging-Header.h"; WRAPPER_EXTENSION = app; }; @@ -13130,6 +13149,8 @@ ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; CLANG_ENABLE_MODULES = YES; CODE_SIGN_ENTITLEMENTS = podcasts/podcastsDebug.entitlements; + "CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer"; + "DEVELOPMENT_TEAM[sdk=iphoneos*]" = PZYM8XX95Q; ENABLE_BITCODE = NO; FRAMEWORK_SEARCH_PATHS = ( "$(inherited)", @@ -13144,6 +13165,7 @@ ); PRODUCT_NAME = "$(TARGET_NAME)"; PROVISIONING_PROFILE_SPECIFIER = "Pocket Casts Development"; + "PROVISIONING_PROFILE_SPECIFIER[sdk=iphoneos*]" = "Pocket Casts Development"; SWIFT_OBJC_BRIDGING_HEADER = "podcasts/podcasts-Bridging-Header.h"; WRAPPER_EXTENSION = app; }; @@ -13785,6 +13807,14 @@ minimumVersion = 1.8.2; }; }; + FFF450222DDF6A8A00272AB8 /* XCRemoteSwiftPackageReference "Ifrit" */ = { + isa = XCRemoteSwiftPackageReference; + repositoryURL = "https://github.com/ukushu/Ifrit.git"; + requirement = { + branch = main; + kind = branch; + }; + }; /* End XCRemoteSwiftPackageReference section */ /* Begin XCSwiftPackageProductDependency section */ diff --git a/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved b/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved index a1dbbb54a8..e7978aa3a2 100644 --- a/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved +++ b/podcasts.xcworkspace/xcshareddata/swiftpm/Package.resolved @@ -181,6 +181,15 @@ "version": "4.1.1" } }, + { + "package": "Ifrit", + "repositoryURL": "https://github.com/ukushu/Ifrit.git", + "state": { + "branch": "main", + "revision": "5cb8badc28c7a4b7f59a05b8f0da37c363d25b6a", + "version": null + } + }, { "package": "InteropForGoogle", "repositoryURL": "https://github.com/google/interop-ios-for-google-sdks.git", diff --git a/podcasts/AudioReadSpeechRecognitionTask.swift b/podcasts/AudioReadSpeechRecognitionTask.swift new file mode 100644 index 0000000000..4451428d5e --- /dev/null +++ b/podcasts/AudioReadSpeechRecognitionTask.swift @@ -0,0 +1,387 @@ +import AVFoundation +import PocketCastsDataModel +import PocketCastsServer +import PocketCastsUtils +import Speech + +class AudioReadSpeechRecognitionTask: AudioReaderTask { + private let maxSilenceAmountToSave = 1000 + + private var minRMS = 0.005 as Float32 + private var minGapSizeInFrames = 3 + private var amountOfSilentFramesToReInsert = 1 + + private let cancelled = AtomicBool() + + private let readQueue: DispatchQueue + private let lock = NSObject() + + private var trimSilence: TrimSilenceAmount = .off + + private var audioFile: AVAudioFile + private var outputFormat: AVAudioFormat + private var bufferManager: PlayBufferManager + + private let bufferLength = UInt32(Constants.Audio.defaultFrameSize) + private let bufferByteSize = Float32(MemoryLayout.size) + + private var foundGap = false + private var channelCount = 0 as UInt32 + private var buffersSavedDuringGap = SynchronizedAudioStack() + private var fadeInNextFrame = true + private var cachedFrameCount = 0 as Int64 + + private var currentFramePosition: AVAudioFramePosition = 0 + private let endOfFileSemaphore = DispatchSemaphore(value: 0) + + private lazy var request: SFSpeechAudioBufferRecognitionRequest? = { + let request = SFSpeechAudioBufferRecognitionRequest() + request.shouldReportPartialResults = false + request.requiresOnDeviceRecognition = true + request.taskHint = .dictation + return request + }() + + private var task: SFSpeechRecognitionTask? + private lazy var recognizer: SFSpeechRecognizer? = { + let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US")) + recognizer?.defaultTaskHint = .dictation + return recognizer + }() + + init(trimSilence: TrimSilenceAmount, audioFile: AVAudioFile, outputFormat: AVAudioFormat, bufferManager: PlayBufferManager, playPositionHint: TimeInterval, frameCount: Int64) { + self.trimSilence = .off + self.audioFile = audioFile + self.outputFormat = outputFormat + self.bufferManager = bufferManager + cachedFrameCount = frameCount + + readQueue = DispatchQueue(label: "au.com.pocketcasts.ReadQueue", qos: .default, attributes: [], autoreleaseFrequency: .never, target: nil) + + updateRemoveSilenceNumbers() + + if playPositionHint > 0 { + currentFramePosition = framePositionForTime(playPositionHint).framePosition + audioFile.framePosition = currentFramePosition + } + } + + var offset: TimeInterval = 0 + + func startup() { + readQueue.async { [weak self] in + guard let self = self else { return } + + guard let recognizer, let request else { return } + + offset = PlaybackManager.shared.currentTime() + print("$$ Starting task") + task = recognizer.recognitionTask(with: request, resultHandler: { [weak self] result, error in + self?.recognitionHandler(result: result, error: error) + }) + + // there are some Core Audio errors that aren't marked as throws in the Swift code, so they'll crash the app + // that's why we have an Objective-C try/catch block here to catch them (see https://github.com/shiftyjelly/pocketcasts-ios/issues/1493 for more details) + do { + try SJCommonUtils.catchException { [weak self] in + guard let self = self else { return } + + do { + while !self.cancelled.value { + // nil is returned when there are playback errors or us getting to the end of a file, sleep so we don't end up in a tight loop but these all set the cancelled flag + guard let audioBuffers = try self.readFromFile() else { + Thread.sleep(forTimeInterval: 0.1) + continue + } + + for buffer in audioBuffers { + self.scheduleForPlayback(buffer: buffer) + } + } + } catch { + self.bufferManager.readErrorOccurred.value = true + FileLog.shared.addMessage("Audio Read failed (Swift): \(error.localizedDescription)") + } + } + } catch { + self.bufferManager.readErrorOccurred.value = true + FileLog.shared.addMessage("Audio Read failed (obj-c): \(error.localizedDescription)") + } + } + } + + nonisolated private func recognitionHandler(result: SFSpeechRecognitionResult?, error: Error?) { + let receivedFinalResult = result?.isFinal ?? false + let receivedError = error != nil + + if result?.isFinal == true { + print("$$ SFSpeechRecognition finished") + } + + if let error { + print("$$ SFSpeechRecognition error \(error)") + } + + if let result { + print("$$ SFSpeechRecognition result: \(result.bestTranscription.formattedString)") + NotificationCenter.postOnMainThread(notification: Constants.Notifications.speechToTextAvailable, userInfo: ["text": result.bestTranscription, "offset": offset]) + } + } + + func shutdown() { + cancelled.value = true + bufferManager.bufferSemaphore.signal() + endOfFileSemaphore.signal() + } + + func setTrimSilence(_ trimSilence: TrimSilenceAmount) { + objc_sync_enter(lock) + defer { objc_sync_exit(lock) } + + self.trimSilence = trimSilence + updateRemoveSilenceNumbers() + } + + private func updateRemoveSilenceNumbers() { + guard trimSilence != .off else { return } + + minGapSizeInFrames = gapSizeForSilenceAmount() + amountOfSilentFramesToReInsert = framesToReInsertForSilenceAmount() + minRMS = minRMSForSilenceAmount() + } + + func seekTo(_ time: TimeInterval, completion: ((Bool) -> Void)?) { + DispatchQueue.global(qos: .default).async { () in + let seekResult = self.performSeek(time) + self.bufferManager.bufferSemaphore.signal() + completion?(seekResult) + } + } + + private func performSeek(_ time: TimeInterval) -> Bool { + objc_sync_enter(lock) + defer { objc_sync_exit(lock) } + + let positionRequired = framePositionForTime(time) + var seekedToEnd = false + + if positionRequired.passedEndOfFile { + bufferManager.removeAll() + bufferManager.readToEOFSuccessfully.value = true + + seekedToEnd = true + } else { + currentFramePosition = positionRequired.framePosition + audioFile.framePosition = currentFramePosition + bufferManager.aboutToSeek() + foundGap = false + buffersSavedDuringGap.removeAll() + fadeInNextFrame = true + + // if we've finished reading this file, wake the reading thread back up + if bufferManager.readToEOFSuccessfully.value { + endOfFileSemaphore.signal() + } + } + + return seekedToEnd + } + + private func handleReachedEndOfFile() { + bufferManager.readToEOFSuccessfully.value = true + + // we've read to the end but the player won't yet have played to the end, wait til it signals us that it has + endOfFileSemaphore.wait() + } + + private func readFromFile() throws -> [BufferedAudio]? { + objc_sync_enter(lock) + + // are we at the end of the file? + currentFramePosition = audioFile.framePosition + if currentFramePosition >= cachedFrameCount { + objc_sync_exit(lock) + handleReachedEndOfFile() + + return nil + } + + let audioPCMBuffer = AVAudioPCMBuffer(pcmFormat: outputFormat, frameCapacity: bufferLength) + do { + try audioFile.read(into: audioPCMBuffer!) + } catch { + objc_sync_exit(lock) + throw PlaybackError.errorDuringPlayback + } + + // check that we actually read something + if audioPCMBuffer?.frameLength == 0 { + objc_sync_exit(lock) + handleReachedEndOfFile() + + return nil + } + + currentFramePosition = audioFile.framePosition + fadeInNextFrame = false + if channelCount == 0 { channelCount = (audioPCMBuffer?.audioBufferList.pointee.mNumberBuffers)! } + + if channelCount == 0 { + bufferManager.readErrorOccurred.value = true + cancelled.value = true + objc_sync_exit(lock) + + return nil + } + + // iOS 16 has an issue in which if the conditions below are met, the playback will fail: + // Audio file has a single channel and spatial audio is enabled + // In order to prevent this issue, we convert a mono buffer to stereo buffer + // For more info, see: https://github.com/Automattic/pocket-casts-ios/issues/62 + var audioBuffer: BufferedAudio + if #available(iOS 16, *), + let audioPCMBuffer = audioPCMBuffer, + audioPCMBuffer.audioBufferList.pointee.mNumberBuffers == 1, + let twoChannelsFormat = AVAudioFormat(standardFormatWithSampleRate: audioFile.processingFormat.sampleRate, channels: 2), + let twoChannnelBuffer = AVAudioPCMBuffer(pcmFormat: twoChannelsFormat, frameCapacity: audioPCMBuffer.frameCapacity) { + let converter = AVAudioConverter(from: audioFile.processingFormat, to: twoChannelsFormat) + try? converter?.convert(to: twoChannnelBuffer, from: audioPCMBuffer) + audioBuffer = BufferedAudio(audioBuffer: twoChannnelBuffer, framePosition: currentFramePosition, shouldFadeOut: false, shouldFadeIn: fadeInNextFrame) + } else { + audioBuffer = BufferedAudio(audioBuffer: audioPCMBuffer!, framePosition: currentFramePosition, shouldFadeOut: false, shouldFadeIn: fadeInNextFrame) + } + + var buffers = [BufferedAudio]() + if trimSilence != .off { + guard let bufferListPointer = UnsafeMutableAudioBufferListPointer(audioPCMBuffer?.mutableAudioBufferList) else { + buffers.append(audioBuffer) + objc_sync_exit(lock) + + return buffers + } + + let currPosition = currentFramePosition / Int64(audioFile.fileFormat.sampleRate) + let totalDuration = cachedFrameCount / Int64(audioFile.fileFormat.sampleRate) + let timeLeft = totalDuration - currPosition + var rms: Float32 = 0 + if timeLeft <= 5 { + // don't trim silence from the last 5 seconds + rms = 1 + } else { + rms = (channelCount == 1) ? AudioUtils.calculateRms(bufferListPointer[0]) : AudioUtils.calculateStereoRms(bufferListPointer[0], rightBuffer: bufferListPointer[1]) + } + + if rms > minRMS, !foundGap { + // the RMS is higher than our minimum and we aren't currently in a gap, just play it + buffers.append(audioBuffer) + } else if foundGap, rms > minRMS || buffersSavedDuringGap.count() > maxSilenceAmountToSave { + foundGap = false + // we've come to the end of a gap (or we've had a suspiscious amount of gap), piece back together the audio + if buffersSavedDuringGap.count() < minGapSizeInFrames { + // we don't have enough gap to remove, just push + while buffersSavedDuringGap.canPop() { + buffers.append(buffersSavedDuringGap.pop()!) + } + + buffers.append(audioBuffer) + } else { + for index in 0 ... amountOfSilentFramesToReInsert { + if index < amountOfSilentFramesToReInsert { + buffers.append(buffersSavedDuringGap.pop()!) + } else { + // fade out the last frame to avoid a jarring re-attach + let buffer = buffersSavedDuringGap.pop()! + AudioUtils.fadeAudio(buffer, fadeOut: true, channelCount: channelCount) + buffers.append(buffer) + } + } + + // pop all the ones we don't need after that + while buffersSavedDuringGap.canPop(), buffersSavedDuringGap.count() > (amountOfSilentFramesToReInsert - 1) { + _ = buffersSavedDuringGap.pop() + let secondsSaved = Double((audioPCMBuffer?.frameLength)!) / audioFile.fileFormat.sampleRate + StatsManager.shared.addTimeSavedDynamicSpeed(secondsSaved) + } + + while buffersSavedDuringGap.canPop() { + buffers.append(buffersSavedDuringGap.pop()!) + } + + // fade back in the new frame + AudioUtils.fadeAudio(audioBuffer, fadeOut: false, channelCount: channelCount) + buffers.append(audioBuffer) + } + } else if rms < minRMS, !foundGap { + // we are at the start of a gap, save this clip and keep going + foundGap = true + buffersSavedDuringGap.push(audioBuffer) + } else if rms < minRMS, foundGap { + // we are inside a gap we've already found + buffersSavedDuringGap.push(audioBuffer) + } + } else { + buffers.append(audioBuffer) + } + + objc_sync_exit(lock) + return buffers + } + + private func scheduleForPlayback(buffer: BufferedAudio) { + // the play task will signal us when it needs more buffer, but it will keep signalling as long as the buffer is low, so keep calling wait until we get below the high point + while !cancelled.value, bufferManager.bufferLength() >= bufferManager.highBufferPoint { + bufferManager.bufferSemaphore.wait() + } + + if !cancelled.value { + bufferManager.push(buffer) + } + + request?.append(buffer.audioBuffer) + } + + private func gapSizeForSilenceAmount() -> Int { + switch trimSilence { + case .low: + return 20 + case .medium: + return 16 + case .high: + return 4 + case .off: + return 0 + } + } + + private func framesToReInsertForSilenceAmount() -> Int { + switch trimSilence { + case .low: + return 14 + case .medium: + return 12 + case .high, .off: + return 0 + } + } + + private func minRMSForSilenceAmount() -> Float32 { + switch trimSilence { + case .low: + return 0.0055 + case .medium: + return 0.00511 + case .high: + return 0.005 + case .off: + return 0 + } + } + + private func framePositionForTime(_ time: TimeInterval) -> (framePosition: Int64, passedEndOfFile: Bool) { + let totalFrames = Double(cachedFrameCount) + let totalSeconds = totalFrames / audioFile.fileFormat.sampleRate + let percentSeek = time / totalSeconds + + return (Int64(totalFrames * percentSeek), percentSeek >= 1) + } +} diff --git a/podcasts/AudioReadTask.swift b/podcasts/AudioReadTask.swift index a9a70541fa..420ffa116f 100644 --- a/podcasts/AudioReadTask.swift +++ b/podcasts/AudioReadTask.swift @@ -3,7 +3,7 @@ import PocketCastsDataModel import PocketCastsServer import PocketCastsUtils -class AudioReadTask { +class AudioReadTask: AudioReaderTask { private let maxSilenceAmountToSave = 1000 private var minRMS = 0.005 as Float32 @@ -297,7 +297,7 @@ class AudioReadTask { if !cancelled.value { bufferManager.push(buffer) - } + } } private func gapSizeForSilenceAmount() -> Int { diff --git a/podcasts/Constants.swift b/podcasts/Constants.swift index 8c4c5c4ae0..2d0de5157e 100644 --- a/podcasts/Constants.swift +++ b/podcasts/Constants.swift @@ -104,6 +104,9 @@ struct Constants { static let avatarNeedsRefreshing = NSNotification.Name(rawValue: "avatarNeedsRefreshing") static let discoverNavigateToCategory = Notification.Name(rawValue: "DiscoverNavigateToCategory") + + // Speech to Text + static let speechToTextAvailable = NSNotification.Name(rawValue: "speechToTextAvailable") } enum UserDefaults { diff --git a/podcasts/EffectsPlayer.swift b/podcasts/EffectsPlayer.swift index 4d1deaf7eb..866b813ee8 100644 --- a/podcasts/EffectsPlayer.swift +++ b/podcasts/EffectsPlayer.swift @@ -4,6 +4,13 @@ import PocketCastsDataModel import PocketCastsUtils import UIKit +protocol AudioReaderTask { + func startup() + func shutdown() + func setTrimSilence(_ trimSilence: TrimSilenceAmount) + func seekTo(_ time: TimeInterval, completion: ((Bool) -> Void)?) +} + class EffectsPlayer: PlaybackProtocol, Hashable { private static let targetVolumeDbGain = 15.0 as Float @@ -21,7 +28,7 @@ class EffectsPlayer: PlaybackProtocol, Hashable { private var peakLimiter: AVAudioUnitEffect? private var playBufferManager: PlayBufferManager? - private var audioReadTask: AudioReadTask? + private var audioReadTask: AudioReaderTask? private var audioPlayTask: AudioPlayTask? private var audioFile: AVAudioFile? @@ -358,7 +365,11 @@ class EffectsPlayer: PlaybackProtocol, Hashable { guard let audioFile = audioFile, let player = player, let playBufferManager = playBufferManager else { return } let requiredStartTime = PlaybackManager.shared.requiredStartingPosition() - audioReadTask = AudioReadTask(trimSilence: effects.trimSilence, audioFile: audioFile, outputFormat: audioFile.processingFormat, bufferManager: playBufferManager, playPositionHint: requiredStartTime, frameCount: cachedFrameCount) + if FeatureFlag.transcriptSync.enabled { + audioReadTask = AudioReadSpeechRecognitionTask(trimSilence: effects.trimSilence, audioFile: audioFile, outputFormat: audioFile.processingFormat, bufferManager: playBufferManager, playPositionHint: requiredStartTime, frameCount: cachedFrameCount) + } else { + audioReadTask = AudioReadTask(trimSilence: effects.trimSilence, audioFile: audioFile, outputFormat: audioFile.processingFormat, bufferManager: playBufferManager, playPositionHint: requiredStartTime, frameCount: cachedFrameCount) + } audioPlayTask = AudioPlayTask(player: player, bufferManager: playBufferManager) audioReadTask?.startup() diff --git a/podcasts/MainTabBarController.swift b/podcasts/MainTabBarController.swift index eff38e0630..7a8f21b4ba 100644 --- a/podcasts/MainTabBarController.swift +++ b/podcasts/MainTabBarController.swift @@ -1,4 +1,5 @@ import PocketCastsDataModel +import Speech import PocketCastsServer import SafariServices import UIKit @@ -104,6 +105,10 @@ class MainTabBarController: UITabBarController, NavigationProtocol { updateDatabaseIndexes() optimizeDatabaseIfNeeded() + + SFSpeechRecognizer.requestAuthorization { SFSpeechRecognizerAuthorizationStatus in + + } } /// Update database indexes and delete unused columns diff --git a/podcasts/PlaybackManager.swift b/podcasts/PlaybackManager.swift index 2a217d3177..520a9c5ae3 100644 --- a/podcasts/PlaybackManager.swift +++ b/podcasts/PlaybackManager.swift @@ -46,7 +46,7 @@ class PlaybackManager: ServerPlaybackDelegate { private let shouldDeactivateSession = AtomicBool() private var haveCalledPlayerLoad = false - private let updateTimerInterval = 1 as TimeInterval + private let updateTimerInterval = 0.01 as TimeInterval #if !os(watchOS) private var backgroundTask = UIBackgroundTaskIdentifier.invalid diff --git a/podcasts/TranscriptModel.swift b/podcasts/TranscriptModel.swift index 7f9dddf38b..813758b556 100644 --- a/podcasts/TranscriptModel.swift +++ b/podcasts/TranscriptModel.swift @@ -3,6 +3,9 @@ import SwiftSubtitles import PocketCastsDataModel import PocketCastsUtils +import Speech +import NaturalLanguage + struct TranscriptCue: Sendable { let startTime: Double let endTime: Double @@ -13,17 +16,28 @@ struct TranscriptCue: Sendable { } } -extension NSAttributedString: @unchecked Sendable { +extension NSAttributedString: @unchecked @retroactive Sendable { } -struct TranscriptModel: Sendable { +class TranscriptModel: @unchecked Sendable { let attributedText: NSAttributedString let cues: [TranscriptCue] let type: String let hasJavascript: Bool + lazy var rawText: String = { + attributedText.string + }() + + init(attributedText: NSAttributedString, cues: [TranscriptCue], type: String, hasJavascript: Bool) { + self.attributedText = attributedText + self.cues = cues + self.type = type + self.hasJavascript = hasJavascript + } + static func makeModel(from transcriptText: String, format: TranscriptFormat) -> TranscriptModel? { if format == .textHTML { let filteredText = ComposeFilter.htmlFilter.filter(transcriptText).trim() @@ -106,9 +120,260 @@ struct TranscriptModel: Sendable { } return nil } + + /// MARK - transcript sync extras: + var timestamps: [(TimeInterval, TimeInterval)] = [] + + var words: [Word] = [] + + var allSpeechToText: [String] = [] { + didSet { + print("$$ \(allSpeechToText.joined(separator: " "))") + print("$$") + } + } } extension NSAttributedString.Key { - static var transcriptSpeaker = NSAttributedString.Key("TranscriptSpeaker") } + +extension TranscriptModel { + + public func firstWord(containing secondsValue: TimeInterval) -> Word? { + words +// .filter { $0.timestamp != nil } +// .sorted(by: { $0.timestamp!.seconds < $1.timestamp!.seconds }) + .first { $0.contains(timeInSeconds: secondsValue) } + } + + func wordByWord(speechToText: SFTranscription) { + // Calculate Levenshtein distance + func levenshtein(aStr: String, bStr: String) -> Int { + let a = Array(aStr) + let b = Array(bStr) + let m = a.count + let n = b.count + + var dist = [[Int]](repeating: [Int](repeating: 0, count: n + 1), count: m + 1) + + for i in 0...m { + dist[i][0] = i + } + for j in 0...n { + dist[0][j] = j + } + + for i in 1...m { + for j in 1...n { + if a[i-1] == b[j-1] { + dist[i][j] = dist[i-1][j-1] + } else { + dist[i][j] = min( + dist[i-1][j] + 1, + dist[i][j-1] + 1, + dist[i-1][j-1] + 1 + ) + } + } + } + + return dist[m][n] + } + + // Define constants + let matchScore = 1 + let mismatchScore = -1 + let gapPenalty = -2 + + struct TimedWord { + let word: String + let timestamp: TimeInterval + let duration: TimeInterval + } + + // Tokenize the text while preserving punctuation + func tokenize(text: String) -> [String] { + var words = [String]() + let tokenizer = NLTokenizer(unit: .word) + tokenizer.string = text + tokenizer.enumerateTokens(in: text.startIndex.. [(String, NSRange)] { + var words = [(String, NSRange)]() + let tokenizer = NLTokenizer(unit: .word) + tokenizer.string = text + tokenizer.enumerateTokens(in: text.startIndex.. [(normalized: String, range: NSRange)] { + let words = tokenizeWithRange(text: text) + return words.map { ($0.lowercased(), $1) } + } + + // Preprocess timed words: tokenize and normalize, preserving timestamps + func preprocessTimedWords(text: [String], timestamps: [(timestamp: TimeInterval, duration: TimeInterval)]) -> [TimedWord] { + var timedWords = [TimedWord]() + for (index, word) in text.enumerated() { + timedWords.append(TimedWord(word: word.lowercased(), timestamp: timestamps[index].timestamp, duration: timestamps[index].duration)) + } + return timedWords + } + + // Define the scoring function + func score(word1: String, word2: String) -> Int { + let distance = levenshtein(aStr: word1, bStr: word2) + return distance == 0 ? 1 : -distance + } + + // Perform sequence alignment + func alignSequences(subtitle: String, transcript: [String], transcriptTimestamps: [(timestamp: TimeInterval, duration: TimeInterval)]) -> ([NSRange?], [String], [(timestamp: TimeInterval, duration: TimeInterval)]) { + let subtitleWords = preprocessSubtitleWords(text: subtitle) + let transcriptTimedWords = preprocessTimedWords(text: transcript, timestamps: transcriptTimestamps) + + let lenSub = subtitleWords.count + let lenTrans = transcriptTimedWords.count + + // Initialize the scoring matrix + var S = Array(repeating: Array(repeating: 0, count: lenTrans + 1), count: lenSub + 1) + + // Initialize first row and column with gap penalties + for i in 1...lenSub { + S[i][0] = S[i-1][0] + gapPenalty + } + for j in 1...lenTrans { + S[0][j] = S[0][j-1] + gapPenalty + } + + // Populate the scoring matrix + for i in 1...lenSub { + for j in 1...lenTrans { + let match = S[i-1][j-1] + score(word1: subtitleWords[i-1].normalized, word2: transcriptTimedWords[j-1].word) + let delete = S[i-1][j] + gapPenalty + let insert = S[i][j-1] + gapPenalty + S[i][j] = max(match, delete, insert) + } + } + + // Traceback to get the aligned sequences + var alignedSubtitle = [NSRange?]() + var alignedTranscript = [String]() + var alignedTimestamps = [(timestamp: TimeInterval, duration: TimeInterval)]() + var i = lenSub + var j = lenTrans + + while i > 0 && j > 0 { + if S[i][j] == S[i-1][j-1] + score(word1: subtitleWords[i-1].normalized, word2: transcriptTimedWords[j-1].word) { + alignedSubtitle.append(subtitleWords[i-1].range) + alignedTranscript.append(transcriptTimedWords[j-1].word) + alignedTimestamps.append((transcriptTimedWords[j-1].timestamp, transcriptTimedWords[j-1].duration)) + i -= 1 + j -= 1 + } else if S[i][j] == S[i-1][j] + gapPenalty { + alignedSubtitle.append(subtitleWords[i-1].range) + alignedTranscript.append("-") + alignedTimestamps.append((-1, -1)) // Indicate a gap with a negative timestamp + i -= 1 + } else { + alignedSubtitle.append(nil) + alignedTranscript.append(transcriptTimedWords[j-1].word) + alignedTimestamps.append((transcriptTimedWords[j-1].timestamp, transcriptTimedWords[j-1].duration)) + j -= 1 + } + } + + while i > 0 { + alignedSubtitle.append(subtitleWords[i-1].range) + alignedTranscript.append("-") + alignedTimestamps.append((-1, -1)) + i -= 1 + } + + while j > 0 { + alignedSubtitle.append(nil) + alignedTranscript.append(transcriptTimedWords[j-1].word) + alignedTimestamps.append((transcriptTimedWords[j-1].timestamp, transcriptTimedWords[j-1].duration)) + j -= 1 + } + + return (alignedSubtitle.reversed(), alignedTranscript.reversed(), alignedTimestamps.reversed()) + } + + allSpeechToText.append(contentsOf: speechToText.segments.map { $0.substring }) + + // Example usage + let subtitle = rawText + let transcript = allSpeechToText + timestamps.append(contentsOf: speechToText.segments.map { ($0.timestamp, $0.duration) }) + + let (alignedSubtitle, alignedTranscript, alignedTimestamps) = alignSequences(subtitle: subtitle, transcript: transcript, transcriptTimestamps: timestamps) + + words = alignedSubtitle.enumerated().compactMap { index, range in + guard let range else { return nil } + + return Word(timestamp: alignedTimestamps[index].timestamp, duration: alignedTimestamps[index].duration, characterRange: range) + } + } +} + +class Word { + var timestamp: TimeInterval + var duration: TimeInterval + var characterRange: NSRange + + init(timestamp: TimeInterval, duration: TimeInterval, characterRange: NSRange) { + self.timestamp = timestamp + self.duration = duration + self.characterRange = characterRange + } + + func contains(timeInSeconds seconds: TimeInterval) -> Bool { + seconds >= timestamp && seconds <= (timestamp + duration) + } +} + +class WordToAnalyze { + let text: String + var timestamp: TimeInterval + var duration: TimeInterval + var matched: Bool = false + + init(text: String, timestamp: TimeInterval, duration: TimeInterval, matched: Bool) { + self.text = text + self.timestamp = timestamp + self.duration = duration + self.matched = matched + } +} + +// Levenshtein distance algorithm +func levDis(_ w1: String, _ w2: String) -> Int { + let empty = [Int](repeating:0, count: w2.count) + var last = [Int](0...w2.count) + + for (i, char1) in w1.enumerated() { + var cur = [i + 1] + empty + for (j, char2) in w2.enumerated() { + cur[j + 1] = char1 == char2 ? last[j] : min(last[j], last[j + 1], cur[j]) + 1 + } + last = cur + } + return last.last! +} + +func levDisNormalized(_ w1: String, _ w2: String) -> Double { + let levDis = levDis(w1, w2) + return 1 - Double(levDis) / Double(max(w1.count, w2.count)) +} diff --git a/podcasts/TranscriptSyncChanges.swift b/podcasts/TranscriptSyncChanges.swift new file mode 100644 index 0000000000..ef69398929 --- /dev/null +++ b/podcasts/TranscriptSyncChanges.swift @@ -0,0 +1,70 @@ +import Speech + +class TranscriptSyncChanges { + + init() { + + } + + var transcript: TranscriptModel? + var transcriptView: UITextView = UITextView() + var playbackManager: PlaybackManager = PlaybackManager.shared + + private func styleText(transcript: TranscriptModel, position: Double = 0) -> NSAttributedString { + if let range = transcript.firstWord(containing: position)?.characterRange { + } + return NSAttributedString() + } + + private func addObservers() { + //addCustomObserver(Constants.Notifications.speechToTextAvailable, selector: #selector(receivedSpeechToTextContent)) + } + + var offset: TimeInterval = 0 + + @objc private func receivedSpeechToTextContent(notification: NSNotification) { + guard let text = notification.userInfo?["text"] as? SFTranscription, + let offset = notification.userInfo?["offset"] as? TimeInterval else { return } + + self.offset = offset + + DispatchQueue.global().async { + self.transcript?.wordByWord(speechToText: text) + } + } + + @objc private func updateTranscriptPosition() { + let position = playbackManager.currentTime() - offset + guard let transcript else { + return + } + + if let word = transcript.firstWord(containing: position) { +// print(transcript.rawText[word.characterRange.lowerBound..) -> String { + let start = index(startIndex, offsetBy: bounds.lowerBound) + let end = index(startIndex, offsetBy: bounds.upperBound) + return String(self[start...end]) + } + + subscript (bounds: CountableRange) -> String { + let start = index(startIndex, offsetBy: bounds.lowerBound) + let end = index(startIndex, offsetBy: bounds.upperBound) + return String(self[start..) -> ArraySlice { + indices.contains(bounds.upperBound) && indices.contains(bounds.lowerBound) ? self[bounds] : [] + } +} diff --git a/podcasts/podcasts-Info.plist b/podcasts/podcasts-Info.plist index 39483d8280..631fef11a1 100644 --- a/podcasts/podcasts-Info.plist +++ b/podcasts/podcasts-Info.plist @@ -1032,6 +1032,8 @@ Pocket Casts needs permission to save this image to your photo library. NSUserTrackingUsageDescription We use analytics to make the app better and to measure if our ad campaigns are successful. + NSSpeechRecognitionUsageDescription + Pocket Casts needs permission to transcribe audio that is playing to text. NSUserActivityTypes ChapterIntent