From f0db7bc587344bda07646d794845a86781129500 Mon Sep 17 00:00:00 2001 From: rover0811 Date: Thu, 15 May 2025 16:40:56 +0900 Subject: [PATCH] [Add] add python client to swift code --- .../ViewModels/Audio/NewAudioWebSocket.swift | 953 ++++++++++++++++++ Sources/ViewModels/Audio/utils.swift | 138 +++ 2 files changed, 1091 insertions(+) create mode 100644 Sources/ViewModels/Audio/NewAudioWebSocket.swift create mode 100644 Sources/ViewModels/Audio/utils.swift diff --git a/Sources/ViewModels/Audio/NewAudioWebSocket.swift b/Sources/ViewModels/Audio/NewAudioWebSocket.swift new file mode 100644 index 0000000..53ee822 --- /dev/null +++ b/Sources/ViewModels/Audio/NewAudioWebSocket.swift @@ -0,0 +1,953 @@ +import Foundation +import AVFoundation +import utils + +class Client: NSObject, URLSessionWebSocketDelegate { + /** + * WebSocket 서버와의 통신을 처리하는 클라이언트 클래스. + */ + + // MARK: - 상수 + static let END_OF_AUDIO = "END_OF_AUDIO" + static var INSTANCES: [String: Client] = [:] + + // MARK: - 프로퍼티 + private var webSocketTask: URLSessionWebSocketTask? + private var urlSession: URLSession! + private var ws_thread: DispatchWorkItem? + + private(set) var recording = false + private(set) var task = "transcribe" + private(set) var uid: String + private(set) var waiting = false + private(set) var last_response_received: Date? + private(set) var disconnect_if_no_response_for: TimeInterval = 15 + private(set) var language: String? + private(set) var model: String + private(set) var server_error = false + private(set) var error_message: Error? + private(set) var srt_file_path: String + private(set) var use_vad: Bool + private(set) var use_wss: Bool + private(set) var last_segment: [String: Any]? + private(set) var last_received_segment: String? + private(set) var log_transcription: Bool + private(set) var max_clients: Int + private(set) var max_connection_time: Int + private(set) var send_last_n_segments: Int + private(set) var no_speech_thresh: Double + private(set) var clip_audio: Bool + private(set) var same_output_threshold: Int + private(set) var server_backend: String? + + var transcript: [[String: Any]] = [] + var transcription_callback: ((_ text: String, _ segments: [[String: Any]]) -> Void)? + + // MARK: - 초기화 + init( + host: String? = nil, + port: Int? = nil, + lang: String? = nil, + translate: Bool = false, + model: String = "small", + srt_file_path: String = "output.srt", + use_vad: Bool = true, + use_wss: Bool = false, + log_transcription: Bool = true, + max_clients: Int = 4, + max_connection_time: Int = 600, + send_last_n_segments: Int = 10, + no_speech_thresh: Double = 0.45, + clip_audio: Bool = false, + same_output_threshold: Int = 10, + transcription_callback: ((_ text: String, _ segments: [[String: Any]]) -> Void)? = nil + ) { + self.uid = UUID().uuidString + self.language = lang + self.model = model + self.srt_file_path = srt_file_path + self.use_vad = use_vad + self.use_wss = use_wss + self.log_transcription = log_transcription + self.max_clients = max_clients + self.max_connection_time = max_connection_time + self.send_last_n_segments = send_last_n_segments + self.no_speech_thresh = no_speech_thresh + self.clip_audio = clip_audio + self.same_output_threshold = same_output_threshold + self.transcription_callback = transcription_callback + + if translate { + self.task = "translate" + } + + super.init() + + if let host = host, let port = port { + let socket_protocol = self.use_wss ? "wss" : "ws" + let socket_url = "\(socket_protocol)://\(host):\(port)" + + if let url = URL(string: socket_url) { + self.urlSession = URLSession(configuration: .default, delegate: self, delegateQueue: .main) + self.webSocketTask = urlSession.webSocketTask(with: url) + + Client.INSTANCES[self.uid] = self + + // WebSocket 연결 시작 + self.webSocketTask?.resume() + self.setupReceiveMessage() + print("[INFO]: * recording") + } else { + print("[ERROR]: Invalid URL: \(socket_url)") + } + } else { + print("[ERROR]: No host or port specified.") + } + } + + // MARK: - WebSocket 메시지 수신 설정 + private func setupReceiveMessage() { + webSocketTask?.receive { [weak self] result in + guard let self = self else { return } + + switch result { + case .success(let message): + switch message { + case .string(let text): + do { + if let jsonData = text.data(using: .utf8), + let messageJson = try JSONSerialization.jsonObject(with: jsonData) as? [String: Any] { + self.handle_message(messageJson) + } + } catch { + print("[ERROR]: Failed to parse JSON message: \(error)") + } + case .data(let data): + print("[INFO]: Received binary data: \(data.count) bytes") + @unknown default: + print("[ERROR]: Unknown message type") + } + // 계속해서 메시지 수신 + self.setupReceiveMessage() + case .failure(let error): + print("[ERROR]: WebSocket receive error: \(error)") + if !self.server_error { + self.on_error(error: error) + } + } + } + } + + // MARK: - 서버 상태 메시지 처리 + func handle_status_messages(_ message_data: [String: Any]) { + guard let status = message_data["status"] as? String else { return } + + if status == "WAIT" { + self.waiting = true + if let waitTime = message_data["message"] as? Double { + print("[INFO]: Server is full. Estimated wait time \(Int(round(waitTime))) minutes.") + } + } else if status == "ERROR" { + if let message = message_data["message"] as? String { + print("Message from Server: \(message)") + } + self.server_error = true + } else if status == "WARNING" { + if let message = message_data["message"] as? String { + print("Message from Server: \(message)") + } + } + } + + // MARK: - 트랜스크립션 세그먼트 처리 + func process_segments(_ segments: [[String: Any]]) { + var text: [String] = [] + + for (i, seg) in segments.enumerated() { + if let segText = seg["text"] as? String { + if text.isEmpty || text.last != segText { + text.append(segText) + + if i == segments.count - 1, let completed = seg["completed"] as? Bool, !completed { + self.last_segment = seg + } else if let serverBackend = self.server_backend, + serverBackend == "faster_whisper", + let completed = seg["completed"] as? Bool, completed, + (self.transcript.isEmpty || + (let segStart = Double(seg["start"] as? String ?? "0"), + let lastEnd = Double(self.transcript.last?["end"] as? String ?? "0"), + segStart >= lastEnd)) { + self.transcript.append(seg) + } + } + } + } + + // 마지막 수신된 세그먼트와 응답 시간 업데이트 + if let lastSegmentText = segments.last?["text"] as? String, + self.last_received_segment != lastSegmentText { + self.last_response_received = Date() + self.last_received_segment = lastSegmentText + } + + // 트랜스크립션 콜백 호출 + if let callback = transcription_callback { + callback(text.joined(separator: " "), segments) + return + } + + // 로깅 + if self.log_transcription { + // 간결성을 위해 마지막 3개 항목으로 제한 + let displayText = Array(text.suffix(3)) + Utils.clearScreen() + Utils.printTranscript(displayText) + } + } + + // MARK: - 메시지 처리 + func handle_message(_ message: [String: Any]) { + guard let messageUid = message["uid"] as? String, messageUid == self.uid else { + print("[ERROR]: invalid client uid") + return + } + + if message["status"] != nil { + self.handle_status_messages(message) + return + } + + if let messageText = message["message"] as? String { + if messageText == "DISCONNECT" { + print("[INFO]: Server disconnected due to overtime.") + self.recording = false + } else if messageText == "SERVER_READY" { + self.last_response_received = Date() + self.recording = true + self.server_backend = message["backend"] as? String + if let backend = self.server_backend { + print("[INFO]: Server Running with backend \(backend)") + } + return + } + } + + if message["language"] != nil { + self.language = message["language"] as? String + let langProb = message["language_prob"] as? Double + if let lang = self.language, let prob = langProb { + print("[INFO]: Server detected language \(lang) with probability \(prob)") + } + return + } + + if let segments = message["segments"] as? [[String: Any]] { + self.process_segments(segments) + } + } + + // MARK: - WebSocket 이벤트 처리 + func on_open() { + print("[INFO]: Opened connection") + let initialMessage: [String: Any] = [ + "uid": self.uid, + "language": self.language as Any, + "task": self.task, + "model": self.model, + "use_vad": self.use_vad, + "max_clients": self.max_clients, + "max_connection_time": self.max_connection_time, + "send_last_n_segments": self.send_last_n_segments, + "no_speech_thresh": self.no_speech_thresh, + "clip_audio": self.clip_audio, + "same_output_threshold": self.same_output_threshold + ] + + do { + let jsonData = try JSONSerialization.data(withJSONObject: initialMessage) + if let jsonString = String(data: jsonData, encoding: .utf8) { + self.webSocketTask?.send(.string(jsonString)) { error in + if let error = error { + print("[ERROR]: Failed to send initial message: \(error)") + } + } + } + } catch { + print("[ERROR]: Failed to serialize JSON: \(error)") + } + } + + func on_error(error: Error) { + print("[ERROR] WebSocket Error: \(error)") + self.server_error = true + self.error_message = error + } + + func on_close(code: URLSessionWebSocketTask.CloseCode, reason: Data?) { + var reasonStr = "No reason" + if let reason = reason, let reasonText = String(data: reason, encoding: .utf8) { + reasonStr = reasonText + } + print("[INFO]: Websocket connection closed: \(code.rawValue): \(reasonStr)") + self.recording = false + self.waiting = false + } + + // MARK: - URLSessionWebSocketDelegate 메서드 + func urlSession(_ session: URLSession, webSocketTask: URLSessionWebSocketTask, didOpenWithProtocol protocol: String?) { + on_open() + } + + func urlSession(_ session: URLSession, webSocketTask: URLSessionWebSocketTask, didCloseWith closeCode: URLSessionWebSocketTask.CloseCode, reason: Data?) { + on_close(code: closeCode, reason: reason) + } + + // MARK: - 오디오 패킷 전송 + func send_packet_to_server(_ message: Data) { + if !recording && !server_error && !waiting { + print("[WARN]: Not sending packet because recording=\(recording), server_error=\(server_error), waiting=\(waiting)") + return + } + + webSocketTask?.send(.data(message)) { error in + if let error = error { + print("[ERROR]: Failed to send audio packet: \(error)") + } + } + } + + // MARK: - WebSocket 연결 종료 + func close_websocket() { + webSocketTask?.cancel(with: .normalClosure, reason: nil) + print("[INFO]: Closed WebSocket connection") + } + + // MARK: - 클라이언트 소켓 가져오기 + func get_client_socket() -> URLSessionWebSocketTask? { + return webSocketTask + } + + // MARK: - SRT 파일 작성 +func write_srt_file(output_path: String = "output.srt") { + guard let backend = self.server_backend, backend == "faster_whisper" else { return } + + if transcript.isEmpty, let lastSegment = self.last_segment { + transcript.append(lastSegment) + } else if let lastSegment = self.last_segment, + let lastText = transcript.last?["text"] as? String, + let segmentText = lastSegment["text"] as? String, + lastText != segmentText { + transcript.append(lastSegment) + } + + // Utils 클래스 사용 + Utils.createSrtFile(segments: transcript, outputPath: output_path) + } + + // MARK: - 연결 종료 전 대기 + func wait_before_disconnect() { + guard let lastResponse = last_response_received else { return } + + // Swift에서는 Python의 busy waiting 대신 다른 방식으로 구현 + let waitTime = disconnect_if_no_response_for - Date().timeIntervalSince(lastResponse) + if waitTime > 0 { + Thread.sleep(forTimeInterval: waitTime) + } + } +} + +// MARK: - TranscriptionTeeClient 클래스 +class TranscriptionTeeClient { + /** + * 하나 이상의 WebSocket 연결을 통해 오디오 녹음, 스트리밍 및 트랜스크립션 작업을 처리하는 클라이언트입니다. + * + * WebSocket 연결을 사용하여 오디오 트랜스크립션 작업을 위한 상위 레벨 클라이언트로 작동합니다. + * 트랜스크립션을 위해 하나 이상의 서버에 오디오 데이터를 보내고 트랜스크립션된 텍스트 세그먼트를 받을 수 있습니다. + */ + + // MARK: - 프로퍼티 + let clients: [Client] + let chunk: Int = 4096 + let channels: Int = 1 + let sampleRate: Int = 16000 + let recordSeconds: Int = 60000 + let saveOutputRecording: Bool + let outputRecordingFilename: String + let muteAudioPlayback: Bool + + private var audioEngine: AVAudioEngine? + private var inputNode: AVAudioInputNode? + private var frames: Data = Data() + + // MARK: - 초기화 + init(clients: [Client], saveOutputRecording: Bool = false, outputRecordingFilename: String = "./output_recording.wav", muteAudioPlayback: Bool = false) { + self.clients = clients + if clients.isEmpty { + fatalError("At least one client is required.") + } + + self.saveOutputRecording = saveOutputRecording + self.outputRecordingFilename = outputRecordingFilename + self.muteAudioPlayback = muteAudioPlayback + + setupAudioEngine() + } + + // MARK: - 오디오 엔진 설정 + private func setupAudioEngine() { + audioEngine = AVAudioEngine() + inputNode = audioEngine?.inputNode + + let session = AVAudioSession.sharedInstance() + do { + try session.setCategory(.playAndRecord, mode: .default) + try session.setActive(true) + } catch { + print("[WARN]: Unable to access microphone. \(error)") + } + } + + // MARK: - 트랜스크립션 시작 + func start(audio: String? = nil, rtspUrl: String? = nil, hlsUrl: String? = nil, saveFile: String? = nil) { + // 소스가 최대 하나만 제공되었는지 검증 + let providedSources = [audio, rtspUrl, hlsUrl].compactMap { $0 } + guard providedSources.count <= 1 else { + print("[ERROR]: You must provide only one selected source") + return + } + + print("[INFO]: Waiting for server ready ...") + + // 모든 클라이언트가 준비될 때까지 대기 + var clientsReady = false + while !clientsReady { + clientsReady = true + + for client in clients { + if !client.recording { + clientsReady = false + + if client.waiting || client.server_error { + closeAllClients() + return + } + + // 잠시 대기 후 다시 확인 + Thread.sleep(forTimeInterval: 0.1) + } + } + } + + print("[INFO]: Server Ready!") + + if let hlsUrl = hlsUrl { + processHLSStream(hlsUrl: hlsUrl, saveFile: saveFile) + } else if let audio = audio { + playFile(filename: audio) + } else if let rtspUrl = rtspUrl { + processRTSPStream(rtspUrl: rtspUrl) + } else { + record() + } + } + + // MARK: - 모든 클라이언트 닫기 + func closeAllClients() { + for client in clients { + client.close_websocket() + } + } + + // MARK: - 모든 클라이언트 SRT 파일 작성 + func writeAllClientsSRT() { + for client in clients { + client.write_srt_file(output_path: client.srt_file_path) + } + } + + // MARK: - 패킷 멀티캐스트 + func multicastPacket(_ packet: Data, unconditional: Bool = false) { + for client in clients { + if unconditional || client.recording { + client.send_packet_to_server(packet) + } + } + } + + // MARK: - 파일 재생 + func playFile(filename: String) { + // 원본 오디오 파일 리샘플링 (Swift에서 구현 필요) + let resampledFile = resampleAudio(filename: filename) + + do { + guard let audioFileURL = URL(string: "file://\(resampledFile)") else { + print("[ERROR]: Invalid file URL") + return + } + + let audioFile = try AVAudioFile(forReading: audioFileURL) + let audioFormat = audioFile.processingFormat + + let audioEngine = AVAudioEngine() + let playerNode = AVAudioPlayerNode() + + audioEngine.attach(playerNode) + audioEngine.connect(playerNode, to: audioEngine.mainMixerNode, format: audioFormat) + + try audioEngine.start() + playerNode.scheduleFile(audioFile, at: nil) + + // 오디오 파일을 버퍼로 읽어서 처리 + let buffer = AVAudioPCMBuffer(pcmFormat: audioFormat, frameCapacity: AVAudioFrameCount(chunk))! + + while true { + do { + try audioFile.read(into: buffer) + + // 더 이상 읽을 프레임이 없으면 종료 + if buffer.frameLength == 0 { + break + } + + // 16비트 PCM으로 변환 + if let pcmData = convertBufferTo16BitPCM(buffer) { + // Float32로 변환 + let floatData = convertPCMToFloat32(pcmData) + multicastPacket(floatData) + } + + // 재생 (muteAudioPlayback이 false인 경우) + if !muteAudioPlayback { + playerNode.scheduleBuffer(buffer, at: nil, options: [], completionHandler: nil) + } + + // 클라이언트가 녹음 중이 아니면 종료 + if !clients.contains(where: { $0.recording }) { + break + } + + } catch { + print("[ERROR]: Error reading audio file: \(error)") + break + } + } + + // 종료 처리 + for client in clients { + client.wait_before_disconnect() + } + + // 종료 신호 전송 + if let endMessage = Client.END_OF_AUDIO.data(using: .utf8) { + multicastPacket(endMessage, unconditional: true) + } + + writeAllClientsSRT() + playerNode.stop() + audioEngine.stop() + closeAllClients() + + } catch { + print("[ERROR]: Error processing audio file: \(error)") + closeAllClients() + writeAllClientsSRT() + } + } + + // MARK: - 오디오 리샘플링 + private func resampleAudio(filename: String) -> String { + return Utils.resample(file: filename, sampleRate: sampleRate) + } + + // MARK: - RTSP 스트림 처리 + func processRTSPStream(rtspUrl: String) { + print("[INFO]: RTSP stream processing is not implemented in the Swift version") + // Swift에서 RTSP 스트림 처리 구현 필요 + + for client in clients { + client.wait_before_disconnect() + } + + if let endMessage = Client.END_OF_AUDIO.data(using: .utf8) { + multicastPacket(endMessage, unconditional: true) + } + + closeAllClients() + writeAllClientsSRT() + print("[INFO]: RTSP stream processing finished.") + } + + // MARK: - HLS 스트림 처리 + func processHLSStream(hlsUrl: String, saveFile: String? = nil) { + print("[INFO]: HLS stream processing is not implemented in the Swift version") + // Swift에서 HLS 스트림 처리 구현 필요 + + for client in clients { + client.wait_before_disconnect() + } + + if let endMessage = Client.END_OF_AUDIO.data(using: .utf8) { + multicastPacket(endMessage, unconditional: true) + } + + closeAllClients() + writeAllClientsSRT() + print("[INFO]: HLS stream processing finished.") + } + + // MARK: - AV 스트림 처리 + private func processAVStream(container: Any, streamType: String, saveFile: String? = nil) { + print("[INFO]: AV stream processing is not implemented in the Swift version") + // Swift에서 AV 스트림 처리 구현 필요 + } + + // MARK: - 오디오 청크 저장 + private func saveChunk(nAudioFile: Int) { + // Swift에서 오디오 청크 저장 구현 필요 + DispatchQueue.global().async { + self.writeAudioFramesToFile(frames: self.frames, fileName: "chunks/\(nAudioFile).wav") + } + } + + // MARK: - 녹음 마무리 + private func finalizeRecording(nAudioFile: Int) { + var nextAudioFileIndex = nAudioFile + + if saveOutputRecording && !frames.isEmpty { + writeAudioFramesToFile(frames: frames, fileName: "chunks/\(nAudioFile).wav") + nextAudioFileIndex += 1 + } + + audioEngine?.stop() + audioEngine?.inputNode.removeTap(onBus: 0) + + closeAllClients() + + if saveOutputRecording { + writeOutputRecording(nAudioFile: nextAudioFileIndex) + } + + writeAllClientsSRT() + } + + // MARK: - 녹음 + func record() { + var nAudioFile = 0 + + if saveOutputRecording { + let fileManager = FileManager.default + let chunksDir = "chunks" + + // 이전 chunks 디렉토리 삭제 + if fileManager.fileExists(atPath: chunksDir) { + try? fileManager.removeItem(atPath: chunksDir) + } + + // 새 chunks 디렉토리 생성 + try? fileManager.createDirectory(atPath: chunksDir, withIntermediateDirectories: true) + } + + guard let audioEngine = audioEngine, let inputNode = inputNode else { + print("[ERROR]: Audio engine not initialized") + return + } + + let format = inputNode.outputFormat(forBus: 0) + + // 인터럽트 핸들러 설정 + var isInterrupted = false + NotificationCenter.default.addObserver(forName: AVAudioSession.interruptionNotification, object: nil, queue: nil) { [weak self] notification in + guard let self = self else { return } + + if let userInfo = notification.userInfo, + let typeValue = userInfo[AVAudioSessionInterruptionTypeKey] as? UInt, + let type = AVAudioSession.InterruptionType(rawValue: typeValue) { + + if type == .began { + isInterrupted = true + self.finalizeRecording(nAudioFile: nAudioFile) + } + } + } + + // 인풋 노드에 탭 설치 + inputNode.installTap(onBus: 0, bufferSize: AVAudioFrameCount(chunk), format: format) { [weak self] buffer, time in + guard let self = self, !isInterrupted else { return } + + // 클라이언트가 녹음 중이 아니면 처리 중단 + if !self.clients.contains(where: { $0.recording }) { + return + } + + // 버퍼를 16비트 PCM Data로 변환 + if let pcmData = self.convertBufferToRawPCM(buffer) { + // 프레임에 추가 + self.frames.append(pcmData) + + // Float32로 변환하여 전송 + let floatData = self.convertPCMToFloat32(pcmData) + self.multicastPacket(floatData) + + // 1분 이상 녹음되면 청크 저장 + if self.frames.count > 60 * self.sampleRate * 2 { // 16비트(2바이트) PCM + if self.saveOutputRecording { + self.saveChunk(nAudioFile: nAudioFile) + nAudioFile += 1 + } + self.frames = Data() + } + } + } + + do { + try audioEngine.start() + print("[INFO]: AVAudioEngine started") + + // 녹음 지속 시간 설정 (타이머 대신 사용) + let recordDuration = TimeInterval(recordSeconds / 1000) // 밀리초에서 초로 변환 + + // 키보드 인터럽트 처리를 위한 메인 스레드 대기 + DispatchQueue.global().async { + // 최대 녹음 시간 또는 클라이언트 녹음 종료까지 실행 + let startTime = Date() + + while Date().timeIntervalSince(startTime) < recordDuration && + self.clients.contains(where: { $0.recording }) && + !isInterrupted { + Thread.sleep(forTimeInterval: 0.1) + } + + // 녹음 종료 처리 + DispatchQueue.main.async { + self.finalizeRecording(nAudioFile: nAudioFile) + } + } + + } catch { + print("[ERROR]: Could not start audio engine: \(error)") + } + } + + // MARK: - 오디오 프레임을 파일로 저장 + private func writeAudioFramesToFile(frames: Data, fileName: String) { + do { + let settings: [String: Any] = [ + AVNumberOfChannelsKey: channels, + AVSampleRateKey: sampleRate, + AVLinearPCMBitDepthKey: 16, + AVLinearPCMIsFloatKey: false, + AVLinearPCMIsBigEndianKey: false + ] + + guard let url = URL(string: "file://\(fileName)") else { + print("[ERROR]: Invalid file URL") + return + } + + try frames.write(to: url) + print("[INFO]: Wrote audio frames to \(fileName)") + } catch { + print("[ERROR]: Failed to write audio frames: \(error)") + } + } + + // MARK: - 출력 녹음 파일 작성 + private func writeOutputRecording(nAudioFile: Int) { + let fileManager = FileManager.default + + // 존재하는 청크 파일 경로 수집 + var inputFiles: [String] = [] + for i in 0.. Data? { + let format = buffer.format + let frameCount = Int(buffer.frameLength) + + if format.commonFormat == .pcmFormatFloat32 { + // Float32에서 Int16으로 변환 + guard let floatChannelData = buffer.floatChannelData else { + return nil + } + + let channelPointer = floatChannelData.pointee + var pcmData = Data(capacity: frameCount * MemoryLayout.size) + + for i in 0...size)) + } + + return pcmData + + } else if format.commonFormat == .pcmFormatInt16 { + // 이미 Int16 형식인 경우 직접 변환 + guard let int16ChannelData = buffer.int16ChannelData else { + return nil + } + + let channelPointer = int16ChannelData.pointee + let dataLength = frameCount * MemoryLayout.size + return Data(bytes: channelPointer, count: dataLength) + } + + print("[ERROR]: Unsupported audio format: \(format.commonFormat)") + return nil + } + + // MARK: - 버퍼를 16비트 PCM으로 변환 + private func convertBufferTo16BitPCM(_ buffer: AVAudioPCMBuffer) -> Data? { + // 버퍼 포맷 확인 및 적절한 변환 처리 + if buffer.format.commonFormat == .pcmFormatInt16 { + guard let channelData = buffer.int16ChannelData else { + print("int16ChannelData is nil") + return nil + } + let channelPointer = channelData.pointee + let dataLength = Int(buffer.frameLength) * MemoryLayout.size + return Data(bytes: channelPointer, count: dataLength) + } else if buffer.format.commonFormat == .pcmFormatFloat32 { + guard let floatChannelData = buffer.floatChannelData else { + print("floatChannelData is nil") + return nil + } + + let channelPointer = floatChannelData.pointee + let frameLength = Int(buffer.frameLength) + var pcmData = Data(capacity: frameLength * MemoryLayout.size) + + for i in 0...size)) + } + + return pcmData + } else { + print("예상치 못한 버퍼 포맷: \(buffer.format.commonFormat)") + return nil + } + } + + // MARK: - 16비트 PCM을 Float32로 변환 + func convertPCMToFloat32(_ pcmData: Data) -> Data { + // 16비트 PCM 데이터를 float32로 변환 + var floatArray = [Float32](repeating: 0, count: pcmData.count / 2) + + pcmData.withUnsafeBytes { (bytes: UnsafeRawBufferPointer) -> Void in + if let baseAddress = bytes.baseAddress { + let int16Buffer = baseAddress.bindMemory(to: Int16.self, capacity: pcmData.count / 2) + for i in 0...size) + } +} + +// MARK: - TranscriptionClient 클래스 +class TranscriptionClient: TranscriptionTeeClient { + /** + * 단일 WebSocket 연결을 통한 오디오 트랜스크립션 작업을 처리하는 클라이언트. + * + * WebSocket 연결을 사용하여 오디오 트랜스크립션 작업을 위한 상위 레벨 클라이언트로 작동합니다. + * 서버에 오디오 데이터를 전송하고 트랜스크립션된 텍스트 세그먼트를 수신할 수 있습니다. + */ + + let client: Client + + init( + host: String, + port: Int, + lang: String? = nil, + translate: Bool = false, + model: String = "small", + use_vad: Bool = true, + use_wss: Bool = false, + save_output_recording: Bool = false, + output_recording_filename: String = "./output_recording.wav", + output_transcription_path: String = "./output.srt", + log_transcription: Bool = true, + max_clients: Int = 4, + max_connection_time: Int = 600, + mute_audio_playback: Bool = false, + send_last_n_segments: Int = 10, + no_speech_thresh: Double = 0.45, + clip_audio: Bool = false, + same_output_threshold: Int = 10, + transcription_callback: ((_ text: String, _ segments: [[String: Any]]) -> Void)? = nil + ) { + // 기본 클라이언트 생성 + self.client = Client( + host: host, + port: port, + lang: lang, + translate: translate, + model: model, + srt_file_path: output_transcription_path, + use_vad: use_vad, + use_wss: use_wss, + log_transcription: log_transcription, + max_clients: max_clients, + max_connection_time: max_connection_time, + send_last_n_segments: send_last_n_segments, + no_speech_thresh: no_speech_thresh, + clip_audio: clip_audio, + same_output_threshold: same_output_threshold, + transcription_callback: transcription_callback + ) + + // 파일 이름 유효성 검사 + if save_output_recording && !output_recording_filename.hasSuffix(".wav") { + fatalError("Please provide a valid `output_recording_filename`: \(output_recording_filename)") + } + if !output_transcription_path.hasSuffix(".srt") { + fatalError("Please provide a valid `output_transcription_path`: \(output_transcription_path). The file extension should be `.srt`.") + } + + // 부모 클래스 초기화 + super.init( + clients: [client], + saveOutputRecording: save_output_recording, + outputRecordingFilename: output_recording_filename, + muteAudioPlayback: mute_audio_playback + ) + } +} \ No newline at end of file diff --git a/Sources/ViewModels/Audio/utils.swift b/Sources/ViewModels/Audio/utils.swift new file mode 100644 index 0000000..c78bd7c --- /dev/null +++ b/Sources/ViewModels/Audio/utils.swift @@ -0,0 +1,138 @@ +import Foundation +import AVFoundation + +class Utils { + + /// 화면을 지웁니다 (콘솔에서 작동) + static func clearScreen() { + #if os(macOS) + print("\u{001B}[2J\u{001B}[H", terminator: "") + #else + // iOS에서는 콘솔 화면을 지우는 기능이 제한적이므로 빈 줄 여러 개 출력 + for _ in 0..<50 { + print("") + } + #endif + } + + /// 트랜스크립션 텍스트를 포맷팅하여 출력합니다 + static func printTranscript(_ text: [String]) { + let joinedText = text.joined(separator: " ") + let width = 60 + + var currentLine = "" + for word in joinedText.split(separator: " ") { + if currentLine.count + word.count + 1 <= width { + if !currentLine.isEmpty { + currentLine += " " + } + currentLine += word + } else { + print(currentLine) + currentLine = String(word) + } + } + + if !currentLine.isEmpty { + print(currentLine) + } + } + + /// 초(float)를 SRT 시간 형식으로 변환합니다 + static func formatTime(_ seconds: Double) -> String { + let hours = Int(seconds / 3600) + let minutes = Int((seconds.truncatingRemainder(dividingBy: 3600)) / 60) + let secs = Int(seconds.truncatingRemainder(dividingBy: 60)) + let milliseconds = Int((seconds - Double(Int(seconds))) * 1000) + + return String(format: "%02d:%02d:%02d,%03d", hours, minutes, secs, milliseconds) + } + + /// SRT 파일을 생성합니다 + static func createSrtFile(segments: [[String: Any]], outputPath: String) { + var srtContent = "" + var segmentNumber = 1 + + for segment in segments { + guard let start = Double(segment["start"] as? String ?? "0"), + let end = Double(segment["end"] as? String ?? "0"), + let text = segment["text"] as? String else { + continue + } + + let startTime = formatTime(start) + let endTime = formatTime(end) + + srtContent += "\(segmentNumber)\n" + srtContent += "\(startTime) --> \(endTime)\n" + srtContent += "\(text)\n\n" + + segmentNumber += 1 + } + + do { + try srtContent.write(toFile: outputPath, atomically: true, encoding: .utf8) + print("[INFO]: SRT file written to \(outputPath)") + } catch { + print("[ERROR]: Failed to write SRT file: \(error)") + } + } + + /// 오디오 파일을 16kHz로 리샘플링합니다 + static func resample(file: String, sampleRate: Int = 16000) -> String { + let fileURL = URL(fileURLWithPath: file) + let fileExtension = fileURL.pathExtension + let fileName = fileURL.deletingPathExtension().lastPathComponent + let outputFileName = "\(fileName)_resampled.wav" + let outputURL = fileURL.deletingLastPathComponent().appendingPathComponent(outputFileName) + + do { + // 소스 오디오 파일 읽기 + let audioFile = try AVAudioFile(forReading: fileURL) + let sourceFormat = audioFile.processingFormat + + // 소스 오디오 데이터를 버퍼로 로드 + let frameCount = UInt32(audioFile.length) + let sourceBuffer = AVAudioPCMBuffer(pcmFormat: sourceFormat, frameCapacity: frameCount)! + try audioFile.read(into: sourceBuffer) + + // 대상 오디오 포맷 설정 (16kHz, mono, 16-bit PCM) + let targetFormat = AVAudioFormat(commonFormat: .pcmFormatInt16, + sampleRate: Double(sampleRate), + channels: 1, + interleaved: true)! + + // 컨버터 생성 및 변환 + let converter = AVAudioConverter(from: sourceFormat, to: targetFormat)! + let targetBuffer = AVAudioPCMBuffer(pcmFormat: targetFormat, + frameCapacity: AVAudioFrameCount(Double(frameCount) * Double(sampleRate) / sourceFormat.sampleRate))! + + var error: NSError? = nil + let inputBlock: AVAudioConverterInputBlock = { inNumPackets, outStatus in + outStatus.pointee = .haveData + return sourceBuffer + } + + converter.convert(to: targetBuffer, error: &error, withInputFrom: inputBlock) + + if let error = error { + print("[ERROR]: Failed to convert audio: \(error)") + return file + } + + // 결과 버퍼를 파일로 저장 + let outputFile = try AVAudioFile(forWriting: outputURL, + settings: targetFormat.settings, + commonFormat: targetFormat.commonFormat, + interleaved: targetFormat.isInterleaved) + try outputFile.write(from: targetBuffer) + + print("[INFO]: Audio resampled to \(outputURL.path)") + return outputURL.path + + } catch { + print("[ERROR]: Failed to resample audio: \(error)") + return file + } + } +} \ No newline at end of file