From 7b9eb786494c2b2e13440dabb2f41209fcfc75df Mon Sep 17 00:00:00 2001 From: Shaun Offenbacher Date: Thu, 15 Feb 2024 13:02:44 -0700 Subject: [PATCH 1/2] made processed-audio more robust to handle form data and raw binary --- scripts/python_recorder_client/main.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/python_recorder_client/main.py b/scripts/python_recorder_client/main.py index eaba25b..6b6269c 100644 --- a/scripts/python_recorder_client/main.py +++ b/scripts/python_recorder_client/main.py @@ -21,8 +21,8 @@ help="API token for authentication with the server.") parser.add_argument('-s', '--seconds', type=int, default=30, help="Duration of each recording segment in seconds. (default 30)") -parser.add_argument('-m', '--sensitivity', type=float, default=35.0, - help="Microphone sensitivity threshold (0.0 to 100.0, default: 35.0).") +parser.add_argument('-m', '--sensitivity', type=float, default=0.0, + help="Microphone sensitivity threshold (0.0 to 100.0, default: 0).") parser.add_argument('-l', '--save', action='store_true', help="Save recordings locally.") parser.add_argument('-v', '--verbose', action='store_true', help="Enable verbose output for debugging.") @@ -69,7 +69,6 @@ def get_base_url(): def store_sound(frames): logger.debug('Store and sending wav.') - # Save the recorded data as a WAV file filename = get_wav_filename() wf = wave.open(filename, 'wb') wf.setnchannels(CHANNELS) @@ -78,11 +77,12 @@ def store_sound(frames): wf.writeframes(b''.join(frames)) wf.close() - files = {'file': open(filename, 'rb')} - response = requests.post(f'{get_base_url()}/functions/v1/process-audio', files=files, headers={ - 'apikey': args.token, - 'Content-Type': 'audio/wav,', - }) + with open(filename, 'rb') as f: + files = {'file': (filename, f, 'audio/wav')} + response = requests.post(f'{get_base_url()}/functions/v1/process-audio', files=files, headers={ + 'Authorization': f'Bearer {args.token}', + 'apikey': args.token, + }, timeout=540) logger.info(response.text) From 0dc51f83ca8c8894b508f2a1857c1d393447c03a Mon Sep 17 00:00:00 2001 From: Shaun Offenbacher Date: Thu, 15 Feb 2024 13:06:48 -0700 Subject: [PATCH 2/2] made processed-audio more robust to handle form data and raw binary --- .gitignore | 4 +- docs/guides/use_python_recorder.md | 4 +- supabase/functions/process-audio/index.ts | 178 ++++++++++++---------- 3 files changed, 105 insertions(+), 81 deletions(-) diff --git a/.gitignore b/.gitignore index 84166e6..535b156 100644 --- a/.gitignore +++ b/.gitignore @@ -66,4 +66,6 @@ _site/ # Ignore folders generated by Bundler .bundle/ -vendor/ \ No newline at end of file +vendor/ + +venv/ \ No newline at end of file diff --git a/docs/guides/use_python_recorder.md b/docs/guides/use_python_recorder.md index 43080b7..b581c63 100644 --- a/docs/guides/use_python_recorder.md +++ b/docs/guides/use_python_recorder.md @@ -69,7 +69,7 @@ Here's a brief overview of the script's parameters: -u: --base-url(required): The URL to which the recordings are sent. -t: --token(required): API token for server authentication. -s: --seconds: Duration of recording segments in seconds (default: 30). - -m: --sensitivity: Microphone sensitivity threshold (0.0 to 100.0, default: 35.0). Set to 0 for continuous recording. + -m: --sensitivity: Microphone sensitivity threshold (0.0 to 100.0, default: 0). Set to 0 for continuous recording. -l: --save: Save recordings locally. -v: --verbose: Enable verbose output for debugging. ``` @@ -91,5 +91,5 @@ And that is it, you should now be able to record things locally, and test the fr #### **Important Notes** - Ensure your base_url and token are correct to successfully send recordings. -- Adjust the sensitivity to your microphone setup to avoid missing recordings or record silance. +- Adjust the sensitivity to your microphone setup to avoid missing recordings or record silance. Too high will make the audio unable to be transcribed. - Use the save option if you want to keep local copies of the recordings (file names "recording{timestamp}.wav"). diff --git a/supabase/functions/process-audio/index.ts b/supabase/functions/process-audio/index.ts index f87f05a..28c8a45 100644 --- a/supabase/functions/process-audio/index.ts +++ b/supabase/functions/process-audio/index.ts @@ -1,97 +1,119 @@ -import { serve } from "https://deno.land/std@0.170.0/http/server.ts"; -import OpenAI, { toFile } from "https://deno.land/x/openai@v4.26.0/mod.ts"; +import { serve } from 'https://deno.land/std/http/server.ts'; +import { multiParser } from 'https://deno.land/x/multiparser@0.114.0/mod.ts'; +import OpenAI, { toFile } from 'https://deno.land/x/openai@v4.26.0/mod.ts'; -import { corsHeaders } from "../common/cors.ts"; -import { supabaseClient } from "../common/supabaseClient.ts"; +import { corsHeaders } from '../common/cors.ts'; +import { supabaseClient } from '../common/supabaseClient.ts'; const processAudio = async (req: Request) => { - - if (req.method !== "POST") { - return new Response("Method Not Allowed", { status: 405 }); - } + if (req.method !== 'POST') { + return new Response('Method Not Allowed', { status: 405 }); + } - const supabase = supabaseClient(req); - const openaiClient = new OpenAI({ - apiKey: Deno.env.get("OPENAI_API_KEY"), - }); + const supabase = supabaseClient(req); + const openaiClient = new OpenAI({ + apiKey: Deno.env.get('OPENAI_API_KEY'), + }); - // Validate Content-Type - const contentType = req.headers.get("Content-Type") || ""; - if (!contentType.includes("audio/wav") && !contentType.includes("audio/x-wav")) { - return new Response("Unsupported Media Type", { status: 415 }); - } + const contentType = req.headers.get('Content-Type') || ''; + let arrayBuffer: ArrayBuffer; + let filenameTimestamp = `audio_${Date.now()}.wav`; - const arrayBuffer = await req.arrayBuffer(); + if (contentType.includes('multipart/form-data')) { + const form = await multiParser(req); + if (!form || !form.files || !form.files.file) { + return new Response('File not found in form', { + status: 400, + headers: corsHeaders, + }); + } + console.log('Form:', form); + const file = form.files.file; + arrayBuffer = file.content.buffer; + filenameTimestamp = file.filename || filenameTimestamp; + } else { + arrayBuffer = await req.arrayBuffer(); + } - let transcript: string; - let embeddings: any; - try { - const filenameTimestamp = `adeus_wav_${Date.now()}.wav`; - const wavFile = await toFile(arrayBuffer, filenameTimestamp); + let transcript: string; + let embeddings: any; + try { + const filenameTimestamp = `adeus_wav_${Date.now()}.wav`; + const wavFile = await toFile(arrayBuffer, filenameTimestamp); + console.log(typeof wavFile, wavFile); - // const { data, error } = await supabase.storage - // .from("test") - // .upload(filenameTimestamp, wavFile); + // const { data, error } = await supabase.storage + // .from("test") + // .upload(filenameTimestamp, wavFile); - // if (error) { - // console.error("Error uploading file:", error); - // } + // if (error) { + // console.error("Error uploading file:", error); + // } - const transcriptResponse = await openaiClient.audio.transcriptions.create({ - file: await toFile(wavFile, filenameTimestamp), - model: "whisper-1", - prompt: - 'If this audio file does not contain any speech, please return "None"', - }); - transcript = transcriptResponse.text; - let transcriptLowered = transcript.toLowerCase(); - // ("thank" in transcriptLowered && - // "watch" in transcriptLowered && - // "video" in transcriptLowered) - if ( - transcript == "None" || - transcript == "" || - transcript == null || - (transcriptLowered.includes("thank") && - transcriptLowered.includes("watch")) - ) { - return new Response(JSON.stringify({ message: "No transcript found." }), { - headers: { ...corsHeaders, "Content-Type": "application/json" }, - status: 200, - }); - } + const transcriptResponse = + await openaiClient.audio.transcriptions.create({ + file: wavFile, + model: 'whisper-1', + prompt: 'If this audio file does not contain any speech, please return "None"', + }); + transcript = transcriptResponse.text; + let transcriptLowered = transcript.toLowerCase(); + // ("thank" in transcriptLowered && + // "watch" in transcriptLowered && + // "video" in transcriptLowered) + if ( + transcript == 'None' || + transcript == '' || + transcript == null || + (transcriptLowered.includes('thank') && + transcriptLowered.includes('watch')) + ) { + return new Response( + JSON.stringify({ message: 'No transcript found.' }), + { + headers: { + ...corsHeaders, + 'Content-Type': 'application/json', + }, + status: 200, + } + ); + } - console.log("Transcript:", transcript); + console.log('Transcript:', transcript); - const embeddingsResponse = await openaiClient.embeddings.create({ - model: "text-embedding-ada-002", - input: transcript.replace(/\n/g, " ").replace(/\s{2,}/g, " "), - }); - embeddings = embeddingsResponse.data[0].embedding; - console.log("Embeddings:", embeddings); + const embeddingsResponse = await openaiClient.embeddings.create({ + model: 'text-embedding-ada-002', + input: transcript.replace(/\n/g, ' ').replace(/\s{2,}/g, ' '), + }); + embeddings = embeddingsResponse.data[0].embedding; + console.log('Embeddings:', embeddings); - const { data, error } = await supabase - .from("records") - .insert({ raw_text: transcript, embeddings: embeddings }); + const { data, error } = await supabase + .from('records') + .insert({ raw_text: transcript, embeddings: embeddings }); - if (error) { - console.error("Error inserting record:", error); + if (error) { + console.error('Error inserting record:', error); + } + } catch (error) { + console.error('Transcription error:', error); + return new Response(JSON.stringify({ error: error.message }), { + headers: { ...corsHeaders, 'Content-Type': 'application/json' }, + status: 500, + }); } - } catch (error) { - console.error("Transcription error:", error); - return new Response(JSON.stringify({ error: error.message }), { - headers: { ...corsHeaders, "Content-Type": "application/json" }, - status: 500, - }); - } - return new Response( - JSON.stringify({ message: "Audio transcribed successfully.", transcript }), - { - headers: { ...corsHeaders, "Content-Type": "application/json" }, - status: 200, - } - ); + return new Response( + JSON.stringify({ + message: 'Audio transcribed successfully.', + transcript, + }), + { + headers: { ...corsHeaders, 'Content-Type': 'application/json' }, + status: 200, + } + ); }; serve(processAudio);