ADDED: Audio Summarizer using Gemini 1.5 Pro

AquibPy · Apr 18, 2024 · 369f1aa · 369f1aa
1 parent 8a3c8bc
commit 369f1aa
Show file tree

Hide file tree

Showing 4 changed files with 47 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -78,6 +78,11 @@ Generative AI, powered by advanced machine learning models, enables the creation
 - **Route:** `/RAG_PDF_Groq`
 - **Description:** This endpoint uses the pdf and give the answer based on the prompt provided using Groq,with a default model input of llama2-70b-4096, but offering alternatives like mixtral-8x7b-32768 and gemma-7b-it.
 
+### 13. Audio Summarizer
+
+- **Route:** `/summarize_audio`
+- **Description:** Endpoint to summarize an uploaded audio file using gemini-1.5-pro-latest.
+
 ## Usage
 
 Each endpoint accepts specific parameters as described in the respective endpoint documentation. Users can make POST requests to these endpoints with the required parameters to perform the desired tasks.

diff --git a/api.py b/api.py
@@ -9,7 +9,8 @@
 from settings import invoice_prompt,youtube_transcribe_prompt,text2sql_prompt,EMPLOYEE_DB,GEMINI_PRO,GEMINI_PRO_1_5
 from mongo import MongoDB
 from helper_functions import get_qa_chain,get_gemini_response,get_url_doc_qa,extract_transcript_details,\
-    get_gemini_response_health,get_gemini_pdf,read_sql_query,remove_substrings,questions_generator,groq_pdf
+    get_gemini_response_health,get_gemini_pdf,read_sql_query,remove_substrings,questions_generator,groq_pdf,\
+    summarize_audio
 from langchain_groq import ChatGroq
 from langchain.chains import ConversationChain
 from langchain.chains.conversation.memory import ConversationBufferWindowMemory
@@ -308,4 +309,20 @@ async def talk_pd_groq(pdf: UploadFile = File(...),prompt: str = Form(...),
         print(result)
         return ResponseText(response=out)
     except Exception as e:
-        return ResponseText(response=f"Error: {str(e)}")
+        return ResponseText(response=f"Error: {str(e)}")
+
+@app.post("/summarize_audio",description="""Endpoint to summarize an uploaded audio file using gemini-1.5-pro-latest.""")
+async def summarize_audio_endpoint(audio_file: UploadFile = File(...)):
+    try:
+        summary_text = await summarize_audio(audio_file)
+        db = MongoDB()
+        payload = {
+            "endpoint" : "/summarize_audio",
+            "output" : summary_text
+        }
+        mongo_data = {"Document": payload}
+        result = db.insert_data(mongo_data)
+        print(result)
+        return ResponseText(response=summary_text)
+    except Exception as e:
+        return {"error": str(e)}
diff --git a/data/harvard.wav b/data/harvard.wav
diff --git a/helper_functions.py b/helper_functions.py
@@ -20,6 +20,7 @@
 from PyPDF2 import PdfReader
 import sqlite3
 from langchain_community.embeddings import GooglePalmEmbeddings
+import tempfile
 
 load_dotenv()
 genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
@@ -212,6 +213,28 @@ def groq_pdf(pdf,model):
     )
     return rag_chain
 
+async def summarize_audio(audio_file):
+    """Summarize the audio using Google's Generative API."""
+    model = genai.GenerativeModel("models/gemini-1.5-pro-latest")
+
+    # Save the audio file to a temporary file
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.'+audio_file.filename.split('.')[-1]) as tmp_file:
+            tmp_file.write(await audio_file.read())
+            audio_file_path = tmp_file.name
+    except Exception as e:
+        raise Exception(f"Error handling uploaded file: {e}")
+
+    audio_file = genai.upload_file(path=audio_file_path)
+    response = model.generate_content(
+        [
+            "Please summarize the following audio.",
+            audio_file
+        ]
+    )
+
+    return response.text
+
 if __name__ == "__main__":
     create_vector_db()
     chain = get_qa_chain()