From 51e974b3bf91ae129b073558a2974c8f56384b95 Mon Sep 17 00:00:00 2001 From: AquibPy Date: Thu, 9 May 2024 15:23:19 +0530 Subject: [PATCH] ADDED: more test cases for new endpoints --- .github/workflows/ci-cd.yaml | 2 +- data/llm.txt | 35 ++++++++++++++++ helper_functions.py | 4 +- settings.py | 19 ++++++++- test_main.py | 79 +++++++++++++++++++++++++++++++++++- 5 files changed, 134 insertions(+), 5 deletions(-) create mode 100644 data/llm.txt diff --git a/.github/workflows/ci-cd.yaml b/.github/workflows/ci-cd.yaml index c94daba..90a0fde 100644 --- a/.github/workflows/ci-cd.yaml +++ b/.github/workflows/ci-cd.yaml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.9", "3.10", "3.11"] + python-version: ["3.9", "3.10", "3.11", "3.12"] steps: - name: Checkout Repository uses: actions/checkout@v2 diff --git a/data/llm.txt b/data/llm.txt new file mode 100644 index 0000000..72c6f39 --- /dev/null +++ b/data/llm.txt @@ -0,0 +1,35 @@ +Title: Enhancing Large Language Models with Perturbative Fine-Tuning: A Comprehensive Guide + +Introduction: +In the realm of natural language processing (NLP), large language models (LLMs) have revolutionized the way we interact with and analyze textual data. +However, while pre-trained LLMs offer remarkable capabilities out-of-the-box, there's often a need to fine-tune them for specific domains or tasks to unlock their full potential. +Enter Perturbative Fine-Tuning (PEFT), a cutting-edge method developed by Dr. Kosaraju that enables researchers and practitioners to enhance LLMs with domain-specific knowledge. +In this blog post, we'll explore the ins and outs of PEFT and how it empowers us to tailor LLMs to our specific needs. + +Understanding PEFT: +PEFT is a systematic approach to fine-tuning pre-trained LLMs by introducing domain-specific perturbations and iteratively refining the model based on task-specific data. +At its core, PEFT leverages the wealth of information encoded in pre-trained LLMs and augments it with domain-specific knowledge, resulting in models that excel in specialized tasks. + +The PEFT Process: +1. Data Preparation: The journey begins with gathering and preprocessing domain-specific datasets tailored to the task at hand. +This step ensures that the fine-tuned model learns from relevant examples and nuances inherent to the target domain. + +2. Model Selection: Next, we select a suitable pre-trained LLM as the foundation for our fine-tuning process. +The choice of base model depends on factors such as architecture, pre-training data, and computational resources. + +3. Perturbation and Fine-Tuning: Here comes the heart of PEFT. +We introduce domain-specific perturbations to the pre-trained model, guiding it to adapt and specialize in the target domain through iterative fine-tuning steps. + +4. Evaluation and Refinement: Finally, we evaluate the performance of the fine-tuned model using appropriate metrics and benchmarks. +Based on the results, we refine the model further, iterating until satisfactory performance is achieved. + +Case Studies and Experiments: +To illustrate the effectiveness of PEFT, let's delve into some real-world case studies and experiments. +From legal text classification to sentiment analysis, PEFT consistently demonstrates its prowess in enhancing LLMs for diverse applications. +Through these examples, we witness firsthand how PEFT empowers researchers and practitioners to unlock new possibilities and push the boundaries of NLP. + +Conclusion: +In a world where the demand for specialized NLP solutions continues to grow, Perturbative Fine-Tuning emerges as a game-changer. +By seamlessly integrating domain-specific knowledge into pre-trained LLMs, PEFT equips us with powerful tools to tackle complex tasks and domains with ease. +As we embark on this journey of fine-tuning and specialization, let's embrace PEFT as a guiding light, +illuminating the path towards unparalleled performance and innovation in natural language processing. \ No newline at end of file diff --git a/helper_functions.py b/helper_functions.py index 8302213..2ad30f7 100644 --- a/helper_functions.py +++ b/helper_functions.py @@ -2,7 +2,7 @@ from settings import GOOGLE_EMBEDDING,FAQ_FILE,INSTRUCTOR_EMBEDDING,VECTORDB_PATH,qa_prompt,\ prompt_pdf,question_prompt_template,question_refine_template, GEMINI_PRO from langchain_google_genai import GoogleGenerativeAI,GoogleGenerativeAIEmbeddings,ChatGoogleGenerativeAI -from langchain.document_loaders.csv_loader import CSVLoader +from langchain_community.document_loaders import CSVLoader from langchain_community.document_loaders import UnstructuredURLLoader,PyPDFLoader,WebBaseLoader from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter,TokenTextSplitter @@ -191,7 +191,7 @@ def questions_generator(doc): question_prompt=PROMPT_QUESTIONS, refine_prompt=REFINE_PROMPT_QUESTIONS) - ques = ques_gen_chain.run(document_ques_gen) + ques = ques_gen_chain.invoke(document_ques_gen) return ques def groq_pdf(pdf,model): diff --git a/settings.py b/settings.py index 03a53ea..13cdc78 100644 --- a/settings.py +++ b/settings.py @@ -80,4 +80,21 @@ "Animagine_xl" : "https://api-inference.huggingface.co/models/cagliostrolab/animagine-xl-3.0", "Stable_Diffusion_base" : "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-xl-base-1.0", "Stable_Diffusion_v2" : "https://api-inference.huggingface.co/models/stabilityai/stable-diffusion-2-1", -} \ No newline at end of file +} + +summary_para = """ +In the vast landscape of human history, civilizations have risen and fallen, leaving behind legacies that shape our present. +From the ancient civilizations of Mesopotamia and Egypt, where the foundations of writing, agriculture, and governance were laid, +to the grand empires of Rome and China, which expanded their reach through conquest and trade, the story of humanity is one of ambition, innovation, and conflict. +The Middle Ages saw the emergence of feudalism in Europe, characterized by the exchange of land for loyalty and protection, +while the Islamic Golden Age ushered in a period of scientific, artistic, and philosophical advancement in the Muslim world. +The Renaissance in Europe sparked a revival of classical learning and ushered in an era of exploration and discovery, +leading to the age of Enlightenment, where reason and empiricism challenged traditional authority. +The Industrial Revolution transformed societies with technological advancements, urbanization, +and shifts in economic production, while the 20th century witnessed unprecedented global conflicts, +technological leaps, and social revolutions. Today, in the 21st century, +we stand at the intersection of unprecedented technological advancement and pressing global challenges, +navigating issues of climate change, political polarization, and the ethical implications of artificial intelligence. +As we reflect on the journey of humanity, from ancient civilizations to the digital age, we are reminded of our shared past and +the collective responsibility to shape a more equitable and sustainable future. +""" \ No newline at end of file diff --git a/test_main.py b/test_main.py index 263c048..e8f1276 100644 --- a/test_main.py +++ b/test_main.py @@ -1,6 +1,7 @@ from fastapi.testclient import TestClient from api import app from mongo import MongoDB +from settings import summary_para client = TestClient(app) db = MongoDB() @@ -58,4 +59,80 @@ def test_questions_generator(): # Make a request to the questions_generator endpoint response = client.post("/questions_generator", files={"pdf": pdf_file}) assert response.status_code == 200 - assert "response" in response.json() \ No newline at end of file + assert "response" in response.json() + +def test_chat_groq(): + question = "What is the capital of France?" + model = "mixtral-8x7b-32768" + conversational_memory_length = 5 + # Send the POST request + data = { + "question": question, + "model": model, + "conversational_memory_length": conversational_memory_length + } + response = client.post("/chat_groq", data=data) + + assert response.status_code == 200 + + # Assert the response content + response_data = response.json() + assert "Chatbot" in response_data + assert isinstance(response_data["Chatbot"], str) + +def test_text_summarizer_groq(): + data = { + "input_text": summary_para + } + response = client.post("/text_summarizer_groq", data=data) + assert response.status_code == 200 + + response_data = response.json() + assert "Summary" in response_data + assert isinstance(response_data["Summary"], str) + assert len(response_data["Summary"]) > 0 + +def test_summarize_audio(): + audio_file_path = "data/harvard.wav" + + with open(audio_file_path, "rb") as audio_file: + files = {"audio_file": audio_file} + response = client.post("/summarize_audio", files=files) + + assert response.status_code == 200 + + response_data = response.json() + assert "response" in response_data + assert isinstance(response_data["response"], str) + assert len(response_data["response"]) > 0 + +def test_qa_url_doc_with_url(): + url = ["https://huggingface.co/blog/merve/quantization"] + prompt = "What is the GPTQ Quantization?" + + data = { + "url": url, + "prompt": prompt + } + response = client.post("/qa_url_doc", data=data) + + assert response.status_code == 200 + + response_data = response.json() + assert "response" in response_data + assert isinstance(response_data["response"], str) + +def test_qa_url_doc_with_document(): + document_file_path = "data/llm.txt" + prompt = "What is the main point discussed in the document?" + + with open(document_file_path, "rb") as document_file: + files = {"documents": ("document.txt", document_file, "text/plain")} + data = {"prompt": prompt} + response = client.post("/qa_url_doc", files=files, data=data) + + assert response.status_code == 200 + + response_data = response.json() + assert "response" in response_data + assert isinstance(response_data["response"], str) \ No newline at end of file