From 7ab0c1ae10eaab679908c88ccb6d15ba61ab1fe8 Mon Sep 17 00:00:00 2001 From: Ishankoradia <39583356+Ishankoradia@users.noreply.github.com> Date: Tue, 23 Apr 2024 22:22:45 +0530 Subject: [PATCH] Mvp for llm4dev (#34) * migrations for file, knowledge_category and mapping embeddings to a file * document upload now maps embeddings to its file model * resetting the correct migrations * added organization to knowledge category * api to add a knowledge category for an org * validation; knowledge category names to be unique * api to delete the knowledge category; will wipe out all files and embeddings * added knowledge category filter in create chat api and also added a get all categories api * api to get all files of an organization * api to delete a file; category compulsory for an org * use category_id in document upload and chat api --- llm/api.py | 242 +++++++++++++++++- ...3_knowledgecategory_file_embedding_file.py | 44 ++++ .../0014_alter_file_knowledge_category.py | 19 ++ llm/models.py | 33 +++ llm/urls.py | 30 +++ 5 files changed, 362 insertions(+), 6 deletions(-) create mode 100644 llm/migrations/0013_knowledgecategory_file_embedding_file.py create mode 100644 llm/migrations/0014_alter_file_knowledge_category.py diff --git a/llm/api.py b/llm/api.py index e64a750..ff7d326 100644 --- a/llm/api.py +++ b/llm/api.py @@ -1,6 +1,8 @@ +import uuid import os import django import json +import openai from logging import basicConfig, INFO, getLogger from pypdf import PdfReader @@ -12,7 +14,6 @@ from rest_framework.decorators import api_view from rest_framework.parsers import MultiPartParser from rest_framework.views import APIView -import openai from llm.utils.prompt import ( context_prompt_messages, @@ -20,7 +21,7 @@ count_tokens_for_text, ) from llm.utils.general import generate_session_id -from llm.models import Organization, Embedding, Message +from llm.models import Organization, Embedding, Message, File, KnowledgeCategory basicConfig(level=INFO) @@ -49,6 +50,12 @@ def create_chat(request): openai.api_key = organization.openai_key + knowledge_cat = None + if "category_id" in request.data: + knowledge_cat = KnowledgeCategory.objects.filter( + id=request.data["category_id"] + ).first() + question = request.data.get("question").strip() system_prompt = ( request.data.get("system_prompt", None) or organization.system_prompt @@ -120,11 +127,18 @@ def create_chat(request): model="text-embedding-ada-002", input=question )["data"][0]["embedding"] + embedding_results_query = Embedding.objects + + if knowledge_cat: + embedding_results_query = embedding_results_query.filter( + file__knowledge_category=knowledge_cat + ) + embedding_results = ( - Embedding.objects.alias( + embedding_results_query.alias( distance=L2Distance("text_vectors", prompt_embeddings), ) - .filter(distance__gt=0.7) + # .filter(distance__gt=0.7) .order_by("-distance") ) logger.info( @@ -236,9 +250,35 @@ def post(self, request, format=None): openai.api_key = org.openai_key - file = request.data["file"] + request_file = request.data["file"] + + if "category_id" not in request.data: + return JsonResponse( + {"error": f"Please provide a category"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + knowledge_cat = KnowledgeCategory.objects.filter( + id=request.data["category_id"] + ).first() - pdf_reader = PdfReader(file) + if not knowledge_cat: + return JsonResponse( + {"error": f"Category does not exist, please create one first"}, + status=status.HTTP_404_NOT_FOUND, + ) + + logger.info("Using Knowledge Category : %s", knowledge_cat) + + logger.info("Uploading file %s", request_file.name) + + # Create the file object + file = File.objects.create( + knowledge_category=knowledge_cat, + name=request_file.name, + ) + + pdf_reader = PdfReader(request_file) for page in pdf_reader.pages: page_text = page.extract_text().replace("\n", " ") @@ -259,6 +299,7 @@ def post(self, request, format=None): text_vectors=embeddings, organization=org, num_tokens=count_tokens_for_text(page_text), + file=file, ) return JsonResponse({"msg": f"Uploaded file {file.name} successfully"}) @@ -333,6 +374,7 @@ def set_examples_text(request): ' """ try: + org: Organization = request.org logger.info(f"processing set examples text request for org {org.name}") @@ -382,3 +424,191 @@ def set_openai_key(request): {"error": f"Something went wrong"}, status=status.HTTP_500_INTERNAL_SERVER_ERROR, ) + + +@api_view(["POST"]) +def create_knowledge_category(request): + """ + Create a new category for an org + """ + try: + org: Organization = request.org + + name = request.data.get("name") + + if KnowledgeCategory.objects.filter(name=name, org=org).exists(): + return JsonResponse( + {"error": f"Knowledge Category with name {name} already exists"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + knowledge_cat = KnowledgeCategory.objects.create(name=name.strip(), org=org) + + return JsonResponse( + { + "name": knowledge_cat.name, + "uuid": knowledge_cat.uuid, + "id": knowledge_cat.id, + }, + status=status.HTTP_200_OK, + ) + + except Exception as error: + logger.error(f"Error: {error}") + return JsonResponse( + {"error": f"Something went wrong"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + +@api_view(["GET"]) +def get_knowledge_categories(request): + """ + Fetches all categories for an org + """ + try: + org: Organization = request.org + + return JsonResponse( + { + "data": [ + { + "name": knowledge_cat.name, + "uuid": knowledge_cat.uuid, + "id": knowledge_cat.id, + } + for knowledge_cat in KnowledgeCategory.objects.filter(org=org).all() + ] + }, + status=status.HTTP_200_OK, + ) + + except Exception as error: + logger.error(f"Error: {error}") + return JsonResponse( + {"error": f"Something went wrong"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + +@api_view(["DELETE"]) +def delete_knowledge_category(request, category_uuid): + """ + Example request body: + + ' + Question: Peshab ki jagah se kharash ho rahi hai + Chatbot Answer in Hindi: aapakee samasya ke lie dhanyavaad. yah peshaab ke samay kharaash kee samasya ho sakatee hai. ise yoorinaree traikt inphekshan (uti) kaha jaata hai. yoorinaree traikt imphekshan utpann hone ka mukhy kaaran aantarik inphekshan ho sakata hai. + ' + """ + try: + org: Organization = request.org + + try: + uuid.UUID( + category_uuid + ) # This will raise a ValueError if uuid_str is not a valid UUID + except ValueError: + return JsonResponse( + {"error": "Invalid UUID"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + knowledge_cat = KnowledgeCategory.objects.filter( + uuid=category_uuid, org=org + ).first() + + if not knowledge_cat: + return JsonResponse( + {"error": f"Knowledge Category does not exists"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + knowledge_cat.delete() + + return JsonResponse( + {"msg": f"Category deleted successfully"}, + status=status.HTTP_200_OK, + ) + + except Exception as error: + logger.error(f"Error: {error}") + return JsonResponse( + {"error": f"Something went wrong"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + +@api_view(["GET"]) +def get_documents(request): + """ + Fetches all documents uploaded by the org + """ + try: + org: Organization = request.org + + return JsonResponse( + { + "data": [ + { + "name": file.name, + "uuid": file.uuid, + "category": { + "name": file.knowledge_category.name, + "uuid": file.knowledge_category.uuid, + "id": file.knowledge_category.id, + }, + } + for file in File.objects.filter(knowledge_category__org=org).all() + ] + }, + status=status.HTTP_200_OK, + ) + + except Exception as error: + logger.error(f"Error: {error}") + return JsonResponse( + {"error": f"Something went wrong"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + +@api_view(["DELETE"]) +def delete_document(request, file_uuid): + """ + Fetches all documents uploaded by the org + """ + try: + org: Organization = request.org + + try: + uuid.UUID( + file_uuid + ) # This will raise a ValueError if uuid_str is not a valid UUID + except ValueError: + return JsonResponse( + {"error": "Invalid UUID"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + file = File.objects.filter(uuid=file_uuid, knowledge_category__org=org).first() + + if not file: + return JsonResponse( + {"error": f"Document does not exists"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + file.delete() + + return JsonResponse( + {"msg": f"File and its embeddings deleted successfully"}, + status=status.HTTP_200_OK, + ) + + except Exception as error: + logger.error(f"Error: {error}") + return JsonResponse( + {"error": f"Something went wrong"}, + status=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) diff --git a/llm/migrations/0013_knowledgecategory_file_embedding_file.py b/llm/migrations/0013_knowledgecategory_file_embedding_file.py new file mode 100644 index 0000000..0562a89 --- /dev/null +++ b/llm/migrations/0013_knowledgecategory_file_embedding_file.py @@ -0,0 +1,44 @@ +# Generated by Django 4.2.6 on 2024-04-17 03:00 + +from django.db import migrations, models +import django.db.models.deletion +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ('llm', '0012_embedding_num_tokens'), + ] + + operations = [ + migrations.CreateModel( + name='KnowledgeCategory', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True)), + ('name', models.CharField(default='default', max_length=255, unique=True)), + ('org', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='llm.organization')), + ], + options={ + 'db_table': 'knowledge_category', + }, + ), + migrations.CreateModel( + name='File', + fields=[ + ('id', models.AutoField(primary_key=True, serialize=False)), + ('uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True)), + ('name', models.CharField(max_length=255)), + ('knowledge_category', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, to='llm.knowledgecategory')), + ], + options={ + 'db_table': 'files', + }, + ), + migrations.AddField( + model_name='embedding', + name='file', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='llm.file'), + ), + ] diff --git a/llm/migrations/0014_alter_file_knowledge_category.py b/llm/migrations/0014_alter_file_knowledge_category.py new file mode 100644 index 0000000..9f7fea2 --- /dev/null +++ b/llm/migrations/0014_alter_file_knowledge_category.py @@ -0,0 +1,19 @@ +# Generated by Django 4.2.6 on 2024-04-17 03:38 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('llm', '0013_knowledgecategory_file_embedding_file'), + ] + + operations = [ + migrations.AlterField( + model_name='file', + name='knowledge_category', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='llm.knowledgecategory'), + ), + ] diff --git a/llm/models.py b/llm/models.py index c64b0e8..f6a73ef 100644 --- a/llm/models.py +++ b/llm/models.py @@ -1,3 +1,4 @@ +import uuid from django.db import models from pgvector.django import VectorField @@ -37,6 +38,37 @@ class Meta: db_table = "organization" +class KnowledgeCategory(models.Model): + """ + Model to store the knowledge category + Documents and their embeddings will now be associated with a knowledge category + """ + + id = models.AutoField(primary_key=True) + uuid = models.UUIDField(unique=True, editable=False, default=uuid.uuid4) + name = models.CharField(max_length=255, unique=True, default="default") + org = models.ForeignKey(Organization, on_delete=models.CASCADE) + + class Meta: + db_table = "knowledge_category" + + +class File(models.Model): + """ + Store the details of all the file/document uploaded + """ + + id = models.AutoField(primary_key=True) + uuid = models.UUIDField(unique=True, editable=False, default=uuid.uuid4) + knowledge_category = models.ForeignKey( + KnowledgeCategory, on_delete=models.CASCADE, null=True + ) + name = models.CharField(max_length=255) + + class Meta: + db_table = "files" + + class Embedding(models.Model): id = models.AutoField(primary_key=True) source_name = models.TextField() @@ -44,6 +76,7 @@ class Embedding(models.Model): text_vectors = VectorField(dimensions=1536, null=True) organization = models.ForeignKey(Organization, on_delete=models.CASCADE) num_tokens = models.IntegerField(default=0) + file = models.ForeignKey(File, on_delete=models.CASCADE, null=True) class Meta: db_table = "embedding" diff --git a/llm/urls.py b/llm/urls.py index 05b4464..c2dc4a6 100644 --- a/llm/urls.py +++ b/llm/urls.py @@ -25,6 +25,11 @@ set_evaluator_prompt, set_examples_text, set_openai_key, + create_knowledge_category, + get_knowledge_categories, + delete_knowledge_category, + get_documents, + delete_document, ) urlpatterns = [ @@ -35,4 +40,29 @@ path("api/evaluator_prompt", set_evaluator_prompt, name="set_evaluator_prompt"), path("api/examples_text", set_examples_text, name="set_examples_text"), path("api/openai_key", set_openai_key, name="set_openai_key"), + path( + "api/knowledge/category", + create_knowledge_category, + name="create_knowledge_category", + ), + path( + "api/knowledge/category/get", + get_knowledge_categories, + name="get_knowledge_categories", + ), + path( + "api/knowledge/category/", + delete_knowledge_category, + name="delete_knowledge_category", + ), + path( + "api/files", + get_documents, + name="get_documents", + ), + path( + "api/files/", + delete_document, + name="delete_document", + ), ]