diff --git a/.env.template b/.env.template index ba04c704..b007d62b 100644 --- a/.env.template +++ b/.env.template @@ -5,7 +5,7 @@ SUPABASE_READ_ONLY= SUPABASE_JWT_SECRET= MATERIALS_SUPABASE_TABLE=uiuc_chatbot -NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE=documents +SUPABASE_DOCUMENTS_TABLE=documents # QDRANT QDRANT_COLLECTION_NAME=uiuc-chatbot diff --git a/.trunk/.gitignore b/.trunk/.gitignore index 1e246529..15966d08 100644 --- a/.trunk/.gitignore +++ b/.trunk/.gitignore @@ -6,3 +6,4 @@ plugins user_trunk.yaml user.yaml +tmp diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml index 86a70d55..4186a1e2 100644 --- a/.trunk/trunk.yaml +++ b/.trunk/trunk.yaml @@ -2,12 +2,12 @@ # To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml version: 0.1 cli: - version: 1.18.0 + version: 1.20.1 # Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins) plugins: sources: - id: trunk - ref: v1.3.0 + ref: v1.4.3 uri: https://github.com/trunk-io/plugins # Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes) runtimes: @@ -18,20 +18,26 @@ runtimes: # This is the section where you manage your linters. (https://docs.trunk.io/check/configuration) # - osv-scanner@1.5.0 # too sensitive, causing failures that make devs skip checks. lint: + disabled: + - black enabled: + # - black@24.2.0 + # - osv-scanner@1.6.2 + - trufflehog@3.67.7 - yapf@0.40.2 + - isort@5.13.2 - actionlint@1.6.26 - - bandit@1.7.5 - - checkov@3.1.9 + - bandit@1.7.7 + - checkov@3.2.22 - git-diff-check - - markdownlint@0.37.0 + - markdownlint@0.39.0 - oxipng@9.0.0 - - prettier@3.1.0 - - ruff@0.1.7 + - prettier@3.2.5 + - ruff@0.2.2 - shellcheck@0.9.0 - shfmt@3.6.0 - - trivy@0.48.0 - - yamllint@1.33.0 + - trivy@0.49.1 + - yamllint@1.35.1 ignore: - linters: [ALL] paths: diff --git a/ai_ta_backend/aws.py b/ai_ta_backend/aws.py deleted file mode 100644 index 0c0cfa33..00000000 --- a/ai_ta_backend/aws.py +++ /dev/null @@ -1,64 +0,0 @@ -import os -import uuid -from multiprocessing import Lock, cpu_count -from multiprocessing.pool import ThreadPool -from typing import List, Optional - -import boto3 - - -def upload_data_files_to_s3(course_name: str, localdir: str) -> Optional[List[str]]: - """Uploads all files in localdir to S3 bucket. - - Args: - course_name (str): Official course name on our website. - localdir (str): Local directory to upload from, coursera-dl downloads to this directory. - - Returns: - Optional[List[str]]: A list of S3 paths, the final resting place of uploads, or None if no files were uploaded. - """ - s3 = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - filenames = [] - for root, _subdirs, files in os.walk(localdir): - for filename in files: - filenames.append(os.path.join(root, filename)) - - if not filenames: - print(f"No files to upload. Not found in: {localdir}") - return None - - print(f"Files to upload: {filenames}") - print("About to upload...") - - s3_paths = [] - s3_paths_lock = Lock() - - def upload(myfile): - # get the last part of the path and append unique ID before it - directory, old_filename = os.path.split(myfile) - new_filename = str(uuid.uuid4()) + '-' + old_filename - new_filepath = os.path.join(directory, new_filename) - - s3_file = f"courses/{course_name}/{os.path.basename(new_filepath)}" - s3.upload_file(myfile, os.getenv('S3_BUCKET_NAME'), s3_file) - with s3_paths_lock: - s3_paths.append(s3_file) - - # only 2 parallel uploads because we're getting rate limited with min_p=6... 503 errors. - min_p = 2 - max_p = cpu_count() - num_procs = max(min(len(filenames), max_p), min_p) - pool = ThreadPool(processes=num_procs) - pool.map(upload, filenames) - - print("All data files uploaded to S3 successfully.") - return s3_paths - - -if __name__ == '__main__': - pass \ No newline at end of file diff --git a/ai_ta_backend/beam/.beamignore b/ai_ta_backend/beam/.beamignore new file mode 100644 index 00000000..f787b8ee --- /dev/null +++ b/ai_ta_backend/beam/.beamignore @@ -0,0 +1,7 @@ +.venv +venv +.idea +.vscode +.git +*.pyc +__pycache__ diff --git a/ai_ta_backend/extreme_context_stuffing.py b/ai_ta_backend/beam/OpenaiEmbeddings.py similarity index 100% rename from ai_ta_backend/extreme_context_stuffing.py rename to ai_ta_backend/beam/OpenaiEmbeddings.py diff --git a/ai_ta_backend/beam/ingest.py b/ai_ta_backend/beam/ingest.py new file mode 100644 index 00000000..a2f930a0 --- /dev/null +++ b/ai_ta_backend/beam/ingest.py @@ -0,0 +1,1241 @@ +""" +To deploy: beam deploy ingest.py --profile caii-ncsa +Use CAII gmail to auth. +""" +import asyncio +import inspect +import json +import logging +import mimetypes +import os +import re +import shutil +import traceback +import uuid +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import Any, Callable, Dict, List, Optional, Union + +import beam +import boto3 +import fitz +import openai +import pytesseract +import sentry_sdk +import supabase +from beam import App, QueueDepthAutoscaler, Runtime # RequestLatencyAutoscaler, +from bs4 import BeautifulSoup +from git.repo import Repo +from langchain.document_loaders import ( + Docx2txtLoader, + GitLoader, + PythonLoader, + SRTLoader, + TextLoader, + UnstructuredExcelLoader, + UnstructuredPowerPointLoader, +) +from langchain.document_loaders.csv_loader import CSVLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.schema import Document +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores import Qdrant +from nomic_logging import delete_from_document_map, log_to_document_map +from OpenaiEmbeddings import OpenAIAPIProcessor +from PIL import Image +from posthog import Posthog +from pydub import AudioSegment +from qdrant_client import QdrantClient, models +from qdrant_client.models import PointStruct + +# from langchain.schema.output_parser import StrOutputParser +# from langchain.chat_models import AzureChatOpenAI + +requirements = [ + "openai<1.0", + "supabase==2.0.2", + "tiktoken==0.5.1", + "boto3==1.28.79", + "qdrant-client==1.7.3", + "langchain==0.0.331", + "posthog==3.1.0", + "pysrt==1.1.2", + "docx2txt==0.8", + "pydub==0.25.1", + "ffmpeg-python==0.2.0", + "ffprobe==0.5", + "ffmpeg==1.4", + "PyMuPDF==1.23.6", + "pytesseract==0.3.10", # image OCR" + "openpyxl==3.1.2", # excel" + "networkx==3.2.1", # unused part of excel partitioning :(" + "python-pptx==0.6.23", + "unstructured==0.10.29", + "GitPython==3.1.40", + "beautifulsoup4==4.12.2", + "sentry-sdk==1.39.1", + "nomic==2.0.14", +] + +# TODO: consider adding workers. They share CPU and memory https://docs.beam.cloud/deployment/autoscaling#worker-use-cases +app = App("ingest", + runtime=Runtime( + cpu=1, + memory="3Gi", + image=beam.Image( + python_version="python3.10", + python_packages=requirements, + commands=["apt-get update && apt-get install -y ffmpeg tesseract-ocr"], + ), + )) + +# MULTI_QUERY_PROMPT = hub.pull("langchain-ai/rag-fusion-query-generation") +OPENAI_API_TYPE = "azure" # "openai" or "azure" + + +def loader(): + """ + The loader function will run once for each worker that starts up. https://docs.beam.cloud/deployment/loaders + """ + openai.api_key = os.getenv("VLADS_OPENAI_KEY") + + # vector DB + qdrant_client = QdrantClient( + url=os.getenv('QDRANT_URL'), + api_key=os.getenv('QDRANT_API_KEY'), + ) + + vectorstore = Qdrant(client=qdrant_client, + collection_name=os.environ['QDRANT_COLLECTION_NAME'], + embeddings=OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE, + openai_api_key=os.getenv('VLADS_OPENAI_KEY'))) + + # S3 + s3_client = boto3.client( + 's3', + aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), + ) + + # Create a Supabase client + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) + + # llm = AzureChatOpenAI( + # temperature=0, + # deployment_name=os.getenv('AZURE_OPENAI_ENGINE'), #type:ignore + # openai_api_base=os.getenv('AZURE_OPENAI_ENDPOINT'), #type:ignore + # openai_api_key=os.getenv('AZURE_OPENAI_KEY'), #type:ignore + # openai_api_version=os.getenv('OPENAI_API_VERSION'), #type:ignore + # openai_api_type=OPENAI_API_TYPE) + + posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com') + sentry_sdk.init( + dsn="https://examplePublicKey@o0.ingest.sentry.io/0", + + # Enable performance monitoring + enable_tracing=True, + ) + + return qdrant_client, vectorstore, s3_client, supabase_client, posthog + + +# autoscaler = RequestLatencyAutoscaler(desired_latency=30, max_replicas=2) +autoscaler = QueueDepthAutoscaler(max_tasks_per_replica=300, max_replicas=3) + + +# Triggers determine how your app is deployed +@app.rest_api( + workers=4, + # callback_url='https://uiuc-chat-git-refactoringesttobeamserverless-kastanday.vercel.app/api/UIUC-api/ingestCallback', + max_pending_tasks=15_000, + max_retries=3, + timeout=-1, + loader=loader, + autoscaler=autoscaler) +def ingest(**inputs: Dict[str, Any]): + qdrant_client, vectorstore, s3_client, supabase_client, posthog = inputs["context"] + + course_name: List[str] | str = inputs.get('course_name', '') + s3_paths: List[str] | str = inputs.get('s3_paths', '') + url: List[str] | str | None = inputs.get('url', None) + base_url: List[str] | str | None = inputs.get('base_url', None) + readable_filename: List[str] | str = inputs.get('readable_filename', '') + content: str | None = inputs.get('content', None) # is webtext if content exists + + print( + f"In top of /ingest route. course: {course_name}, s3paths: {s3_paths}, readable_filename: {readable_filename}, base_url: {base_url}, url: {url}, content: {content}" + ) + + ingester = Ingest(qdrant_client, vectorstore, s3_client, supabase_client, posthog) + + if content: + success_fail_dict = ingester.ingest_single_web_text(course_name, base_url, url, content, readable_filename) + elif readable_filename == '': + success_fail_dict = ingester.bulk_ingest(course_name, s3_paths, base_url=base_url, url=url) + else: + success_fail_dict = ingester.bulk_ingest(course_name, + s3_paths, + readable_filename=readable_filename, + base_url=base_url, + url=url) + print("Final success_fail_dict: ", success_fail_dict) + return json.dumps(success_fail_dict) + + +class Ingest(): + + def __init__(self, qdrant_client, vectorstore, s3_client, supabase_client, posthog): + self.qdrant_client = qdrant_client + self.vectorstore = vectorstore + self.s3_client = s3_client + self.supabase_client = supabase_client + self.posthog = posthog + + def bulk_ingest(self, course_name: str, s3_paths: Union[str, List[str]], **kwargs) -> Dict: + """ + Bulk ingest a list of s3 paths into the vectorstore, and also into the supabase database. + """ + + def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs): + """Handle running an arbitrary ingest function for an individual file.""" + # RUN INGEST METHOD + ret = ingest_method(s3_path, *args, **kwargs) + if ret == "Success": + success_status['success_ingest'].append(s3_path) + else: + success_status['failure_ingest'].append(s3_path) + + # πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡ ADD NEW INGEST METHODS HERE πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸŽ‰ + file_ingest_methods = { + '.html': self._ingest_html, + '.py': self._ingest_single_py, + '.pdf': self._ingest_single_pdf, + '.txt': self._ingest_single_txt, + '.md': self._ingest_single_txt, + '.srt': self._ingest_single_srt, + '.vtt': self._ingest_single_vtt, + '.docx': self._ingest_single_docx, + '.ppt': self._ingest_single_ppt, + '.pptx': self._ingest_single_ppt, + '.xlsx': self._ingest_single_excel, + '.xls': self._ingest_single_excel, + '.csv': self._ingest_single_csv, + '.png': self._ingest_single_image, + '.jpg': self._ingest_single_image, + } + + # Ingest methods via MIME type (more general than filetype) + mimetype_ingest_methods = { + 'video': self._ingest_single_video, + 'audio': self._ingest_single_video, + 'text': self._ingest_single_txt, + 'image': self._ingest_single_image, + } + # πŸ‘†πŸ‘†πŸ‘†πŸ‘† ADD NEW INGEST METHODhe πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸŽ‰ + + print(f"Top of ingest, Course_name {course_name}. S3 paths {s3_paths}") + success_status = {"success_ingest": [], "failure_ingest": []} + try: + if isinstance(s3_paths, str): + s3_paths = [s3_paths] + + for s3_path in s3_paths: + file_extension = Path(s3_path).suffix + with NamedTemporaryFile(suffix=file_extension) as tmpfile: + self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) + mime_type = str(mimetypes.guess_type(tmpfile.name, strict=False)[0]) + mime_category = mime_type.split('/')[0] if '/' in mime_type else mime_type + + if file_extension in file_ingest_methods: + # Use specialized functions when possible, fallback to mimetype. Else raise error. + ingest_method = file_ingest_methods[file_extension] + _ingest_single(ingest_method, s3_path, course_name, **kwargs) + elif mime_category in mimetype_ingest_methods: + # fallback to MimeType + print("mime category", mime_category) + ingest_method = mimetype_ingest_methods[mime_category] + _ingest_single(ingest_method, s3_path, course_name, **kwargs) + else: + # No supported ingest... Fallback to attempting utf-8 decoding, otherwise fail. + try: + self._ingest_single_txt(s3_path, course_name) + success_status['success_ingest'].append(s3_path) + print("βœ… FALLBACK TO UTF-8 INGEST WAS SUCCESSFUL :) ") + except Exception as e: + print( + f"We don't have a ingest method for this filetype: {file_extension}. As a last-ditch effort, we tried to ingest the file as utf-8 text, but that failed too. File is unsupported: {s3_path}. UTF-8 ingest error: {e}" + ) + success_status['failure_ingest'].append( + f"We don't have a ingest method for this filetype: {file_extension} (with generic type {mime_type}), for file: {s3_path}" + ) + self.posthog.capture( + 'distinct_id_of_the_user', + event='ingest_failure', + properties={ + 'course_name': + course_name, + 's3_path': + s3_paths, + 'kwargs': + kwargs, + 'error': + f"We don't have a ingest method for this filetype: {file_extension} (with generic type {mime_type}), for file: {s3_path}" + }) + + return success_status + except Exception as e: + err = f"❌❌ Error in /ingest: `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc() + + success_status['failure_ingest'].append(f"MAJOR ERROR IN /bulk_ingest: Error: {err}") + self.posthog.capture('distinct_id_of_the_user', + event='ingest_failure', + properties={ + 'course_name': course_name, + 's3_path': s3_paths, + 'kwargs': kwargs, + 'error': err + }) + + sentry_sdk.capture_exception(e) + print(f"MAJOR ERROR IN /bulk_ingest: Error: {str(e)}") + return success_status + + def ingest_single_web_text(self, course_name: str, base_url: str, url: str, content: str, readable_filename: str): + """Crawlee integration + """ + self.posthog.capture('distinct_id_of_the_user', + event='ingest_single_web_text_invoked', + properties={ + 'course_name': course_name, + 'base_url': base_url, + 'url': url, + 'content': content, + 'title': readable_filename + }) + try: + # if not, ingest the text + text = [content] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': '', + 'readable_filename': readable_filename, + 'pagenumber': '', + 'timestamp': '', + 'url': url, + 'base_url': base_url, + }] + self.split_and_upload(texts=text, metadatas=metadatas) + self.posthog.capture('distinct_id_of_the_user', + event='ingest_single_web_text_succeeded', + properties={ + 'course_name': course_name, + 'base_url': base_url, + 'url': url, + 'title': readable_filename + }) + + return f"βœ… Success for web text. title: {readable_filename}, url: {url}, " + except Exception as e: + + err = f"❌❌ Error in (web text ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return str(err) + + def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs): + try: + file_name = s3_path.split("/")[-1] + file_path = "media/" + file_name # download from s3 to local folder for ingest + + self.s3_client.download_file(os.getenv('S3_BUCKET_NAME'), s3_path, file_path) + + loader = PythonLoader(file_path) + documents = loader.load() + + texts = [doc.page_content for doc in documents] + + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': '', + 'url': '', + 'base_url': '', + } for doc in documents] + #print(texts) + os.remove(file_path) + + success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas) + print("Python ingest: ", success_or_failure) + return success_or_failure + + except Exception as e: + err = f"❌❌ Error in (Python ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return err + + def _ingest_single_vtt(self, s3_path: str, course_name: str, **kwargs): + """ + Ingest a single .vtt file from S3. + """ + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into vtt_tmpfile + self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) + loader = TextLoader(tmpfile.name) + documents = loader.load() + texts = [doc.page_content for doc in documents] + + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': '', + 'url': '', + 'base_url': '', + } for doc in documents] + + success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas) + return success_or_failure + except Exception as e: + err = f"❌❌ Error in (VTT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return err + + def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str: + print(f"IN _ingest_html s3_path `{s3_path}` kwargs: {kwargs}") + try: + response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path) + raw_html = response['Body'].read().decode('utf-8') + + soup = BeautifulSoup(raw_html, 'html.parser') + title = s3_path.replace("courses/" + course_name, "") + title = title.replace(".html", "") + title = title.replace("_", " ") + title = title.replace("/", " ") + title = title.strip() + title = title[37:] # removing the uuid prefix + text = [soup.get_text()] + + metadata: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': str(title), # adding str to avoid error: unhashable type 'slice' + 'url': kwargs.get('url', ''), + 'base_url': kwargs.get('base_url', ''), + 'pagenumber': '', + 'timestamp': '', + }] + + success_or_failure = self.split_and_upload(text, metadata) + print(f"_ingest_html: {success_or_failure}") + return success_or_failure + except Exception as e: + err: str = f"ERROR IN _ingest_html: {e}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + sentry_sdk.capture_exception(e) + return err + + def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str: + """ + Ingest a single video file from S3. + """ + print("Starting ingest video or audio") + try: + # check for file extension + file_ext = Path(s3_path).suffix + openai.api_key = os.getenv('OPENAI_API_KEY') + transcript_list = [] + with NamedTemporaryFile(suffix=file_ext) as video_tmpfile: + # download from S3 into an video tmpfile + self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=video_tmpfile) + # extract audio from video tmpfile + mp4_version = AudioSegment.from_file(video_tmpfile.name, file_ext[1:]) + + # save the extracted audio as a temporary webm file + with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as webm_tmpfile: + mp4_version.export(webm_tmpfile, format="webm") + + # check file size + file_size = os.path.getsize(webm_tmpfile.name) + # split the audio into 25MB chunks + if file_size > 26214400: + # load the webm file into audio object + full_audio = AudioSegment.from_file(webm_tmpfile.name, "webm") + file_count = file_size // 26214400 + 1 + split_segment = 35 * 60 * 1000 + start = 0 + count = 0 + + while count < file_count: + with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as split_tmp: + if count == file_count - 1: + # last segment + audio_chunk = full_audio[start:] + else: + audio_chunk = full_audio[start:split_segment] + + audio_chunk.export(split_tmp.name, format="webm") + + # transcribe the split file and store the text in dictionary + with open(split_tmp.name, "rb") as f: + transcript = openai.Audio.transcribe("whisper-1", f) + transcript_list.append(transcript['text']) # type: ignore + start += split_segment + split_segment += split_segment + count += 1 + os.remove(split_tmp.name) + else: + # transcribe the full audio + with open(webm_tmpfile.name, "rb") as f: + transcript = openai.Audio.transcribe("whisper-1", f) + transcript_list.append(transcript['text']) # type: ignore + + os.remove(webm_tmpfile.name) + + text = [txt for txt in transcript_list] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': text.index(txt), + 'url': '', + 'base_url': '', + } for txt in text] + + self.split_and_upload(texts=text, metadatas=metadatas) + return "Success" + except Exception as e: + err = f"❌❌ Error in (VIDEO ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return str(err) + + def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str: + try: + with NamedTemporaryFile() as tmpfile: + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) + + loader = Docx2txtLoader(tmpfile.name) + documents = loader.load() + + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': '', + 'url': '', + 'base_url': '', + } for doc in documents] + + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + err = f"❌❌ Error in (DOCX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return str(err) + + def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str: + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into pdf_tmpfile + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) + + loader = SRTLoader(tmpfile.name) + documents = loader.load() + + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': '', + 'url': '', + 'base_url': '', + } for doc in documents] + + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + err = f"❌❌ Error in (SRT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return str(err) + + def _ingest_single_excel(self, s3_path: str, course_name: str, **kwargs) -> str: + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into pdf_tmpfile + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) + + loader = UnstructuredExcelLoader(tmpfile.name, mode="elements") + # loader = SRTLoader(tmpfile.name) + documents = loader.load() + + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': '', + 'url': '', + 'base_url': '', + } for doc in documents] + + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + err = f"❌❌ Error in (Excel/xlsx ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return str(err) + + def _ingest_single_image(self, s3_path: str, course_name: str, **kwargs) -> str: + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into pdf_tmpfile + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) + """ + # Unstructured image loader makes the install too large (700MB --> 6GB. 3min -> 12 min build times). AND nobody uses it. + # The "hi_res" strategy will identify the layout of the document using detectron2. "ocr_only" uses pdfminer.six. https://unstructured-io.github.io/unstructured/core/partition.html#partition-image + loader = UnstructuredImageLoader(tmpfile.name, unstructured_kwargs={'strategy': "ocr_only"}) + documents = loader.load() + """ + + res_str = pytesseract.image_to_string(Image.open(tmpfile.name)) + print("IMAGE PARSING RESULT:", res_str) + documents = [Document(page_content=res_str)] + + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': '', + 'url': '', + 'base_url': '', + } for doc in documents] + + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + err = f"❌❌ Error in (png/jpg ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return str(err) + + def _ingest_single_csv(self, s3_path: str, course_name: str, **kwargs) -> str: + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into pdf_tmpfile + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) + + loader = CSVLoader(file_path=tmpfile.name) + documents = loader.load() + + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': '', + 'url': '', + 'base_url': '', + } for doc in documents] + + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + err = f"❌❌ Error in (CSV ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return str(err) + + def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs): + """ + Both OCR the PDF. And grab the first image as a PNG. + LangChain `Documents` have .metadata and .page_content attributes. + Be sure to use TemporaryFile() to avoid memory leaks! + """ + print("IN PDF ingest: s3_path: ", s3_path, "and kwargs:", kwargs) + + try: + with NamedTemporaryFile() as pdf_tmpfile: + # download from S3 into pdf_tmpfile + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile) + ### READ OCR of PDF + doc = fitz.open(pdf_tmpfile.name) # type: ignore + + # improve quality of the image + zoom_x = 2.0 # horizontal zoom + zoom_y = 2.0 # vertical zoom + mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension + + pdf_pages_OCRed: List[Dict] = [] + for i, page in enumerate(doc): # type: ignore + + # UPLOAD FIRST PAGE IMAGE to S3 + if i == 0: + with NamedTemporaryFile(suffix=".png") as first_page_png: + pix = page.get_pixmap(matrix=mat) + pix.save(first_page_png) # store image as a PNG + + s3_upload_path = str(Path(s3_path)).rsplit('.pdf')[0] + "-pg1-thumb.png" + first_page_png.seek(0) # Seek the file pointer back to the beginning + with open(first_page_png.name, 'rb') as f: + print("Uploading image png to S3") + self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path) + + # Extract text + text = page.get_text().encode("utf8").decode("utf8", errors='ignore') # get plain text (is in UTF-8) + pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name[37:])) + + metadatas: List[Dict[str, Any]] = [ + { + 'course_name': course_name, + 's3_path': s3_path, + 'pagenumber': page['page_number'] + 1, # +1 for human indexing + 'timestamp': '', + 'readable_filename': kwargs.get('readable_filename', page['readable_filename']), + 'url': kwargs.get('url', ''), + 'base_url': kwargs.get('base_url', ''), + } for page in pdf_pages_OCRed + ] + pdf_texts = [page['text'] for page in pdf_pages_OCRed] + + success_or_failure = self.split_and_upload(texts=pdf_texts, metadatas=metadatas) + return success_or_failure + except Exception as e: + err = f"❌❌ Error in (PDF ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) # type: ignore + print(err) + sentry_sdk.capture_exception(e) + return err + return "Success" + + def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str: + """Ingest a single .txt or .md file from S3. + Args: + s3_path (str): A path to a .txt file in S3 + course_name (str): The name of the course + Returns: + str: "Success" or an error message + """ + print("In text ingest") + try: + # NOTE: slightly different method for .txt files, no need for download. It's part of the 'body' + response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path) + print("s3 Resonse:", response) + text = response['Body'].read().decode('utf-8') + print("Text from s3:", text) + text = [text] + + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': '', + 'url': '', + 'base_url': '', + }] + print("Prior to ingest", metadatas) + + success_or_failure = self.split_and_upload(texts=text, metadatas=metadatas) + return success_or_failure + except Exception as e: + err = f"❌❌ Error in (TXT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return str(err) + + def _ingest_single_ppt(self, s3_path: str, course_name: str, **kwargs) -> str: + """ + Ingest a single .ppt or .pptx file from S3. + """ + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into pdf_tmpfile + #print("in ingest PPTX") + self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) + + loader = UnstructuredPowerPointLoader(tmpfile.name) + documents = loader.load() + + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': kwargs.get('readable_filename', + Path(s3_path).name[37:]), + 'pagenumber': '', + 'timestamp': '', + 'url': '', + 'base_url': '', + } for doc in documents] + + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + err = f"❌❌ Error in (PPTX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( + ) + print(err) + sentry_sdk.capture_exception(e) + return str(err) + + def ingest_github(self, github_url: str, course_name: str) -> str: + """ + Clones the given GitHub URL and uses Langchain to load data. + 1. Clone the repo + 2. Use Langchain to load the data + 3. Pass to split_and_upload() + Args: + github_url (str): The Github Repo URL to be ingested. + course_name (str): The name of the course in our system. + + Returns: + _type_: Success or error message. + """ + try: + repo_path = "media/cloned_repo" + repo = Repo.clone_from(github_url, to_path=repo_path, depth=1, clone_submodules=False) + branch = repo.head.reference + + loader = GitLoader(repo_path="media/cloned_repo", branch=str(branch)) + data = loader.load() + shutil.rmtree("media/cloned_repo") + # create metadata for each file in data + + for doc in data: + texts = doc.page_content + metadatas: Dict[str, Any] = { + 'course_name': course_name, + 's3_path': '', + 'readable_filename': doc.metadata['file_name'], + 'url': f"{github_url}/blob/main/{doc.metadata['file_path']}", + 'pagenumber': '', + 'timestamp': '', + } + self.split_and_upload(texts=[texts], metadatas=[metadatas]) + return "Success" + except Exception as e: + err = f"❌❌ Error in (GITHUB ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n{traceback.format_exc()}" + print(err) + sentry_sdk.capture_exception(e) + return err + + def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): + """ This is usually the last step of document ingest. Chunk & upload to Qdrant (and Supabase.. todo). + Takes in Text and Metadata (from Langchain doc loaders) and splits / uploads to Qdrant. + + good examples here: https://langchain.readthedocs.io/en/latest/modules/utils/combine_docs_examples/textsplitter.html + + Args: + texts (List[str]): _description_ + metadatas (List[Dict[str, Any]]): _description_ + """ + # return "Success" + self.posthog.capture('distinct_id_of_the_user', + event='split_and_upload_invoked', + properties={ + 'course_name': metadatas[0].get('course_name', None), + 's3_path': metadatas[0].get('s3_path', None), + 'readable_filename': metadatas[0].get('readable_filename', None), + 'url': metadatas[0].get('url', None), + 'base_url': metadatas[0].get('base_url', None), + }) + + print("In split and upload") + print(f"metadatas: {metadatas}") + print(f"Texts: {texts}") + assert len(texts) == len( + metadatas + ), f'must have equal number of text strings and metadata dicts. len(texts) is {len(texts)}. len(metadatas) is {len(metadatas)}' + + try: + text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + chunk_size=1000, + chunk_overlap=150, + separators=[ + "\n\n", "\n", ". ", " ", "" + ] # try to split on paragraphs... fallback to sentences, then chars, ensure we always fit in context window + ) + contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas) + input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts] + + # check for duplicates + is_duplicate = self.check_for_duplicates(input_texts, metadatas) + if is_duplicate: + self.posthog.capture('distinct_id_of_the_user', + event='split_and_upload_succeeded', + properties={ + 'course_name': metadatas[0].get('course_name', None), + 's3_path': metadatas[0].get('s3_path', None), + 'readable_filename': metadatas[0].get('readable_filename', None), + 'url': metadatas[0].get('url', None), + 'base_url': metadatas[0].get('base_url', None), + 'is_duplicate': True, + }) + return "Success" + + # adding chunk index to metadata for parent doc retrieval + for i, context in enumerate(contexts): + context.metadata['chunk_index'] = i + + oai = OpenAIAPIProcessor( + input_prompts_list=input_texts, + request_url='https://api.openai.com/v1/embeddings', + api_key=os.getenv('VLADS_OPENAI_KEY'), + # request_url='https://uiuc-chat-canada-east.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-05-15', + # api_key=os.getenv('AZURE_OPENAI_KEY'), + max_requests_per_minute=5_000, + max_tokens_per_minute=300_000, + max_attempts=20, + logging_level=logging.INFO, + token_encoding_name='cl100k_base') # nosec -- reasonable bandit error suppression + asyncio.run(oai.process_api_requests_from_file()) + # parse results into dict of shape page_content -> embedding + embeddings_dict: dict[str, List[float]] = { + item[0]['input']: item[1]['data'][0]['embedding'] for item in oai.results + } + + ### BULK upload to Qdrant ### + vectors: list[PointStruct] = [] + for context in contexts: + # !DONE: Updated the payload so each key is top level (no more payload.metadata.course_name. Instead, use payload.course_name), great for creating indexes. + upload_metadata = {**context.metadata, "page_content": context.page_content} + vectors.append( + PointStruct(id=str(uuid.uuid4()), vector=embeddings_dict[context.page_content], payload=upload_metadata)) + + self.qdrant_client.upsert( + collection_name=os.environ['QDRANT_COLLECTION_NAME'], # type: ignore + points=vectors # type: ignore + ) + ### Supabase SQL ### + contexts_for_supa = [{ + "text": context.page_content, + "pagenumber": context.metadata.get('pagenumber'), + "timestamp": context.metadata.get('timestamp'), + "chunk_index": context.metadata.get('chunk_index'), + "embedding": embeddings_dict[context.page_content] + } for context in contexts] + + document = { + "course_name": contexts[0].metadata.get('course_name'), + "s3_path": contexts[0].metadata.get('s3_path'), + "readable_filename": contexts[0].metadata.get('readable_filename'), + "url": contexts[0].metadata.get('url'), + "base_url": contexts[0].metadata.get('base_url'), + "contexts": contexts_for_supa, + } + + response = self.supabase_client.table( + os.getenv('SUPABASE_DOCUMENTS_TABLE')).insert(document).execute() # type: ignore + + # add to Nomic document map + if len(response.data) > 0: + inserted_data = response.data[0] + log_to_document_map(inserted_data) + + self.posthog.capture('distinct_id_of_the_user', + event='split_and_upload_succeeded', + properties={ + 'course_name': metadatas[0].get('course_name', None), + 's3_path': metadatas[0].get('s3_path', None), + 'readable_filename': metadatas[0].get('readable_filename', None), + 'url': metadatas[0].get('url', None), + 'base_url': metadatas[0].get('base_url', None), + }) + print("successful END OF split_and_upload") + return "Success" + except Exception as e: + err: str = f"ERROR IN split_and_upload(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + sentry_sdk.capture_exception(e) + return err + + def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]]) -> bool: + """ + For given metadata, fetch docs from Supabase based on S3 path or URL. + If docs exists, concatenate the texts and compare with current texts, if same, return True. + """ + doc_table = os.getenv('SUPABASE_DOCUMENTS_TABLE', '') + course_name = metadatas[0]['course_name'] + incoming_s3_path = metadatas[0]['s3_path'] + url = metadatas[0]['url'] + original_filename = incoming_s3_path.split('/')[-1][37:] # remove the 37-char uuid prefix + + # check if uuid exists in s3_path -- not all s3_paths have uuids! + incoming_filename = incoming_s3_path.split('/')[-1] + pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', + re.I) # uuid V4 pattern, and v4 only. + if bool(pattern.search(incoming_filename)): + # uuid pattern exists -- remove the uuid and proceed with duplicate checking + original_filename = incoming_filename[37:] + else: + # do not remove anything and proceed with duplicate checking + original_filename = incoming_filename + + if incoming_s3_path: + filename = incoming_s3_path + supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq( + 'course_name', course_name).like('s3_path', '%' + original_filename + '%').order('id', desc=True).execute() + supabase_contents = supabase_contents.data + elif url: + filename = url + supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq( + 'course_name', course_name).eq('url', url).order('id', desc=True).execute() + supabase_contents = supabase_contents.data + else: + filename = None + supabase_contents = [] + + supabase_whole_text = "" + if len(supabase_contents) > 0: # if a doc with same filename exists in Supabase + # concatenate texts + supabase_contexts = supabase_contents[0] + for text in supabase_contexts['contexts']: + supabase_whole_text += text['text'] + + current_whole_text = "" + for text in texts: + current_whole_text += text['input'] + + if supabase_whole_text == current_whole_text: # matches the previous file + print(f"Duplicate ingested! πŸ“„ s3_path: {filename}.") + return True + + else: # the file is updated + print(f"Updated file detected! Same filename, new contents. πŸ“„ s3_path: {filename}") + + # call the delete function on older docs + for content in supabase_contents: + print("older s3_path to be deleted: ", content['s3_path']) + delete_status = self.delete_data(course_name, content['s3_path'], '') + print("delete_status: ", delete_status) + return False + + else: # filename does not already exist in Supabase, so its a brand new file + print(f"NOT a duplicate! πŸ“„s3_path: {filename}") + return False + + def delete_data(self, course_name: str, s3_path: str, source_url: str): + """Delete file from S3, Qdrant, and Supabase.""" + print(f"Deleting {s3_path} from S3, Qdrant, and Supabase for course {course_name}") + # add delete from doc map logic here + try: + # Delete file from S3 + bucket_name = os.getenv('S3_BUCKET_NAME') + + # Delete files by S3 path + if s3_path: + try: + self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path) + except Exception as e: + print("Error in deleting file from s3:", e) + sentry_sdk.capture_exception(e) + # Delete from Qdrant + # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key + # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training ... + try: + self.qdrant_client.delete( + collection_name=os.environ['QDRANT_COLLECTION_NAME'], + points_selector=models.Filter(must=[ + models.FieldCondition( + key="s3_path", + match=models.MatchValue(value=s3_path), + ), + ]), + ) + except Exception as e: + if "timed out" in str(e): + # Timed out is fine. Still deletes. + # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525 + pass + else: + print("Error in deleting file from Qdrant:", e) + sentry_sdk.capture_exception(e) + try: + # delete from Nomic + response = self.supabase_client.from_( + os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq('s3_path', s3_path).eq( + 'course_name', course_name).execute() + data = response.data[0] #single record fetched + nomic_ids_to_delete = [] + context_count = len(data['contexts']) + for i in range(1, context_count + 1): + nomic_ids_to_delete.append(str(data['id']) + "_" + str(i)) + + # delete from Nomic + delete_from_document_map(course_name, nomic_ids_to_delete) + except Exception as e: + print("Error in deleting file from Nomic:", e) + sentry_sdk.capture_exception(e) + + try: + self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('s3_path', s3_path).eq( + 'course_name', course_name).execute() + except Exception as e: + print("Error in deleting file from supabase:", e) + sentry_sdk.capture_exception(e) + + # Delete files by their URL identifier + elif source_url: + try: + # Delete from Qdrant + self.qdrant_client.delete( + collection_name=os.environ['QDRANT_COLLECTION_NAME'], + points_selector=models.Filter(must=[ + models.FieldCondition( + key="url", + match=models.MatchValue(value=source_url), + ), + ]), + ) + except Exception as e: + if "timed out" in str(e): + # Timed out is fine. Still deletes. + # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525 + pass + else: + print("Error in deleting file from Qdrant:", e) + sentry_sdk.capture_exception(e) + try: + # delete from Nomic + response = self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, url, contexts").eq( + 'url', source_url).eq('course_name', course_name).execute() + data = response.data[0] #single record fetched + nomic_ids_to_delete = [] + context_count = len(data['contexts']) + for i in range(1, context_count + 1): + nomic_ids_to_delete.append(str(data['id']) + "_" + str(i)) + + # delete from Nomic + delete_from_document_map(course_name, nomic_ids_to_delete) + except Exception as e: + print("Error in deleting file from Nomic:", e) + sentry_sdk.capture_exception(e) + + try: + # delete from Supabase + self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('url', source_url).eq( + 'course_name', course_name).execute() + except Exception as e: + print("Error in deleting file from supabase:", e) + sentry_sdk.capture_exception(e) + + # Delete from Supabase + return "Success" + except Exception as e: + err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + sentry_sdk.capture_exception(e) + return err + + # def ingest_coursera(self, coursera_course_name: str, course_name: str) -> str: + # """ Download all the files from a coursera course and ingest them. + + # 1. Download the coursera content. + # 2. Upload to S3 (so users can view it) + # 3. Run everything through the ingest_bulk method. + + # Args: + # coursera_course_name (str): The name of the coursera course. + # course_name (str): The name of the course in our system. + + # Returns: + # _type_: Success or error message. + # """ + # certificate = "-ca 'FVhVoDp5cb-ZaoRr5nNJLYbyjCLz8cGvaXzizqNlQEBsG5wSq7AHScZGAGfC1nI0ehXFvWy1NG8dyuIBF7DLMA.X3cXsDvHcOmSdo3Fyvg27Q.qyGfoo0GOHosTVoSMFy-gc24B-_BIxJtqblTzN5xQWT3hSntTR1DMPgPQKQmfZh_40UaV8oZKKiF15HtZBaLHWLbpEpAgTg3KiTiU1WSdUWueo92tnhz-lcLeLmCQE2y3XpijaN6G4mmgznLGVsVLXb-P3Cibzz0aVeT_lWIJNrCsXrTFh2HzFEhC4FxfTVqS6cRsKVskPpSu8D9EuCQUwJoOJHP_GvcME9-RISBhi46p-Z1IQZAC4qHPDhthIJG4bJqpq8-ZClRL3DFGqOfaiu5y415LJcH--PRRKTBnP7fNWPKhcEK2xoYQLr9RxBVL3pzVPEFyTYtGg6hFIdJcjKOU11AXAnQ-Kw-Gb_wXiHmu63veM6T8N2dEkdqygMre_xMDT5NVaP3xrPbA4eAQjl9yov4tyX4AQWMaCS5OCbGTpMTq2Y4L0Mbz93MHrblM2JL_cBYa59bq7DFK1IgzmOjFhNG266mQlC9juNcEhc'" + # always_use_flags = "-u kastanvday@gmail.com -p hSBsLaF5YM469# --ignore-formats mp4 --subtitle-language en --path ./coursera-dl" + + # try: + # subprocess.run( + # f"coursera-dl {always_use_flags} {certificate} {coursera_course_name}", + # check=True, + # shell=True, # nosec -- reasonable bandit error suppression + # stdout=subprocess.PIPE, + # stderr=subprocess.PIPE) # capture_output=True, + # dl_results_path = os.path.join('coursera-dl', coursera_course_name) + # s3_paths: Union[List, None] = upload_data_files_to_s3(course_name, dl_results_path) + + # if s3_paths is None: + # return "Error: No files found in the coursera-dl directory" + + # print("starting bulk ingest") + # start_time = time.monotonic() + # self.bulk_ingest(s3_paths, course_name) + # print("completed bulk ingest") + # print(f"⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds") + + # # Cleanup the coursera downloads + # shutil.rmtree(dl_results_path) + + # return "Success" + # except Exception as e: + # err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + # print(err) + # return err + + # def list_files_recursively(self, bucket, prefix): + # all_files = [] + # continuation_token = None + + # while True: + # list_objects_kwargs = { + # 'Bucket': bucket, + # 'Prefix': prefix, + # } + # if continuation_token: + # list_objects_kwargs['ContinuationToken'] = continuation_token + + # response = self.s3_client.list_objects_v2(**list_objects_kwargs) + + # if 'Contents' in response: + # for obj in response['Contents']: + # all_files.append(obj['Key']) + + # if response['IsTruncated']: + # continuation_token = response['NextContinuationToken'] + # else: + # break + + # return all_files + + +if __name__ == "__main__": + raise NotImplementedError("This file is not meant to be run directly") + text = "Testing 123" + # ingest(text=text) diff --git a/ai_ta_backend/nomic_logging.py b/ai_ta_backend/beam/nomic_logging.py similarity index 89% rename from ai_ta_backend/nomic_logging.py rename to ai_ta_backend/beam/nomic_logging.py index cf5bc699..18591a05 100644 --- a/ai_ta_backend/nomic_logging.py +++ b/ai_ta_backend/beam/nomic_logging.py @@ -1,738 +1,743 @@ -import datetime -import os -import time - -import nomic -import numpy as np -import pandas as pd -import supabase -from langchain.embeddings import OpenAIEmbeddings -from nomic import AtlasProject, atlas -import sentry_sdk -import backoff -import json - -OPENAI_API_TYPE = "azure" - - -SUPABASE_CLIENT = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - -LOCK_EXCEPTIONS = ['Project is locked for state access! Please wait until the project is unlocked to access embeddings.', - 'Project is locked for state access! Please wait until the project is unlocked to access data.', - 'Project is currently indexing and cannot ingest new datums. Try again later.'] - -def giveup_hdlr(e): - """ - Function to handle giveup conditions in backoff decorator - Args: - e: Exception raised by the decorated function - Returns: - True if we want to stop retrying, False otherwise - """ - (e_args,) = e.args - e_str = e_args['exception'] - - print("giveup_hdlr() called with exception:", e_str) - if e_str in LOCK_EXCEPTIONS: - return False - else: - sentry_sdk.capture_exception(e) - return True - - -def backoff_hdlr(details): - """ - Function to handle backup conditions in backoff decorator. - Currently just prints the details of the backoff. - """ - print( - "\nBacking off {wait:0.1f} seconds after {tries} tries, calling function {target} with args {args} and kwargs {kwargs}" - .format(**details)) - - -def backoff_strategy(): - """ - Function to define retry strategy. Is usualy defined in the decorator, - but passing parameters to it is giving errors. - """ - return backoff.expo(base=10, factor=1.5) - - -@backoff.on_exception(backoff_strategy, - Exception, - max_tries=5, - raise_on_giveup=False, - giveup=giveup_hdlr, - on_backoff=backoff_hdlr) -def log_convo_to_nomic(course_name: str, conversation) -> str: - nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app - NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' - """ - Logs conversation to Nomic. - 1. Check if map exists for given course - 2. Check if conversation ID exists - - if yes, delete and add new data point - - if no, add new data point - 3. Keep current logic for map doesn't exist - update metadata - """ - - print(f"in log_convo_to_nomic() for course: {course_name}") - print("type of conversation:", type(conversation)) - #conversation = json.loads(conversation) - messages = conversation['conversation']['messages'] - if 'user_email' not in conversation['conversation']: - user_email = "NULL" - else: - user_email = conversation['conversation']['user_email'] - conversation_id = conversation['conversation']['id'] - - # we have to upload whole conversations - # check what the fetched data looks like - pandas df or pyarrow table - # check if conversation ID exists in Nomic, if yes fetch all data from it and delete it. - # will have current QA and historical QA from Nomic, append new data and add_embeddings() - - project_name = NOMIC_MAP_NAME_PREFIX + course_name - start_time = time.monotonic() - emoji = "" - - try: - # fetch project metadata and embbeddings - project = AtlasProject(name=project_name, add_datums_if_exists=True) - - map_metadata_df = project.maps[1].data.df # type: ignore - map_embeddings_df = project.maps[1].embeddings.latent - # create a function which returns project, data and embeddings df here - map_metadata_df['id'] = map_metadata_df['id'].astype(int) - last_id = map_metadata_df['id'].max() - - if conversation_id in map_metadata_df.values: - # store that convo metadata locally - prev_data = map_metadata_df[map_metadata_df['conversation_id'] == conversation_id] - prev_index = prev_data.index.values[0] - embeddings = map_embeddings_df[prev_index - 1].reshape(1, 1536) - prev_convo = prev_data['conversation'].values[0] - prev_id = prev_data['id'].values[0] - created_at = pd.to_datetime(prev_data['created_at'].values[0]).strftime('%Y-%m-%d %H:%M:%S') - - # delete that convo data point from Nomic, and print result - print("Deleting point from nomic:", project.delete_data([str(prev_id)])) - - # prep for new point - first_message = prev_convo.split("\n")[1].split(": ")[1] - - # select the last 2 messages and append new convo to prev convo - messages_to_be_logged = messages[-2:] - for message in messages_to_be_logged: - if message['role'] == 'user': - emoji = "πŸ™‹ " - else: - emoji = "πŸ€– " - - if isinstance(message['content'], list): - text = message['content'][0]['text'] - else: - text = message['content'] - - prev_convo += "\n>>> " + emoji + message['role'] + ": " + text + "\n" - - # modified timestamp - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - # update metadata - metadata = [{ - "course": course_name, - "conversation": prev_convo, - "conversation_id": conversation_id, - "id": last_id + 1, - "user_email": user_email, - "first_query": first_message, - "created_at": created_at, - "modified_at": current_time - }] - else: - print("conversation_id does not exist") - - # add new data point - user_queries = [] - conversation_string = "" - - first_message = messages[0]['content'] - if isinstance(first_message, list): - first_message = first_message[0]['text'] - user_queries.append(first_message) - - for message in messages: - if message['role'] == 'user': - emoji = "πŸ™‹ " - else: - emoji = "πŸ€– " - - if isinstance(message['content'], list): - text = message['content'][0]['text'] - else: - text = message['content'] - - conversation_string += "\n>>> " + emoji + message['role'] + ": " + text + "\n" - - # modified timestamp - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - metadata = [{ - "course": course_name, - "conversation": conversation_string, - "conversation_id": conversation_id, - "id": last_id + 1, - "user_email": user_email, - "first_query": first_message, - "created_at": current_time, - "modified_at": current_time - }] - - # create embeddings - embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore - embeddings = embeddings_model.embed_documents(user_queries) - - # add embeddings to the project - create a new function for this - project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) - with project.wait_for_project_lock(): - project.add_embeddings(embeddings=np.array(embeddings), data=pd.DataFrame(metadata)) - project.rebuild_maps() - - print(f"⏰ Nomic logging runtime: {(time.monotonic() - start_time):.2f} seconds") - return f"Successfully logged for {course_name}" - - except Exception as e: - if str(e) == 'You must specify a unique_id_field when creating a new project.': - print("Attempting to create Nomic map...") - result = create_nomic_map(course_name, conversation) - print("result of create_nomic_map():", result) - else: - # raising exception again to trigger backoff and passing parameters to use in create_nomic_map() - raise Exception({"exception": str(e)}) - - -def get_nomic_map(course_name: str, type: str): - - """ - Returns the variables necessary to construct an iframe of the Nomic map given a course name. - We just need the ID and URL. - Example values: - map link: https://atlas.nomic.ai/map/ed222613-97d9-46a9-8755-12bbc8a06e3a/f4967ad7-ff37-4098-ad06-7e1e1a93dd93 - map id: f4967ad7-ff37-4098-ad06-7e1e1a93dd93 - """ - nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app - if type.lower() == 'document': - NOMIC_MAP_NAME_PREFIX = 'Document Map for ' - else: - NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' - - project_name = NOMIC_MAP_NAME_PREFIX + course_name - start_time = time.monotonic() - - try: - project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) - map = project.get_map(project_name) - - print(f"⏰ Nomic Full Map Retrieval: {(time.monotonic() - start_time):.2f} seconds") - return {"map_id": f"iframe{map.id}", "map_link": map.map_link} - except Exception as e: - # Error: ValueError: You must specify a unique_id_field when creating a new project. - if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore - print("Nomic map does not exist yet, probably because you have less than 20 queries/documents on your project: ", e) - else: - print("ERROR in get_nomic_map():", e) - sentry_sdk.capture_exception(e) - return {"map_id": None, "map_link": None} - - -def create_nomic_map(course_name: str, log_data: list): - """ - Creates a Nomic map for new courses and those which previously had < 20 queries. - 1. fetches supabase conversations for course - 2. appends current embeddings and metadata to it - 2. creates map if there are at least 20 queries - """ - nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app - NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' - - print(f"in create_nomic_map() for {course_name}") - # initialize supabase - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - - try: - # fetch all conversations with this new course (we expect <=20 conversations, because otherwise the map should be made already) - response = supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).execute() - data = response.data - df = pd.DataFrame(data) - - if len(data) < 19: - return None - else: - # get all queries for course and create metadata - user_queries = [] - metadata = [] - i = 1 - conversation_exists = False - - # current log details - log_messages = log_data['conversation']['messages'] # type: ignore - log_user_email = log_data['conversation']['user_email'] # type: ignore - log_conversation_id = log_data['conversation']['id'] # type: ignore - - for _index, row in df.iterrows(): - user_email = row['user_email'] - created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S') - convo = row['convo'] - messages = convo['messages'] - - first_message = messages[0]['content'] - if isinstance(first_message, list): - first_message = first_message[0]['text'] - - user_queries.append(first_message) - - # create metadata for multi-turn conversation - conversation = "" - for message in messages: - # string of role: content, role: content, ... - if message['role'] == 'user': # type: ignore - emoji = "πŸ™‹ " - else: - emoji = "πŸ€– " - - if isinstance(message['content'], list): - text = message['content'][0]['text'] - else: - text = message['content'] - - conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" - - # append current chat to previous chat if convo already exists - if convo['id'] == log_conversation_id: - conversation_exists = True - - for m in log_messages: - if m['role'] == 'user': # type: ignore - emoji = "πŸ™‹ " - else: - emoji = "πŸ€– " - - if isinstance(m['content'], list): - text = m['content'][0]['text'] - else: - text = m['content'] - conversation += "\n>>> " + emoji + m['role'] + ": " + text + "\n" - - # adding modified timestamp - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - # add to metadata - metadata_row = { - "course": row['course_name'], - "conversation": conversation, - "conversation_id": convo['id'], - "id": i, - "user_email": user_email, - "first_query": first_message, - "created_at": created_at, - "modified_at": current_time - } - metadata.append(metadata_row) - i += 1 - - # add current log as a new data point if convo doesn't exist - if not conversation_exists: - user_queries.append(log_messages[0]['content']) - conversation = "" - for message in log_messages: - if message['role'] == 'user': - emoji = "πŸ™‹ " - else: - emoji = "πŸ€– " - - if isinstance(message['content'], list): - text = message['content'][0]['text'] - else: - text = message['content'] - conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" - - # adding timestamp - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - metadata_row = { - "course": course_name, - "conversation": conversation, - "conversation_id": log_conversation_id, - "id": i, - "user_email": log_user_email, - "first_query": log_messages[0]['content'], - "created_at": current_time, - "modified_at": current_time - } - metadata.append(metadata_row) - - metadata = pd.DataFrame(metadata) - embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore - embeddings = embeddings_model.embed_documents(user_queries) - - # create Atlas project - project_name = NOMIC_MAP_NAME_PREFIX + course_name - index_name = course_name + "_convo_index" - project = atlas.map_embeddings( - embeddings=np.array(embeddings), - data=metadata, # type: ignore - this is the correct type, the func signature from Nomic is incomplete - id_field='id', - build_topic_model=True, - topic_label_field='first_query', - name=project_name, - colorable_fields=['conversation_id', 'first_query']) - project.create_index(index_name, build_topic_model=True) - return f"Successfully created Nomic map for {course_name}" - except Exception as e: - # Error: ValueError: You must specify a unique_id_field when creating a new project. - if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore - print("Nomic map does not exist yet, probably because you have less than 20 queries on your project: ", e) - else: - print("ERROR in create_nomic_map():", e) - sentry_sdk.capture_exception(e) - - return "failed" - -## -------------------------------- DOCUMENT MAP FUNCTIONS --------------------------------- ## - -def create_document_map(course_name: str): - """ - This is a function which creates a document map for a given course from scratch - 1. Gets count of documents for the course - 2. If less than 20, returns a message that a map cannot be created - 3. If greater than 20, iteratively fetches documents in batches of 25 - 4. Prepares metadata and embeddings for nomic upload - 5. Creates a new map and uploads the data - - Args: - course_name: str - Returns: - str: success or failed - """ - print("in create_document_map()") - nomic.login(os.getenv('NOMIC_API_KEY')) - NOMIC_MAP_NAME_PREFIX = 'Document Map for ' - - # initialize supabase - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - - try: - # check if map exists - response = supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute() - if response.data: - return "Map already exists for this course." - - # fetch relevant document data from Supabase - response = supabase_client.table("documents").select("id", count="exact").eq("course_name", course_name).order('id', desc=False).execute() - if not response.count: - return "No documents found for this course." - - total_doc_count = response.count - print("Total number of documents in Supabase: ", total_doc_count) - - # minimum 20 docs needed to create map - if total_doc_count > 19: - - first_id = response.data[0]['id'] - combined_dfs = [] - curr_total_doc_count = 0 - doc_count = 0 - first_batch = True - - # iteratively query in batches of 25 - while curr_total_doc_count < total_doc_count: - - response = supabase_client.table("documents").select("id, created_at, s3_path, url, readable_filename, contexts").eq("course_name", course_name).gte( - 'id', first_id).order('id', desc=False).limit(25).execute() - df = pd.DataFrame(response.data) - combined_dfs.append(df) # list of dfs - - curr_total_doc_count += len(response.data) - doc_count += len(response.data) - - if doc_count >= 1000: # upload to Nomic every 1000 docs - - # concat all dfs from the combined_dfs list - final_df = pd.concat(combined_dfs, ignore_index=True) - - # prep data for nomic upload - embeddings, metadata = data_prep_for_doc_map(final_df) - - if first_batch: - # create a new map - print("Creating new map...") - project_name = NOMIC_MAP_NAME_PREFIX + course_name - index_name = course_name + "_doc_index" - topic_label_field = "text" - colorable_fields = ["readable_filename", "text"] - result = create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields) - # update flag - first_batch = False - - else: - # append to existing map - print("Appending data to existing map...") - project_name = NOMIC_MAP_NAME_PREFIX + course_name - # add project lock logic here - result = append_to_map(embeddings, metadata, project_name) - - # reset variables - combined_dfs = [] - doc_count = 0 - - # set first_id for next iteration - first_id = response.data[-1]['id'] + 1 - - # upload last set of docs - final_df = pd.concat(combined_dfs, ignore_index=True) - embeddings, metadata = data_prep_for_doc_map(final_df) - project_name = NOMIC_MAP_NAME_PREFIX + course_name - if first_batch: - index_name = course_name + "_doc_index" - topic_label_field = "text" - colorable_fields = ["readable_filename", "text"] - result = create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields) - else: - result = append_to_map(embeddings, metadata, project_name) - print("Atlas upload status: ", result) - - # log info to supabase - project = AtlasProject(name=project_name, add_datums_if_exists=True) - project_id = project.id - project.rebuild_maps() - project_info = {'course_name': course_name, 'doc_map_id': project_id} - response = supabase_client.table("projects").insert(project_info).execute() - print("Response from supabase: ", response) - return "success" - else: - return "Cannot create a map because there are less than 20 documents in the course." - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return "failed" - - -def delete_from_document_map(course_name: str, ids: list): - """ - This function is used to delete datapoints from a document map. - Currently used within the delete_data() function in vector_database.py - Args: - course_name: str - ids: list of str - """ - print("in delete_from_document_map()") - - try: - # check if project exists - response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() - if response.data: - project_id = response.data[0]['doc_map_id'] - else: - return "No document map found for this course" - - # fetch project from Nomic - project = AtlasProject(project_id=project_id, add_datums_if_exists=True) - - # delete the ids from Nomic - print("Deleting point from document map:", project.delete_data(ids)) - with project.wait_for_project_lock(): - project.rebuild_maps() - return "Successfully deleted from Nomic map" - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return "Error in deleting from document map: {e}" - -def log_to_document_map(data: dict): - """ - This is a function which appends new documents to an existing document map. It's called - at the end of split_and_upload() after inserting data to Supabase. - Args: - data: dict - the response data from Supabase insertion - """ - print("in add_to_document_map()") - - try: - # check if map exists - course_name = data['course_name'] - response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() - if response.data: - project_id = response.data[0]['doc_map_id'] - else: - # create a map - map_creation_result = create_document_map(course_name) - if map_creation_result != "success": - return "The project has less than 20 documents and a map cannot be created." - else: - # fetch project id - response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() - project_id = response.data[0]['doc_map_id'] - - - project = AtlasProject(project_id=project_id, add_datums_if_exists=True) - #print("Inserted data: ", data) - - embeddings = [] - metadata = [] - context_count = 0 - # prep data for nomic upload - for row in data['contexts']: - context_count += 1 - embeddings.append(row['embedding']) - metadata.append({ - "id": str(data['id']) + "_" + str(context_count), - "doc_ingested_at": data['created_at'], - "s3_path": data['s3_path'], - "url": data['url'], - "readable_filename": data['readable_filename'], - "created_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), - "text": row['text'] - }) - embeddings = np.array(embeddings) - metadata = pd.DataFrame(metadata) - print("Shape of embeddings: ", embeddings.shape) - - # append to existing map - project_name = "Document Map for " + course_name - result = append_to_map(embeddings, metadata, project_name) - - # check if project is accepting new datums - if project.is_accepting_data: - with project.wait_for_project_lock(): - project.rebuild_maps() - - # with project.wait_for_project_lock(): - # project.rebuild_maps() - return result - - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return "Error in appending to map: {e}" - - -def create_map(embeddings, metadata, map_name, index_name, topic_label_field, colorable_fields): - """ - Generic function to create a Nomic map from given parameters. - Args: - embeddings: np.array of embeddings - metadata: pd.DataFrame of metadata - map_name: str - index_name: str - topic_label_field: str - colorable_fields: list of str - """ - nomic.login(os.getenv('NOMIC_API_KEY')) - - try: - project = atlas.map_embeddings( - embeddings=embeddings, - data=metadata, - id_field="id", - build_topic_model=True, - name=map_name, - topic_label_field=topic_label_field, - colorable_fields=colorable_fields, - add_datums_if_exists=True - ) - project.create_index(index_name, build_topic_model=True) - return "success" - except Exception as e: - print(e) - return "Error in creating map: {e}" - - -def append_to_map(embeddings, metadata, map_name): - """ - Generic function to append new data to an existing Nomic map. - Args: - embeddings: np.array of embeddings - metadata: pd.DataFrame of Nomic upload metadata - map_name: str - """ - nomic.login(os.getenv('NOMIC_API_KEY')) - try: - project = atlas.AtlasProject(name=map_name, add_datums_if_exists=True) - with project.wait_for_project_lock(): - project.add_embeddings(embeddings=embeddings, data=metadata) - return "Successfully appended to Nomic map" - except Exception as e: - print(e) - return "Error in appending to map: {e}" - - -def data_prep_for_doc_map(df: pd.DataFrame): - """ - This function prepares embeddings and metadata for nomic upload in document map creation. - Args: - df: pd.DataFrame - the dataframe of documents from Supabase - Returns: - embeddings: np.array of embeddings - metadata: pd.DataFrame of metadata - """ - print("in data_prep_for_doc_map()") - - metadata = [] - embeddings = [] - texts = [] - - for index, row in df.iterrows(): - - current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - if row['url'] == None: - row['url'] = "" - # iterate through all contexts and create separate entries for each - context_count = 0 - for context in row['contexts']: - context_count += 1 - text_row = context['text'] - embeddings_row = context['embedding'] - - meta_row = { - "id": str(row['id']) + "_" + str(context_count), - "doc_ingested_at": row['created_at'], - "s3_path": row['s3_path'], - "url": row['url'], - "readable_filename": row['readable_filename'], - "created_at": current_time, - "text": text_row - } - - embeddings.append(embeddings_row) - metadata.append(meta_row) - texts.append(text_row) - - embeddings_np = np.array(embeddings, dtype=object) - print("Shape of embeddings: ", embeddings_np.shape) - - # check dimension if embeddings_np is (n, 1536) - if len(embeddings_np.shape) < 2: - print("Creating new embeddings...") - # embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE, - # openai_api_base=os.getenv('AZURE_OPENAI_BASE'), - # openai_api_key=os.getenv('AZURE_OPENAI_KEY')) # type: ignore - embeddings_model = OpenAIEmbeddings(openai_api_type="openai", - openai_api_base="https://api.openai.com/v1/", - openai_api_key=os.getenv('VLADS_OPENAI_KEY')) # type: ignore - embeddings = embeddings_model.embed_documents(texts) - - metadata = pd.DataFrame(metadata) - embeddings = np.array(embeddings) - - return embeddings, metadata - - - -if __name__ == '__main__': - pass +import datetime +import json +import os +import time + +import backoff +import nomic +import numpy as np +import pandas as pd +import sentry_sdk +import supabase +from langchain.embeddings import OpenAIEmbeddings +from nomic import AtlasProject, atlas + +OPENAI_API_TYPE = "azure" + +SUPABASE_CLIENT = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + +LOCK_EXCEPTIONS = [ + 'Project is locked for state access! Please wait until the project is unlocked to access embeddings.', + 'Project is locked for state access! Please wait until the project is unlocked to access data.', + 'Project is currently indexing and cannot ingest new datums. Try again later.' +] + + +def giveup_hdlr(e): + """ + Function to handle giveup conditions in backoff decorator + Args: + e: Exception raised by the decorated function + Returns: + True if we want to stop retrying, False otherwise + """ + (e_args,) = e.args + e_str = e_args['exception'] + + print("giveup_hdlr() called with exception:", e_str) + if e_str in LOCK_EXCEPTIONS: + return False + else: + sentry_sdk.capture_exception(e) + return True + + +def backoff_hdlr(details): + """ + Function to handle backup conditions in backoff decorator. + Currently just prints the details of the backoff. + """ + print( + "\nBacking off {wait:0.1f} seconds after {tries} tries, calling function {target} with args {args} and kwargs {kwargs}" + .format(**details)) + + +def backoff_strategy(): + """ + Function to define retry strategy. Is usualy defined in the decorator, + but passing parameters to it is giving errors. + """ + return backoff.expo(base=10, factor=1.5) + + +@backoff.on_exception(backoff_strategy, + Exception, + max_tries=5, + raise_on_giveup=False, + giveup=giveup_hdlr, + on_backoff=backoff_hdlr) +def log_convo_to_nomic(course_name: str, conversation) -> str: + nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app + NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' + """ + Logs conversation to Nomic. + 1. Check if map exists for given course + 2. Check if conversation ID exists + - if yes, delete and add new data point + - if no, add new data point + 3. Keep current logic for map doesn't exist - update metadata + """ + + print(f"in log_convo_to_nomic() for course: {course_name}") + print("type of conversation:", type(conversation)) + #conversation = json.loads(conversation) + messages = conversation['conversation']['messages'] + if 'user_email' not in conversation['conversation']: + user_email = "NULL" + else: + user_email = conversation['conversation']['user_email'] + conversation_id = conversation['conversation']['id'] + + # we have to upload whole conversations + # check what the fetched data looks like - pandas df or pyarrow table + # check if conversation ID exists in Nomic, if yes fetch all data from it and delete it. + # will have current QA and historical QA from Nomic, append new data and add_embeddings() + + project_name = NOMIC_MAP_NAME_PREFIX + course_name + start_time = time.monotonic() + emoji = "" + + try: + # fetch project metadata and embbeddings + project = AtlasProject(name=project_name, add_datums_if_exists=True) + + map_metadata_df = project.maps[1].data.df # type: ignore + map_embeddings_df = project.maps[1].embeddings.latent + # create a function which returns project, data and embeddings df here + map_metadata_df['id'] = map_metadata_df['id'].astype(int) + last_id = map_metadata_df['id'].max() + + if conversation_id in map_metadata_df.values: + # store that convo metadata locally + prev_data = map_metadata_df[map_metadata_df['conversation_id'] == conversation_id] + prev_index = prev_data.index.values[0] + embeddings = map_embeddings_df[prev_index - 1].reshape(1, 1536) + prev_convo = prev_data['conversation'].values[0] + prev_id = prev_data['id'].values[0] + created_at = pd.to_datetime(prev_data['created_at'].values[0]).strftime('%Y-%m-%d %H:%M:%S') + + # delete that convo data point from Nomic, and print result + print("Deleting point from nomic:", project.delete_data([str(prev_id)])) + + # prep for new point + first_message = prev_convo.split("\n")[1].split(": ")[1] + + # select the last 2 messages and append new convo to prev convo + messages_to_be_logged = messages[-2:] + for message in messages_to_be_logged: + if message['role'] == 'user': + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + + prev_convo += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # modified timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # update metadata + metadata = [{ + "course": course_name, + "conversation": prev_convo, + "conversation_id": conversation_id, + "id": last_id + 1, + "user_email": user_email, + "first_query": first_message, + "created_at": created_at, + "modified_at": current_time + }] + else: + print("conversation_id does not exist") + + # add new data point + user_queries = [] + conversation_string = "" + + first_message = messages[0]['content'] + if isinstance(first_message, list): + first_message = first_message[0]['text'] + user_queries.append(first_message) + + for message in messages: + if message['role'] == 'user': + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + + conversation_string += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # modified timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + metadata = [{ + "course": course_name, + "conversation": conversation_string, + "conversation_id": conversation_id, + "id": last_id + 1, + "user_email": user_email, + "first_query": first_message, + "created_at": current_time, + "modified_at": current_time + }] + + # create embeddings + embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore + embeddings = embeddings_model.embed_documents(user_queries) + + # add embeddings to the project - create a new function for this + project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) + with project.wait_for_project_lock(): + project.add_embeddings(embeddings=np.array(embeddings), data=pd.DataFrame(metadata)) + project.rebuild_maps() + + print(f"⏰ Nomic logging runtime: {(time.monotonic() - start_time):.2f} seconds") + return f"Successfully logged for {course_name}" + + except Exception as e: + if str(e) == 'You must specify a unique_id_field when creating a new project.': + print("Attempting to create Nomic map...") + result = create_nomic_map(course_name, conversation) + print("result of create_nomic_map():", result) + else: + # raising exception again to trigger backoff and passing parameters to use in create_nomic_map() + raise Exception({"exception": str(e)}) + + +def get_nomic_map(course_name: str, type: str): + """ + Returns the variables necessary to construct an iframe of the Nomic map given a course name. + We just need the ID and URL. + Example values: + map link: https://atlas.nomic.ai/map/ed222613-97d9-46a9-8755-12bbc8a06e3a/f4967ad7-ff37-4098-ad06-7e1e1a93dd93 + map id: f4967ad7-ff37-4098-ad06-7e1e1a93dd93 + """ + nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app + if type.lower() == 'document': + NOMIC_MAP_NAME_PREFIX = 'Document Map for ' + else: + NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' + + project_name = NOMIC_MAP_NAME_PREFIX + course_name + start_time = time.monotonic() + + try: + project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) + map = project.get_map(project_name) + + print(f"⏰ Nomic Full Map Retrieval: {(time.monotonic() - start_time):.2f} seconds") + return {"map_id": f"iframe{map.id}", "map_link": map.map_link} + except Exception as e: + # Error: ValueError: You must specify a unique_id_field when creating a new project. + if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore + print("Nomic map does not exist yet, probably because you have less than 20 queries/documents on your project: ", + e) + else: + print("ERROR in get_nomic_map():", e) + sentry_sdk.capture_exception(e) + return {"map_id": None, "map_link": None} + + +def create_nomic_map(course_name: str, log_data: list): + """ + Creates a Nomic map for new courses and those which previously had < 20 queries. + 1. fetches supabase conversations for course + 2. appends current embeddings and metadata to it + 2. creates map if there are at least 20 queries + """ + nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app + NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' + + print(f"in create_nomic_map() for {course_name}") + # initialize supabase + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + + try: + # fetch all conversations with this new course (we expect <=20 conversations, because otherwise the map should be made already) + response = supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).execute() + data = response.data + df = pd.DataFrame(data) + + if len(data) < 19: + return None + else: + # get all queries for course and create metadata + user_queries = [] + metadata = [] + i = 1 + conversation_exists = False + + # current log details + log_messages = log_data['conversation']['messages'] # type: ignore + log_user_email = log_data['conversation']['user_email'] # type: ignore + log_conversation_id = log_data['conversation']['id'] # type: ignore + + for _index, row in df.iterrows(): + user_email = row['user_email'] + created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S') + convo = row['convo'] + messages = convo['messages'] + + first_message = messages[0]['content'] + if isinstance(first_message, list): + first_message = first_message[0]['text'] + + user_queries.append(first_message) + + # create metadata for multi-turn conversation + conversation = "" + for message in messages: + # string of role: content, role: content, ... + if message['role'] == 'user': # type: ignore + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + + conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # append current chat to previous chat if convo already exists + if convo['id'] == log_conversation_id: + conversation_exists = True + + for m in log_messages: + if m['role'] == 'user': # type: ignore + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(m['content'], list): + text = m['content'][0]['text'] + else: + text = m['content'] + conversation += "\n>>> " + emoji + m['role'] + ": " + text + "\n" + + # adding modified timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # add to metadata + metadata_row = { + "course": row['course_name'], + "conversation": conversation, + "conversation_id": convo['id'], + "id": i, + "user_email": user_email, + "first_query": first_message, + "created_at": created_at, + "modified_at": current_time + } + metadata.append(metadata_row) + i += 1 + + # add current log as a new data point if convo doesn't exist + if not conversation_exists: + user_queries.append(log_messages[0]['content']) + conversation = "" + for message in log_messages: + if message['role'] == 'user': + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # adding timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + metadata_row = { + "course": course_name, + "conversation": conversation, + "conversation_id": log_conversation_id, + "id": i, + "user_email": log_user_email, + "first_query": log_messages[0]['content'], + "created_at": current_time, + "modified_at": current_time + } + metadata.append(metadata_row) + + metadata = pd.DataFrame(metadata) + embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) # type: ignore + embeddings = embeddings_model.embed_documents(user_queries) + + # create Atlas project + project_name = NOMIC_MAP_NAME_PREFIX + course_name + index_name = course_name + "_convo_index" + project = atlas.map_embeddings( + embeddings=np.array(embeddings), + data=metadata, # type: ignore - this is the correct type, the func signature from Nomic is incomplete + id_field='id', + build_topic_model=True, + topic_label_field='first_query', + name=project_name, + colorable_fields=['conversation_id', 'first_query']) + project.create_index(index_name, build_topic_model=True) + return f"Successfully created Nomic map for {course_name}" + except Exception as e: + # Error: ValueError: You must specify a unique_id_field when creating a new project. + if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore + print("Nomic map does not exist yet, probably because you have less than 20 queries on your project: ", e) + else: + print("ERROR in create_nomic_map():", e) + sentry_sdk.capture_exception(e) + + return "failed" + + +## -------------------------------- DOCUMENT MAP FUNCTIONS --------------------------------- ## + + +def create_document_map(course_name: str): + """ + This is a function which creates a document map for a given course from scratch + 1. Gets count of documents for the course + 2. If less than 20, returns a message that a map cannot be created + 3. If greater than 20, iteratively fetches documents in batches of 25 + 4. Prepares metadata and embeddings for nomic upload + 5. Creates a new map and uploads the data + + Args: + course_name: str + Returns: + str: success or failed + """ + print("in create_document_map()") + nomic.login(os.getenv('NOMIC_API_KEY')) + NOMIC_MAP_NAME_PREFIX = 'Document Map for ' + + # initialize supabase + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + + try: + # check if map exists + response = supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + if response.data: + return "Map already exists for this course." + + # fetch relevant document data from Supabase + response = supabase_client.table("documents").select("id", + count="exact").eq("course_name", + course_name).order('id', + desc=False).execute() + if not response.count: + return "No documents found for this course." + + total_doc_count = response.count + print("Total number of documents in Supabase: ", total_doc_count) + + # minimum 20 docs needed to create map + if total_doc_count > 19: + + first_id = response.data[0]['id'] + combined_dfs = [] + curr_total_doc_count = 0 + doc_count = 0 + first_batch = True + + # iteratively query in batches of 25 + while curr_total_doc_count < total_doc_count: + + response = supabase_client.table("documents").select( + "id, created_at, s3_path, url, readable_filename, contexts").eq("course_name", course_name).gte( + 'id', first_id).order('id', desc=False).limit(25).execute() + df = pd.DataFrame(response.data) + combined_dfs.append(df) # list of dfs + + curr_total_doc_count += len(response.data) + doc_count += len(response.data) + + if doc_count >= 1000: # upload to Nomic every 1000 docs + + # concat all dfs from the combined_dfs list + final_df = pd.concat(combined_dfs, ignore_index=True) + + # prep data for nomic upload + embeddings, metadata = data_prep_for_doc_map(final_df) + + if first_batch: + # create a new map + print("Creating new map...") + project_name = NOMIC_MAP_NAME_PREFIX + course_name + index_name = course_name + "_doc_index" + topic_label_field = "text" + colorable_fields = ["readable_filename", "text"] + result = create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields) + # update flag + first_batch = False + + else: + # append to existing map + print("Appending data to existing map...") + project_name = NOMIC_MAP_NAME_PREFIX + course_name + # add project lock logic here + result = append_to_map(embeddings, metadata, project_name) + + # reset variables + combined_dfs = [] + doc_count = 0 + + # set first_id for next iteration + first_id = response.data[-1]['id'] + 1 + + # upload last set of docs + final_df = pd.concat(combined_dfs, ignore_index=True) + embeddings, metadata = data_prep_for_doc_map(final_df) + project_name = NOMIC_MAP_NAME_PREFIX + course_name + if first_batch: + index_name = course_name + "_doc_index" + topic_label_field = "text" + colorable_fields = ["readable_filename", "text"] + result = create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields) + else: + result = append_to_map(embeddings, metadata, project_name) + print("Atlas upload status: ", result) + + # log info to supabase + project = AtlasProject(name=project_name, add_datums_if_exists=True) + project_id = project.id + project.rebuild_maps() + project_info = {'course_name': course_name, 'doc_map_id': project_id} + response = supabase_client.table("projects").insert(project_info).execute() + print("Response from supabase: ", response) + return "success" + else: + return "Cannot create a map because there are less than 20 documents in the course." + except Exception as e: + print(e) + sentry_sdk.capture_exception(e) + return "failed" + + +def delete_from_document_map(course_name: str, ids: list): + """ + This function is used to delete datapoints from a document map. + Currently used within the delete_data() function in vector_database.py + Args: + course_name: str + ids: list of str + """ + print("in delete_from_document_map()") + + try: + # check if project exists + response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + if response.data: + project_id = response.data[0]['doc_map_id'] + else: + return "No document map found for this course" + + # fetch project from Nomic + project = AtlasProject(project_id=project_id, add_datums_if_exists=True) + + # delete the ids from Nomic + print("Deleting point from document map:", project.delete_data(ids)) + with project.wait_for_project_lock(): + project.rebuild_maps() + return "Successfully deleted from Nomic map" + except Exception as e: + print(e) + sentry_sdk.capture_exception(e) + return "Error in deleting from document map: {e}" + + +def log_to_document_map(data: dict): + """ + This is a function which appends new documents to an existing document map. It's called + at the end of split_and_upload() after inserting data to Supabase. + Args: + data: dict - the response data from Supabase insertion + """ + print("in add_to_document_map()") + + try: + # check if map exists + course_name = data['course_name'] + response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + if response.data: + project_id = response.data[0]['doc_map_id'] + else: + # create a map + map_creation_result = create_document_map(course_name) + if map_creation_result != "success": + return "The project has less than 20 documents and a map cannot be created." + else: + # fetch project id + response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + project_id = response.data[0]['doc_map_id'] + + project = AtlasProject(project_id=project_id, add_datums_if_exists=True) + #print("Inserted data: ", data) + + embeddings = [] + metadata = [] + context_count = 0 + # prep data for nomic upload + for row in data['contexts']: + context_count += 1 + embeddings.append(row['embedding']) + metadata.append({ + "id": str(data['id']) + "_" + str(context_count), + "doc_ingested_at": data['created_at'], + "s3_path": data['s3_path'], + "url": data['url'], + "readable_filename": data['readable_filename'], + "created_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "text": row['text'] + }) + embeddings = np.array(embeddings) + metadata = pd.DataFrame(metadata) + print("Shape of embeddings: ", embeddings.shape) + + # append to existing map + project_name = "Document Map for " + course_name + result = append_to_map(embeddings, metadata, project_name) + + # check if project is accepting new datums + if project.is_accepting_data: + with project.wait_for_project_lock(): + project.rebuild_maps() + + # with project.wait_for_project_lock(): + # project.rebuild_maps() + return result + + except Exception as e: + print(e) + sentry_sdk.capture_exception(e) + return "Error in appending to map: {e}" + + +def create_map(embeddings, metadata, map_name, index_name, topic_label_field, colorable_fields): + """ + Generic function to create a Nomic map from given parameters. + Args: + embeddings: np.array of embeddings + metadata: pd.DataFrame of metadata + map_name: str + index_name: str + topic_label_field: str + colorable_fields: list of str + """ + nomic.login(os.getenv('NOMIC_API_KEY')) + + try: + project = atlas.map_embeddings(embeddings=embeddings, + data=metadata, + id_field="id", + build_topic_model=True, + name=map_name, + topic_label_field=topic_label_field, + colorable_fields=colorable_fields, + add_datums_if_exists=True) + project.create_index(index_name, build_topic_model=True) + return "success" + except Exception as e: + print(e) + return "Error in creating map: {e}" + + +def append_to_map(embeddings, metadata, map_name): + """ + Generic function to append new data to an existing Nomic map. + Args: + embeddings: np.array of embeddings + metadata: pd.DataFrame of Nomic upload metadata + map_name: str + """ + nomic.login(os.getenv('NOMIC_API_KEY')) + try: + project = atlas.AtlasProject(name=map_name, add_datums_if_exists=True) + with project.wait_for_project_lock(): + project.add_embeddings(embeddings=embeddings, data=metadata) + return "Successfully appended to Nomic map" + except Exception as e: + print(e) + return "Error in appending to map: {e}" + + +def data_prep_for_doc_map(df: pd.DataFrame): + """ + This function prepares embeddings and metadata for nomic upload in document map creation. + Args: + df: pd.DataFrame - the dataframe of documents from Supabase + Returns: + embeddings: np.array of embeddings + metadata: pd.DataFrame of metadata + """ + print("in data_prep_for_doc_map()") + + metadata = [] + embeddings = [] + texts = [] + + for index, row in df.iterrows(): + + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if row['url'] == None: + row['url'] = "" + # iterate through all contexts and create separate entries for each + context_count = 0 + for context in row['contexts']: + context_count += 1 + text_row = context['text'] + embeddings_row = context['embedding'] + + meta_row = { + "id": str(row['id']) + "_" + str(context_count), + "doc_ingested_at": row['created_at'], + "s3_path": row['s3_path'], + "url": row['url'], + "readable_filename": row['readable_filename'], + "created_at": current_time, + "text": text_row + } + + embeddings.append(embeddings_row) + metadata.append(meta_row) + texts.append(text_row) + + embeddings_np = np.array(embeddings, dtype=object) + print("Shape of embeddings: ", embeddings_np.shape) + + # check dimension if embeddings_np is (n, 1536) + if len(embeddings_np.shape) < 2: + print("Creating new embeddings...") + # embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE, + # openai_api_base=os.getenv('AZURE_OPENAI_BASE'), + # openai_api_key=os.getenv('AZURE_OPENAI_KEY')) # type: ignore + embeddings_model = OpenAIEmbeddings(openai_api_type="openai", + openai_api_base="https://api.openai.com/v1/", + openai_api_key=os.getenv('VLADS_OPENAI_KEY')) # type: ignore + embeddings = embeddings_model.embed_documents(texts) + + metadata = pd.DataFrame(metadata) + embeddings = np.array(embeddings) + + return embeddings, metadata + + +if __name__ == '__main__': + pass diff --git a/ai_ta_backend/canvas.py b/ai_ta_backend/canvas.py deleted file mode 100644 index 324621b4..00000000 --- a/ai_ta_backend/canvas.py +++ /dev/null @@ -1,264 +0,0 @@ -import os -import shutil - -import requests -from canvasapi import Canvas -import sentry_sdk - -from ai_ta_backend.aws import upload_data_files_to_s3 -from ai_ta_backend.vector_database import Ingest - - -class CanvasAPI(): - - def __init__(self): - self.canvas_client = Canvas("https://canvas.illinois.edu", os.getenv('CANVAS_ACCESS_TOKEN')) - self.headers = {"Authorization": "Bearer " + os.getenv('CANVAS_ACCESS_TOKEN')} - - def add_users(self, canvas_course_id: str, course_name: str): - """ - Get all users in a course by course ID and add them to uiuc.chat course - - Student profile does not have access to emails. - - Currently collecting all names in a list. - """ - course = self.canvas_client.get_course(canvas_course_id) - users = course.get_users() - - user_names = [] - for user in users: - user_names.append(user.name) - - print("Collected names: ", user_names) - - if len(user_names) > 0: - return "Success" - else: - return "Failed" - - def download_course_content(self, canvas_course_id: int, dest_folder: str, content_ingest_dict: dict) -> str: - """ - Downloads all Canvas course materials through the course ID and stores in local directory. - 1. Iterate through content_ingest_dict and download all. - 2. Maintain a list of URLs and convert HTML strings to proper format. - """ - print("In download_course_content") - - try: - api_path = "https://canvas.illinois.edu/api/v1/courses/" + str(canvas_course_id) - - # Iterate over the content_ingest_dict - for key, value in content_ingest_dict.items(): - if value is True: - if key == 'files': - self.download_files(dest_folder, api_path) - elif key == 'pages': - self.download_pages(dest_folder, api_path) - elif key == 'modules': - self.download_modules(dest_folder, api_path) - elif key == 'syllabus': - self.download_syllabus(dest_folder, api_path) - elif key == 'assignments': - self.download_assignments(dest_folder, api_path) - elif key == 'discussions': - self.download_discussions(dest_folder, api_path) - - # at this point, we have all extracted files in the dest_folder. - - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def ingest_course_content(self, canvas_course_id: int, course_name: str, content_ingest_dict: dict = None) -> str: - """ - Ingests all Canvas course materials through the course ID. - 1. Download zip file from Canvas and store in local directory - 2. Upload all files to S3 - 3. Call bulk_ingest() to ingest all files into QDRANT - 4. Delete extracted files from local directory - """ - - print("In ingest_course_content") - try: - # a dictionary of all contents we want to ingest - files, pages, modules, syllabus, assignments, discussions. - if content_ingest_dict is None: - content_ingest_dict = { - 'files': True, - 'pages': True, - 'modules': True, - 'syllabus': True, - 'assignments': True, - 'discussions': True - } - - # Create a canvas directory with a course folder inside it. - canvas_dir = "canvas_materials" - folder_name = "canvas_course_" + str(canvas_course_id) + "_ingest" - folder_path = canvas_dir + "/" + folder_name - - if os.path.exists(canvas_dir): - print("Canvas directory already exists") - else: - os.mkdir(canvas_dir) - print("Canvas directory created") - - if os.path.exists(canvas_dir + "/" + folder_name): - print("Course folder already exists") - else: - os.mkdir(canvas_dir + "/" + folder_name) - print("Course folder created") - - # Download course content - self.download_course_content(canvas_course_id, folder_path, content_ingest_dict) - - # Upload files to S3 - s3_paths = upload_data_files_to_s3(course_name, folder_path) - - # Delete files from local directory - shutil.rmtree(folder_path) - - # Ingest files into QDRANT - ingest = Ingest() - canvas_ingest = ingest.bulk_ingest(s3_paths, course_name=course_name) - return canvas_ingest - - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return "Failed" - - def download_files(self, dest_folder: str, api_path: str) -> str: - """ - Downloads all files in a Canvas course into given folder. - """ - try: - # files_request = requests.get(api_path + "/files", headers=self.headers) - # files = files_request.json() - - course = self.canvas_client.get_course(api_path.split('/')[-1]) - files = course.get_files() - - for file in files: - # file_name = file['filename'] - file_name = file.filename - print("Downloading file: ", file_name) - - # file_download = requests.get(file['url'], headers=self.headers) - file_download = requests.get(file.url, headers=self.headers) - with open(os.path.join(dest_folder, file_name), 'wb') as f: - f.write(file_download.content) - - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_pages(self, dest_folder: str, api_path: str) -> str: - """ - Downloads all pages as HTML and stores them in given folder. - """ - print("In download_pages") - try: - pages_request = requests.get(api_path + "/pages", headers=self.headers) - pages = pages_request.json() - - for page in pages: - if page['html_url'] != '': - page_name = page['url'] + ".html" - page_content_request = requests.get(api_path + "/pages/" + str(page['page_id']), headers=self.headers) - page_body = page_content_request.json()['body'] - - with open(dest_folder + "/" + page_name, 'w') as html_file: - html_file.write(page_body) - - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_syllabus(self, dest_folder: str, api_path: str) -> str: - """ - Downloads syllabus as HTML and stores in given folder. - """ - print("In download_syllabus") - try: - course_settings_request = requests.get(api_path + "?include=syllabus_body", headers=self.headers) - syllabus_body = course_settings_request.json()['syllabus_body'] - syllabus_name = "syllabus.html" - - with open(dest_folder + "/" + syllabus_name, 'w') as html_file: - html_file.write(syllabus_body) - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_modules(self, dest_folder: str, api_path: str) -> str: - """ - Downloads all content uploaded in modules. - Modules may contain: assignments, quizzes, files, pages, discussions, external tools and external urls. - Rest of the things are covered in other functions. - """ - print("In download_modules") - try: - module_request = requests.get(api_path + "/modules?include=items", headers=self.headers) - modules = module_request.json() - - for module in modules: - module_items = module['items'] - for item in module_items: - if item['type'] == 'ExternalUrl': - external_url = item['external_url'] - url_title = item['title'] - - # Download external url as HTML - response = requests.get(external_url) - if response.status_code == 200: - html_file_name = url_title + ".html" - with open(dest_folder + "/" + html_file_name, 'w') as html_file: - html_file.write(response.text) - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_assignments(self, dest_folder: str, api_path: str) -> str: - """ - The description attribute has the assignment content in HTML format. Access that and store it as an HTML file. - """ - print("In download_assignments") - try: - assignment_request = requests.get(api_path + "/assignments", headers=self.headers) - assignments = assignment_request.json() - - for assignment in assignments: - if assignment['description'] is not None and assignment['description'] != "": - assignment_name = "assignment_" + str(assignment['id']) + ".html" - assignment_description = assignment['description'] - - with open(dest_folder + "/" + assignment_name, 'w') as html_file: - html_file.write(assignment_description) - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) - - def download_discussions(self, dest_folder: str, api_path: str) -> str: - """ - Download course discussions as HTML and store in given folder. - """ - print("In download_discussions") - try: - discussion_request = requests.get(api_path + "/discussion_topics", headers=self.headers) - discussions = discussion_request.json() - - for discussion in discussions: - discussion_content = discussion['message'] - discussion_name = discussion['title'] + ".html" - - with open(dest_folder + "/" + discussion_name, 'w') as html_file: - html_file.write(discussion_content) - return "Success" - except Exception as e: - sentry_sdk.capture_exception(e) - return "Failed! Error: " + str(e) diff --git a/ai_ta_backend/database/__init__.py b/ai_ta_backend/database/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ai_ta_backend/database/aws.py b/ai_ta_backend/database/aws.py new file mode 100644 index 00000000..68e61b68 --- /dev/null +++ b/ai_ta_backend/database/aws.py @@ -0,0 +1,34 @@ +import os + +import boto3 +from injector import inject + + +class AWSStorage: + + @inject + def __init__(self): + # S3 + self.s3_client = boto3.client( + 's3', + aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID'], + aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY'], + ) + + def upload_file(self, file_path: str, bucket_name: str, object_name: str): + self.s3_client.upload_file(file_path, bucket_name, object_name) + + def download_file(self, object_name: str, bucket_name: str, file_path: str): + self.s3_client.download_file(bucket_name, object_name, file_path) + + def delete_file(self, bucket_name: str, s3_path: str): + return self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path) + + def generatePresignedUrl(self, object: str, bucket_name: str, s3_path: str, expiration: int = 3600): + # generate presigned URL + return self.s3_client.generate_presigned_url('get_object', + Params={ + 'Bucket': bucket_name, + 'Key': s3_path + }, + ExpiresIn=expiration) diff --git a/ai_ta_backend/database/sql.py b/ai_ta_backend/database/sql.py new file mode 100644 index 00000000..a9819657 --- /dev/null +++ b/ai_ta_backend/database/sql.py @@ -0,0 +1,88 @@ +import os + +import supabase +from injector import inject + + +class SQLDatabase: + + @inject + def __init__(self, db_url: str): + # Create a Supabase client + self.supabase_client = supabase.create_client( # type: ignore + supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) + + def getAllMaterialsForCourse(self, course_name: str): + return self.supabase_client.table( + os.environ['SUPABASE_DOCUMENTS_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq( + 'course_name', course_name).execute() + + def getMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str): + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq( + 's3_path', s3_path).eq('course_name', course_name).execute() + + def getMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq( + key, value).eq('course_name', course_name).execute() + + def deleteMaterialsForCourseAndKeyAndValue(self, course_name: str, key: str, value: str): + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq(key, value).eq( + 'course_name', course_name).execute() + + def deleteMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str): + return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).delete().eq('s3_path', s3_path).eq( + 'course_name', course_name).execute() + + def getProjectsMapForCourse(self, course_name: str): + return self.supabase_client.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + + def getDocumentsBetweenDates(self, course_name: str, from_date: str, to_date: str, table_name: str): + if from_date != '' and to_date != '': + # query between the dates + print("from_date and to_date") + + response = self.supabase_client.table(table_name).select("id", count='exact').eq("course_name", course_name).gte( + 'created_at', from_date).lte('created_at', to_date).order('id', desc=False).execute() + + elif from_date != '' and to_date == '': + # query from from_date to now + print("only from_date") + response = self.supabase_client.table(table_name).select("id", count='exact').eq("course_name", course_name).gte( + 'created_at', from_date).order('id', desc=False).execute() + + elif from_date == '' and to_date != '': + # query from beginning to to_date + print("only to_date") + response = self.supabase_client.table(table_name).select("id", count='exact').eq("course_name", course_name).lte( + 'created_at', to_date).order('id', desc=False).execute() + + else: + # query all data + print("No dates") + response = self.supabase_client.table(table_name).select("id", count='exact').eq( + "course_name", course_name).order('id', desc=False).execute() + return response + + def getAllFromTableForDownloadType(self, course_name: str, download_type: str, first_id: int): + if download_type == 'documents': + response = self.supabase_client.table("documents").select("*").eq("course_name", course_name).gte( + 'id', first_id).order('id', desc=False).limit(100).execute() + else: + response = self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte( + 'id', first_id).order('id', desc=False).limit(100).execute() + + return response + + def getAllConversationsBetweenIds(self, course_name: str, first_id: int, last_id: int): + return self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte( + 'id', first_id).lte('id', last_id).order('id', desc=False).limit(25).execute() + + def getDocsForIdsGte(self, course_name: str, first_id: int, fields: str = "*", limit: int = 100): + return self.supabase_client.table("documents").select(fields).eq("course_name", course_name).gte( + 'id', first_id).order('id', desc=False).limit(limit).execute() + + def insertProjectInfo(self, project_info): + return self.supabase_client.table("projects").insert(project_info).execute() + + def getAllFromLLMConvoMonitor(self, course_name: str): + return self.supabase_client.table("llm-convo-monitor").select("*").eq("course_name", course_name).execute() diff --git a/ai_ta_backend/database/vector.py b/ai_ta_backend/database/vector.py new file mode 100644 index 00000000..d22fc6ca --- /dev/null +++ b/ai_ta_backend/database/vector.py @@ -0,0 +1,61 @@ +import os + +from injector import inject +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import Qdrant +from qdrant_client import QdrantClient, models + +OPENAI_API_TYPE = "azure" # "openai" or "azure" + + +class VectorDatabase(): + """ + Contains all methods for building and using vector databases. + """ + + @inject + def __init__(self): + """ + Initialize AWS S3, Qdrant, and Supabase. + """ + # vector DB + self.qdrant_client = QdrantClient( + url=os.environ['QDRANT_URL'], + api_key=os.environ['QDRANT_API_KEY'], + ) + + self.vectorstore = Qdrant(client=self.qdrant_client, + collection_name=os.environ['QDRANT_COLLECTION_NAME'], + embeddings=OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)) + + def vector_search(self, search_query, course_name, user_query_embedding, top_n): + """ + Search the vector database for a given query. + """ + myfilter = models.Filter(must=[ + models.FieldCondition(key='course_name', match=models.MatchValue(value=course_name)), + ]) + search_results = self.qdrant_client.search( + collection_name=os.environ['QDRANT_COLLECTION_NAME'], + query_filter=myfilter, + with_vectors=False, + query_vector=user_query_embedding, + limit=top_n, # Return n closest points + + # In a system with high disk latency, the re-scoring step may become a bottleneck: https://qdrant.tech/documentation/guides/quantization/ + search_params=models.SearchParams(quantization=models.QuantizationSearchParams(rescore=False))) + return search_results + + def delete_data(self, collection_name: str, key: str, value: str): + """ + Delete data from the vector database. + """ + return self.qdrant_client.delete( + collection_name=collection_name, + points_selector=models.Filter(must=[ + models.FieldCondition( + key=key, + match=models.MatchValue(value=value), + ), + ]), + ) diff --git a/ai_ta_backend/emails.py b/ai_ta_backend/emails.py deleted file mode 100644 index 2f17dce0..00000000 --- a/ai_ta_backend/emails.py +++ /dev/null @@ -1,38 +0,0 @@ -import os -import smtplib -from email.mime.text import MIMEText -from email.mime.multipart import MIMEMultipart - - -def send_email(subject: str, body_text: str, sender: str, receipients: list, bcc_receipients: list): - """ - Send an email using the AWS SES service - :param subject: The subject of the email - :param body_text: The body of the email - :param sender: The email address of the sender - :param receipients: A list of email addresses to send the email to - :param bcc_receipients: A list of email addresses to send the email to as BCC - :return: A string indicating the result of the email send operation - - """ - # Create message content - message = MIMEMultipart("alternative") - message["Subject"] = subject - message["From"] = sender - message["To"] = ", ".join(receipients) - - if len(bcc_receipients) > 0: - message["Bcc"] = ", ".join(bcc_receipients) - - # Add plain text part - part1 = MIMEText(body_text, "plain") - message.attach(part1) - - # Add additional parts for HTML, attachments, etc. (optional) - - # Connect to SMTP server - with smtplib.SMTP_SSL(os.getenv('SES_HOST'), os.getenv('SES_PORT')) as server: # type: ignore - server.login(os.getenv('USERNAME_SMTP'), os.getenv('PASSWORD_SMTP')) # type: ignore - server.sendmail(sender, receipients + bcc_receipients, message.as_string()) - - return "Email sent successfully!" \ No newline at end of file diff --git a/ai_ta_backend/executors/flask_executor.py b/ai_ta_backend/executors/flask_executor.py new file mode 100644 index 00000000..b9a78540 --- /dev/null +++ b/ai_ta_backend/executors/flask_executor.py @@ -0,0 +1,23 @@ +from flask_executor import Executor +from injector import inject + + +class ExecutorInterface: + + def submit(self, fn, *args, **kwargs): + raise NotImplementedError + + +class FlaskExecutorAdapter(ExecutorInterface): + """ + Adapter for Flask Executor, suitable for I/O-bound tasks that benefit from asynchronous execution. + Use this executor for tasks that involve waiting for I/O operations (e.g., network requests, file I/O), + where the overhead of creating new threads or processes is justified by the time spent waiting. + """ + + @inject + def __init__(self, executor: Executor): + self.executor = executor + + def submit(self, fn, *args, **kwargs): + return self.executor.submit(fn, *args, **kwargs) diff --git a/ai_ta_backend/executors/process_pool_executor.py b/ai_ta_backend/executors/process_pool_executor.py new file mode 100644 index 00000000..81b4860c --- /dev/null +++ b/ai_ta_backend/executors/process_pool_executor.py @@ -0,0 +1,31 @@ +from concurrent.futures import ProcessPoolExecutor + + +class ProcessPoolExecutorInterface: + + def submit(self, fn, *args, **kwargs): + raise NotImplementedError + + +class ProcessPoolExecutorAdapter(ProcessPoolExecutorInterface): + """ + Adapter for Python's ProcessPoolExecutor, suitable for CPU-bound tasks that benefit from parallel execution. + Use this executor for tasks that require significant computation and can be efficiently parallelized across multiple CPUs. + Not for I/O-bound tasks like database queries, file I/O, or network requests, as the overhead of creating and managing processes can outweigh the benefits. + + This executor is ideal for scenarios where the task execution time would significantly benefit from being distributed + across multiple processes, thereby bypassing the GIL (Global Interpreter Lock) and utilizing multiple CPU cores. + + Note: ProcessPoolExecutor is best used with tasks that are relatively heavy and can be executed independently of each other. + """ + + def __init__(self, max_workers=None): + self.executor = ProcessPoolExecutor(max_workers=max_workers) + + def submit(self, fn, *args, **kwargs): + raise NotImplementedError( + "ProcessPoolExecutorAdapter does not support 'submit' directly due to its nature. Use 'map' or other methods as needed." + ) + + def map(self, fn, *iterables, timeout=None, chunksize=1): + return self.executor.map(fn, *iterables, timeout=timeout, chunksize=chunksize) diff --git a/ai_ta_backend/executors/thread_pool_executor.py b/ai_ta_backend/executors/thread_pool_executor.py new file mode 100644 index 00000000..0b40b5db --- /dev/null +++ b/ai_ta_backend/executors/thread_pool_executor.py @@ -0,0 +1,29 @@ +from concurrent.futures import ThreadPoolExecutor + + +class ThreadPoolExecutorInterface: + + def submit(self, fn, *args, **kwargs): + raise NotImplementedError + + +class ThreadPoolExecutorAdapter(ThreadPoolExecutorInterface): + """ + Adapter for Python's ThreadPoolExecutor, suitable for I/O-bound tasks that can be performed concurrently. + Use this executor for tasks that are largely waiting on I/O operations, such as database queries or file reads, + where the GIL (Global Interpreter Lock) does not become a bottleneck. + + Not for CPU-bound tasks like heavy computation, as the GIL would prevent true parallel execution. + + This executor is particularly useful when you want more control over the number of concurrent threads + than what Flask Executor provides, or when you're not working within a Flask application context. + """ + + def __init__(self, max_workers=None): + self.executor = ThreadPoolExecutor(max_workers=max_workers) + + def submit(self, fn, *args, **kwargs): + return self.executor.submit(fn, *args, **kwargs) + + def map(self, fn, *iterables, timeout=None, chunksize=1): + return self.executor.map(fn, *iterables, timeout=timeout, chunksize=chunksize) diff --git a/ai_ta_backend/export_data.py b/ai_ta_backend/export_data.py deleted file mode 100644 index 299b3435..00000000 --- a/ai_ta_backend/export_data.py +++ /dev/null @@ -1,350 +0,0 @@ -import os -import uuid -import zipfile -import io -import pandas as pd -import supabase -import sentry_sdk -import boto3 -import botocore -from concurrent.futures import ProcessPoolExecutor -import requests -import json -from ai_ta_backend.emails import send_email - -# Initialize Supabase client -SUPABASE_CLIENT = supabase.create_client(supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - - -def export_documents_json(course_name: str, from_date='', to_date=''): - """ - This function exports the documents to a json file. - 1. If the number of documents is greater than 1000, it calls a background task to upload the documents to S3. - 2. If the number of documents is less than 1000, it fetches the documents and zips them. - Args: - course_name (str): The name of the course. - from_date (str, optional): The start date for the data export. Defaults to ''. - to_date (str, optional): The end date for the data export. Defaults to ''. - """ - - if from_date != '' and to_date != '': - # query between the dates - print("from_date and to_date") - response = SUPABASE_CLIENT.table("documents").select("id", count='exact').eq("course_name", course_name).gte( - 'created_at', from_date).lte('created_at', to_date).order('id', desc=False).execute() - - elif from_date != '' and to_date == '': - # query from from_date to now - print("only from_date") - response = SUPABASE_CLIENT.table("documents").select("id", count='exact').eq("course_name", course_name).gte( - 'created_at', from_date).order('id', desc=False).execute() - - elif from_date == '' and to_date != '': - # query from beginning to to_date - print("only to_date") - response = SUPABASE_CLIENT.table("documents").select("id", count='exact').eq("course_name", course_name).lte( - 'created_at', to_date).order('id', desc=False).execute() - - else: - # query all data - print("No dates") - response = SUPABASE_CLIENT.table("documents").select("id", - count='exact').eq("course_name", - course_name).order('id', - desc=False).execute() - - # add a condition to route to direct download or s3 download - if response.count > 1000: - # call background task to upload to s3 - - filename = course_name + '_' + str(uuid.uuid4()) + '_documents.zip' - s3_filepath = s3_file = f"courses/{course_name}/{filename}" - # background task of downloading data - map it with above ID - executor = ProcessPoolExecutor() - executor.submit(export_data_in_bg, response, "documents", course_name, s3_filepath) - return {"response": 'Download from S3', "s3_path": s3_filepath} - - else: - # Fetch data - if response.count > 0: - # batch download - total_doc_count = response.count - first_id = response.data[0]['id'] - last_id = response.data[-1]['id'] - - print("total_doc_count: ", total_doc_count) - print("first_id: ", first_id) - print("last_id: ", last_id) - - curr_doc_count = 0 - filename = course_name + '_' + str(uuid.uuid4()) + '_documents.json' - file_path = os.path.join(os.getcwd(), filename) - - while curr_doc_count < total_doc_count: - print("Fetching data from id: ", first_id) - response = SUPABASE_CLIENT.table("documents").select("*").eq("course_name", course_name).gte('id', first_id).order('id', desc=False).limit(100).execute() - df = pd.DataFrame(response.data) - curr_doc_count += len(response.data) - - # writing to file - if not os.path.isfile(file_path): - df.to_json(file_path, orient='records') - else: - df.to_json(file_path, orient='records', lines=True, mode='a') - - if len(response.data) > 0: - first_id = response.data[-1]['id'] + 1 - - # Download file - try: - # zip file - zip_filename = filename.split('.')[0] + '.zip' - zip_file_path = os.path.join(os.getcwd(), zip_filename) - - with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: - zipf.write(file_path, filename) - - os.remove(file_path) - return {"response": (zip_file_path, zip_filename, os.getcwd())} - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return {"response": "Error downloading file."} - else: - return {"response": "No data found between the given dates."} - - -def export_data_in_bg(response, download_type, course_name, s3_path): - """ - This function is called in export_documents_csv() to upload the documents to S3. - 1. download the documents in batches of 100 and upload them to S3. - 2. generate a pre-signed URL for the S3 file. - 3. send an email to the course admins with the pre-signed URL. - - Args: - response (dict): The response from the Supabase query. - download_type (str): The type of download - 'documents' or 'conversations'. - course_name (str): The name of the course. - s3_path (str): The S3 path where the file will be uploaded. - """ - total_doc_count = response.count - first_id = response.data[0]['id'] - print("total_doc_count: ", total_doc_count) - print("pre-defined s3_path: ", s3_path) - - curr_doc_count = 0 - filename = s3_path.split('/')[-1].split('.')[0] + '.json' - file_path = os.path.join(os.getcwd(), filename) - - # download data in batches of 100 - while curr_doc_count < total_doc_count: - print("Fetching data from id: ", first_id) - if download_type == 'documents': - response = SUPABASE_CLIENT.table("documents").select("*").eq("course_name", course_name).gte('id', first_id).order('id', desc=False).limit(100).execute() - else: - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte('id', first_id).order('id', desc=False).limit(100).execute() - df = pd.DataFrame(response.data) - curr_doc_count += len(response.data) - - # writing to file - if not os.path.isfile(file_path): - df.to_json(file_path, orient='records') - else: - df.to_json(file_path, orient='records', lines=True, mode='a') - - if len(response.data) > 0: - first_id = response.data[-1]['id'] + 1 - - # zip file - zip_filename = filename.split('.')[0] + '.zip' - zip_file_path = os.path.join(os.getcwd(), zip_filename) - - with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: - zipf.write(file_path, filename) - - print("zip file created: ", zip_file_path) - - try: - # upload to S3 - s3 = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" - s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" - s3.upload_file(zip_file_path, os.getenv('S3_BUCKET_NAME'), s3_file) - - # remove local files - os.remove(file_path) - os.remove(zip_file_path) - - print("file uploaded to s3: ", s3_file) - - # pre-signed URL - s3_object = s3.head_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path) - - # generate presigned URL - s3_url = s3.generate_presigned_url('get_object', Params={'Bucket': os.getenv('S3_BUCKET_NAME'), 'Key': s3_path}, ExpiresIn=3600) - - # get admin email IDs - headers = { - "Authorization": f"Bearer {os.getenv('VERCEL_READ_ONLY_API_KEY')}", - "Content-Type": "application/json" - } - - hget_url = str(os.getenv('VERCEL_BASE_URL')) + "course_metadatas/" + course_name - response = requests.get(hget_url, headers=headers) - course_metadata = response.json() - course_metadata = json.loads(course_metadata['result']) - admin_emails = course_metadata['course_admins'] - bcc_emails = [] - - # check for Kastan's email and move to bcc - if 'kvday2@illinois.edu' in admin_emails: - admin_emails.remove('kvday2@illinois.edu') - bcc_emails.append('kvday2@illinois.edu') - - # add course owner email to admin_emails - admin_emails.append(course_metadata['course_owner']) - admin_emails = list(set(admin_emails)) - print("admin_emails: ", admin_emails) - print("bcc_emails: ", bcc_emails) - - # add a check for emails, don't send email if no admin emails - if len(admin_emails) == 0: - return "No admin emails found. Email not sent." - - # send email to admins - subject = "UIUC.chat Data Export Complete for " + course_name - body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." - email_status = send_email(subject, body_text, os.getenv('EMAIL_SENDER'), admin_emails, bcc_emails) - print("email_status: ", email_status) - - return "File uploaded to S3. Email sent to admins." - - except Exception as e: - print(e) - return "Error: " + str(e) - -def check_s3_path_and_download(s3_path): - """ - This function checks if the file exists in S3 and downloads it. - Args: - s3_path (str): The S3 path of the file. - """ - s3 = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - try: - print("Checking if file exists in S3...", s3_path) - s3_object = s3.head_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path) - - # generate presigned URL - s3_url = s3.generate_presigned_url('get_object', Params={'Bucket': os.getenv('S3_BUCKET_NAME'), 'Key': s3_path}, ExpiresIn=172800) - print("Presigned URL: ", s3_url) - return {"response": s3_url} - - except botocore.exceptions.ClientError as e: - if e.response['Error']['Code'] == "404": - # The object does not exist. - return {"response": "Export is not complete yet. Please try again later."} - else: - # Something else has gone wrong. - sentry_sdk.capture_exception(e) - return {"response": "Error downloading file."} - - -def export_convo_history_json(course_name: str, from_date='', to_date=''): - """ - This function exports the conversation history to a csv file. - Args: - course_name (str): The name of the course. - from_date (str, optional): The start date for the data export. Defaults to ''. - to_date (str, optional): The end date for the data export. Defaults to ''. - """ - print("Exporting conversation history to csv file...") - - if from_date == '' and to_date == '': - # Get all data - print("No dates") - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("id", count='exact').eq( - "course_name", course_name).order('id', desc=False).execute() - elif from_date != '' and to_date == '': - print("only from_date") - # Get data from from_date to now - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("id", count='exact').eq( - "course_name", course_name).gte('created_at', from_date).order('id', desc=False).execute() - elif from_date == '' and to_date != '': - print("only to_date") - # Get data from beginning to to_date - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("id", count='exact').eq( - "course_name", course_name).lte('created_at', to_date).order('id', desc=False).execute() - else: - print("both from_date and to_date") - # Get data from from_date to to_date - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("id", count='exact').eq( - "course_name", course_name).gte('created_at', from_date).lte('created_at', to_date).order('id', - desc=False).execute() - - if response.count > 1000: - # call background task to upload to s3 - filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.zip' - s3_filepath = s3_file = f"courses/{course_name}/{filename}" - # background task of downloading data - map it with above ID - executor = ProcessPoolExecutor() - executor.submit(export_data_in_bg, response, "conversations", course_name, s3_filepath) - return {"response": 'Download from S3', "s3_path": s3_filepath} - - # Fetch data - if response.count > 0: - print("id count greater than zero") - first_id = response.data[0]['id'] - last_id = response.data[-1]['id'] - total_count = response.count - - filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.csv' - file_path = os.path.join(os.getcwd(), filename) - curr_count = 0 - # Fetch data in batches of 25 from first_id to last_id - while curr_count < total_count: - print("Fetching data from id: ", first_id) - response = SUPABASE_CLIENT.table("llm-convo-monitor").select("*").eq("course_name", course_name).gte( - 'id', first_id).lte('id', last_id).order('id', desc=False).limit(25).execute() - # Convert to pandas dataframe - df = pd.DataFrame(response.data) - curr_count += len(response.data) - - # Append to csv file - if not os.path.isfile(file_path): - df.to_json(file_path, orient='records', lines=True) - else: - df.to_json(file_path, orient='records', lines=True, mode='a') - - # Update first_id - if len(response.data) > 0: - first_id = response.data[-1]['id'] + 1 - print("updated first_id: ", first_id) - - # Download file - try: - # zip file - zip_filename = filename.split('.')[0] + '.zip' - zip_file_path = os.path.join(os.getcwd(), zip_filename) - - with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: - zipf.write(file_path, filename) - os.remove(file_path) - - return {"response": (zip_file_path, zip_filename, os.getcwd())} - except Exception as e: - print(e) - sentry_sdk.capture_exception(e) - return {"response": "Error downloading file!"} - else: - return {"response": "No data found between the given dates."} diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 3c87f04b..ef311b6c 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -1,10 +1,7 @@ -import gc import os import threading import time from typing import List -import requests -from threading import Thread from dotenv import load_dotenv from flask import ( @@ -15,30 +12,32 @@ make_response, request, send_from_directory, - stream_with_context, ) from flask_cors import CORS from flask_executor import Executor -from posthog import Posthog -# import ray -import sentry_sdk - -from ai_ta_backend.canvas import CanvasAPI - -from ai_ta_backend.export_data import export_convo_history_json, export_documents_json, check_s3_path_and_download -from ai_ta_backend.nomic_logging import get_nomic_map, log_convo_to_nomic, create_document_map -from ai_ta_backend.vector_database import Ingest -from ai_ta_backend.web_scrape import WebScrape, mit_course_download - -# Sentry.io error logging -sentry_sdk.init( - dsn=os.getenv("SENTRY_DSN"), - # Set traces_sample_rate to 1.0 to capture 100% of transactions for performance monitoring. - traces_sample_rate=1.0, - # Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions. - # We recommend adjusting this value in production. - profiles_sample_rate=1.0, - enable_tracing=True) +from flask_injector import FlaskInjector, RequestScope +from injector import Binder, SingletonScope + +from ai_ta_backend.database.aws import AWSStorage +from ai_ta_backend.database.sql import SQLDatabase +from ai_ta_backend.database.vector import VectorDatabase +from ai_ta_backend.executors.flask_executor import ( + ExecutorInterface, + FlaskExecutorAdapter, +) +from ai_ta_backend.executors.process_pool_executor import ( + ProcessPoolExecutorAdapter, + ProcessPoolExecutorInterface, +) +from ai_ta_backend.executors.thread_pool_executor import ( + ThreadPoolExecutorAdapter, + ThreadPoolExecutorInterface, +) +from ai_ta_backend.service.export_service import ExportService +from ai_ta_backend.service.nomic_service import NomicService +from ai_ta_backend.service.posthog_service import PosthogService +from ai_ta_backend.service.retrieval_service import RetrievalService +from ai_ta_backend.service.sentry_service import SentryService app = Flask(__name__) CORS(app) @@ -49,10 +48,6 @@ # load API keys from globally-availabe .env file load_dotenv() -# ray.init() - -print("NUM ACTIVE THREADS (top of main):", threading.active_count()) - @app.route('/') def index() -> Response: @@ -64,68 +59,14 @@ def index() -> Response: Returns: JSON: _description_ """ - response = jsonify({"Choo Choo": "Welcome to your Flask app πŸš…"}) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/coursera', methods=['GET']) -def coursera() -> Response: - try: - course_name: str = request.args.get('course_name') # type: ignore - coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore - except Exception as e: - print(f"No course name provided: {e}") - - ingester = Ingest() - results = ingester.ingest_coursera(coursera_course_name, course_name) # type: ignore - del ingester - - response = jsonify(results) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/github', methods=['GET']) -def github() -> Response: - course_name: str = request.args.get('course_name', default='', type=str) - github_url: str = request.args.get('github_url', default='', type=str) - - if course_name == '' or github_url == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{github_url}`" - ) - - ingester = Ingest() - results = ingester.ingest_github(github_url, course_name) - del ingester - response = jsonify(results) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/delete-entire-course', methods=['GET']) -def delete_entire_course() -> Response: - try: - course_name: str = request.args.get('course_name') # type: ignore - # coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore - except Exception as e: - print(f"No course name provided: {e}") - - ingester = Ingest() - results = ingester.delete_entire_course(course_name) # type: ignore - del ingester - - response = jsonify(results) + response = jsonify( + {"hi there, this is a 404": "Welcome to UIUC.chat backend πŸš… Read the docs here: https://docs.uiuc.chat/ "}) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/getTopContexts', methods=['GET']) -def getTopContexts() -> Response: +def getTopContexts(service: RetrievalService) -> Response: """Get most relevant contexts for a given search query. Return value @@ -160,7 +101,6 @@ def getTopContexts() -> Response: Exception Testing how exceptions are handled. """ - print("In getRopContexts in Main()") search_query: str = request.args.get('search_query', default='', type=str) course_name: str = request.args.get('course_name', default='', type=str) token_limit: int = request.args.get('token_limit', default=3000, type=int) @@ -174,197 +114,16 @@ def getTopContexts() -> Response: print("NUM ACTIVE THREADS (top of getTopContexts):", threading.active_count()) - ingester = Ingest() - found_documents = ingester.getTopContexts(search_query, course_name, token_limit) + found_documents = service.getTopContexts(search_query, course_name, token_limit) print("NUM ACTIVE THREADS (after instantiating Ingest() class in getTopContexts):", threading.active_count()) - del ingester response = jsonify(found_documents) response.headers.add('Access-Control-Allow-Origin', '*') return response -@app.route('/get_stuffed_prompt', methods=['GET']) -def get_stuffed_prompt() -> Response: - """Get most relevant contexts for a given search query. - - ## GET arguments - course name (optional) str - A json response with TBD fields. - search_query - top_n - - Returns - ------- - String - - """ - course_name: str = request.args.get('course_name', default='', type=str) - search_query: str = request.args.get('search_query', default='', type=str) - token_limit: int = request.args.get('token_limit', default=-1, type=int) - if course_name == '' or search_query == '' or token_limit == -1: - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'course_name', 'search_query', and 'token_limit' must be provided. Course name: `{course_name}`, Search query: `{search_query}`, Token limit: `{token_limit}`" - ) - - print("In /getTopContexts: ", search_query) - if search_query is None: - return jsonify({"error": "No parameter `search_query` provided. It is undefined."}) - if token_limit is None: - token_limit = 3_000 - else: - token_limit = int(token_limit) - - ingester = Ingest() - prompt = ingester.get_stuffed_prompt(search_query, course_name, token_limit) - del ingester - - response = jsonify(prompt) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/ingest', methods=['GET']) -def ingest() -> Response: - """Recursively ingests anything from S3 filepath and below. - Pass a s3_paths filepath (not URL) into our S3 bucket. - - Ingests all files, not just PDFs. - - args: - s3_paths: str | List[str] - - Returns: - str: Success or Failure message. Failure message if any failures. TODO: email on failure. - """ - s3_paths: List[str] | str = request.args.get('s3_paths', default='') - readable_filename: List[str] | str = request.args.get('readable_filename', default='') - course_name: List[str] | str = request.args.get('course_name', default='') - base_url: List[str] | str | None = request.args.get('base_url', default=None) - url: List[str] | str | None = request.args.get('url', default=None) - - print( - f"In top of /ingest route. course: {course_name}, s3paths: {s3_paths}, readable_filename: {readable_filename}, base_url: {base_url}, url: {url}" - ) - - if course_name == '' or s3_paths == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{s3_paths}`" - ) - - print("NUM ACTIVE THREADS (top of /ingest):", threading.active_count()) - - ingester = Ingest() - if readable_filename == '': - success_fail_dict = ingester.bulk_ingest(s3_paths, course_name, base_url=base_url, url=url) - else: - success_fail_dict = ingester.bulk_ingest(s3_paths, - course_name, - readable_filename=readable_filename, - base_url=base_url, - url=url) - print(f"Bottom of /ingest route. success or fail dict: {success_fail_dict}") - del ingester - - response = jsonify(success_fail_dict) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/ingest-web-text', methods=['POST']) -def ingest_web_text() -> Response: - """Ingests web text data provided in the POST request body. - - Expects JSON data containing: - - url: The URL of the web text to ingest. - - base_url: The base URL of the web text to ingest. - - title: The title of the web text. - - content: The content of the web text. - - course_name: The name of the course associated with the web text. - - Returns: - str: Success or Failure message. Failure message if any failures. TODO: email on failure. - """ - data = request.get_json() - url: str = data.get('url', '') - base_url: str = data.get('base_url', '') - title: str = data.get('title', '') - content: str = data.get('content', '') - course_name: str = data.get('courseName', '') - - print(f"In top of /ingest-web-text. course: {course_name}, base_url: {base_url}, url: {url}") - - if course_name == '' or url == '' or title == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: course_name, url or title. Course name: `{course_name}`, url: `{url}`, content: `{content}`, title: `{title}`" - ) - - if content == '': - print(f"Content is empty. Skipping ingestion of {url}") - response = jsonify({"outcome": "success"}) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - print("NUM ACTIVE THREADS (top of /ingest-web-text):", threading.active_count()) - - ingester = Ingest() - success_fail = ingester.ingest_single_web_text(course_name, base_url, url, content, title) - del ingester - - print(f"Bottom of /ingest route. success or fail dict: {success_fail}") - - response = jsonify(success_fail) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/getContextStuffedPrompt', methods=['GET']) -def getContextStuffedPrompt() -> Response: - """ - Get a stuffed prompt for a given user question and course name. - Args : - search_query (str) - course_name (str) : used for metadata filtering - Returns : str - a very long "stuffed prompt" with question + summaries of 20 most relevant documents. - """ - print("In /getContextStuffedPrompt") - - ingester = Ingest() - search_query: str = request.args.get('search_query', default='', type=str) - course_name: str = request.args.get('course_name', default='', type=str) - top_n: int = request.args.get('top_n', default=-1, type=int) - top_k_to_search: int = request.args.get('top_k_to_search', default=-1, type=int) - - if search_query == '' or course_name == '' or top_n == -1 or top_k_to_search == -1: - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'search_query', 'course_name', 'top_n', and 'top_k_to_search' must be provided. Search query: `{search_query}`, Course name: `{course_name}`, Top N: `{top_n}`, Top K to search: `{top_k_to_search}`" - ) - - start_time = time.monotonic() - stuffed_prompt = ingester.get_context_stuffed_prompt(search_query, course_name, top_n, top_k_to_search) - print(f"⏰ Runtime of EXTREME prompt stuffing: {(time.monotonic() - start_time):.2f} seconds") - del ingester - - response = jsonify({"prompt": stuffed_prompt}) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - @app.route('/getAll', methods=['GET']) -def getAll() -> Response: +def getAll(service: RetrievalService) -> Response: """Get all course materials based on the course_name """ course_name: List[str] | str = request.args.get('course_name', default='', type=str) @@ -375,9 +134,7 @@ def getAll() -> Response: 400, description=f"Missing the one required parameter: 'course_name' must be provided. Course name: `{course_name}`") - ingester = Ingest() - distinct_dicts = ingester.getAll(course_name) - del ingester + distinct_dicts = service.getAll(course_name) response = jsonify({"distinct_files": distinct_dicts}) response.headers.add('Access-Control-Allow-Origin', '*') @@ -385,7 +142,7 @@ def getAll() -> Response: @app.route('/delete', methods=['DELETE']) -def delete(): +def delete(service: RetrievalService, flaskExecutor: ExecutorInterface): """ Delete a single file from all our database: S3, Qdrant, and Supabase (for now). Note, of course, we still have parts of that file in our logs. @@ -403,149 +160,18 @@ def delete(): ) start_time = time.monotonic() - ingester = Ingest() # background execution of tasks!! - executor.submit(ingester.delete_data, course_name, s3_path, source_url) + flaskExecutor.submit(service.delete_data, course_name, s3_path, source_url) print(f"From {course_name}, deleted file: {s3_path}") print(f"⏰ Runtime of FULL delete func: {(time.monotonic() - start_time):.2f} seconds") - del ingester - # we need instant return. Delets are "best effort" assume always successful... sigh :( response = jsonify({"outcome": 'success'}) response.headers.add('Access-Control-Allow-Origin', '*') return response -@app.route('/web-scrape', methods=['GET']) -def scrape() -> Response: - url: str = request.args.get('url', default='', type=str) - course_name: str = request.args.get('course_name', default='', type=str) - max_urls: int = request.args.get('max_urls', default=100, type=int) - max_depth: int = request.args.get('max_depth', default=2, type=int) - timeout: int = request.args.get('timeout', default=3, type=int) - # stay_on_baseurl = request.args.get('stay_on_baseurl', default='', type=str) - stay_on_baseurl: bool = request.args.get('stay_on_baseurl', default=True, type=lambda x: x.lower() == 'true') - depth_or_breadth: str = request.args.get('depth_or_breadth', default='breadth', type=str) - - if url == '' or max_urls == -1 or max_depth == -1 or timeout == -1 or course_name == '' or stay_on_baseurl is None: - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'url', 'max_urls', 'max_depth', 'timeout', 'course_name', and 'stay_on_baseurl' must be provided. url: `{url}`, max_urls: `{max_urls}`, max_depth: `{max_depth}`, timeout: `{timeout}`, course_name: `{course_name}`, stay_on_baseurl: `{stay_on_baseurl}`" - ) - - # print all input params - print(f"Web scrape: {url}") - print(f"Max Urls: {max_urls}") - print(f"Max Depth: {max_depth}") - print(f"Stay on BaseURL: {stay_on_baseurl}") - print(f"Timeout in Seconds ⏰: {timeout}") - - posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com') - posthog.capture('distinct_id_of_the_user', - event='web_scrape_invoked', - properties={ - 'url': url, - 'max_urls': max_urls, - 'max_depth': max_depth, - 'stay_on_baseurl': stay_on_baseurl, - 'timeout': timeout, - 'course_name': course_name, - 'depth_or_breadth': depth_or_breadth - }) - - scraper = WebScrape() - success_fail_dict = scraper.main_crawler(url, course_name, max_urls, max_depth, timeout, stay_on_baseurl, - depth_or_breadth) - del scraper - posthog.shutdown() - gc.collect() # manually invoke garbage collection, try to reduce memory on Railway $$$ - - response = jsonify(success_fail_dict) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/mit-download', methods=['GET']) -def mit_download_course() -> Response: - """ Web scraper built for - """ - url: str = request.args.get('url', default='', type=str) - course_name: str = request.args.get('course_name', default='', type=str) - local_dir: str = request.args.get('local_dir', default='', type=str) - - if url == '' or course_name == '' or local_dir == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'url', 'course_name', and 'local_dir' must be provided. url: `{url}`, course_name: `{course_name}`, local_dir: `{local_dir}`" - ) - - success_fail = mit_course_download(url, course_name, local_dir) - - response = jsonify(success_fail) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/addCanvasUsers', methods=['GET']) -def add_canvas_users(): - """ - Add users from canvas to the course - """ - print("In /addCanvasUsers") - - canvas = CanvasAPI() - canvas_course_id: str = request.args.get('course_id') - course_name: str = request.args.get('course_name') - - success_or_failure = canvas.add_users(canvas_course_id, course_name) - - response = jsonify({"outcome": success_or_failure}) - - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/ingestCanvas', methods=['GET']) -def ingest_canvas(): - """ - Ingest course content from Canvas - """ - print("made it to ingest") - canvas = CanvasAPI() - canvas_course_id: str = request.args.get('course_id') - course_name: str = request.args.get('course_name') - - # Retrieve the checkbox values from the request and create the content_ingest_dict - # Set default values to True if not provided in the request - content_ingest_dict = { - 'files': request.args.get('files', 'true').lower() == 'true', - 'pages': request.args.get('pages', 'true').lower() == 'true', - 'modules': request.args.get('modules', 'true').lower() == 'true', - 'syllabus': request.args.get('syllabus', 'true').lower() == 'true', - 'assignments': request.args.get('assignments', 'true').lower() == 'true', - 'discussions': request.args.get('discussions', 'true').lower() == 'true' - } - - if canvas_course_id == '' or course_name == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'course_id' and 'course_name' must be provided. course_id: `{canvas_course_id}`, course_name: `{course_name}`" - ) - - success_or_failure = canvas.ingest_course_content(canvas_course_id, course_name, content_ingest_dict) - response = jsonify({"outcome": success_or_failure}) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - @app.route('/getNomicMap', methods=['GET']) -def nomic_map(): +def nomic_map(service: NomicService): course_name: str = request.args.get('course_name', default='', type=str) map_type: str = request.args.get('map_type', default='conversation', type=str) @@ -553,7 +179,7 @@ def nomic_map(): # proper web error "400 Bad request" abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`") - map_id = get_nomic_map(course_name, map_type) + map_id = service.get_nomic_map(course_name, map_type) print("nomic map\n", map_id) response = jsonify(map_id) @@ -562,14 +188,14 @@ def nomic_map(): @app.route('/createDocumentMap', methods=['GET']) -def createDocumentMap(): +def createDocumentMap(service: NomicService): course_name: str = request.args.get('course_name', default='', type=str) if course_name == '': # proper web error "400 Bad request" abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`") - map_id = create_document_map(course_name) + map_id = service.create_document_map(course_name) response = jsonify(map_id) response.headers.add('Access-Control-Allow-Origin', '*') @@ -577,7 +203,7 @@ def createDocumentMap(): @app.route('/onResponseCompletion', methods=['POST']) -def logToNomic(): +def logToNomic(service: NomicService, flaskExecutor: ExecutorInterface): data = request.get_json() course_name = data['course_name'] conversation = data['conversation'] @@ -592,14 +218,14 @@ def logToNomic(): print(f"In /onResponseCompletion for course: {course_name}") # background execution of tasks!! - response = executor.submit(log_convo_to_nomic, course_name, data) + response = flaskExecutor.submit(service.log_convo_to_nomic, course_name, data) response = jsonify({'outcome': 'success'}) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/export-convo-history-csv', methods=['GET']) -def export_convo_history(): +def export_convo_history(service: ExportService): course_name: str = request.args.get('course_name', default='', type=str) from_date: str = request.args.get('from_date', default='', type=str) to_date: str = request.args.get('to_date', default='', type=str) @@ -608,7 +234,7 @@ def export_convo_history(): # proper web error "400 Bad request" abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`") - export_status = export_convo_history_json(course_name, from_date, to_date) + export_status = service.export_convo_history_json(course_name, from_date, to_date) print("EXPORT FILE LINKS: ", export_status) if export_status['response'] == "No data found between the given dates.": @@ -630,7 +256,7 @@ def export_convo_history(): @app.route('/exportDocuments', methods=['GET']) -def exportDocuments(): +def exportDocuments(service: ExportService): course_name: str = request.args.get('course_name', default='', type=str) from_date: str = request.args.get('from_date', default='', type=str) to_date: str = request.args.get('to_date', default='', type=str) @@ -639,7 +265,7 @@ def exportDocuments(): # proper web error "400 Bad request" abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`") - export_status = export_documents_json(course_name, from_date, to_date) + export_status = service.export_documents_json(course_name, from_date, to_date) print("EXPORT FILE LINKS: ", export_status) if export_status['response'] == "No data found between the given dates.": @@ -661,7 +287,7 @@ def exportDocuments(): @app.route('/getTopContextsWithMQR', methods=['GET']) -def getTopContextsWithMQR() -> Response: +def getTopContextsWithMQR(service: RetrievalService, posthog_service: PosthogService) -> Response: """ Get relevant contexts for a given search query, using Multi-query retrieval + filtering method. """ @@ -676,102 +302,35 @@ def getTopContextsWithMQR() -> Response: f"Missing one or more required parameters: 'search_query' and 'course_name' must be provided. Search query: `{search_query}`, Course name: `{course_name}`" ) - posthog = Posthog(sync_mode=True, project_api_key=os.environ['POSTHOG_API_KEY'], host='https://app.posthog.com') - posthog.capture('distinct_id_of_the_user', - event='filter_top_contexts_invoked', - properties={ - 'user_query': search_query, - 'course_name': course_name, - 'token_limit': token_limit, - }) + posthog_service.capture(event_name='filter_top_contexts_invoked', + properties={ + 'user_query': search_query, + 'course_name': course_name, + 'token_limit': token_limit, + }) - ingester = Ingest() - found_documents = ingester.getTopContextsWithMQR(search_query, course_name, token_limit) - del ingester - posthog.shutdown() + found_documents = service.getTopContextsWithMQR(search_query, course_name, token_limit) response = jsonify(found_documents) response.headers.add('Access-Control-Allow-Origin', '*') return response -@app.route('/resource-report', methods=['GET']) -def resource_report() -> Response: - """ - Print server resources. - # https://manpages.debian.org/bookworm/manpages-dev/getrlimit.2.en.html - """ - import resource - from resource import getrusage, RUSAGE_SELF, RUSAGE_CHILDREN - import subprocess - - print("πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡ πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡") - - print("NUM ACTIVE THREADS (top of /resource-report):", threading.active_count()) - try: - # result = subprocess.run(['ps', '-u', '$(whoami)', '|', 'wc', '-l'], stdout=subprocess.PIPE) - result = subprocess.run('ps -u $(whoami) | wc -l', shell=True, stdout=subprocess.PIPE) - print("Current active threads: ", result.stdout.decode('utf-8')) - except Exception as e: - print("Error executing ulimit -a: ", e) - - try: - with open('/etc/security/limits.conf', 'r') as file: - print("/etc/security/limits.conf:\n", file.read()) - except Exception as e: - print("Error reading /etc/security/limits.conf: ", e) - - try: - with open('/proc/sys/kernel/threads-max', 'r') as file: - print("/proc/sys/kernel/threads-max: ", file.read()) - except Exception as e: - print("Error reading /proc/sys/kernel/threads-max: ", e) - - # Check container or virtualization platform limits if applicable - # This is highly dependent on the specific platform and setup - # Here is an example for Docker, adjust as needed for your environment - try: - result = subprocess.run('docker stats --no-stream', shell=True, stdout=subprocess.PIPE) - print("Docker stats:\n", result.stdout.decode('utf-8')) - except Exception as e: - print("Error getting Docker stats: ", e) - - print("RLIMIT_NPROC: ", resource.getrlimit(resource.RLIMIT_NPROC)) - print("RLIMIT_AS (GB): ", [limit / (1024 * 1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_AS)]) - print("RLIMIT_DATA (GB): ", [limit / (1024 * 1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_DATA)]) - print("RLIMIT_MEMLOCK (GB): ", - [limit / (1024 * 1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_MEMLOCK) - ]) # The maximum address space which may be locked in memory. - print("RLIMIT_STACK (MB): ", [limit / (1024 * 1024) for limit in resource.getrlimit(resource.RLIMIT_STACK)]) - print("getpagesize (MB): ", resource.getpagesize() / (1024 * 1024)) - - print("RUSAGE_SELF", getrusage(RUSAGE_SELF), end="\n") - print("RUSAGE_CHILDREN", getrusage(RUSAGE_CHILDREN), end="\n") - - try: - result = subprocess.run('ulimit -u', shell=True, stdout=subprocess.PIPE) - print("ulimit -u: ", result.stdout.decode('utf-8')) - except Exception as e: - print("Error executing ulimit -u: ", e) - - try: - result = subprocess.run('ulimit -a', shell=True, stdout=subprocess.PIPE) - print(f"ulimit -a:\n{result.stdout.decode('utf-8')}") - except Exception as e: - print("Error executing ulimit -a: ", e) - - try: - print("RUSAGE_THREAD: ", resource.getrlimit(resource.RUSAGE_THREAD)) - except Exception as e: - pass - # print("Error in RUSAGE_THREAD: ", e) - - print("πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘† πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸ‘†") - - response = jsonify({"outcome": "success"}) - response.headers.add('Access-Control-Allow-Origin', '*') - return response +def configure(binder: Binder) -> None: + binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope) + binder.bind(PosthogService, to=PosthogService, scope=SingletonScope) + binder.bind(SentryService, to=SentryService, scope=SingletonScope) + binder.bind(NomicService, to=NomicService, scope=SingletonScope) + binder.bind(ExportService, to=ExportService, scope=SingletonScope) + binder.bind(VectorDatabase, to=VectorDatabase, scope=SingletonScope) + binder.bind(SQLDatabase, to=SQLDatabase, scope=SingletonScope) + binder.bind(AWSStorage, to=AWSStorage, scope=SingletonScope) + binder.bind(ExecutorInterface, to=FlaskExecutorAdapter(executor), scope=SingletonScope) + binder.bind(ThreadPoolExecutorInterface, to=ThreadPoolExecutorAdapter, scope=SingletonScope) + binder.bind(ProcessPoolExecutorInterface, to=ProcessPoolExecutorAdapter, scope=SingletonScope) + +FlaskInjector(app=app, modules=[configure]) if __name__ == '__main__': app.run(debug=True, port=int(os.getenv("PORT", default=8000))) # nosec -- reasonable bandit error suppression diff --git a/ai_ta_backend/public_api/uiuc_chat_api.py b/ai_ta_backend/public_api/uiuc_chat_api.py index 33029990..ee21d666 100644 --- a/ai_ta_backend/public_api/uiuc_chat_api.py +++ b/ai_ta_backend/public_api/uiuc_chat_api.py @@ -1,5 +1,6 @@ -import requests import json + +import requests """ # Example usage diff --git a/ai_ta_backend/service/__init__.py b/ai_ta_backend/service/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ai_ta_backend/service/export_service.py b/ai_ta_backend/service/export_service.py new file mode 100644 index 00000000..6eb889c2 --- /dev/null +++ b/ai_ta_backend/service/export_service.py @@ -0,0 +1,267 @@ +import json +import os +import uuid +import zipfile +from concurrent.futures import ProcessPoolExecutor + +import pandas as pd +import requests +from injector import inject + +from ai_ta_backend.database.aws import AWSStorage +from ai_ta_backend.database.sql import SQLDatabase +from ai_ta_backend.service.sentry_service import SentryService +from ai_ta_backend.utils.emails import send_email + + +class ExportService: + + @inject + def __init__(self, sql: SQLDatabase, s3: AWSStorage, sentry: SentryService): + self.sql = sql + self.s3 = s3 + self.sentry = sentry + + def export_documents_json(self, course_name: str, from_date='', to_date=''): + """ + This function exports the documents to a json file. + 1. If the number of documents is greater than 1000, it calls a background task to upload the documents to S3. + 2. If the number of documents is less than 1000, it fetches the documents and zips them. + Args: + course_name (str): The name of the course. + from_date (str, optional): The start date for the data export. Defaults to ''. + to_date (str, optional): The end date for the data export. Defaults to ''. + """ + + response = self.sql.getDocumentsBetweenDates(course_name, from_date, to_date, 'documents') + # add a condition to route to direct download or s3 download + if response.count > 1000: + # call background task to upload to s3 + + filename = course_name + '_' + str(uuid.uuid4()) + '_documents.zip' + s3_filepath = f"courses/{course_name}/{filename}" + # background task of downloading data - map it with above ID + executor = ProcessPoolExecutor() + executor.submit(self.export_data_in_bg, response, "documents", course_name, s3_filepath) + return {"response": 'Download from S3', "s3_path": s3_filepath} + + else: + # Fetch data + if response.count > 0: + # batch download + total_doc_count = response.count + first_id = response.data[0]['id'] + last_id = response.data[-1]['id'] + + print("total_doc_count: ", total_doc_count) + print("first_id: ", first_id) + print("last_id: ", last_id) + + curr_doc_count = 0 + filename = course_name + '_' + str(uuid.uuid4()) + '_documents.json' + file_path = os.path.join(os.getcwd(), filename) + + while curr_doc_count < total_doc_count: + print("Fetching data from id: ", first_id) + + response = self.sql.getDocsForIdsGte(course_name, first_id) + df = pd.DataFrame(response.data) + curr_doc_count += len(response.data) + + # writing to file + if not os.path.isfile(file_path): + df.to_json(file_path, orient='records') + else: + df.to_json(file_path, orient='records', lines=True, mode='a') + + if len(response.data) > 0: + first_id = response.data[-1]['id'] + 1 + + # Download file + try: + # zip file + zip_filename = filename.split('.')[0] + '.zip' + zip_file_path = os.path.join(os.getcwd(), zip_filename) + + with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: + zipf.write(file_path, filename) + + os.remove(file_path) + return {"response": (zip_file_path, zip_filename, os.getcwd())} + except Exception as e: + print(e) + self.sentry.capture_exception(e) + return {"response": "Error downloading file."} + else: + return {"response": "No data found between the given dates."} + + def export_data_in_bg(self, response, download_type, course_name, s3_path): + """ + This function is called in export_documents_csv() to upload the documents to S3. + 1. download the documents in batches of 100 and upload them to S3. + 2. generate a pre-signed URL for the S3 file. + 3. send an email to the course admins with the pre-signed URL. + + Args: + response (dict): The response from the Supabase query. + download_type (str): The type of download - 'documents' or 'conversations'. + course_name (str): The name of the course. + s3_path (str): The S3 path where the file will be uploaded. + """ + total_doc_count = response.count + first_id = response.data[0]['id'] + print("total_doc_count: ", total_doc_count) + print("pre-defined s3_path: ", s3_path) + + curr_doc_count = 0 + filename = s3_path.split('/')[-1].split('.')[0] + '.json' + file_path = os.path.join(os.getcwd(), filename) + + # download data in batches of 100 + while curr_doc_count < total_doc_count: + print("Fetching data from id: ", first_id) + response = self.sql.getAllFromTableForDownloadType(course_name, download_type, first_id) + df = pd.DataFrame(response.data) + curr_doc_count += len(response.data) + + # writing to file + if not os.path.isfile(file_path): + df.to_json(file_path, orient='records') + else: + df.to_json(file_path, orient='records', lines=True, mode='a') + + if len(response.data) > 0: + first_id = response.data[-1]['id'] + 1 + + # zip file + zip_filename = filename.split('.')[0] + '.zip' + zip_file_path = os.path.join(os.getcwd(), zip_filename) + + with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: + zipf.write(file_path, filename) + + print("zip file created: ", zip_file_path) + + try: + # upload to S3 + + #s3_file = f"courses/{course_name}/exports/{os.path.basename(zip_file_path)}" + s3_file = f"courses/{course_name}/{os.path.basename(zip_file_path)}" + self.s3.upload_file(zip_file_path, os.environ['S3_BUCKET_NAME'], s3_file) + + # remove local files + os.remove(file_path) + os.remove(zip_file_path) + + print("file uploaded to s3: ", s3_file) + + # generate presigned URL + s3_url = self.s3.generatePresignedUrl('get_object', os.environ['S3_BUCKET_NAME'], s3_path, 3600) + + # get admin email IDs + headers = { + "Authorization": f"Bearer {os.environ['VERCEL_READ_ONLY_API_KEY']}", + "Content-Type": "application/json" + } + + hget_url = str(os.environ['VERCEL_BASE_URL']) + "course_metadatas/" + course_name + response = requests.get(hget_url, headers=headers) + course_metadata = response.json() + course_metadata = json.loads(course_metadata['result']) + admin_emails = course_metadata['course_admins'] + bcc_emails = [] + + # check for Kastan's email and move to bcc + if 'kvday2@illinois.edu' in admin_emails: + admin_emails.remove('kvday2@illinois.edu') + bcc_emails.append('kvday2@illinois.edu') + + # add course owner email to admin_emails + admin_emails.append(course_metadata['course_owner']) + admin_emails = list(set(admin_emails)) + print("admin_emails: ", admin_emails) + print("bcc_emails: ", bcc_emails) + + # add a check for emails, don't send email if no admin emails + if len(admin_emails) == 0: + return "No admin emails found. Email not sent." + + # send email to admins + subject = "UIUC.chat Data Export Complete for " + course_name + body_text = "The data export for " + course_name + " is complete.\n\nYou can download the file from the following link: \n\n" + s3_url + "\n\nThis link will expire in 48 hours." + email_status = send_email(subject, body_text, os.environ['EMAIL_SENDER'], admin_emails, bcc_emails) + print("email_status: ", email_status) + + return "File uploaded to S3. Email sent to admins." + + except Exception as e: + print(e) + return "Error: " + str(e) + + def export_convo_history_json(self, course_name: str, from_date='', to_date=''): + """ + This function exports the conversation history to a csv file. + Args: + course_name (str): The name of the course. + from_date (str, optional): The start date for the data export. Defaults to ''. + to_date (str, optional): The end date for the data export. Defaults to ''. + """ + print("Exporting conversation history to csv file...") + + response = self.sql.getDocumentsBetweenDates(course_name, from_date, to_date, 'llm-convo-monitor') + + if response.count > 1000: + # call background task to upload to s3 + filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.zip' + s3_filepath = f"courses/{course_name}/{filename}" + # background task of downloading data - map it with above ID + executor = ProcessPoolExecutor() + executor.submit(self.export_data_in_bg, response, "conversations", course_name, s3_filepath) + return {"response": 'Download from S3', "s3_path": s3_filepath} + + # Fetch data + if response.count > 0: + print("id count greater than zero") + first_id = response.data[0]['id'] + last_id = response.data[-1]['id'] + total_count = response.count + + filename = course_name + '_' + str(uuid.uuid4()) + '_convo_history.csv' + file_path = os.path.join(os.getcwd(), filename) + curr_count = 0 + # Fetch data in batches of 25 from first_id to last_id + while curr_count < total_count: + print("Fetching data from id: ", first_id) + response = self.sql.getAllConversationsBetweenIds(course_name, first_id, last_id) + # Convert to pandas dataframe + df = pd.DataFrame(response.data) + curr_count += len(response.data) + + # Append to csv file + if not os.path.isfile(file_path): + df.to_json(file_path, orient='records', lines=True) + else: + df.to_json(file_path, orient='records', lines=True, mode='a') + + # Update first_id + if len(response.data) > 0: + first_id = response.data[-1]['id'] + 1 + print("updated first_id: ", first_id) + + # Download file + try: + # zip file + zip_filename = filename.split('.')[0] + '.zip' + zip_file_path = os.path.join(os.getcwd(), zip_filename) + + with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf: + zipf.write(file_path, filename) + os.remove(file_path) + + return {"response": (zip_file_path, zip_filename, os.getcwd())} + except Exception as e: + print(e) + self.sentry.capture_exception(e) + return {"response": "Error downloading file!"} + else: + return {"response": "No data found between the given dates."} diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py new file mode 100644 index 00000000..2e660de0 --- /dev/null +++ b/ai_ta_backend/service/nomic_service.py @@ -0,0 +1,720 @@ +import datetime +import os +import time + +import backoff +import nomic +import numpy as np +import pandas as pd +from injector import inject +from langchain.embeddings import OpenAIEmbeddings +from nomic import AtlasProject, atlas + +from ai_ta_backend.database.sql import SQLDatabase +from ai_ta_backend.service.sentry_service import SentryService + +LOCK_EXCEPTIONS = [ + 'Project is locked for state access! Please wait until the project is unlocked to access embeddings.', + 'Project is locked for state access! Please wait until the project is unlocked to access data.', + 'Project is currently indexing and cannot ingest new datums. Try again later.' +] + + +def giveup_hdlr(e): + """ + Function to handle giveup conditions in backoff decorator + Args: + e: Exception raised by the decorated function + Returns: + True if we want to stop retrying, False otherwise + """ + (e_args,) = e.args + e_str = e_args['exception'] + + print("giveup_hdlr() called with exception:", e_str) + if e_str in LOCK_EXCEPTIONS: + return False + else: + # self.sentry.capture_exception(e) + return True + + +def backoff_hdlr(details): + """ + Function to handle backup conditions in backoff decorator. + Currently just prints the details of the backoff. + """ + print( + "\nBacking off {wait:0.1f} seconds after {tries} tries, calling function {target} with args {args} and kwargs {kwargs}" + .format(**details)) + + +def backoff_strategy(): + """ + Function to define retry strategy. Is usualy defined in the decorator, + but passing parameters to it is giving errors. + """ + return backoff.expo(base=10, factor=1.5) + + +class NomicService(): + + @inject + def __init__(self, sentry: SentryService, sql: SQLDatabase): + nomic.login(os.environ['NOMIC_API_KEY']) + self.sentry = sentry + self.sql = sql + + @backoff.on_exception(backoff_strategy, + Exception, + max_tries=5, + raise_on_giveup=False, + giveup=giveup_hdlr, + on_backoff=backoff_hdlr) + def log_convo_to_nomic(self, course_name: str, conversation) -> str | None: + # nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app + NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' + """ + Logs conversation to Nomic. + 1. Check if map exists for given course + 2. Check if conversation ID exists + - if yes, delete and add new data point + - if no, add new data point + 3. Keep current logic for map doesn't exist - update metadata + """ + + print(f"in log_convo_to_nomic() for course: {course_name}") + print("type of conversation:", type(conversation)) + #conversation = json.loads(conversation) + messages = conversation['conversation']['messages'] + if 'user_email' not in conversation['conversation']: + user_email = "NULL" + else: + user_email = conversation['conversation']['user_email'] + conversation_id = conversation['conversation']['id'] + + # we have to upload whole conversations + # check what the fetched data looks like - pandas df or pyarrow table + # check if conversation ID exists in Nomic, if yes fetch all data from it and delete it. + # will have current QA and historical QA from Nomic, append new data and add_embeddings() + + project_name = NOMIC_MAP_NAME_PREFIX + course_name + start_time = time.monotonic() + emoji = "" + + try: + # fetch project metadata and embbeddings + project = AtlasProject(name=project_name, add_datums_if_exists=True) + + map_metadata_df = project.maps[1].data.df # type: ignore + map_embeddings_df = project.maps[1].embeddings.latent + # create a function which returns project, data and embeddings df here + map_metadata_df['id'] = map_metadata_df['id'].astype(int) + last_id = map_metadata_df['id'].max() + + if conversation_id in map_metadata_df.values: + # store that convo metadata locally + prev_data = map_metadata_df[map_metadata_df['conversation_id'] == conversation_id] + prev_index = prev_data.index.values[0] + embeddings = map_embeddings_df[prev_index - 1].reshape(1, 1536) + prev_convo = prev_data['conversation'].values[0] + prev_id = prev_data['id'].values[0] + created_at = pd.to_datetime(prev_data['created_at'].values[0]).strftime('%Y-%m-%d %H:%M:%S') + + # delete that convo data point from Nomic, and print result + print("Deleting point from nomic:", project.delete_data([str(prev_id)])) + + # prep for new point + first_message = prev_convo.split("\n")[1].split(": ")[1] + + # select the last 2 messages and append new convo to prev convo + messages_to_be_logged = messages[-2:] + for message in messages_to_be_logged: + if message['role'] == 'user': + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + + prev_convo += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # modified timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # update metadata + metadata = [{ + "course": course_name, + "conversation": prev_convo, + "conversation_id": conversation_id, + "id": last_id + 1, + "user_email": user_email, + "first_query": first_message, + "created_at": created_at, + "modified_at": current_time + }] + else: + print("conversation_id does not exist") + + # add new data point + user_queries = [] + conversation_string = "" + + first_message = messages[0]['content'] + if isinstance(first_message, list): + first_message = first_message[0]['text'] + user_queries.append(first_message) + + for message in messages: + if message['role'] == 'user': + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + + conversation_string += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # modified timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + metadata = [{ + "course": course_name, + "conversation": conversation_string, + "conversation_id": conversation_id, + "id": last_id + 1, + "user_email": user_email, + "first_query": first_message, + "created_at": current_time, + "modified_at": current_time + }] + + # create embeddings + embeddings_model = OpenAIEmbeddings(openai_api_type=os.environ['OPENAI_API_TYPE']) + embeddings = embeddings_model.embed_documents(user_queries) + + # add embeddings to the project - create a new function for this + project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) + with project.wait_for_project_lock(): + project.add_embeddings(embeddings=np.array(embeddings), data=pd.DataFrame(metadata)) + project.rebuild_maps() + + print(f"⏰ Nomic logging runtime: {(time.monotonic() - start_time):.2f} seconds") + return f"Successfully logged for {course_name}" + + except Exception as e: + if str(e) == 'You must specify a unique_id_field when creating a new project.': + print("Attempting to create Nomic map...") + result = self.create_nomic_map(course_name, conversation) + print("result of create_nomic_map():", result) + else: + # raising exception again to trigger backoff and passing parameters to use in create_nomic_map() + raise Exception({"exception": str(e)}) + + def get_nomic_map(self, course_name: str, type: str): + """ + Returns the variables necessary to construct an iframe of the Nomic map given a course name. + We just need the ID and URL. + Example values: + map link: https://atlas.nomic.ai/map/ed222613-97d9-46a9-8755-12bbc8a06e3a/f4967ad7-ff37-4098-ad06-7e1e1a93dd93 + map id: f4967ad7-ff37-4098-ad06-7e1e1a93dd93 + """ + # nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app + if type.lower() == 'document': + NOMIC_MAP_NAME_PREFIX = 'Document Map for ' + else: + NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' + + project_name = NOMIC_MAP_NAME_PREFIX + course_name + start_time = time.monotonic() + + try: + project = atlas.AtlasProject(name=project_name, add_datums_if_exists=True) + map = project.get_map(project_name) + + print(f"⏰ Nomic Full Map Retrieval: {(time.monotonic() - start_time):.2f} seconds") + return {"map_id": f"iframe{map.id}", "map_link": map.map_link} + except Exception as e: + # Error: ValueError: You must specify a unique_id_field when creating a new project. + if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore + print( + "Nomic map does not exist yet, probably because you have less than 20 queries/documents on your project: ", + e) + else: + print("ERROR in get_nomic_map():", e) + self.sentry.capture_exception(e) + return {"map_id": None, "map_link": None} + + def create_nomic_map(self, course_name: str, log_data: list): + """ + Creates a Nomic map for new courses and those which previously had < 20 queries. + 1. fetches supabase conversations for course + 2. appends current embeddings and metadata to it + 2. creates map if there are at least 20 queries + """ + nomic.login(os.environ['NOMIC_API_KEY']) # login during start of flask app + NOMIC_MAP_NAME_PREFIX = 'Conversation Map for ' + + print(f"in create_nomic_map() for {course_name}") + + try: + # fetch all conversations with this new course (we expect <=20 conversations, because otherwise the map should be made already) + + response = self.sql.getAllFromLLMConvoMonitor(course_name) + data = response.data + df = pd.DataFrame(data) + + if len(data) < 19: + return None + else: + # get all queries for course and create metadata + user_queries = [] + metadata = [] + i = 1 + conversation_exists = False + + # current log details + log_messages = log_data['conversation']['messages'] # type: ignore + log_user_email = log_data['conversation']['user_email'] # type: ignore + log_conversation_id = log_data['conversation']['id'] # type: ignore + + for _index, row in df.iterrows(): + user_email = row['user_email'] + created_at = pd.to_datetime(row['created_at']).strftime('%Y-%m-%d %H:%M:%S') + convo = row['convo'] + messages = convo['messages'] + + first_message = messages[0]['content'] + if isinstance(first_message, list): + first_message = first_message[0]['text'] + + user_queries.append(first_message) + + # create metadata for multi-turn conversation + conversation = "" + for message in messages: + # string of role: content, role: content, ... + if message['role'] == 'user': # type: ignore + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + + conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # append current chat to previous chat if convo already exists + if convo['id'] == log_conversation_id: + conversation_exists = True + + for m in log_messages: + if m['role'] == 'user': # type: ignore + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(m['content'], list): + text = m['content'][0]['text'] + else: + text = m['content'] + conversation += "\n>>> " + emoji + m['role'] + ": " + text + "\n" + + # adding modified timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # add to metadata + metadata_row = { + "course": row['course_name'], + "conversation": conversation, + "conversation_id": convo['id'], + "id": i, + "user_email": user_email, + "first_query": first_message, + "created_at": created_at, + "modified_at": current_time + } + metadata.append(metadata_row) + i += 1 + + # add current log as a new data point if convo doesn't exist + if not conversation_exists: + user_queries.append(log_messages[0]['content']) + conversation = "" + for message in log_messages: + if message['role'] == 'user': + emoji = "πŸ™‹ " + else: + emoji = "πŸ€– " + + if isinstance(message['content'], list): + text = message['content'][0]['text'] + else: + text = message['content'] + conversation += "\n>>> " + emoji + message['role'] + ": " + text + "\n" + + # adding timestamp + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + metadata_row = { + "course": course_name, + "conversation": conversation, + "conversation_id": log_conversation_id, + "id": i, + "user_email": log_user_email, + "first_query": log_messages[0]['content'], + "created_at": current_time, + "modified_at": current_time + } + metadata.append(metadata_row) + + metadata = pd.DataFrame(metadata) + embeddings_model = OpenAIEmbeddings(openai_api_type=os.environ['OPENAI_API_TYPE']) + embeddings = embeddings_model.embed_documents(user_queries) + + # create Atlas project + project_name = NOMIC_MAP_NAME_PREFIX + course_name + index_name = course_name + "_convo_index" + project = atlas.map_embeddings( + embeddings=np.array(embeddings), + data=metadata, # type: ignore - this is the correct type, the func signature from Nomic is incomplete + id_field='id', + build_topic_model=True, + topic_label_field='first_query', + name=project_name, + colorable_fields=['conversation_id', 'first_query']) + project.create_index(index_name, build_topic_model=True) + return f"Successfully created Nomic map for {course_name}" + except Exception as e: + # Error: ValueError: You must specify a unique_id_field when creating a new project. + if str(e) == 'You must specify a unique_id_field when creating a new project.': # type: ignore + print("Nomic map does not exist yet, probably because you have less than 20 queries on your project: ", e) + else: + print("ERROR in create_nomic_map():", e) + self.sentry.capture_exception(e) + + return "failed" + + ## -------------------------------- DOCUMENT MAP FUNCTIONS --------------------------------- ## + + def create_document_map(self, course_name: str): + """ + This is a function which creates a document map for a given course from scratch + 1. Gets count of documents for the course + 2. If less than 20, returns a message that a map cannot be created + 3. If greater than 20, iteratively fetches documents in batches of 25 + 4. Prepares metadata and embeddings for nomic upload + 5. Creates a new map and uploads the data + + Args: + course_name: str + Returns: + str: success or failed + """ + print("in create_document_map()") + # nomic.login(os.getenv('NOMIC_API_KEY')) + NOMIC_MAP_NAME_PREFIX = 'Document Map for ' + + try: + # check if map exists + + response = self.sql.getProjectsMapForCourse(course_name) + if response.data: + return "Map already exists for this course." + + # fetch relevant document data from Supabase + response = self.sql.getDocumentsBetweenDates(course_name, '', '', "documents") + + if not response.count: + return "No documents found for this course." + + total_doc_count = response.count + print("Total number of documents in Supabase: ", total_doc_count) + + # minimum 20 docs needed to create map + if total_doc_count > 19: + + first_id = response.data[0]['id'] + combined_dfs = [] + curr_total_doc_count = 0 + doc_count = 0 + first_batch = True + + # iteratively query in batches of 25 + while curr_total_doc_count < total_doc_count: + + response = self.sql.getDocsForIdsGte(course_name, first_id, + "id, created_at, s3_path, url, readable_filename, contexts", 25) + + df = pd.DataFrame(response.data) + combined_dfs.append(df) # list of dfs + + curr_total_doc_count += len(response.data) + doc_count += len(response.data) + + if doc_count >= 1000: # upload to Nomic every 1000 docs + + # concat all dfs from the combined_dfs list + final_df = pd.concat(combined_dfs, ignore_index=True) + + # prep data for nomic upload + embeddings, metadata = self.data_prep_for_doc_map(final_df) + + if first_batch: + # create a new map + print("Creating new map...") + project_name = NOMIC_MAP_NAME_PREFIX + course_name + index_name = course_name + "_doc_index" + topic_label_field = "text" + colorable_fields = ["readable_filename", "text"] + result = self.create_map(embeddings, metadata, project_name, index_name, topic_label_field, + colorable_fields) + # update flag + first_batch = False + + else: + # append to existing map + print("Appending data to existing map...") + project_name = NOMIC_MAP_NAME_PREFIX + course_name + # add project lock logic here + result = self.append_to_map(embeddings, metadata, project_name) + + # reset variables + combined_dfs = [] + doc_count = 0 + + # set first_id for next iteration + first_id = response.data[-1]['id'] + 1 + + # upload last set of docs + final_df = pd.concat(combined_dfs, ignore_index=True) + embeddings, metadata = self.data_prep_for_doc_map(final_df) + project_name = NOMIC_MAP_NAME_PREFIX + course_name + if first_batch: + index_name = course_name + "_doc_index" + topic_label_field = "text" + colorable_fields = ["readable_filename", "text"] + result = self.create_map(embeddings, metadata, project_name, index_name, topic_label_field, colorable_fields) + else: + result = self.append_to_map(embeddings, metadata, project_name) + print("Atlas upload status: ", result) + + # log info to supabase + project = AtlasProject(name=project_name, add_datums_if_exists=True) + project_id = project.id + project.rebuild_maps() + project_info = {'course_name': course_name, 'doc_map_id': project_id} + response = self.sql.insertProjectInfo(project_info) + print("Response from supabase: ", response) + return "success" + else: + return "Cannot create a map because there are less than 20 documents in the course." + except Exception as e: + print(e) + self.sentry.capture_exception(e) + return "failed" + + def delete_from_document_map(self, project_id: str, ids: list): + """ + This function is used to delete datapoints from a document map. + Currently used within the delete_data() function in vector_database.py + Args: + course_name: str + ids: list of str + """ + print("in delete_from_document_map()") + + try: + # fetch project from Nomic + project = AtlasProject(project_id=project_id, add_datums_if_exists=True) + + # delete the ids from Nomic + print("Deleting point from document map:", project.delete_data(ids)) + with project.wait_for_project_lock(): + project.rebuild_maps() + return "Successfully deleted from Nomic map" + except Exception as e: + print(e) + self.sentry.capture_exception(e) + return "Error in deleting from document map: {e}" + + # If this needs to be uncommented, make sure to move the supabase call to the respective service + # def log_to_document_map(self, data: dict): + # """ + # This is a function which appends new documents to an existing document map. It's called + # at the end of split_and_upload() after inserting data to Supabase. + # Args: + # data: dict - the response data from Supabase insertion + # """ + # print("in add_to_document_map()") + + # try: + # # check if map exists + # course_name = data['course_name'] + # response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + # if response.data: + # project_id = response.data[0]['doc_map_id'] + # else: + # # create a map + # map_creation_result = self.create_document_map(course_name) + # if map_creation_result != "success": + # return "The project has less than 20 documents and a map cannot be created." + # else: + # # fetch project id + # response = SUPABASE_CLIENT.table("projects").select("doc_map_id").eq("course_name", course_name).execute() + # project_id = response.data[0]['doc_map_id'] + + # project = AtlasProject(project_id=project_id, add_datums_if_exists=True) + # #print("Inserted data: ", data) + + # embeddings = [] + # metadata = [] + # context_count = 0 + # # prep data for nomic upload + # for row in data['contexts']: + # context_count += 1 + # embeddings.append(row['embedding']) + # metadata.append({ + # "id": str(data['id']) + "_" + str(context_count), + # "doc_ingested_at": data['created_at'], + # "s3_path": data['s3_path'], + # "url": data['url'], + # "readable_filename": data['readable_filename'], + # "created_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + # "text": row['text'] + # }) + # embeddings = np.array(embeddings) + # metadata = pd.DataFrame(metadata) + # print("Shape of embeddings: ", embeddings.shape) + + # # append to existing map + # project_name = "Document Map for " + course_name + # result = self.append_to_map(embeddings, metadata, project_name) + + # # check if project is accepting new datums + # if project.is_accepting_data: + # with project.wait_for_project_lock(): + # project.rebuild_maps() + + # # with project.wait_for_project_lock(): + # # project.rebuild_maps() + # return result + + # except Exception as e: + # print(e) + # self.sentry.capture_exception(e) + # return "Error in appending to map: {e}" + + def create_map(self, embeddings, metadata, map_name, index_name, topic_label_field, colorable_fields): + """ + Generic function to create a Nomic map from given parameters. + Args: + embeddings: np.array of embeddings + metadata: pd.DataFrame of metadata + map_name: str + index_name: str + topic_label_field: str + colorable_fields: list of str + """ + nomic.login(os.environ['NOMIC_API_KEY']) + + try: + project = atlas.map_embeddings(embeddings=embeddings, + data=metadata, + id_field="id", + build_topic_model=True, + name=map_name, + topic_label_field=topic_label_field, + colorable_fields=colorable_fields, + add_datums_if_exists=True) + project.create_index(index_name, build_topic_model=True) + return "success" + except Exception as e: + print(e) + return "Error in creating map: {e}" + + def append_to_map(self, embeddings, metadata, map_name): + """ + Generic function to append new data to an existing Nomic map. + Args: + embeddings: np.array of embeddings + metadata: pd.DataFrame of Nomic upload metadata + map_name: str + """ + nomic.login(os.environ['NOMIC_API_KEY']) + try: + project = atlas.AtlasProject(name=map_name, add_datums_if_exists=True) + with project.wait_for_project_lock(): + project.add_embeddings(embeddings=embeddings, data=metadata) + return "Successfully appended to Nomic map" + except Exception as e: + print(e) + return "Error in appending to map: {e}" + + def data_prep_for_doc_map(self, df: pd.DataFrame): + """ + This function prepares embeddings and metadata for nomic upload in document map creation. + Args: + df: pd.DataFrame - the dataframe of documents from Supabase + Returns: + embeddings: np.array of embeddings + metadata: pd.DataFrame of metadata + """ + print("in data_prep_for_doc_map()") + + metadata = [] + embeddings = [] + texts = [] + + for _index, row in df.iterrows(): + + current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + if row['url'] is None: + row['url'] = "" + # iterate through all contexts and create separate entries for each + context_count = 0 + for context in row['contexts']: + context_count += 1 + text_row = context['text'] + embeddings_row = context['embedding'] + + meta_row = { + "id": str(row['id']) + "_" + str(context_count), + "doc_ingested_at": row['created_at'], + "s3_path": row['s3_path'], + "url": row['url'], + "readable_filename": row['readable_filename'], + "created_at": current_time, + "text": text_row + } + + embeddings.append(embeddings_row) + metadata.append(meta_row) + texts.append(text_row) + + embeddings_np = np.array(embeddings, dtype=object) + print("Shape of embeddings: ", embeddings_np.shape) + + # check dimension if embeddings_np is (n, 1536) + if len(embeddings_np.shape) < 2: + print("Creating new embeddings...") + # embeddings_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE, + # openai_api_base=os.getenv('AZURE_OPENAI_BASE'), + # openai_api_key=os.getenv('AZURE_OPENAI_KEY')) # type: ignore + embeddings_model = OpenAIEmbeddings(openai_api_type="openai", + openai_api_base="https://api.openai.com/v1/", + openai_api_key=os.environ['VLADS_OPENAI_KEY']) + embeddings = embeddings_model.embed_documents(texts) + + metadata = pd.DataFrame(metadata) + embeddings = np.array(embeddings) + + return embeddings, metadata diff --git a/ai_ta_backend/service/posthog_service.py b/ai_ta_backend/service/posthog_service.py new file mode 100644 index 00000000..87f41d9a --- /dev/null +++ b/ai_ta_backend/service/posthog_service.py @@ -0,0 +1,18 @@ +import os + +from injector import inject +from posthog import Posthog + + +class PosthogService: + + @inject + def __init__(self): + self.posthog = Posthog( + sync_mode=True, + project_api_key=os.environ["POSTHOG_API_KEY"], + host="https://app.posthog.com", + ) + + def capture(self, event_name, properties): + self.posthog.capture("distinct_id_of_the_user", event=event_name, properties=properties) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py new file mode 100644 index 00000000..3d73a82b --- /dev/null +++ b/ai_ta_backend/service/retrieval_service.py @@ -0,0 +1,416 @@ +import inspect +import os +import time +import traceback +from typing import Dict, List, Union + +import openai +from injector import inject +from langchain.chat_models import AzureChatOpenAI +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.schema import Document + +from ai_ta_backend.database.aws import AWSStorage +from ai_ta_backend.database.sql import SQLDatabase +from ai_ta_backend.database.vector import VectorDatabase +from ai_ta_backend.service.nomic_service import NomicService +from ai_ta_backend.service.posthog_service import PosthogService +from ai_ta_backend.service.sentry_service import SentryService +from ai_ta_backend.utils.utils_tokenization import count_tokens_and_cost + + +class RetrievalService: + """ + Contains all methods for business logic of the retrieval service. + """ + + @inject + def __init__(self, vdb: VectorDatabase, sqlDb: SQLDatabase, aws: AWSStorage, posthog: PosthogService, + sentry: SentryService, nomicService: NomicService): + self.vdb = vdb + self.sqlDb = sqlDb + self.aws = aws + self.sentry = sentry + self.posthog = posthog + self.nomicService = nomicService + + openai.api_key = os.environ["OPENAI_API_KEY"] + + self.embeddings = OpenAIEmbeddings( + model='text-embedding-ada-002', + openai_api_base=os.environ["AZURE_OPENAI_ENDPOINT"], + openai_api_type=os.environ['OPENAI_API_TYPE'], + openai_api_key=os.environ["AZURE_OPENAI_KEY"], + openai_api_version=os.environ["OPENAI_API_VERSION"], + ) + + self.llm = AzureChatOpenAI( + temperature=0, + deployment_name=os.environ["AZURE_OPENAI_ENGINE"], + openai_api_base=os.environ["AZURE_OPENAI_ENDPOINT"], + openai_api_key=os.environ["AZURE_OPENAI_KEY"], + openai_api_version=os.environ["OPENAI_API_VERSION"], + openai_api_type=os.environ['OPENAI_API_TYPE'], + ) + + def getTopContexts(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]: + """Here's a summary of the work. + + /GET arguments + course name (optional) str: A json response with TBD fields. + + Returns + JSON: A json response with TBD fields. See main.py:getTopContexts docs. + or + String: An error message with traceback. + """ + try: + start_time_overall = time.monotonic() + + found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name) + + pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" + # count tokens at start and end, then also count each context. + token_counter, _ = count_tokens_and_cost(pre_prompt + "\n\nNow please respond to my query: " + # type: ignore + search_query) + + valid_docs = [] + num_tokens = 0 + for doc in found_docs: + doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" + num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore + + print( + f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, total prompt cost (of these contexts): {prompt_cost}. πŸ“„ File: {doc.metadata['readable_filename']}" + ) + if token_counter + num_tokens <= token_limit: + token_counter += num_tokens + valid_docs.append(doc) + else: + # filled our token size, time to return + break + + print(f"Total tokens used: {token_counter}. Docs used: {len(valid_docs)} of {len(found_docs)} docs retrieved") + print(f"Course: {course_name} ||| search_query: {search_query}") + print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") + if len(valid_docs) == 0: + return [] + + self.posthog.capture( + event_name="getTopContexts_success_DI", + properties={ + "user_query": search_query, + "course_name": course_name, + "token_limit": token_limit, + "total_tokens_used": token_counter, + "total_contexts_used": len(valid_docs), + "total_unique_docs_retrieved": len(found_docs), + "getTopContext_total_latency_sec": time.monotonic() - start_time_overall, + }, + ) + + return self.format_for_json(valid_docs) + except Exception as e: + # return full traceback to front end + # err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore + err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.print_exc} \n{e}" # type: ignore + traceback.print_exc() + print(err) + self.sentry.capture_exception(e) + return err + + def getAll( + self, + course_name: str, + ): + """Get all course materials based on course name. + Args: + course_name (as uploaded on supabase) + Returns: + list of dictionaries with distinct s3 path, readable_filename and course_name, url, base_url. + """ + + response = self.sqlDb.getAllMaterialsForCourse(course_name) + + data = response.data + unique_combinations = set() + distinct_dicts = [] + + for item in data: + combination = (item['s3_path'], item['readable_filename'], item['course_name'], item['url'], item['base_url']) + if combination not in unique_combinations: + unique_combinations.add(combination) + distinct_dicts.append(item) + + return distinct_dicts + + def delete_data(self, course_name: str, s3_path: str, source_url: str): + """Delete file from S3, Qdrant, and Supabase.""" + print(f"Deleting data for course {course_name}") + # add delete from doc map logic here + try: + # Delete file from S3 + bucket_name = os.environ['S3_BUCKET_NAME'] + if bucket_name is None: + raise ValueError("S3_BUCKET_NAME environment variable is not set") + + identifier_key, identifier_value = ("s3_path", s3_path) if s3_path else ("url", source_url) + print(f"Deleting {identifier_value} from S3, Qdrant, and Supabase using {identifier_key}") + + # Delete from S3 + if identifier_key == "s3_path": + self.delete_from_s3(bucket_name, s3_path) + + # Delete from Qdrant + self.delete_from_qdrant(identifier_key, identifier_value) + + # Delete from Nomic and Supabase + self.delete_from_nomic_and_supabase(course_name, identifier_key, identifier_value) + + return "Success" + except Exception as e: + err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + self.sentry.capture_exception(e) + return err + + def delete_from_s3(self, bucket_name: str, s3_path: str): + try: + print("Deleting from S3") + response = self.aws.delete_file(bucket_name, s3_path) + print(f"AWS response: {response}") + except Exception as e: + print("Error in deleting file from s3:", e) + self.sentry.capture_exception(e) + + def delete_from_qdrant(self, identifier_key: str, identifier_value: str): + try: + print("Deleting from Qdrant") + response = self.vdb.delete_data(os.environ['QDRANT_COLLECTION_NAME'], identifier_key, identifier_value) + print(f"Qdrant response: {response}") + except Exception as e: + if "timed out" in str(e): + # Timed out is fine. Still deletes. + pass + else: + print("Error in deleting file from Qdrant:", e) + self.sentry.capture_exception(e) + + def getTopContextsWithMQR(self, + search_query: str, + course_name: str, + token_limit: int = 4_000) -> Union[List[Dict], str]: + """ + New info-retrieval pipeline that uses multi-query retrieval + filtering + reciprocal rank fusion + context padding. + 1. Generate multiple queries based on the input search query. + 2. Retrieve relevant docs for each query. + 3. Filter the relevant docs based on the user query and pass them to the rank fusion step. + 4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score. + 5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document. + """ + raise NotImplementedError("Method deprecated for performance reasons. Hope to bring back soon.") + + # try: + # top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS + # start_time_overall = time.monotonic() + # mq_start_time = time.monotonic() + + # # 1. GENERATE MULTIPLE QUERIES + # generate_queries = ( + # MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) | + # (lambda x: list(filter(None, x))) # filter out non-empty strings + # ) + + # generated_queries = generate_queries.invoke({"original_query": search_query}) + # print("generated_queries", generated_queries) + + # # 2. VECTOR SEARCH FOR EACH QUERY + # batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries, + # course_name=course_name, + # top_n=top_n_per_query) + + # # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end. + # found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested) + # found_docs = [doc for doc, score in found_docs] + # print(f"Num docs after re-ranking: {len(found_docs)}") + # if len(found_docs) == 0: + # return [] + # print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds") + + # # 4. FILTER DOCS + # filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180) + # if len(filtered_docs) == 0: + # return [] + + # # 5. TOP DOC CONTEXT PADDING // parent document retriever + # final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name) + # print(f"Number of final docs after context padding: {len(final_docs)}") + + # pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" + # token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + + # search_query) # type: ignore + + # valid_docs = [] + # num_tokens = 0 + # for doc in final_docs: + # doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n" + # num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore + + # print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") + # if token_counter + num_tokens <= token_limit: + # token_counter += num_tokens + # valid_docs.append(doc) + # else: + # # filled our token size, time to return + # break + + # print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.") + # print(f"Course: {course_name} ||| search_query: {search_query}") + # print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds") + + # if len(valid_docs) == 0: + # return [] + + # self.posthog.capture('distinct_id_of_the_user', + # event='filter_top_contexts_succeeded', + # properties={ + # 'user_query': search_query, + # 'course_name': course_name, + # 'token_limit': token_limit, + # 'total_tokens_used': token_counter, + # 'total_contexts_used': len(valid_docs), + # 'total_unique_docs_retrieved': len(found_docs), + # }) + + # return self.format_for_json_mqr(valid_docs) + # except Exception as e: + # # return full traceback to front end + # err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore + # print(err) + # sentry_sdk.capture_exception(e) + # return err + + def format_for_json_mqr(self, found_docs) -> List[Dict]: + """ + Same as format_for_json, but for the new MQR pipeline. + """ + for found_doc in found_docs: + if "pagenumber" not in found_doc.keys(): + print("found no pagenumber") + found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp'] + + contexts = [ + { + 'text': doc['text'], + 'readable_filename': doc['readable_filename'], + 'course_name ': doc['course_name'], + 's3_path': doc['s3_path'], + 'pagenumber': doc['pagenumber'], + 'url': doc['url'], # wouldn't this error out? + 'base_url': doc['base_url'], + } for doc in found_docs + ] + + return contexts + + def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, identifier_value: str): + try: + print(f"Deleting from Nomic and Supabase for {course_name} using {identifier_key}: {identifier_value}") + response = self.sqlDb.getMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) + if not response.data: + raise Exception(f"No materials found for {course_name} using {identifier_key}: {identifier_value}") + data = response.data[0] # single record fetched + nomic_ids_to_delete = [str(data['id']) + "_" + str(i) for i in range(1, len(data['contexts']) + 1)] + + # delete from Nomic + # check if project exists + response = self.sqlDb.getProjectsMapForCourse(course_name) + if not response.data: + raise Exception(f"No document map found for this course: {course_name}") + project_id = response.data[0]['doc_map_id'] + self.nomicService.delete_from_document_map(project_id, nomic_ids_to_delete) + + # delete from Supabase + print(f"Deleting from Supabase for {course_name} using {identifier_key}: {identifier_value}") + response = self.sqlDb.deleteMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) + except Exception as e: + print(f"Error in deleting file from Nomic or Supabase using {identifier_key}: {identifier_value}", e) + self.sentry.capture_exception(e) + + def vector_search(self, search_query, course_name): + top_n = 80 + # EMBED + openai_start_time = time.monotonic() + user_query_embedding = self.embeddings.embed_query(search_query) + openai_embedding_latency = time.monotonic() - openai_start_time + + # SEARCH + self.posthog.capture( + event_name="vector_search_invoked", + properties={ + "user_query": search_query, + "course_name": course_name, + }, + ) + qdrant_start_time = time.monotonic() + search_results = self.vdb.vector_search(search_query, course_name, user_query_embedding, top_n) + + found_docs: list[Document] = [] + for d in search_results: + try: + metadata = d.payload + page_content = metadata["page_content"] # type: ignore + del metadata["page_content"] # type: ignore + if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): # type: ignore + # aiding in the database migration... + metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] # type: ignore + + found_docs.append(Document(page_content=page_content, metadata=metadata)) # type: ignore + except Exception as e: + print(f"Error in vector_search(), for course: `{course_name}`. Error: {e}") + self.sentry.capture_exception(e) + + self.posthog.capture( + event_name="vector_search_succeded", + properties={ + "user_query": search_query, + "course_name": course_name, + "qdrant_latency_sec": time.monotonic() - qdrant_start_time, + "openai_embedding_latency_sec": openai_embedding_latency, + }, + ) + # print("found_docs", found_docs) + return found_docs + + def format_for_json(self, found_docs: List[Document]) -> List[Dict]: + """Formatting only. + {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]} + + Args: + found_docs (List[Document]): _description_ + + Raises: + Exception: _description_ + + Returns: + List[Dict]: _description_ + """ + for found_doc in found_docs: + if "pagenumber" not in found_doc.metadata.keys(): + print("found no pagenumber") + found_doc.metadata["pagenumber"] = found_doc.metadata["pagenumber_or_timestamp"] + + contexts = [ + { + "text": doc.page_content, + "readable_filename": doc.metadata["readable_filename"], + "course_name ": doc.metadata["course_name"], + "s3_path": doc.metadata["s3_path"], + "pagenumber": doc.metadata["pagenumber"], # this because vector db schema is older... + # OPTIONAL PARAMS... + "url": doc.metadata.get("url"), # wouldn't this error out? + "base_url": doc.metadata.get("base_url"), + } for doc in found_docs + ] + + return contexts diff --git a/ai_ta_backend/service/sentry_service.py b/ai_ta_backend/service/sentry_service.py new file mode 100644 index 00000000..53b780b0 --- /dev/null +++ b/ai_ta_backend/service/sentry_service.py @@ -0,0 +1,22 @@ +import os + +import sentry_sdk +from injector import inject + + +class SentryService: + + @inject + def __init__(self, dsn: str): + # Sentry.io error logging + sentry_sdk.init( + dsn=os.getenv("SENTRY_DSN"), + # Set traces_sample_rate to 1.0 to capture 100% of transactions for performance monitoring. + traces_sample_rate=1.0, + # Set profiles_sample_rate to 1.0 to profile 100% of sampled transactions. + # We recommend adjusting this value in production. + profiles_sample_rate=1.0, + enable_tracing=True) + + def capture_exception(self, exception: Exception): + sentry_sdk.capture_exception(exception) diff --git a/ai_ta_backend/utils/__init__.py b/ai_ta_backend/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ai_ta_backend/context_parent_doc_padding.py b/ai_ta_backend/utils/context_parent_doc_padding.py similarity index 94% rename from ai_ta_backend/context_parent_doc_padding.py rename to ai_ta_backend/utils/context_parent_doc_padding.py index 5c095b0b..fc0ba19c 100644 --- a/ai_ta_backend/context_parent_doc_padding.py +++ b/ai_ta_backend/utils/context_parent_doc_padding.py @@ -4,11 +4,9 @@ from functools import partial from multiprocessing import Manager -import supabase - -DOCUMENTS_TABLE = os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE'] -SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], - supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore +DOCUMENTS_TABLE = os.environ['SUPABASE_DOCUMENTS_TABLE'] +# SUPABASE_CLIENT = supabase.create_client(supabase_url=os.environ['SUPABASE_URL'], +# supabase_key=os.environ['SUPABASE_API_KEY']) # type: ignore def context_parent_doc_padding(found_docs, search_query, course_name): diff --git a/ai_ta_backend/utils/emails.py b/ai_ta_backend/utils/emails.py new file mode 100644 index 00000000..4312a35d --- /dev/null +++ b/ai_ta_backend/utils/emails.py @@ -0,0 +1,38 @@ +import os +import smtplib +from email.mime.multipart import MIMEMultipart +from email.mime.text import MIMEText + + +def send_email(subject: str, body_text: str, sender: str, receipients: list, bcc_receipients: list): + """ + Send an email using the AWS SES service + :param subject: The subject of the email + :param body_text: The body of the email + :param sender: The email address of the sender + :param receipients: A list of email addresses to send the email to + :param bcc_receipients: A list of email addresses to send the email to as BCC + :return: A string indicating the result of the email send operation + + """ + # Create message content + message = MIMEMultipart("alternative") + message["Subject"] = subject + message["From"] = sender + message["To"] = ", ".join(receipients) + + if len(bcc_receipients) > 0: + message["Bcc"] = ", ".join(bcc_receipients) + + # Add plain text part + part1 = MIMEText(body_text, "plain") + message.attach(part1) + + # Add additional parts for HTML, attachments, etc. (optional) + + # Connect to SMTP server + with smtplib.SMTP_SSL(os.getenv('SES_HOST'), os.getenv('SES_PORT')) as server: # type: ignore + server.login(os.getenv('USERNAME_SMTP'), os.getenv('PASSWORD_SMTP')) # type: ignore + server.sendmail(sender, receipients + bcc_receipients, message.as_string()) + + return "Email sent successfully!" diff --git a/ai_ta_backend/filtering_contexts.py b/ai_ta_backend/utils/filtering_contexts.py similarity index 100% rename from ai_ta_backend/filtering_contexts.py rename to ai_ta_backend/utils/filtering_contexts.py diff --git a/ai_ta_backend/utils_tokenization.py b/ai_ta_backend/utils/utils_tokenization.py similarity index 96% rename from ai_ta_backend/utils_tokenization.py rename to ai_ta_backend/utils/utils_tokenization.py index 7070ea7f..956cc196 100644 --- a/ai_ta_backend/utils_tokenization.py +++ b/ai_ta_backend/utils/utils_tokenization.py @@ -9,6 +9,7 @@ def count_tokens_and_cost( completion: str = '', openai_model_name: str = "gpt-3.5-turbo"): # -> tuple[int, float] | tuple[int, float, int, float]: """ + # TODO: improve w/ extra tokens used by model: https://github.com/openai/openai-cookbook/blob/d00e9a48a63739f5b038797594c81c8bb494fc09/examples/How_to_count_tokens_with_tiktoken.ipynb Returns the number of tokens in a text string. Only the first parameter is required, a string of text to measure. The completion and model name are optional. diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py deleted file mode 100644 index 739d5b16..00000000 --- a/ai_ta_backend/vector_database.py +++ /dev/null @@ -1,1726 +0,0 @@ -import asyncio -import inspect -import logging -import mimetypes -import os -import re -import shutil -import subprocess -import time -import traceback -import uuid -from pathlib import Path -from tempfile import NamedTemporaryFile -from typing import Any, Callable, Dict, List, Union -import sentry_sdk -import boto3 -import fitz -import openai -from posthog import Posthog -import pytesseract -import supabase -from bs4 import BeautifulSoup -from langchain import hub -from langchain.chat_models import AzureChatOpenAI -from git.repo import Repo -from langchain.document_loaders import ( - Docx2txtLoader, - GitLoader, - PythonLoader, - SRTLoader, - TextLoader, - UnstructuredExcelLoader, - UnstructuredPowerPointLoader, -) -from langchain.document_loaders.csv_loader import CSVLoader -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.schema import Document -from langchain.load import loads, dumps -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.vectorstores import Qdrant -from PIL import Image -from pydub import AudioSegment -from qdrant_client import QdrantClient, models -from qdrant_client.models import PointStruct - -from ai_ta_backend.aws import upload_data_files_to_s3 -from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor -from ai_ta_backend.utils_tokenization import count_tokens_and_cost -# from ai_ta_backend.context_parent_doc_padding import context_parent_doc_padding -# from ai_ta_backend.filtering_contexts import filter_top_contexts -from ai_ta_backend.nomic_logging import log_to_document_map, delete_from_document_map - -MULTI_QUERY_PROMPT = hub.pull("langchain-ai/rag-fusion-query-generation") -OPENAI_API_TYPE = "azure" # "openai" or "azure" - - -class Ingest(): - """ - Contains all methods for building and using vector databases. - """ - - def __init__(self): - """ - Initialize AWS S3, Qdrant, and Supabase. - """ - openai.api_key = os.getenv("OPENAI_API_KEY") - - # vector DB - self.qdrant_client = QdrantClient( - url=os.getenv('QDRANT_URL'), - api_key=os.getenv('QDRANT_API_KEY'), - ) - - self.vectorstore = Qdrant(client=self.qdrant_client, - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - embeddings=OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE)) - - # S3 - self.s3_client = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - # Create a Supabase client - self.supabase_client = supabase.create_client( # type: ignore - supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) - - self.llm = AzureChatOpenAI( - temperature=0, - deployment_name=os.getenv('AZURE_OPENAI_ENGINE'), #type:ignore - openai_api_base=os.getenv('AZURE_OPENAI_ENDPOINT'), #type:ignore - openai_api_key=os.getenv('AZURE_OPENAI_KEY'), #type:ignore - openai_api_version=os.getenv('OPENAI_API_VERSION'), #type:ignore - openai_api_type=OPENAI_API_TYPE) - - self.posthog = Posthog(sync_mode=True, - project_api_key=os.environ['POSTHOG_API_KEY'], - host='https://app.posthog.com') - - return None - - def __del__(self): - # Gracefully shutdown the Posthog client -- this was a main cause of dangling threads. - # Since I changed Posthog to be sync, no need to shutdown. - # try: - # self.posthog.shutdown() - # except Exception as e: - # print("Failed to shutdown PostHog. Probably fine. Error: ", e) - try: - self.qdrant_client.close() - except Exception as e: - print("Failed to shutdown Qdrant. Probably fine. Error: ", e) - try: - del self.supabase_client - except Exception as e: - print("Failed delete supabase_client. Probably fine. Error: ", e) - try: - del self.s3_client - except Exception as e: - print("Failed to delete s3_client. Probably fine. Error: ", e) - - def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwargs) -> Dict[str, List[str]]: - - def _ingest_single(ingest_method: Callable, s3_path, *args, **kwargs): - """Handle running an arbitrary ingest function for an individual file.""" - # RUN INGEST METHOD - ret = ingest_method(s3_path, *args, **kwargs) - if ret == "Success": - success_status['success_ingest'].append(s3_path) - else: - success_status['failure_ingest'].append(s3_path) - - # πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡ ADD NEW INGEST METHODS HERE πŸ‘‡πŸ‘‡πŸ‘‡πŸ‘‡πŸŽ‰ - file_ingest_methods = { - '.html': self._ingest_html, - '.py': self._ingest_single_py, - '.pdf': self._ingest_single_pdf, - '.txt': self._ingest_single_txt, - '.md': self._ingest_single_txt, - '.srt': self._ingest_single_srt, - '.vtt': self._ingest_single_vtt, - '.docx': self._ingest_single_docx, - '.ppt': self._ingest_single_ppt, - '.pptx': self._ingest_single_ppt, - '.xlsx': self._ingest_single_excel, - '.xls': self._ingest_single_excel, - '.csv': self._ingest_single_csv, - '.png': self._ingest_single_image, - '.jpg': self._ingest_single_image, - } - - # Ingest methods via MIME type (more general than filetype) - mimetype_ingest_methods = { - 'video': self._ingest_single_video, - 'audio': self._ingest_single_video, - 'text': self._ingest_single_txt, - 'image': self._ingest_single_image, - } - # πŸ‘†πŸ‘†πŸ‘†πŸ‘† ADD NEW INGEST METHODhe πŸ‘†πŸ‘†πŸ‘†πŸ‘†πŸŽ‰ - - print(f"Top of ingest, Course_name {course_name}. S3 paths {s3_paths}") - success_status = {"success_ingest": [], "failure_ingest": []} - try: - if isinstance(s3_paths, str): - s3_paths = [s3_paths] - - for s3_path in s3_paths: - file_extension = Path(s3_path).suffix - with NamedTemporaryFile(suffix=file_extension) as tmpfile: - self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) - mime_type = str(mimetypes.guess_type(tmpfile.name, strict=False)[0]) - mime_category, mime_subcategory = mime_type.split('/') - - if file_extension in file_ingest_methods: - # Use specialized functions when possible, fallback to mimetype. Else raise error. - ingest_method = file_ingest_methods[file_extension] - _ingest_single(ingest_method, s3_path, course_name, **kwargs) - elif mime_category in mimetype_ingest_methods: - # fallback to MimeType - print("mime category", mime_category) - ingest_method = mimetype_ingest_methods[mime_category] - _ingest_single(ingest_method, s3_path, course_name, **kwargs) - else: - # No supported ingest... Fallback to attempting utf-8 decoding, otherwise fail. - try: - self._ingest_single_txt(s3_path, course_name) - success_status['success_ingest'].append(s3_path) - print("βœ… FALLBACK TO UTF-8 INGEST WAS SUCCESSFUL :) ") - except Exception as e: - print( - f"We don't have a ingest method for this filetype: {file_extension}. As a last-ditch effort, we tried to ingest the file as utf-8 text, but that failed too. File is unsupported: {s3_path}. UTF-8 ingest error: {e}" - ) - success_status['failure_ingest'].append( - f"We don't have a ingest method for this filetype: {file_extension} (with generic type {mime_type}), for file: {s3_path}" - ) - - return success_status - except Exception as e: - success_status['failure_ingest'].append(f"MAJOR ERROR IN /bulk_ingest: Error: {str(e)}") - sentry_sdk.capture_exception(e) - return success_status - - def ingest_single_web_text(self, course_name: str, base_url: str, url: str, content: str, title: str): - """Crawlee integration - """ - self.posthog.capture('distinct_id_of_the_user', - event='ingest_single_web_text_invoked', - properties={ - 'course_name': course_name, - 'base_url': base_url, - 'url': url, - 'content': content, - 'title': title - }) - try: - # if not, ingest the text - text = [content] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': '', - 'readable_filename': title, - 'pagenumber': '', - 'timestamp': '', - 'url': url, - 'base_url': base_url, - }] - self.split_and_upload(texts=text, metadatas=metadatas) - self.posthog.capture('distinct_id_of_the_user', - event='ingest_single_web_text_succeeded', - properties={ - 'course_name': course_name, - 'base_url': base_url, - 'url': url, - 'title': title - }) - - return "Success" - except Exception as e: - err = f"❌❌ Error in (web text ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_py(self, s3_path: str, course_name: str, **kwargs): - try: - file_name = s3_path.split("/")[-1] - file_path = "media/" + file_name # download from s3 to local folder for ingest - - self.s3_client.download_file(os.getenv('S3_BUCKET_NAME'), s3_path, file_path) - - loader = PythonLoader(file_path) - documents = loader.load() - - texts = [doc.page_content for doc in documents] - - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': '', - 'url': '', - 'base_url': '', - } for doc in documents] - #print(texts) - os.remove(file_path) - - success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas) - print("Python ingest: ", success_or_failure) - return success_or_failure - - except Exception as e: - err = f"❌❌ Error in (Python ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_vtt(self, s3_path: str, course_name: str, **kwargs): - """ - Ingest a single .vtt file from S3. - """ - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into vtt_tmpfile - self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) - loader = TextLoader(tmpfile.name) - documents = loader.load() - texts = [doc.page_content for doc in documents] - - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': '', - 'url': '', - 'base_url': '', - } for doc in documents] - - success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas) - return success_or_failure - except Exception as e: - err = f"❌❌ Error in (VTT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str: - print(f"IN _ingest_html s3_path `{s3_path}` kwargs: {kwargs}") - try: - response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path) - raw_html = response['Body'].read().decode('utf-8') - - soup = BeautifulSoup(raw_html, 'html.parser') - title = s3_path.replace("courses/" + course_name, "") - title = title.replace(".html", "") - title = title.replace("_", " ") - title = title.replace("/", " ") - title = title.strip() - title = title[37:] # removing the uuid prefix - text = [soup.get_text()] - - metadata: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': str(title), # adding str to avoid error: unhashable type 'slice' - 'url': kwargs.get('url', ''), - 'base_url': kwargs.get('base_url', ''), - 'pagenumber': '', - 'timestamp': '', - }] - - success_or_failure = self.split_and_upload(text, metadata) - print(f"_ingest_html: {success_or_failure}") - return success_or_failure - except Exception as e: - err: str = f"ERROR IN _ingest_html: {e}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_video(self, s3_path: str, course_name: str, **kwargs) -> str: - """ - Ingest a single video file from S3. - """ - print("Starting ingest video or audio") - try: - # check for file extension - file_ext = Path(s3_path).suffix - openai.api_key = os.getenv('OPENAI_API_KEY') - transcript_list = [] - with NamedTemporaryFile(suffix=file_ext) as video_tmpfile: - # download from S3 into an video tmpfile - self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=video_tmpfile) - # extract audio from video tmpfile - mp4_version = AudioSegment.from_file(video_tmpfile.name, file_ext[1:]) - - # save the extracted audio as a temporary webm file - with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as webm_tmpfile: - mp4_version.export(webm_tmpfile, format="webm") - - # check file size - file_size = os.path.getsize(webm_tmpfile.name) - # split the audio into 25MB chunks - if file_size > 26214400: - # load the webm file into audio object - full_audio = AudioSegment.from_file(webm_tmpfile.name, "webm") - file_count = file_size // 26214400 + 1 - split_segment = 35 * 60 * 1000 - start = 0 - count = 0 - - while count < file_count: - with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as split_tmp: - if count == file_count - 1: - # last segment - audio_chunk = full_audio[start:] - else: - audio_chunk = full_audio[start:split_segment] - - audio_chunk.export(split_tmp.name, format="webm") - - # transcribe the split file and store the text in dictionary - with open(split_tmp.name, "rb") as f: - transcript = openai.Audio.transcribe("whisper-1", f) - transcript_list.append(transcript['text']) # type: ignore - start += split_segment - split_segment += split_segment - count += 1 - os.remove(split_tmp.name) - else: - # transcribe the full audio - with open(webm_tmpfile.name, "rb") as f: - transcript = openai.Audio.transcribe("whisper-1", f) - transcript_list.append(transcript['text']) # type: ignore - - os.remove(webm_tmpfile.name) - - text = [txt for txt in transcript_list] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': text.index(txt), - 'url': '', - 'base_url': '', - } for txt in text] - - self.split_and_upload(texts=text, metadatas=metadatas) - return "Success" - except Exception as e: - err = f"❌❌ Error in (VIDEO ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_docx(self, s3_path: str, course_name: str, **kwargs) -> str: - try: - with NamedTemporaryFile() as tmpfile: - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) - - loader = Docx2txtLoader(tmpfile.name) - documents = loader.load() - - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': '', - 'url': '', - 'base_url': '', - } for doc in documents] - - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - err = f"❌❌ Error in (DOCX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_srt(self, s3_path: str, course_name: str, **kwargs) -> str: - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into pdf_tmpfile - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) - - loader = SRTLoader(tmpfile.name) - documents = loader.load() - - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': '', - 'url': '', - 'base_url': '', - } for doc in documents] - - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - err = f"❌❌ Error in (SRT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_excel(self, s3_path: str, course_name: str, **kwargs) -> str: - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into pdf_tmpfile - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) - - loader = UnstructuredExcelLoader(tmpfile.name, mode="elements") - # loader = SRTLoader(tmpfile.name) - documents = loader.load() - - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': '', - 'url': '', - 'base_url': '', - } for doc in documents] - - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - err = f"❌❌ Error in (Excel/xlsx ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_image(self, s3_path: str, course_name: str, **kwargs) -> str: - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into pdf_tmpfile - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) - """ - # Unstructured image loader makes the install too large (700MB --> 6GB. 3min -> 12 min build times). AND nobody uses it. - # The "hi_res" strategy will identify the layout of the document using detectron2. "ocr_only" uses pdfminer.six. https://unstructured-io.github.io/unstructured/core/partition.html#partition-image - loader = UnstructuredImageLoader(tmpfile.name, unstructured_kwargs={'strategy': "ocr_only"}) - documents = loader.load() - """ - - res_str = pytesseract.image_to_string(Image.open(tmpfile.name)) - print("IMAGE PARSING RESULT:", res_str) - documents = [Document(page_content=res_str)] - - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': '', - 'url': '', - 'base_url': '', - } for doc in documents] - - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - err = f"❌❌ Error in (png/jpg ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_csv(self, s3_path: str, course_name: str, **kwargs) -> str: - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into pdf_tmpfile - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) - - loader = CSVLoader(file_path=tmpfile.name) - documents = loader.load() - - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': '', - 'url': '', - 'base_url': '', - } for doc in documents] - - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - err = f"❌❌ Error in (CSV ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs): - """ - Both OCR the PDF. And grab the first image as a PNG. - LangChain `Documents` have .metadata and .page_content attributes. - Be sure to use TemporaryFile() to avoid memory leaks! - """ - print("IN PDF ingest: s3_path: ", s3_path, "and kwargs:", kwargs) - - try: - with NamedTemporaryFile() as pdf_tmpfile: - # download from S3 into pdf_tmpfile - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile) - ### READ OCR of PDF - doc = fitz.open(pdf_tmpfile.name) # type: ignore - - # improve quality of the image - zoom_x = 2.0 # horizontal zoom - zoom_y = 2.0 # vertical zoom - mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension - - pdf_pages_OCRed: List[Dict] = [] - for i, page in enumerate(doc): # type: ignore - - # UPLOAD FIRST PAGE IMAGE to S3 - if i == 0: - with NamedTemporaryFile(suffix=".png") as first_page_png: - pix = page.get_pixmap(matrix=mat) - pix.save(first_page_png) # store image as a PNG - - s3_upload_path = str(Path(s3_path)).rsplit('.pdf')[0] + "-pg1-thumb.png" - first_page_png.seek(0) # Seek the file pointer back to the beginning - with open(first_page_png.name, 'rb') as f: - print("Uploading image png to S3") - self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path) - - # Extract text - text = page.get_text().encode("utf8").decode("utf8", errors='ignore') # get plain text (is in UTF-8) - pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name[37:])) - - metadatas: List[Dict[str, Any]] = [ - { - 'course_name': course_name, - 's3_path': s3_path, - 'pagenumber': page['page_number'] + 1, # +1 for human indexing - 'timestamp': '', - 'readable_filename': kwargs.get('readable_filename', page['readable_filename']), - 'url': kwargs.get('url', ''), - 'base_url': kwargs.get('base_url', ''), - } for page in pdf_pages_OCRed - ] - pdf_texts = [page['text'] for page in pdf_pages_OCRed] - - success_or_failure = self.split_and_upload(texts=pdf_texts, metadatas=metadatas) - return success_or_failure - except Exception as e: - err = f"❌❌ Error in (PDF ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - return "Success" - - def _ingest_single_txt(self, s3_path: str, course_name: str, **kwargs) -> str: - """Ingest a single .txt or .md file from S3. - Args: - s3_path (str): A path to a .txt file in S3 - course_name (str): The name of the course - Returns: - str: "Success" or an error message - """ - print("In text ingest") - try: - # NOTE: slightly different method for .txt files, no need for download. It's part of the 'body' - response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path) - print("s3 Resonse:", response) - text = response['Body'].read().decode('utf-8') - print("Text from s3:", text) - text = [text] - - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': '', - 'url': '', - 'base_url': '', - }] - print("Prior to ingest", metadatas) - - success_or_failure = self.split_and_upload(texts=text, metadatas=metadatas) - return success_or_failure - except Exception as e: - err = f"❌❌ Error in (TXT ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def _ingest_single_ppt(self, s3_path: str, course_name: str, **kwargs) -> str: - """ - Ingest a single .ppt or .pptx file from S3. - """ - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into pdf_tmpfile - #print("in ingest PPTX") - self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) - - loader = UnstructuredPowerPointLoader(tmpfile.name) - documents = loader.load() - - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': kwargs.get('readable_filename', - Path(s3_path).name[37:]), - 'pagenumber': '', - 'timestamp': '', - 'url': '', - 'base_url': '', - } for doc in documents] - - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - err = f"❌❌ Error in (PPTX ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n", traceback.format_exc( - ) - print(err) - sentry_sdk.capture_exception(e) - return err - - def list_files_recursively(self, bucket, prefix): - all_files = [] - continuation_token = None - - while True: - list_objects_kwargs = { - 'Bucket': bucket, - 'Prefix': prefix, - } - if continuation_token: - list_objects_kwargs['ContinuationToken'] = continuation_token - - response = self.s3_client.list_objects_v2(**list_objects_kwargs) - - if 'Contents' in response: - for obj in response['Contents']: - all_files.append(obj['Key']) - - if response['IsTruncated']: - continuation_token = response['NextContinuationToken'] - else: - break - - return all_files - - def ingest_coursera(self, coursera_course_name: str, course_name: str) -> str: - """ Download all the files from a coursera course and ingest them. - - 1. Download the coursera content. - 2. Upload to S3 (so users can view it) - 3. Run everything through the ingest_bulk method. - - Args: - coursera_course_name (str): The name of the coursera course. - course_name (str): The name of the course in our system. - - Returns: - _type_: Success or error message. - """ - certificate = "-ca 'FVhVoDp5cb-ZaoRr5nNJLYbyjCLz8cGvaXzizqNlQEBsG5wSq7AHScZGAGfC1nI0ehXFvWy1NG8dyuIBF7DLMA.X3cXsDvHcOmSdo3Fyvg27Q.qyGfoo0GOHosTVoSMFy-gc24B-_BIxJtqblTzN5xQWT3hSntTR1DMPgPQKQmfZh_40UaV8oZKKiF15HtZBaLHWLbpEpAgTg3KiTiU1WSdUWueo92tnhz-lcLeLmCQE2y3XpijaN6G4mmgznLGVsVLXb-P3Cibzz0aVeT_lWIJNrCsXrTFh2HzFEhC4FxfTVqS6cRsKVskPpSu8D9EuCQUwJoOJHP_GvcME9-RISBhi46p-Z1IQZAC4qHPDhthIJG4bJqpq8-ZClRL3DFGqOfaiu5y415LJcH--PRRKTBnP7fNWPKhcEK2xoYQLr9RxBVL3pzVPEFyTYtGg6hFIdJcjKOU11AXAnQ-Kw-Gb_wXiHmu63veM6T8N2dEkdqygMre_xMDT5NVaP3xrPbA4eAQjl9yov4tyX4AQWMaCS5OCbGTpMTq2Y4L0Mbz93MHrblM2JL_cBYa59bq7DFK1IgzmOjFhNG266mQlC9juNcEhc'" - always_use_flags = "-u kastanvday@gmail.com -p hSBsLaF5YM469# --ignore-formats mp4 --subtitle-language en --path ./coursera-dl" - - try: - subprocess.run( - f"coursera-dl {always_use_flags} {certificate} {coursera_course_name}", - check=True, - shell=True, # nosec -- reasonable bandit error suppression - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) # capture_output=True, - dl_results_path = os.path.join('coursera-dl', coursera_course_name) - s3_paths: Union[List, None] = upload_data_files_to_s3(course_name, dl_results_path) - - if s3_paths is None: - return "Error: No files found in the coursera-dl directory" - - print("starting bulk ingest") - start_time = time.monotonic() - self.bulk_ingest(s3_paths, course_name) - print("completed bulk ingest") - print(f"⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds") - - # Cleanup the coursera downloads - shutil.rmtree(dl_results_path) - - return "Success" - except Exception as e: - err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - return err - - def ingest_github(self, github_url: str, course_name: str) -> str: - """ - Clones the given GitHub URL and uses Langchain to load data. - 1. Clone the repo - 2. Use Langchain to load the data - 3. Pass to split_and_upload() - Args: - github_url (str): The Github Repo URL to be ingested. - course_name (str): The name of the course in our system. - - Returns: - _type_: Success or error message. - """ - try: - repo_path = "media/cloned_repo" - repo = Repo.clone_from(github_url, to_path=repo_path, depth=1, clone_submodules=False) - branch = repo.head.reference - - loader = GitLoader(repo_path="media/cloned_repo", branch=str(branch)) - data = loader.load() - shutil.rmtree("media/cloned_repo") - # create metadata for each file in data - - for doc in data: - texts = doc.page_content - metadatas: Dict[str, Any] = { - 'course_name': course_name, - 's3_path': '', - 'readable_filename': doc.metadata['file_name'], - 'url': f"{github_url}/blob/main/{doc.metadata['file_path']}", - 'pagenumber': '', - 'timestamp': '', - } - self.split_and_upload(texts=[texts], metadatas=[metadatas]) - return "Success" - except Exception as e: - err = f"❌❌ Error in (GITHUB ingest): `{inspect.currentframe().f_code.co_name}`: {e}\nTraceback:\n{traceback.format_exc()}" - print(err) - sentry_sdk.capture_exception(e) - return err - - def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): - """ This is usually the last step of document ingest. Chunk & upload to Qdrant (and Supabase.. todo). - Takes in Text and Metadata (from Langchain doc loaders) and splits / uploads to Qdrant. - - good examples here: https://langchain.readthedocs.io/en/latest/modules/utils/combine_docs_examples/textsplitter.html - - Args: - texts (List[str]): _description_ - metadatas (List[Dict[str, Any]]): _description_ - """ - self.posthog.capture('distinct_id_of_the_user', - event='split_and_upload_invoked', - properties={ - 'course_name': metadatas[0].get('course_name', None), - 's3_path': metadatas[0].get('s3_path', None), - 'readable_filename': metadatas[0].get('readable_filename', None), - 'url': metadatas[0].get('url', None), - 'base_url': metadatas[0].get('base_url', None), - }) - - print("In split and upload") - print(f"metadatas: {metadatas}") - print(f"Texts: {texts}") - assert len(texts) == len( - metadatas - ), f'must have equal number of text strings and metadata dicts. len(texts) is {len(texts)}. len(metadatas) is {len(metadatas)}' - - try: - text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( - chunk_size=1000, - chunk_overlap=150, - separators=[ - "\n\n", "\n", ". ", " ", "" - ] # try to split on paragraphs... fallback to sentences, then chars, ensure we always fit in context window - ) - contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas) - input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts] - - # check for duplicates - is_duplicate = self.check_for_duplicates(input_texts, metadatas) - if is_duplicate: - self.posthog.capture('distinct_id_of_the_user', - event='split_and_upload_succeeded', - properties={ - 'course_name': metadatas[0].get('course_name', None), - 's3_path': metadatas[0].get('s3_path', None), - 'readable_filename': metadatas[0].get('readable_filename', None), - 'url': metadatas[0].get('url', None), - 'base_url': metadatas[0].get('base_url', None), - 'is_duplicate': True, - }) - return "Success" - - # adding chunk index to metadata for parent doc retrieval - for i, context in enumerate(contexts): - context.metadata['chunk_index'] = i - - oai = OpenAIAPIProcessor( - input_prompts_list=input_texts, - request_url='https://api.openai.com/v1/embeddings', - api_key=os.getenv('VLADS_OPENAI_KEY'), - # request_url='https://uiuc-chat-canada-east.openai.azure.com/openai/deployments/text-embedding-ada-002/embeddings?api-version=2023-05-15', - # api_key=os.getenv('AZURE_OPENAI_KEY'), - max_requests_per_minute=5_000, - max_tokens_per_minute=300_000, - max_attempts=20, - logging_level=logging.INFO, - token_encoding_name='cl100k_base') # nosec -- reasonable bandit error suppression - asyncio.run(oai.process_api_requests_from_file()) - # parse results into dict of shape page_content -> embedding - embeddings_dict: dict[str, List[float]] = { - item[0]['input']: item[1]['data'][0]['embedding'] for item in oai.results - } - - ### BULK upload to Qdrant ### - vectors: list[PointStruct] = [] - for context in contexts: - # !DONE: Updated the payload so each key is top level (no more payload.metadata.course_name. Instead, use payload.course_name), great for creating indexes. - upload_metadata = {**context.metadata, "page_content": context.page_content} - vectors.append( - PointStruct(id=str(uuid.uuid4()), vector=embeddings_dict[context.page_content], payload=upload_metadata)) - - self.qdrant_client.upsert( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], # type: ignore - points=vectors # type: ignore - ) - ### Supabase SQL ### - contexts_for_supa = [{ - "text": context.page_content, - "pagenumber": context.metadata.get('pagenumber'), - "timestamp": context.metadata.get('timestamp'), - "chunk_index": context.metadata.get('chunk_index'), - "embedding": embeddings_dict[context.page_content] - } for context in contexts] - - document = { - "course_name": contexts[0].metadata.get('course_name'), - "s3_path": contexts[0].metadata.get('s3_path'), - "readable_filename": contexts[0].metadata.get('readable_filename'), - "url": contexts[0].metadata.get('url'), - "base_url": contexts[0].metadata.get('base_url'), - "contexts": contexts_for_supa, - } - - response = self.supabase_client.table( - os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore - - # add to Nomic document map - if len(response.data) > 0: - inserted_data = response.data[0] - res = log_to_document_map(inserted_data) - - self.posthog.capture('distinct_id_of_the_user', - event='split_and_upload_succeeded', - properties={ - 'course_name': metadatas[0].get('course_name', None), - 's3_path': metadatas[0].get('s3_path', None), - 'readable_filename': metadatas[0].get('readable_filename', None), - 'url': metadatas[0].get('url', None), - 'base_url': metadatas[0].get('base_url', None), - }) - print("successful END OF split_and_upload") - return "Success" - except Exception as e: - err: str = f"ERROR IN split_and_upload(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - - def delete_entire_course(self, course_name: str): - """Delete entire course. - - Delete materials from S3, Supabase SQL, Vercel KV, and QDrant vector DB - Args: - course_name (str): _description_ - """ - print(f"Deleting entire course: {course_name}") - try: - # Delete file from S3 - print("Deleting from S3") - objects_to_delete = self.s3_client.list_objects(Bucket=os.getenv('S3_BUCKET_NAME'), - Prefix=f'courses/{course_name}/') - for object in objects_to_delete['Contents']: - self.s3_client.delete_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=object['Key']) - except Exception as e: - err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - pass - - try: - # Delete from Qdrant - # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key - # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), - print("deleting from qdrant") - self.qdrant_client.delete( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - points_selector=models.Filter(must=[ - models.FieldCondition( - key="course_name", - match=models.MatchValue(value=course_name), - ), - ]), - ) - except Exception as e: - err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - pass - - try: - # Delete from Supabase - print("deleting from supabase") - response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 'course_name', course_name).execute() - print("supabase response: ", response) - return "Success" - except Exception as e: - err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - # todo: delete from Vercel KV to fully make the coure not exist. Last db to delete from (as of now, Aug 15) - - def delete_data(self, course_name: str, s3_path: str, source_url: str): - """Delete file from S3, Qdrant, and Supabase.""" - print(f"Deleting {s3_path} from S3, Qdrant, and Supabase for course {course_name}") - # add delete from doc map logic here - try: - # Delete file from S3 - bucket_name = os.getenv('S3_BUCKET_NAME') - - # Delete files by S3 path - if s3_path: - try: - self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path) - except Exception as e: - print("Error in deleting file from s3:", e) - sentry_sdk.capture_exception(e) - # Delete from Qdrant - # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key - # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training ... - try: - self.qdrant_client.delete( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - points_selector=models.Filter(must=[ - models.FieldCondition( - key="s3_path", - match=models.MatchValue(value=s3_path), - ), - ]), - ) - except Exception as e: - if "timed out" in str(e): - # Timed out is fine. Still deletes. - # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525 - pass - else: - print("Error in deleting file from Qdrant:", e) - sentry_sdk.capture_exception(e) - try: - # delete from Nomic - response = self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, s3_path, contexts").eq( - 's3_path', s3_path).eq('course_name', course_name).execute() - data = response.data[0] #single record fetched - nomic_ids_to_delete = [] - context_count = len(data['contexts']) - for i in range(1, context_count + 1): - nomic_ids_to_delete.append(str(data['id']) + "_" + str(i)) - - # delete from Nomic - res = delete_from_document_map(course_name, nomic_ids_to_delete) - except Exception as e: - print("Error in deleting file from Nomic:", e) - sentry_sdk.capture_exception(e) - - try: - self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 's3_path', s3_path).eq('course_name', course_name).execute() - except Exception as e: - print("Error in deleting file from supabase:", e) - sentry_sdk.capture_exception(e) - - # Delete files by their URL identifier - elif source_url: - try: - # Delete from Qdrant - self.qdrant_client.delete( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - points_selector=models.Filter(must=[ - models.FieldCondition( - key="url", - match=models.MatchValue(value=source_url), - ), - ]), - ) - except Exception as e: - if "timed out" in str(e): - # Timed out is fine. Still deletes. - # https://github.com/qdrant/qdrant/issues/3654#issuecomment-1955074525 - pass - else: - print("Error in deleting file from Qdrant:", e) - sentry_sdk.capture_exception(e) - try: - # delete from Nomic - response = self.supabase_client.from_( - os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select("id, url, contexts").eq( - 'url', source_url).eq('course_name', course_name).execute() - data = response.data[0] #single record fetched - nomic_ids_to_delete = [] - context_count = len(data['contexts']) - for i in range(1, context_count + 1): - nomic_ids_to_delete.append(str(data['id']) + "_" + str(i)) - - # delete from Nomic - res = delete_from_document_map(course_name, nomic_ids_to_delete) - except Exception as e: - print("Error in deleting file from Nomic:", e) - sentry_sdk.capture_exception(e) - - try: - # delete from Supabase - self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq( - 'url', source_url).eq('course_name', course_name).execute() - except Exception as e: - print("Error in deleting file from supabase:", e) - sentry_sdk.capture_exception(e) - - # Delete from Supabase - return "Success" - except Exception as e: - err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - - def getAll( - self, - course_name: str, - ): - """Get all course materials based on course name. - Args: - course_name (as uploaded on supabase) - Returns: - list of dictionaries with distinct s3 path, readable_filename and course_name, url, base_url. - """ - - response = self.supabase_client.table(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select( - 'course_name, s3_path, readable_filename, url, base_url').eq('course_name', course_name).execute() - - data = response.data - unique_combinations = set() - distinct_dicts = [] - - for item in data: - combination = (item['s3_path'], item['readable_filename'], item['course_name'], item['url'], item['base_url']) - if combination not in unique_combinations: - unique_combinations.add(combination) - distinct_dicts.append(item) - - return distinct_dicts - - def vector_search(self, search_query, course_name): - top_n = 80 - # EMBED - openai_start_time = time.monotonic() - o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) - user_query_embedding = o.embed_query(search_query) - openai_embedding_latency = time.monotonic() - openai_start_time - - # SEARCH - myfilter = models.Filter(must=[ - models.FieldCondition(key='course_name', match=models.MatchValue(value=course_name)), - ]) - self.posthog.capture('distinct_id_of_the_user', - event='vector_search_invoked', - properties={ - 'user_query': search_query, - 'course_name': course_name, - }) - qdrant_start_time = time.monotonic() - search_results = self.qdrant_client.search( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - query_filter=myfilter, - with_vectors=False, - query_vector=user_query_embedding, - limit=top_n, # Return n closest points - - # In a system with high disk latency, the re-scoring step may become a bottleneck: https://qdrant.tech/documentation/guides/quantization/ - search_params=models.SearchParams(quantization=models.QuantizationSearchParams(rescore=False))) - - found_docs: list[Document] = [] - for d in search_results: - try: - metadata = d.payload - page_content = metadata['page_content'] - del metadata['page_content'] - if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): # type: ignore - # aiding in the database migration... - metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] # type: ignore - - found_docs.append(Document(page_content=page_content, metadata=metadata)) # type: ignore - except Exception as e: - print(f"Error in vector_search(), for course: `{course_name}`. Error: {e}") - sentry_sdk.capture_exception(e) - - self.posthog.capture('distinct_id_of_the_user', - event='vector_search_succeded', - properties={ - 'user_query': search_query, - 'course_name': course_name, - 'qdrant_latency_sec': time.monotonic() - qdrant_start_time, - 'openai_embedding_latency_sec': openai_embedding_latency - }) - # print("found_docs", found_docs) - return found_docs - - def getTopContexts(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]: - """Here's a summary of the work. - - /GET arguments - course name (optional) str: A json response with TBD fields. - - Returns - JSON: A json response with TBD fields. See main.py:getTopContexts docs. - or - String: An error message with traceback. - """ - try: - start_time_overall = time.monotonic() - - found_docs: list[Document] = self.vector_search(search_query=search_query, course_name=course_name) - - pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + - search_query) # type: ignore - - valid_docs = [] - num_tokens = 0 - for doc in found_docs: - doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore - - print( - f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, total prompt cost (of these contexts): {prompt_cost}. πŸ“„ File: {doc.metadata['readable_filename']}" - ) - if token_counter + num_tokens <= token_limit: - token_counter += num_tokens - valid_docs.append(doc) - else: - # filled our token size, time to return - break - - print(f"Total tokens used: {token_counter}. Docs used: {len(valid_docs)} of {len(found_docs)} docs retrieved") - print(f"Course: {course_name} ||| search_query: {search_query}") - print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") - if len(valid_docs) == 0: - return [] - - self.posthog.capture('distinct_id_of_the_user', - event='success_get_top_contexts_OG', - properties={ - 'user_query': search_query, - 'course_name': course_name, - 'token_limit': token_limit, - 'total_tokens_used': token_counter, - 'total_contexts_used': len(valid_docs), - 'total_unique_docs_retrieved': len(found_docs), - 'getTopContext_total_latency_sec': time.monotonic() - start_time_overall, - }) - - return self.format_for_json(valid_docs) - except Exception as e: - # return full traceback to front end - err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - - def batch_vector_search(self, search_queries: List[str], course_name: str, top_n: int = 50): - """ - Perform a similarity search for all the generated queries at once. - """ - start_time = time.monotonic() - - from qdrant_client.http import models as rest - o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) - # Prepare the filter for the course name - myfilter = rest.Filter(must=[ - rest.FieldCondition(key='course_name', match=rest.MatchValue(value=course_name)), - ]) - - # Prepare the search requests - search_requests = [] - for query in search_queries: - user_query_embedding = o.embed_query(query) - search_requests.append( - rest.SearchRequest(vector=user_query_embedding, - filter=myfilter, - limit=top_n, - with_payload=True, - params=models.SearchParams(quantization=models.QuantizationSearchParams(rescore=False)))) - - # Perform the batch search - search_results = self.qdrant_client.search_batch( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - requests=search_requests, - ) - # process search results - found_docs: list[list[Document]] = [] - for result in search_results: - docs = [] - for doc in result: - try: - metadata = doc.payload - page_content = metadata['page_content'] - del metadata['page_content'] - - if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): - metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] - - docs.append(Document(page_content=page_content, metadata=metadata)) - except Exception: - print(traceback.print_exc()) - found_docs.append(docs) - - print(f"⏰ Qdrant Batch Search runtime: {(time.monotonic() - start_time):.2f} seconds") - return found_docs - - def reciprocal_rank_fusion(self, results: list[list], k=60): - """ - Since we have multiple queries, and n documents returned per query, we need to go through all the results - and collect the documents with the highest overall score, as scored by qdrant similarity matching. - """ - fused_scores = {} - count = 0 - unique_count = 0 - for docs in results: - # Assumes the docs are returned in sorted order of relevance - count += len(docs) - for rank, doc in enumerate(docs): - doc_str = dumps(doc) - if doc_str not in fused_scores: - fused_scores[doc_str] = 0 - unique_count += 1 - fused_scores[doc_str] += 1 / (rank + k) - # Uncomment for debugging - # previous_score = fused_scores[doc_str] - #print(f"Change score for doc: {doc_str}, previous score: {previous_score}, updated score: {fused_scores[doc_str]} ") - print(f"Total number of documents in rank fusion: {count}") - print(f"Total number of unique documents in rank fusion: {unique_count}") - reranked_results = [ - (loads(doc), score) for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True) - ] - return reranked_results - - def getTopContextsWithMQR(self, - search_query: str, - course_name: str, - token_limit: int = 4_000) -> Union[List[Dict], str]: - """ - New info-retrieval pipeline that uses multi-query retrieval + filtering + reciprocal rank fusion + context padding. - 1. Generate multiple queries based on the input search query. - 2. Retrieve relevant docs for each query. - 3. Filter the relevant docs based on the user query and pass them to the rank fusion step. - 4. [CANCELED BEC POINTLESS] Rank the docs based on the relevance score. - 5. Parent-doc-retrieval: Pad just the top 5 docs with expanded context from the original document. - """ - return 'fail' - - # try: - # top_n_per_query = 40 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS - # start_time_overall = time.monotonic() - # mq_start_time = time.monotonic() - - # # 1. GENERATE MULTIPLE QUERIES - # generate_queries = ( - # MULTI_QUERY_PROMPT | self.llm | StrOutputParser() | (lambda x: x.split("\n")) | - # (lambda x: list(filter(None, x))) # filter out non-empty strings - # ) - - # generated_queries = generate_queries.invoke({"original_query": search_query}) - # print("generated_queries", generated_queries) - - # # 2. VECTOR SEARCH FOR EACH QUERY - # batch_found_docs_nested: list[list[Document]] = self.batch_vector_search(search_queries=generated_queries, - # course_name=course_name, - # top_n=top_n_per_query) - - # # 3. RANK REMAINING DOCUMENTS -- good for parent doc padding of top 5 at the end. - # found_docs = self.reciprocal_rank_fusion(batch_found_docs_nested) - # found_docs = [doc for doc, score in found_docs] - # print(f"Num docs after re-ranking: {len(found_docs)}") - # if len(found_docs) == 0: - # return [] - # print(f"⏰ Total multi-query processing runtime: {(time.monotonic() - mq_start_time):.2f} seconds") - - # # 4. FILTER DOCS - # filtered_docs = filter_top_contexts(contexts=found_docs, user_query=search_query, timeout=30, max_concurrency=180) - # if len(filtered_docs) == 0: - # return [] - - # # 5. TOP DOC CONTEXT PADDING // parent document retriever - # final_docs = context_parent_doc_padding(filtered_docs, search_query, course_name) - # print(f"Number of final docs after context padding: {len(final_docs)}") - - # pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - # token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + - # search_query) # type: ignore - - # valid_docs = [] - # num_tokens = 0 - # for doc in final_docs: - # doc_string = f"Document: {doc['readable_filename']}{', page: ' + str(doc['pagenumber']) if doc['pagenumber'] else ''}\n{str(doc['text'])}\n" - # num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore - - # print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") - # if token_counter + num_tokens <= token_limit: - # token_counter += num_tokens - # valid_docs.append(doc) - # else: - # # filled our token size, time to return - # break - - # print(f"Total tokens used: {token_counter} Used {len(valid_docs)} of total unique docs {len(found_docs)}.") - # print(f"Course: {course_name} ||| search_query: {search_query}") - # print(f"⏰ ^^ Runtime of getTopContextsWithMQR: {(time.monotonic() - start_time_overall):.2f} seconds") - - # if len(valid_docs) == 0: - # return [] - - # self.posthog.capture('distinct_id_of_the_user', - # event='filter_top_contexts_succeeded', - # properties={ - # 'user_query': search_query, - # 'course_name': course_name, - # 'token_limit': token_limit, - # 'total_tokens_used': token_counter, - # 'total_contexts_used': len(valid_docs), - # 'total_unique_docs_retrieved': len(found_docs), - # }) - - # return self.format_for_json_mqr(valid_docs) - # except Exception as e: - # # return full traceback to front end - # err: str = f"ERROR: In /getTopContextsWithMQR. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.format_exc()}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore - # print(err) - # sentry_sdk.capture_exception(e) - # return err - - def format_for_json_mqr(self, found_docs) -> List[Dict]: - """ - Same as format_for_json, but for the new MQR pipeline. - """ - for found_doc in found_docs: - if "pagenumber" not in found_doc.keys(): - print("found no pagenumber") - found_doc['pagenumber'] = found_doc['pagenumber_or_timestamp'] - - contexts = [ - { - 'text': doc['text'], - 'readable_filename': doc['readable_filename'], - 'course_name ': doc['course_name'], - 's3_path': doc['s3_path'], - 'pagenumber': doc['pagenumber'], - 'url': doc['url'], # wouldn't this error out? - 'base_url': doc['base_url'], - } for doc in found_docs - ] - - return contexts - - def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n: int, top_k_to_search: int) -> str: - """ - Get a stuffed prompt for a given user question and course name. - Args: - user_question (str) - course_name (str) : used for metadata filtering - Returns : str - a very long "stuffed prompt" with question + summaries of top_n most relevant documents. - """ - # MMR with metadata filtering based on course_name - vec_start_time = time.monotonic() - found_docs = self.vectorstore.max_marginal_relevance_search(user_question, k=top_n, fetch_k=top_k_to_search) - print( - f"⏰ MMR Search runtime (top_n_to_keep: {top_n}, top_k_to_search: {top_k_to_search}): {(time.monotonic() - vec_start_time):.2f} seconds" - ) - - requests = [] - for doc in found_docs: - print("doc", doc) - dictionary = { - "model": "gpt-3.5-turbo", - "messages": [{ - "role": - "system", - "content": - "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary." - }, { - "role": - "user", - "content": - f"Provide a comprehensive summary of the given text, based on this question:\n{doc.page_content}\nQuestion: {user_question}\nThe summary should cover all the key points that are relevant to the question, while also condensing the information into a concise format. The length of the summary should be as short as possible, without losing relevant information.\nMake use of direct quotes from the text.\nFeel free to include references, sentence fragments, keywords or anything that could help someone learn about it, only as it relates to the given question.\nIf the text does not provide information to answer the question, please write 'None' and nothing else.", - }], - "n": 1, - "max_tokens": 600, - "metadata": doc.metadata - } - requests.append(dictionary) - - oai = OpenAIAPIProcessor( - input_prompts_list=requests, - request_url='https://api.openai.com/v1/chat/completions', - api_key=os.getenv("OPENAI_API_KEY"), - max_requests_per_minute=1500, - max_tokens_per_minute=90000, - token_encoding_name='cl100k_base', # nosec -- reasonable bandit error suppression - max_attempts=5, - logging_level=20) - - chain_start_time = time.monotonic() - asyncio.run(oai.process_api_requests_from_file()) - results: list[str] = oai.results - print(f"⏰ EXTREME context stuffing runtime: {(time.monotonic() - chain_start_time):.2f} seconds") - - print(f"Cleaned results: {oai.cleaned_results}") - - all_texts = "" - separator = '---' # between each context - token_counter = 0 #keeps track of tokens in each summarization - max_tokens = 7_500 #limit, will keep adding text to string until 8000 tokens reached. - for i, text in enumerate(oai.cleaned_results): - if text.lower().startswith('none') or text.lower().endswith('none.') or text.lower().endswith('none'): - # no useful text, it replied with a summary of "None" - continue - if text is not None: - if "pagenumber" not in results[i][-1].keys(): # type: ignore - results[i][-1]['pagenumber'] = results[i][-1].get('pagenumber_or_timestamp') # type: ignore - num_tokens, prompt_cost = count_tokens_and_cost(text) # type: ignore - if token_counter + num_tokens > max_tokens: - print(f"Total tokens yet in loop {i} is {num_tokens}") - break # Stop building the string if it exceeds the maximum number of tokens - token_counter += num_tokens - filename = str(results[i][-1].get('readable_filename', '')) # type: ignore - pagenumber_or_timestamp = str(results[i][-1].get('pagenumber', '')) # type: ignore - pagenumber = f", page: {pagenumber_or_timestamp}" if pagenumber_or_timestamp else '' - doc = f"Document : filename: {filename}" + pagenumber - summary = f"\nSummary: {text}" - all_texts += doc + summary + '\n' + separator + '\n' - - stuffed_prompt = """Please answer the following question. -Use the context below, called 'your documents', only if it's helpful and don't use parts that are very irrelevant. -It's good to quote 'your documents' directly using informal citations, like "in document X it says Y". Try to avoid giving false or misleading information. Feel free to say you don't know. -Try to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. -That said, be practical and really do your best, and don't let caution get too much in the way of being useful. -To help answer the question, here's a few passages of high quality documents:\n{all_texts} -Now please respond to my question: {user_question}""" - - # "Please answer the following question. It's good to quote 'your documents' directly, something like 'from ABS source it says XYZ' Feel free to say you don't know. \nHere's a few passages of the high quality 'your documents':\n" - - return stuffed_prompt - - def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: int = 7_000) -> str: - """ - Returns - String: A fully formatted prompt string. - """ - try: - top_n = 90 - start_time_overall = time.monotonic() - o = OpenAIEmbeddings(openai_api_type=OPENAI_API_TYPE) - user_query_embedding = o.embed_documents(search_query)[0] # type: ignore - myfilter = models.Filter(must=[ - models.FieldCondition(key='course_name', match=models.MatchValue(value=course_name)), - ]) - - found_docs = self.qdrant_client.search( - collection_name=os.environ['QDRANT_COLLECTION_NAME'], - query_filter=myfilter, - with_vectors=False, - query_vector=user_query_embedding, - limit=top_n # Return 5 closest points - ) - print("Search results: ", found_docs) - if len(found_docs) == 0: - return search_query - - pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - - # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + - search_query) # type: ignore - valid_docs = [] - for d in found_docs: - if d.payload is not None: - if "pagenumber" not in d.payload.keys(): - d.payload["pagenumber"] = d.payload["pagenumber_or_timestamp"] - - doc_string = f"---\nDocument: {d.payload['readable_filename']}{', page: ' + str(d.payload['pagenumber']) if d.payload['pagenumber'] else ''}\n{d.payload.get('page_content')}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore - - # print(f"Page: {d.payload.get('page_content', ' '*100)[:100]}...") - print( - f"tokens used/limit: {token_counter}/{token_limit}, tokens in chunk: {num_tokens}, prompt cost of chunk: {prompt_cost}. πŸ“„ File: {d.payload.get('readable_filename', '')}" - ) - if token_counter + num_tokens <= token_limit: - token_counter += num_tokens - valid_docs.append( - Document(page_content=d.payload.get('page_content', ''), metadata=d.payload)) - else: - continue - - # Convert the valid_docs to full prompt - separator = '---\n' # between each context - context_text = separator.join( - f"Document: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber']) if d.metadata['pagenumber'] else ''}\n{d.page_content}\n" - for d in valid_docs) - - # Create the stuffedPrompt - stuffedPrompt = (pre_prompt + context_text + '\n\nNow please respond to my query: ' + search_query) - - TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4') # type: ignore - print(f"Total tokens: {TOTAL_num_tokens}, prompt_cost: {prompt_cost}") - print("total docs: ", len(found_docs)) - print("num docs used: ", len(valid_docs)) - - print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") - return stuffedPrompt - except Exception as e: - # return full traceback to front end - err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - sentry_sdk.capture_exception(e) - return err - - def format_for_json(self, found_docs: List[Document]) -> List[Dict]: - """Formatting only. - {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]} - - Args: - found_docs (List[Document]): _description_ - - Raises: - Exception: _description_ - - Returns: - List[Dict]: _description_ - """ - for found_doc in found_docs: - if "pagenumber" not in found_doc.metadata.keys(): - print("found no pagenumber") - found_doc.metadata['pagenumber'] = found_doc.metadata['pagenumber_or_timestamp'] - - contexts = [ - { - 'text': doc.page_content, - 'readable_filename': doc.metadata['readable_filename'], - 'course_name ': doc.metadata['course_name'], - 's3_path': doc.metadata['s3_path'], - 'pagenumber': doc.metadata['pagenumber'], # this because vector db schema is older... - # OPTIONAL PARAMS... - 'url': doc.metadata.get('url'), # wouldn't this error out? - 'base_url': doc.metadata.get('base_url'), - } for doc in found_docs - ] - - return contexts - - def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any]]) -> bool: - """ - For given metadata, fetch docs from Supabase based on S3 path or URL. - If docs exists, concatenate the texts and compare with current texts, if same, return True. - """ - doc_table = os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE', '') - course_name = metadatas[0]['course_name'] - incoming_s3_path = metadatas[0]['s3_path'] - url = metadatas[0]['url'] - original_filename = incoming_s3_path.split('/')[-1][37:] # remove the 37-char uuid prefix - - # check if uuid exists in s3_path -- not all s3_paths have uuids! - incoming_filename = incoming_s3_path.split('/')[-1] - pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', - re.I) # uuid V4 pattern, and v4 only. - if bool(pattern.search(incoming_filename)): - # uuid pattern exists -- remove the uuid and proceed with duplicate checking - original_filename = incoming_filename[37:] - else: - # do not remove anything and proceed with duplicate checking - original_filename = incoming_filename - - if incoming_s3_path: - filename = incoming_s3_path - supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq( - 'course_name', course_name).like('s3_path', '%' + original_filename + '%').order('id', desc=True).execute() - supabase_contents = supabase_contents.data - elif url: - filename = url - supabase_contents = self.supabase_client.table(doc_table).select('id', 'contexts', 's3_path').eq( - 'course_name', course_name).eq('url', url).order('id', desc=True).execute() - supabase_contents = supabase_contents.data - else: - filename = None - supabase_contents = [] - - supabase_whole_text = "" - if len(supabase_contents) > 0: # if a doc with same filename exists in Supabase - # concatenate texts - supabase_contexts = supabase_contents[0] - for text in supabase_contexts['contexts']: - supabase_whole_text += text['text'] - - current_whole_text = "" - for text in texts: - current_whole_text += text['input'] - - if supabase_whole_text == current_whole_text: # matches the previous file - print(f"Duplicate ingested! πŸ“„ s3_path: {filename}.") - return True - - else: # the file is updated - print(f"Updated file detected! Same filename, new contents. πŸ“„ s3_path: {filename}") - - # call the delete function on older docs - for content in supabase_contents: - print("older s3_path to be deleted: ", content['s3_path']) - delete_status = self.delete_data(course_name, content['s3_path'], '') - print("delete_status: ", delete_status) - return False - - else: # filename does not already exist in Supabase, so its a brand new file - print(f"NOT a duplicate! πŸ“„s3_path: {filename}") - return False - - -if __name__ == '__main__': - pass diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py deleted file mode 100644 index e341657c..00000000 --- a/ai_ta_backend/web_scrape.py +++ /dev/null @@ -1,664 +0,0 @@ -import mimetypes -import os -import re -import shutil -import time -import uuid -from tempfile import NamedTemporaryFile -from typing import List, Optional -from zipfile import ZipFile - -import boto3 # type: ignore -import requests -import supabase -from bs4 import BeautifulSoup - -from ai_ta_backend.aws import upload_data_files_to_s3 -from ai_ta_backend.vector_database import Ingest - - -class WebScrape(): - - def __init__(self) -> None: - - # S3 - self.s3_client = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - # Create a Supabase client - self.supabase_client = supabase.create_client( # type: ignore - supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY']) - - self.ingester = Ingest() - - self.url_contents = [] - self.invalid_urls = [] - self.existing_urls = [] - self.max_urls = 0 - self.original_amount = 0 - self.supa_urls = 0 - self.queue = {} - - return None - - def get_file_extension(self, filename): - match = re.search(r'\.([a-zA-Z0-9]+)$', filename) - valid_filetypes = list(mimetypes.types_map.keys()) - valid_filetypes = valid_filetypes + ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx'] - if match: - filetype = "." + match.group(1) - if filetype in valid_filetypes: - return filetype - else: - return '.html' - else: - return '.html' - - def valid_url(self, url): - """ - Returns the URL and it's content if it's good, otherwise returns false. Prints the status code. - """ - try: - response = requests.get(url, allow_redirects=True, timeout=20) - - redirect_loop_counter = 0 - while response.status_code == 301: - # Check for permanent redirect - if redirect_loop_counter > 3: - print("❌ Redirect loop (on 301 error) exceeded redirect limit of:", redirect_loop_counter, "❌") - return (False, False, False) - redirect_url = response.headers['Location'] - response = requests.head(redirect_url) - redirect_loop_counter += 1 - if response.status_code == 200: - filetype = self.get_file_extension(response.url) - print("file extension:", filetype) - if filetype == '.html': - content = BeautifulSoup(response.content, "html.parser") - if " average: - # print("Too many repeated urls, exiting web scraper") - # return True - # else: - # return False - - def count_hard_stop_len(self): - count = len(self.url_contents) - if self.url_contents != []: - print("πŸ“ˆπŸ“ˆ Counted URLs", count, "out of", self.original_amount, "πŸ“ˆπŸ“ˆ") - if count > self.original_amount: - print("Too many repeated urls, exiting web scraper") - return True - else: - return False - - def check_and_ingest(self, url: str, course_name: str, timeout: int, base_url_on: str): - if url not in self.invalid_urls and url not in self.existing_urls: - second_url, content, filetype = self.valid_url(url) - else: - print("This URL is invalid or already existing in the database") - self.existing_urls.append((url)) - return '', '', '' - - if second_url: - time.sleep(timeout) - url_content = (second_url, content, filetype) - if self.check_file_not_exists(url_content): - path_name = self.title_path_name(url_content) - self.url_contents.append(url_content) - self.existing_urls.append(url_content) - # url_contents = remove_duplicates(url_contents, _existing_urls) - self.ingest_file(url_content, course_name, path_name, base_url_on) - print("βœ…βœ… Scraped:", second_url, "βœ…βœ…") - self.max_urls -= 1 - else: - print("This URL is already existing in the database") - self.existing_urls.append((second_url, content, filetype)) - else: - self.invalid_urls.append(url) - print("This URL is invalid") - - return url, content, filetype - - def scrape_user_provided_page(self, url: str, course_name: str, timeout: int, base: str): - urls = [] - url, content, filetype = self.check_and_ingest(url, course_name, timeout, base) - - if url: - if filetype == '.html': - try: - body = content.find("body") - header = content.find("head") - footer = content.find("footer") - nav = content.find("nav") - except Exception as e: - print("Error:", e) - body = "" - header = "" - # Check for 403 Forbidden urls - try: - if content.title.string.lower() == "403 forbidden" or content.title.string.lower( - ) == 'page not found': # type: ignore - print("403 Forbidden") - self.invalid_urls.append(url) - else: - pass - except Exception as e: - print("Error:", e) - pass - if body != "" and header != "": - urls = self.find_urls(body, base, urls) # type: ignore - urls = self.find_urls(header, base, urls) # type: ignore - self.invalid_urls.append(self.find_urls(footer, base)) # type: ignore - self.invalid_urls.append(self.find_urls(nav, base)) # type: ignore - else: - urls = self.find_urls(content, base, urls) # type: ignore - - return urls - - def non_user_provided_page_urls(self, url: str, base: str, soup, filetype: str): - urls = [] - if filetype == '.html': - try: - body = soup.find("body") - header = soup.find("head") - footer = soup.find("footer") - nav = soup.find("nav") - except Exception as e: - print("Error:", e) - body = "" - header = "" - - # Check for 403 Forbidden urls - try: - if soup.title.string.lower() == "403 forbidden" or soup.title.string.lower( - ) == 'page not found': # type: ignore - print("403 Forbidden") - self.invalid_urls.append(url) - else: - pass - except Exception as e: - print("Error:", e) - pass - if body != "" and header != "": - urls = self.find_urls(body, base, urls) - urls = self.find_urls(header, base, urls) - self.invalid_urls.append(self.find_urls(footer, base)) # type: ignore - self.invalid_urls.append(self.find_urls(nav, base)) # type: ignore - else: - urls = self.find_urls(soup, base, urls) - - return urls - - # def depth_crawler(self, url:str, course_name:str, max_depth:int=3, timeout:int=1, base_url_on:str=None, _depth:int=0, _soup=None, _filetype:str=None): # type: ignore - # '''Function gets titles of urls and the urls themselves''' - # # Prints the depth of the current search - # print("depth: ", _depth) - # if base_url_on: - # base_url_on = str(base_url_on) - - # # Create a base site for incomplete hrefs - # base = self.base_url(url) - # if base == "": - # raise ValueError("This URL is invalid") - - # if self.count_hard_stop_len(): - # raise ValueError("Too many repeated urls, exiting web scraper") - - # try: - # if _soup: - # urls = self.non_user_provided_page_urls(url, base, _soup, _filetype) - # else: - # urls = self.scrape_user_provided_page(url, course_name, timeout, base) - # except ValueError as e: - # raise e - - # temp_urls = [] - # # We grab content out of these urls - # try: - # for url in urls: - # if self.max_urls > 0: - # if base_url_on: - # if url.startswith(base): - # new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - # if new_url: - # temp_urls.append((new_url, content, filetype)) - # if self.count_hard_stop_len(): - # raise ValueError("Too many repeated urls, exiting web scraper") - # else: - # new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - # if new_url: - # temp_urls.append((new_url, content, filetype)) - # if self.count_hard_stop_len(): - # raise ValueError("Too many repeated urls, exiting web scraper") - # else: - # print("Max URLs reached") - # raise ValueError("Max URLs reached") - # except ValueError as e: - # print("Error:", e) - - # # recursively go through crawler until we reach the max amount of urls. - # for url in temp_urls: - # if self.max_urls > 0: - # if _depth < max_depth: - # self.depth_crawler(url[0], course_name, max_depth, timeout, base_url_on, _depth+1, url[1], url[2]) - # print(self.max_urls, "urls left") - # if self.count_hard_stop_len(): - # raise ValueError("Too many repeated urls, exiting web scraper") - # else: - # print("Depth exceeded:", _depth+1, "out of", max_depth) - # break - # else: - # print("Max urls reached") - # break - - # return None - - def breadth_crawler(self, - url: str, - course_name: str, - timeout: int = 1, - base_url_on: str = None, - max_depth: int = 3, - base_option: bool = False): # type: ignore - depth = 0 - if base_url_on: - base_url_on = str(base_url_on) - - # Create a base site for incomplete hrefs - base = self.base_url(url) - if base == "": - raise ValueError("This URL is invalid") - - self.queue[depth] = self.scrape_user_provided_page(url, course_name, timeout, base) - self.queue[depth + 1] = [] - print("queue", self.queue) - print("len", len(self.queue[depth]), len(self.queue[depth + 1])) - - while self.count_hard_stop_len() is False: - print("queue", len(self.queue[depth]), len(self.queue[depth + 1])) - - if self.queue[depth] == []: - depth += 1 - print("depth:", depth) - self.queue[depth + 1] = [] - if depth > max_depth: - print("Depth exceeded:", depth, "out of", max_depth) - raise ValueError("Depth exceeded") - - if self.queue[depth] == []: - print("queue is empty") - raise ValueError("Queue is empty") - - url = self.queue[depth].pop(0) - if self.max_urls > 0: - if depth <= max_depth: - if base_url_on: - if self.base_requirements(url, base_url_on): - print("url", url) - print("requirements", self.base_requirements(url, base_url_on)) - new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - self.queue[depth + 1] += self.non_user_provided_page_urls(new_url, base, content, filetype) - if self.count_hard_stop_len(): - raise ValueError("Too many repeated urls, exiting web scraper") - else: - if base_option: - new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - if self.count_hard_stop_len(): - raise ValueError("Too many repeated urls, exiting web scraper") - else: - new_url, content, filetype = self.check_and_ingest(url, course_name, timeout, base_url_on) - self.queue[depth + 1] += self.non_user_provided_page_urls(new_url, base, content, filetype) - if self.count_hard_stop_len(): - raise ValueError("Too many repeated urls, exiting web scraper") - else: - print("Depth exceeded:", depth + 1, "out of", max_depth) - break - else: - print("Max URLs reached") - raise ValueError("Max URLs reached") - - return None - - def main_crawler(self, - url: str, - course_name: str, - max_urls: int = 100, - max_depth: int = 3, - timeout: int = 1, - stay_on_baseurl: bool = True, - depth_or_breadth: str = 'breadth'): - """ - Crawl a site and scrape its content and PDFs, then upload the data to S3 and ingest it. - - Args: - url (str): The URL of the site to crawl. - course_name (str): The name of the course to associate with the crawled data. - max_urls (int, optional): The maximum number of URLs to crawl. Defaults to 100. - max_depth (int, optional): The maximum depth of URLs to crawl. Defaults to 3. - timeout (int, optional): The number of seconds to wait between requests. Defaults to 1. - - Returns: - None - """ - print("\n") - max_urls = int(max_urls) - max_depth = int(max_depth) - timeout = int(timeout) - stay_on_baseurl = bool(stay_on_baseurl) - self.max_urls = max_urls - self.original_amount = max_urls - if stay_on_baseurl: - base_url_str = self.base_url(url) - print(base_url_str) - else: - base_url_str = '' - - # Check for GitHub repository coming soon - if is_github_repo(url): - print("Begin Ingesting GitHub page") - results = self.ingester.ingest_github(url, course_name) - print("Finished ingesting GitHub page") - return results - else: - try: - print("Gathering existing urls from Supabase") - urls = self.supabase_client.table( - os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url').eq( - 'course_name', course_name).execute() # type: ignore - - if urls.data == []: - self.existing_urls = [] - else: - self.existing_urls = [] - for row in urls.data: - # whole = '' - # for text in row['contexts']: - # whole += text['text'] - self.existing_urls.append((row['url'], 'whole', 'supa')) - print("Finished gathering existing urls from Supabase") - except Exception as e: - print("Error:", e) - print("Could not gather existing urls from Supabase") - self.existing_urls = [] - try: - print("Begin Ingesting Web page") - self.supa_urls = len(self.existing_urls) - if depth_or_breadth.lower() == 'depth': - self.depth_crawler(url=url, - course_name=course_name, - max_depth=max_depth, - timeout=timeout, - base_url_on=base_url_str) - elif depth_or_breadth.lower() == 'breadth': - self.breadth_crawler(url=url, - course_name=course_name, - timeout=timeout, - base_url_on=base_url_str, - max_depth=max_depth) - else: - raise ValueError("Invalid depth_or_breadth argument") - except ValueError as e: - print("Error:", e) - - if len(self.url_contents) < self.original_amount: - print("Max URLS not reached, returning all urls found:", len(self.url_contents), "out of", self.original_amount) - elif len(self.url_contents) == self.original_amount: - print("Max URLS reached:", len(self.url_contents), "out of", self.original_amount) - else: - print("Exceeded Max URLS, found:", len(self.url_contents), "out of", self.original_amount) - print(len(self.url_contents), "urls found") - print(f"Successfully uploaded files to s3: {len(self.url_contents)}") - print("Finished /web-scrape") - - -def is_github_repo(url): - # Split the URL by '?' to ignore any parameters - base_url = url.split('?')[0] - - # The regular expression now allows for optional 'http', 'https', and 'www' prefixes. - # It also accounts for optional trailing slashes. - # The pattern is also case-insensitive. - pattern = re.compile(r'^(https?://)?(www\.)?github\.com/[^/?]+/[^/?]+/?$', re.IGNORECASE) - - # The function returns True or False based on whether the pattern matches the base_url - return base_url if pattern.match(base_url) else None - - -def mit_course_download(url: str, course_name: str, local_dir: str): - """ - Download an MIT course using its url - """ - ingester = Ingest() - if url.endswith("download"): - pass - else: - url = url + "download" - - r = requests.get(url) - soup = BeautifulSoup(r.text, "html.parser") - - zip = '' - for ref in soup.find_all("a"): - if ref.attrs['href'].endswith("zip"): - zip = ref.attrs['href'] - - site = zip - print('site', site) - r = requests.get(url=site, stream=True) - - zip_file = local_dir + ".zip" - - try: - with open(zip_file, 'wb') as fd: - for chunk in r.iter_content(chunk_size=128): - fd.write(chunk) - print("course downloaded!") - except Exception as e: - print("Error:", e, site) - - with ZipFile(zip_file, 'r') as zObject: - zObject.extractall(path=local_dir) - - shutil.move(local_dir + "/" + "robots.txt", local_dir + "/static_resources") - s3_paths = upload_data_files_to_s3(course_name, local_dir + "/static_resources") - success_fail = ingester.bulk_ingest(s3_paths, course_name) # type: ignore - - shutil.move(zip_file, local_dir) - shutil.rmtree(local_dir) - del ingester - print("Finished Ingest") - return success_fail - - -if __name__ == '__main__': - pass diff --git a/railway.json b/railway.json index 56b39dac..d6d92535 100644 --- a/railway.json +++ b/railway.json @@ -10,11 +10,9 @@ "python -m venv --copies /opt/venv && . /opt/venv/bin/activate", "pip install pip==23.3.1", "pip install -r requirements.txt" - ], - "aptPkgs": ["ffmpeg", "tesseract-ocr"] + ] }, "setup": { - "aptPkgs": ["libcap-dev", "libgl1"], "nixPkgs": ["python310", "gcc"] } } diff --git a/requirements.txt b/requirements.txt index f4503824..848c10d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,59 +1,68 @@ -# On Apple Silicon: pip uninstall grpcio -y; conda install grpcio -y -nomic==2.0.14 +Flask==3.0.0 +flask-cors==4.0.0 +Flask-Injector==0.15.0 +gunicorn==21.2.0 protobuf==4.25.0 -langchain==0.0.331 -langchainhub==0.1.14 -click==8.1.7 aiohttp==3.8.6 +wheel==0.41.3 +click==8.1.7 MarkupSafe==2.1.3 Werkzeug==3.0.1 mkdocstrings[python]==0.23.0 mkdocs-material==9.4.7 itsdangerous==2.1.2 -wheel==0.41.3 -Flask==3.0.0 -gunicorn==21.2.0 -tiktoken==0.5.1 Jinja2==3.1.2 -python-dotenv==1.0.0 -flask-cors==4.0.0 -qdrant-client==1.7.3 mkdocs==1.5.3 -openai==0.28.1 -supabase==2.0.2 SQLAlchemy==2.0.22 -boto3==1.28.79 -PyMuPDF==1.23.6 tabulate==0.9.0 typing-inspect==0.9.0 typing_extensions==4.8.0 -pysrt==1.1.2 -docx2txt==0.8 -pydub==0.25.1 -ffmpeg-python==0.2.0 -ffprobe==0.5 -ffmpeg==1.4 -beautifulsoup4==4.12.2 -canvasapi==3.2.0 -GitPython==3.1.40 + +# Utils +tiktoken==0.5.1 +python-dotenv==1.0.0 +pydantic==1.10.13 # pydantic v1 works better for ray flask-executor==1.0.0 + +# AI & core services +nomic==2.0.14 +openai==0.28.1 +langchain==0.0.331 +langchainhub==0.1.14 + +# Data +boto3==1.28.79 +qdrant-client==1.7.3 +supabase==2.0.2 + +# Logging +posthog==3.1.0 +sentry-sdk==1.39.1 + +# Not currently supporting coursera ingest +# cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl + +# removed due to /ingest in Beam +# canvasapi==3.2.0 +# GitPython==3.1.40 +# pysrt==1.1.2 +# docx2txt==0.8 +# pydub==0.25.1 +# ffmpeg-python==0.2.0 +# ffprobe==0.5 +# ffmpeg==1.4 +# beautifulsoup4==4.12.2 +# PyMuPDF==1.23.6 +# pytesseract==0.3.10 # image OCR +# openpyxl==3.1.2 # excel +# networkx==3.2.1 # unused part of excel partitioning :( +# python-pptx==0.6.23 +# unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4 + # pdf packages for unstructured # pdf2image==1.16.3 # pdfminer.six==20221105 # opencv-python-headless==4.8.1.78 # unstructured.pytesseract==0.3.12 # unstructured-inference==0.7.11 # this is the real large one :( -pytesseract==0.3.10 # image OCR -openpyxl==3.1.2 # excel -networkx==3.2.1 # unused part of excel partitioning :( -python-pptx==0.6.23 -unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4 # unstructured[xlsx,image,pptx]==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4 - -# Not currently supporting coursera ingest -# cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl -pydantic==1.10.13 # pydantic v1 works better for ray -posthog==3.1.0 -sentry-sdk==1.39.1 -# ray==2.8.1 -# newrelic==9.3.0 \ No newline at end of file diff --git a/run.sh b/run.sh index 0d77691a..49d79372 100755 --- a/run.sh +++ b/run.sh @@ -5,4 +5,4 @@ # 200 MB object store memory.. necessary to statically allocate or will crash in Railway env restrictions. # ray start --head --num-cpus 6 --object-store-memory 300000000 export PYTHONPATH=${PYTHONPATH}:$(pwd)/ai_ta_backend -exec gunicorn --workers=6 --threads=20000 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 +exec gunicorn --workers=3 --threads=100 --worker-class=gthread ai_ta_backend.main:app --timeout 1800