added .md ingest and fixed .py ingest

UIUC-Chatbot · Aug 15, 2023 · 07238a2 · KastanDay · Aug 16, 2023 · 07238a2
1 parent 54e3fb0
commit 07238a2
Show file tree

Hide file tree

Showing 4 changed files with 214 additions and 12 deletions.
diff --git a/ai_ta_backend/data_logging.py b/ai_ta_backend/data_logging.py
@@ -0,0 +1,56 @@
+import os
+import nomic
+from nomic import atlas
+from langchain.embeddings import OpenAIEmbeddings
+import numpy as np
+
+class DataLog():
+    def __init__(self):
+        self.login = nomic.login(os.getenv('NOMIC_API_KEY'))
+
+    def nomic_log(self, course_name:str, search_query:str, retrieved_contexts)-> str:
+        """
+        Logs user query and retrieved contexts to Nomic.
+        """
+        print("course_name: ", course_name)
+        print("search_query: ", search_query)
+        print("retrieved_contexts: ", len(retrieved_contexts))
+
+        # concat all retrieved contexts into one string
+        context_string = ""
+        for context in retrieved_contexts:
+            context_string += context['text'] + " "
+
+        print("context_string: ", context_string)
+
+        # convert query and context to embeddings
+        embeddings_model = OpenAIEmbeddings()
+        #embeddings = embeddings_model.embed_documents([search_query, context_string])
+        #embeddings = np.array(embeddings)
+
+        num_embeddings = 2
+        embeddings = np.random.rand(num_embeddings, 1536)
+
+        data = [{'course': course_name, 'id': i} for i in range(len(embeddings))]
+        print("len of data: ", len(data))
+        print("len of embeddings: ", embeddings.shape)
+
+        # project = atlas.map_embeddings(embeddings=np.array(embeddings),
+        #                                data=data,
+        #                                id_field='id',
+        #                                name='Search Query Viz',
+        #                                colorable_fields=['course'])
+        # print(project.maps)
+
+        project = atlas.AtlasProject(name="Search Query Viz", add_datums_if_exists=True)
+        #map = project.get_map('Search Query Viz')
+        print(project.name)
+        #print(map)
+
+        with project.wait_for_project_lock() as project:
+            project.add_embeddings(embeddings=embeddings, data=data)
+            project.rebuild_maps()
+
+        print("done")
+        # log to Nomic
+        return "WIP"
diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
@@ -13,6 +13,7 @@
 from ai_ta_backend.vector_database import Ingest
 from ai_ta_backend.web_scrape import main_crawler, mit_course_download
 from ai_ta_backend.canvas import CanvasAPI
+from ai_ta_backend.data_logging import DataLog
 
 app = Flask(__name__)
 CORS(app)
@@ -132,6 +133,10 @@ def getTopContexts():
   ingester = Ingest()
   found_documents = ingester.getTopContexts(search_query, course_name, token_limit)
 
+  # add nomic log function here
+  logger = DataLog()
+  result = logger.nomic_log(course_name, search_query, found_documents)
+
   response = jsonify(found_documents)
   response.headers.add('Access-Control-Allow-Origin', '*')
   return response

diff --git a/ai_ta_backend/nomic.ipynb b/ai_ta_backend/nomic.ipynb
@@ -0,0 +1,111 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nomic\n",
+    "from nomic import atlas\n",
+    "import supabase\n",
+    "import os\n",
+    "\n",
+    "nomic.login(\"z9vzLMiZvEv-Ub1Vc0wOFoKEI70Ute9OQC2_YjocH5TC8\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "https://twzwfuydgnnjcaopyfdv.supabase.co\n"
+     ]
+    }
+   ],
+   "source": [
+    "# create supabase client\n",
+    "url = \"https://twzwfuydgnnjcaopyfdv.supabase.co\"\n",
+    "key = \"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InR3endmdXlkZ25uamNhb3B5ZmR2Iiwicm9sZSI6InNlcnZpY2Vfcm9sZSIsImlhdCI6MTY4NDQzMzc0NiwiZXhwIjoyMDAwMDA5NzQ2fQ.dMyvULsZ8jnnLk8TUZtP0Ec_6LeOxcoLnD-uIjoYYs4\"\n",
+    "\n",
+    "client = supabase.create_client(supabase_url=url, supabase_key=key)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = client.table(\"documents\").select(\"*\").eq(\"course_name\", \"canvas\").limit(2).execute()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "data=[{'id': 46461, 'created_at': '2023-08-10T20:32:20.802466+00:00', 's3_path': 'courses/canvas/How_businesses_and_websites_can_use_third-party_data_to_target_advertising_through_LinkedIn_|_LinkedIn_Help.html', 'readable_filename': '8/10/23 How businesses and websites can use third-party data to target advertising through LinkedIn | LinkedIn Help', 'course_name': 'canvas', 'url': 'https://www.linkedin.com/help/linkedin/answer/a426264?trk=microsites-frontend_legal_privacy-policy&lang=en', 'contexts': [{'text': 'How businesses and websites can use third-party data to target advertising through LinkedIn | LinkedIn Help\\nDue to high support volume, it may take longer than usual to hear back from our Support Agents. \\xa0dismiss this messageAttention screen reader users, you are in a mobile optimized view and content may not appear where you expect it to be. To return the screen to its desktop view, please maximize your browser.Skip to contentSkip to searchClose jump menuClose menuGet help with:LinkedInSales NavigatorTalent HubRecruiterSales InsightsMarketing SolutionsTalent InsightsCorporate BillingLearningGo to LinkedInSign inHelpSign inContact usEnglish (English)Čeština (Czech)Deutsch (German)Bahasa Indonesia (Indonesian)Español (Spanish)Türkçe (Turkish)Français (French)हिंदी (Hindi)Italiano (Italian)日本語 (Japanese)Nederlands (Dutch)Português (Portuguese)Svenska (Swedish)Polski (Polish)Bahasa Malaysia (Malay)한국어 (Korean)Dansk (Danish)Norsk (Norwegian)Română (Romanian)Русский (Russian)Українська (Ukrainian)简体中文 (Chinese (Simplified))正體中文 (Chinese (Traditional))ภาษาไทย (Thai)العربية (Arabic)LinkedIn Corporation © 2023AboutTransparency CenterPrivacy and TermsCookiesCopyrightTermsPrivacyGuest controlsYour California Privacy ChoicesDismiss privacy menuLinkedIn Corporation © 2023', 'embedding': None, 'timestamp': None, 'pagenumber': ''}, {'text': 'How businesses and websites can use third-party data to target advertising through LinkedIn | LinkedIn Help\\nDue to high support volume, it may take longer than usual to hear back from our Support Agents. \\xa0dismiss this messageAttention screen reader users, you are in a mobile optimized view and content may not appear where you expect it to be. To return the screen to its desktop view, please maximize your browser.Skip to contentSkip to searchClose jump menuClose menuGet help with:LinkedInLearningCorporate BillingTalent InsightsMarketing SolutionsSales InsightsTalent HubSales NavigatorRecruiterGo to LinkedInSign inHelpSign inContact usEnglish (English)Čeština (Czech)Deutsch (German)Bahasa Indonesia (Indonesian)Español (Spanish)Türkçe (Turkish)Français (French)हिंदी (Hindi)Italiano (Italian)日本語 (Japanese)Nederlands (Dutch)Português (Portuguese)Svenska (Swedish)Polski (Polish)Bahasa Malaysia (Malay)한국어 (Korean)Dansk (Danish)Norsk (Norwegian)Română (Romanian)Русский (Russian)Українська (Ukrainian)简体中文 (Chinese (Simplified))正體中文 (Chinese (Traditional))ภาษาไทย (Thai)العربية (Arabic)LinkedIn Corporation © 2023AboutTransparency CenterPrivacy and TermsCookiesCopyrightTermsPrivacyGuest controlsYour California Privacy ChoicesDismiss privacy menuLinkedIn Corporation © 2023', 'embedding': None, 'timestamp': None, 'pagenumber': ''}], 'base_url': None}, {'id': 46519, 'created_at': '2023-08-10T20:32:30.368445+00:00', 's3_path': 'courses/canvas/Off-LinkedIn_Visibility_|_LinkedIn_Help.html', 'readable_filename': '8/10/23 Off-LinkedIn Visibility | LinkedIn Help', 'course_name': 'canvas', 'url': 'https://www.linkedin.com/help/linkedin/answer/a1340507?trk=microsites-frontend_legal_privacy-policy&lang=en', 'contexts': [{'text': 'Off-LinkedIn Visibility | LinkedIn Help\\nDue to high support volume, it may take longer than usual to hear back from our Support Agents. \\xa0dismiss this messageAttention screen reader users, you are in a mobile optimized view and content may not appear where you expect it to be. To return the screen to its desktop view, please maximize your browser.Skip to contentSkip to searchClose jump menuClose menuGet help with:LinkedInLearningRecruiterSales NavigatorTalent HubSales InsightsMarketing SolutionsCorporate BillingTalent InsightsGo to LinkedInSign inHelpSign inContact usEnglish (English)Čeština (Czech)Deutsch (German)Bahasa Indonesia (Indonesian)Español (Spanish)Türkçe (Turkish)Français (French)हिंदी (Hindi)Italiano (Italian)日本語 (Japanese)Nederlands (Dutch)Português (Portuguese)Svenska (Swedish)Polski (Polish)Bahasa Malaysia (Malay)한국어 (Korean)Dansk (Danish)Norsk (Norwegian)Română (Romanian)Русский (Russian)Українська (Ukrainian)简体中文 (Chinese (Simplified))正體中文 (Chinese (Traditional))ภาษาไทย (Thai)العربية (Arabic)LinkedIn Corporation © 2023AboutTransparency CenterPrivacy and TermsCookiesCopyrightTermsPrivacyGuest controlsYour California Privacy ChoicesDismiss privacy menuLinkedIn Corporation © 2023', 'embedding': None, 'timestamp': None, 'pagenumber': ''}, {'text': 'Off-LinkedIn Visibility | LinkedIn Help\\nDue to high support volume, it may take longer than usual to hear back from our Support Agents. \\xa0dismiss this messageAttention screen reader users, you are in a mobile optimized view and content may not appear where you expect it to be. To return the screen to its desktop view, please maximize your browser.Skip to contentSkip to searchClose jump menuClose menuGet help with:LinkedInSales NavigatorCorporate BillingTalent HubRecruiterMarketing SolutionsTalent InsightsLearningSales InsightsGo to LinkedInSign inHelpSign inContact usEnglish (English)Čeština (Czech)Deutsch (German)Bahasa Indonesia (Indonesian)Español (Spanish)Türkçe (Turkish)Français (French)हिंदी (Hindi)Italiano (Italian)日本語 (Japanese)Nederlands (Dutch)Português (Portuguese)Svenska (Swedish)Polski (Polish)Bahasa Malaysia (Malay)한국어 (Korean)Dansk (Danish)Norsk (Norwegian)Română (Romanian)Русский (Russian)Українська (Ukrainian)简体中文 (Chinese (Simplified))正體中文 (Chinese (Traditional))ภาษาไทย (Thai)العربية (Arabic)LinkedIn Corporation © 2023AboutTransparency CenterPrivacy and TermsCookiesCopyrightTermsPrivacyGuest controlsYour California Privacy ChoicesDismiss privacy menuLinkedIn Corporation © 2023', 'embedding': None, 'timestamp': None, 'pagenumber': ''}], 'base_url': None}] count=None\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[Document(page_content='import os\\nfrom canvasapi import Canvas\\nimport requests\\nfrom zipfile import ZipFile\\nfrom ai_ta_backend.aws import upload_data_files_to_s3\\nfrom ai_ta_backend.vector_database import Ingest\\n\\n\\nclass CanvasAPI():\\n    def __init__(self):\\n        self.canvas_client = Canvas(\"https://canvas.illinois.edu\", \\n                                    os.getenv(\\'CANVAS_ACCESS_TOKEN\\'))\\n    \\n    def add_users(self, canvas_course_id: str, course_name: str):\\n        \"\"\"\\n        Get all users in a course\\n        \"\"\"\\n        course = self.canvas_client.get_course(canvas_course_id)\\n        users = course.get_users()\\n        user_emails = []\\n        for user in users:\\n            net_id = user.sis_user_id\\n            email_id = net_id + \"@illinois.edu\"\\n            user_emails.append(email_id)\\n        \\n        print(user_emails)\\n        \\n        if len(user_emails) > 0:\\n            return \"Success\"\\n        else:\\n            return \"Failed\"\\n        \\n    def ingest_course_content(self, canvas_course_id: str, course_name: str):\\n        \"\"\"\\n        Ingests all Canvas course materials through the course ID.\\n        \"\"\"\\n        print(\"In ingest_course_content\")\\n\\n        api_path = \"https://canvas.illinois.edu/api/v1/courses/\" + str(canvas_course_id)\\n        headers = {\"Authorization\": \"Bearer \" + os.getenv(\\'CANVAS_ACCESS_TOKEN\\')}\\n\\n        try:\\n            # Start the content export\\n            content_export_api_path = api_path + \"/content_exports?export_type=zip\"\\n            start_content_export = requests.post(content_export_api_path, headers=headers)\\n            content_export_id = start_content_export.json()[\\'id\\']\\n            progress_url = start_content_export.json()[\\'progress_url\\']\\n\\n            # Wait for the content export to finish\\n            export_progress = requests.get(progress_url, headers=headers)\\n            while export_progress.json()[\\'workflow_state\\'] != \\'completed\\':\\n                export_progress = requests.get(progress_url, headers=headers)\\n            \\n            # View content export and get download URL\\n            show_content_export_api_path = api_path + \"/content_exports/\" + str(content_export_id)\\n            print(\"Show export path: \", show_content_export_api_path)\\n\\n            show_content_export = requests.get(show_content_export_api_path, headers=headers)\\n            download_url = show_content_export.json()[\\'attachment\\'][\\'url\\']\\n            file_name = show_content_export.json()[\\'attachment\\'][\\'filename\\']\\n\\n            # Create a directory for the content\\n            directory = os.path.join(os.getcwd(), \"course_content\")\\n            if not os.path.exists(directory):\\n                os.mkdir(directory)\\n\\n            # Download zip and save to directory\\n            download = requests.get(download_url, headers=headers)\\n            with open(os.path.join(directory, file_name), \\'wb\\') as f:\\n                f.write(download.content)\\n            print(\"Downloaded!\")\\n\\n            # Extract and read from zip file\\n            filepath = \"course_content/\" + file_name\\n            with ZipFile(filepath, \\'r\\') as zip:\\n                zip.printdir()\\n                zip.extractall(\"course_content\")\\n                print(\\'Done!\\')\\n            os.remove(filepath)\\n\\n            # Upload files to S3 and call bulk_ingest\\n            s3_paths = upload_data_files_to_s3(course_name, \"course_content\")\\n            ingest = Ingest()\\n            canvas_ingest = ingest.bulk_ingest(s3_paths, course_name=course_name)\\n            \\n            return canvas_ingest\\n        \\n        except Exception as e:\\n            print(e)\\n            return \"Failed\"\\n\\n\\n        ', metadata={'source': 'canvas.py'})]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain.document_loaders import PythonLoader\n",
+    "\n",
+    "file = \"canvas.py\"\n",
+    "\n",
+    "loader = PythonLoader(file)\n",
+    "data = loader.load()\n",
+    "\n",
+    "print(data)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.16"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py
@@ -25,7 +25,7 @@
 # # from arize.utils.types import (Embedding, EmbeddingColumnNames, Environments,
 # #                                Metrics, ModelTypes, Schema)
 from langchain.document_loaders import (Docx2txtLoader, PythonLoader,
-                                        SRTLoader,
+                                        SRTLoader, UnstructuredFileLoader, 
                                         UnstructuredPowerPointLoader, TextLoader, GitLoader)
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.schema import Document
@@ -243,7 +243,7 @@ def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwarg
             success_status['failure_ingest'].append(s3_path)
           else:
             success_status['success_ingest'].append(s3_path)
-        elif s3_path.endswith('.txt'):
+        elif s3_path.endswith('.txt') or s3_path.endswith('.md'):
           ret = self._ingest_single_txt(s3_path, course_name)
           if ret != "Success":
             success_status['failure_ingest'].append(s3_path)
@@ -280,21 +280,51 @@ def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwarg
 
   def _ingest_single_py(self, s3_path: str, course_name: str):
     try:
-      with NamedTemporaryFile() as tmpfile:
-        # download from S3 into vtt_tmpfile
-        self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)
-        loader = PythonLoader(tmpfile.name)
-        documents = loader.load()
-        texts = [doc.page_content for doc in documents]
-        metadatas: List[Dict[str, Any]] = [{
+      print("in ingest_py")
+
+      file_name = s3_path.split("/")[-1]
+      file_path = "media/" + file_name
+
+      self.s3_client.download_file(os.getenv('S3_BUCKET_NAME'), s3_path, file_path)
+      loader = PythonLoader(file_path)
+      documents = loader.load()
+
+      texts = [doc.page_content for doc in documents]
+
+      metadatas: List[Dict[str, Any]] = [{
             'course_name': course_name,
             's3_path': s3_path,
             'readable_filename': Path(s3_path).name,
             'pagenumber_or_timestamp': '',
         } for doc in documents]
+      #print(texts)
+      os.remove(file_path)
 
-        success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
-        return success_or_failure
+      success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
+      return success_or_failure
+
+      # with NamedTemporaryFile() as tmpfile:
+      #   # download from S3 into tmpfile
+      #   self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile)
+
+      #   print("filename: ", file_name)
+      #   loader = PythonLoader("media/" + file_name)
+      #   print("file: ", tmpfile)
+
+      #   documents = loader.load()
+      #   texts = [doc.page_content for doc in documents]
+
+      #   metadatas: List[Dict[str, Any]] = [{
+      #       'course_name': course_name,
+      #       's3_path': s3_path,
+      #       'readable_filename': Path(s3_path).name,
+      #       'pagenumber_or_timestamp': '',
+      #   } for doc in documents]
+
+      #   print(documents)
+
+      #   success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas)
+      #   return success_or_failure
     except Exception as e:
       print(f"ERROR IN py READING {e}")
 
@@ -566,7 +596,7 @@ def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs):
     return "Success"
 
   def _ingest_single_txt(self, s3_path: str, course_name: str) -> str:
-    """Ingest a single .txt file from S3.
+    """Ingest a single .txt or .md file from S3.
     Args:
         s3_path (str): A path to a .txt file in S3
         course_name (str): The name of the course