Enhancement to Github ingest, add full URLs, enable file deletion

UIUC-Chatbot · Sep 15, 2023 · 26ee9f4 · 26ee9f4
1 parent 4f6a863
commit 26ee9f4
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 25 deletions.
diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py
@@ -292,22 +292,22 @@ def delete():
   Delete a single file from all our database: S3, Qdrant, and Supabase (for now).
   Note, of course, we still have parts of that file in our logs.
   """
-
   course_name: str = request.args.get('course_name', default='', type=str)
   s3_path: str = request.args.get('s3_path', default='', type=str)
+  source_url: str = request.args.get('source_url', default='', type=str)
 
-  if course_name == '' or s3_path == '':
+  if course_name == '' and (s3_path == '' or source_url == ''):
     # proper web error "400 Bad request"
     abort(
         400,
         description=
-        f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{s3_path}`"
+        f"Missing one or more required parameters: 'course_name' and ('s3_path' or 'source_url') must be provided. Course name: `{course_name}`, S3 path: `{s3_path}`, source_url: `{source_url}`"
     )
 
   start_time = time.monotonic()
   ingester = Ingest()
   # background execution of tasks!! 
-  executor.submit(ingester.delete_data, s3_path, course_name)
+  executor.submit(ingester.delete_data, course_name, s3_path, source_url)
   print(f"From {course_name}, deleted file: {s3_path}")
   print(f"⏰ Runtime of FULL delete func: {(time.monotonic() - start_time):.2f} seconds")
   del ingester

diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py
@@ -681,13 +681,14 @@ def ingest_github(self, github_url: str, course_name: str) -> str:
       data = loader.load()
       shutil.rmtree("media/cloned_repo")
       # create metadata for each file in data 
+
       for doc in data:
         texts = doc.page_content
         metadatas: Dict[str, Any] = {
                 'course_name': course_name,
                 's3_path': '',
                 'readable_filename': doc.metadata['file_name'],
-                'url': github_url,
+                'url': f"{github_url}/blob/main/{doc.metadata['file_path']}",
                 'pagenumber': '', 
                 'timestamp': '',
             }
@@ -834,31 +835,62 @@ def delete_entire_course(self, course_name: str):
     # todo: delete from Vercel KV to fully make the coure not exist. Last db to delete from (as of now, Aug 15)
 
 
-  # Create a method to delete file from s3, delete vector from qdrant, and delete row from supabase
-  def delete_data(self, s3_path: str, course_name: str):
+  def delete_data(self, course_name: str, s3_path: str, source_url: str):
     """Delete file from S3, Qdrant, and Supabase."""
     print(f"Deleting {s3_path} from S3, Qdrant, and Supabase for course {course_name}")
     try:
       # Delete file from S3
       bucket_name = os.getenv('S3_BUCKET_NAME')
-      self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path)
 
-      # Delete from Qdrant
-      # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key
-      # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18  \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None),
-      self.qdrant_client.delete(
-          collection_name=os.environ['QDRANT_COLLECTION_NAME'],
-          points_selector=models.Filter(must=[
-              models.FieldCondition(
-                  key="metadata.s3_path",
-                  match=models.MatchValue(value=s3_path),
-              ),
-          ]),
-      )
+      # Delete files by S3 path
+      if s3_path: 
+        try:
+          self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path)
+        except Exception as e:
+          print("Error in deleting file from s3:", e)
+        # Delete from Qdrant
+        # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key
+        # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18  \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training  ...
+        try: 
+          self.qdrant_client.delete(
+              collection_name=os.environ['QDRANT_COLLECTION_NAME'],
+              points_selector=models.Filter(must=[
+                  models.FieldCondition(
+                      key="metadata.s3_path",
+                      match=models.MatchValue(value=s3_path),
+                  ),
+              ]),
+          )
+        except Exception as e:
+          print("Error in deleting file from Qdrant:", e)
+        try: 
+          response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq('s3_path', s3_path).eq(
+              'course_name', course_name).execute()
+        except Exception as e:
+          print("Error in deleting file from supabase:", e)
+
+      # Delete files by their URL identifier
+      elif source_url:
+        try:
+          # Delete from Qdrant
+          self.qdrant_client.delete(
+              collection_name=os.environ['QDRANT_COLLECTION_NAME'],
+              points_selector=models.Filter(must=[
+                  models.FieldCondition(
+                      key="metadata.url",
+                      match=models.MatchValue(value=source_url),
+                  ),
+              ]),
+          )
+        except Exception as e:
+          print("Error in deleting file from Qdrant:", e)
+        try: 
+          response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq('url', source_url).eq(
+              'course_name', course_name).execute()
+        except Exception as e:
+          print("Error in deleting file from supabase:", e)
 
       # Delete from Supabase
-      response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq('s3_path', s3_path).eq(
-          'course_name', course_name).execute()
       return "Success"
     except Exception as e:
       err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}"  # type: ignore

diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py
@@ -295,7 +295,7 @@ def is_github_repo(url):
   pattern = re.compile(r'^(https?://)?(www\.)?github\.com/[^/?]+/[^/?]+/?$', re.IGNORECASE)
 
   # The function returns True or False based on whether the pattern matches the base_url
-  return bool(pattern.match(base_url))
+  return base_url if pattern.match(base_url) else None
 
 def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, timeout:int=1, stay_on_baseurl:bool=False):
   """
@@ -328,9 +328,10 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti
     )
 
   # Check for GitHub repository coming soon
-  if is_github_repo(url):
+  cleaned_repo_url = is_github_repo(url)
+  if cleaned_repo_url:
     print("Begin Ingesting GitHub page")
-    results = ingester.ingest_github(url, course_name)
+    results = ingester.ingest_github(cleaned_repo_url, course_name)
     print("Finished ingesting GitHub page")
     del ingester
     return results