From b41478e86ab04e318926eda5ce4915ed652a066b Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Wed, 30 Aug 2023 18:38:21 -0700 Subject: [PATCH 1/9] enhance split_and_upload to handle code files that don't have periods --- ai_ta_backend/vector_database.py | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index 16659ee9..8e70b3de 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -780,31 +780,17 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): metadatas (List[Dict[str, Any]]): _description_ """ print("In split and upload") - print(f"Texts: {texts}") print(f"metadatas: {metadatas}") - print(type(texts)) - assert len(texts) == len(metadatas), 'must have equal number of text strings and metadata dicts' - + print(f"Texts: {texts}") + assert len(texts) == len(metadatas), f'must have equal number of text strings and metadata dicts. len(texts) is {len(texts)}. len(metadatas) is {len(metadatas)}' try: - # generate AI summary - # summary = self.ai_summary(texts, metadatas) - # for i in range(len(summary)): - # metadatas[i]['summary'] = summary[i] - text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=1000, chunk_overlap=150, - separators=". ", # try to split on sentences... + separators=[". ", "\n\n", "\n", " ", ""] # try to split on sentences... fallback to others to ensure we always fit in context window ) contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas) - - def remove_small_contexts(contexts: List[Document]) -> List[Document]: - # Remove TextSplit contexts with fewer than 50 chars. - return [doc for doc in contexts if len(doc.page_content) > 50] - - contexts = remove_small_contexts(contexts=contexts) - input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts] oai = OpenAIAPIProcessor(input_prompts_list=input_texts, @@ -835,9 +821,6 @@ def remove_small_contexts(contexts: List[Document]) -> List[Document]: collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore points=vectors # type: ignore ) - # # replace with Qdrant - # self.vectorstore.add_texts([doc.page_content for doc in documents], [doc.metadata for doc in documents]) - ### Supabase SQL ### contexts_for_supa = [{ "text": context.page_content, From 61a296b6a4eae92ad10bca4a9c4169b7197f0150 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Tue, 5 Sep 2023 10:53:26 -0700 Subject: [PATCH 2/9] reducing workers, increasing threads. will greatly reduce memory footprint at idle and under load --- run.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/run.sh b/run.sh index 02359fd1..3edfdcf1 100755 --- a/run.sh +++ b/run.sh @@ -1,4 +1,6 @@ #!/bin/bash +# Docs https://docs.gunicorn.org/en/stable/settings.html#workers + export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend -exec gunicorn --workers=6 --threads=6 --worker-class=gthread ai_ta_backend.main:app --timeout 108000 \ No newline at end of file +exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 60 --max-requests 2 \ No newline at end of file From 9161d8c0eb361a7f81a159586c16d03772be9a58 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Tue, 5 Sep 2023 11:12:28 -0700 Subject: [PATCH 3/9] Reduce memory consumption to save cash on Railway (#82) * major push; delete the ingest class everytime in main(). add if __name__ == main to all files * lower timeout value to encourage workers to restart (and reset memory usage) * reducing workers, increasing threads. should reduce memory footprint * match main's workers and threads --- ai_ta_backend/aws.py | 5 +- ai_ta_backend/extreme_context_stuffing.py | 16 +-- ai_ta_backend/main.py | 162 +++++++++++++++------- ai_ta_backend/nomic_logging.py | 3 + ai_ta_backend/utils_tokenization.py | 5 + ai_ta_backend/vector_database.py | 102 ++++++-------- ai_ta_backend/web_scrape.py | 49 +++---- 7 files changed, 188 insertions(+), 154 deletions(-) diff --git a/ai_ta_backend/aws.py b/ai_ta_backend/aws.py index 66fb0bbe..53e4c8c2 100644 --- a/ai_ta_backend/aws.py +++ b/ai_ta_backend/aws.py @@ -51,4 +51,7 @@ def upload(myfile): pool.map(upload, filenames) print("All data files uploaded to S3 successfully.") - return s3_paths \ No newline at end of file + return s3_paths + +if __name__ == '__main__': + pass diff --git a/ai_ta_backend/extreme_context_stuffing.py b/ai_ta_backend/extreme_context_stuffing.py index b92f2fd0..ed133a6a 100644 --- a/ai_ta_backend/extreme_context_stuffing.py +++ b/ai_ta_backend/extreme_context_stuffing.py @@ -274,19 +274,9 @@ def extract_context_from_results(results: List[Any]) -> List[str]: assistant_contents.append(choice['message']['content']) total_prompt_tokens += item['usage']['prompt_tokens'] total_completion_tokens += item['usage']['completion_tokens'] + # Note: I don't think the prompt_tokens or completion_tokens is working quite right... - # print("Assistant Contents:", assistant_contents) - print("Total Prompt Tokens:", total_prompt_tokens) - print("Total Completion Tokens:", total_completion_tokens) - turbo_total_cost = (total_prompt_tokens * 0.0015) + (total_completion_tokens * 0.002) - print("Total cost (3.5-turbo):", (total_prompt_tokens * 0.0015), " + Completions: ", (total_completion_tokens * 0.002), " = ", - turbo_total_cost) - - gpt4_total_cost = (total_prompt_tokens * 0.03) + (total_completion_tokens * 0.06) - print("Hypothetical cost for GPT-4:", (total_prompt_tokens * 0.03), " + Completions: ", (total_completion_tokens * 0.06), " = ", - gpt4_total_cost) - print("GPT-4 cost premium: ", (gpt4_total_cost / max(turbo_total_cost, 1)), "x") - return assistant_contents #, total_prompt_tokens, total_completion_tokens + return assistant_contents # dataclasses @@ -442,6 +432,8 @@ def task_id_generator_function(): yield task_id task_id += 1 +if __name__ == '__main__': + pass # run script # if __name__ == "__main__": diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 01d6949b..f20aff23 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -1,16 +1,17 @@ +import gc import os import time -from typing import Any, List, Union +from typing import List from dotenv import load_dotenv -from flask import Flask, abort, jsonify, request +from flask import Flask, Response, abort, jsonify, request from flask_cors import CORS +from flask_executor import Executor from sqlalchemy import JSON +from ai_ta_backend.nomic_logging import get_nomic_map, log_query_to_nomic from ai_ta_backend.vector_database import Ingest from ai_ta_backend.web_scrape import main_crawler, mit_course_download -from ai_ta_backend.nomic_logging import log_query_to_nomic, get_nomic_map, create_nomic_map -from flask_executor import Executor app = Flask(__name__) CORS(app) @@ -21,7 +22,7 @@ load_dotenv() @app.route('/') -def index() -> JSON: +def index() -> Response: """_summary_ Args: @@ -30,11 +31,13 @@ def index() -> JSON: Returns: JSON: _description_ """ - return jsonify({"Choo Choo": "Welcome to your Flask app 🚅"}) + response = jsonify({"Choo Choo": "Welcome to your Flask app 🚅"}) + response.headers.add('Access-Control-Allow-Origin', '*') + return response @app.route('/coursera', methods=['GET']) -def coursera() -> JSON: +def coursera() -> Response: try: course_name: str = request.args.get('course_name') # type: ignore coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore @@ -43,29 +46,37 @@ def coursera() -> JSON: ingester = Ingest() results = ingester.ingest_coursera(coursera_course_name, course_name) # type: ignore + del ingester + response = jsonify(results) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/github', methods=['GET']) -def github() -> JSON: - try: - course_name: str = request.args.get('course_name') # type: ignore - github_url: str = request.args.get('github_url') # type: ignore - except Exception as e: - print(f"No course name provided: {e}") +def github() -> Response: + course_name: str = request.args.get('course_name', default='', type=str) + github_url: str = request.args.get('github_url', default='', type=str) + + if course_name == '' or github_url == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{github_url}`" + ) + - print("In /github") ingester = Ingest() results = ingester.ingest_github(github_url, course_name) + del ingester response = jsonify(results) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/delete-entire-course', methods=['GET']) -def delete_entire_course(): +def delete_entire_course() -> Response: try: course_name: str = request.args.get('course_name') # type: ignore # coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore @@ -74,13 +85,15 @@ def delete_entire_course(): ingester = Ingest() results = ingester.delete_entire_course(course_name) # type: ignore + del ingester + response = jsonify(results) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/getTopContexts', methods=['GET']) -def getTopContexts(): +def getTopContexts() -> Response: """Get most relevant contexts for a given search query. Return value @@ -129,6 +142,7 @@ def getTopContexts(): ingester = Ingest() found_documents = ingester.getTopContexts(search_query, course_name, token_limit) + del ingester # background execution of tasks!! executor.submit(log_query_to_nomic, course_name, search_query) @@ -140,7 +154,7 @@ def getTopContexts(): @app.route('/get_stuffed_prompt', methods=['GET']) -def get_stuffed_prompt(): +def get_stuffed_prompt() -> Response: """Get most relevant contexts for a given search query. ## GET arguments @@ -154,13 +168,16 @@ def get_stuffed_prompt(): String """ - # todo: best way to handle optional arguments? - try: - course_name: str = request.args.get('course_name') - search_query: str = request.args.get('search_query') - token_limit: int = request.args.get('token_limit') - except Exception as e: - print("No course name provided.") + course_name: str = request.args.get('course_name', default='', type=str) + search_query: str = request.args.get('search_query', default='', type=str) + token_limit: int = request.args.get('token_limit', default=-1, type=int) + if course_name == '' or search_query == '' or token_limit == -1: + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'course_name', 'search_query', and 'token_limit' must be provided. Course name: `{course_name}`, Search query: `{search_query}`, Token limit: `{token_limit}`" + ) print("In /getTopContexts: ", search_query) if search_query is None: @@ -172,6 +189,7 @@ def get_stuffed_prompt(): ingester = Ingest() prompt = ingester.get_stuffed_prompt(search_query, course_name, token_limit) + del ingester response = jsonify(prompt) response.headers.add('Access-Control-Allow-Origin', '*') @@ -179,7 +197,7 @@ def get_stuffed_prompt(): @app.route('/ingest', methods=['GET']) -def ingest(): +def ingest() -> Response: """Recursively ingests anything from S3 filepath and below. Pass a s3_paths filepath (not URL) into our S3 bucket. @@ -191,13 +209,20 @@ def ingest(): Returns: str: Success or Failure message. Failure message if any failures. TODO: email on failure. """ + s3_paths: List[str] | str = request.args.get('s3_paths', default='') + course_name: List[str] | str = request.args.get('course_name', default='') - print("In /ingest") + if course_name == '' or s3_paths == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{s3_paths}`" + ) ingester = Ingest() - s3_paths: List[str] | str = request.args.get('s3_paths') - course_name: List[str] | str = request.args.get('course_name') success_fail_dict = ingester.bulk_ingest(s3_paths, course_name) + del ingester response = jsonify(success_fail_dict) response.headers.add('Access-Control-Allow-Origin', '*') @@ -205,7 +230,7 @@ def ingest(): @app.route('/getContextStuffedPrompt', methods=['GET']) -def getContextStuffedPrompt(): +def getContextStuffedPrompt() -> Response: """ Get a stuffed prompt for a given user question and course name. Args : @@ -217,32 +242,48 @@ def getContextStuffedPrompt(): print("In /getContextStuffedPrompt") ingester = Ingest() - search_query: str = str(request.args.get('search_query')) # type: ignore - course_name: str = str(request.args.get('course_name')) # type: ignore - top_n: int = int(request.args.get('top_n')) # type: ignore - top_k_to_search: int = int(request.args.get('top_k_to_search')) # type: ignore + search_query: str = request.args.get('search_query', default='', type=str) + course_name: str = request.args.get('course_name', default='', type=str) + top_n: int = request.args.get('top_n', default=-1, type=int) + top_k_to_search: int = request.args.get('top_k_to_search', default=-1, type=int) + + if search_query == '' or course_name == '' or top_n == -1 or top_k_to_search == -1: + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'search_query', 'course_name', 'top_n', and 'top_k_to_search' must be provided. Search query: `{search_query}`, Course name: `{course_name}`, Top N: `{top_n}`, Top K to search: `{top_k_to_search}`" + ) start_time = time.monotonic() stuffed_prompt = ingester.get_context_stuffed_prompt(search_query, course_name, top_n, top_k_to_search) print(f"⏰ Runtime of EXTREME prompt stuffing: {(time.monotonic() - start_time):.2f} seconds") - response = jsonify({"prompt": stuffed_prompt}) + del ingester + response = jsonify({"prompt": stuffed_prompt}) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/getAll', methods=['GET']) -def getAll(): +def getAll() -> Response: """Get all course materials based on the course_name """ + course_name: List[str] | str = request.args.get('course_name', default='', type=str) - print("In /getAll") + if course_name == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing the one required parameter: 'course_name' must be provided. Course name: `{course_name}`" + ) ingester = Ingest() - course_name: List[str] | str = request.args.get('course_name') distinct_dicts = ingester.getAll(course_name) - response = jsonify({"distinct_files": distinct_dicts}) + del ingester + response = jsonify({"distinct_files": distinct_dicts}) response.headers.add('Access-Control-Allow-Origin', '*') return response @@ -267,11 +308,11 @@ def delete(): start_time = time.monotonic() ingester = Ingest() - # background execution of tasks!! executor.submit(ingester.delete_data, s3_path, course_name) print(f"From {course_name}, deleted file: {s3_path}") print(f"⏰ Runtime of FULL delete func: {(time.monotonic() - start_time):.2f} seconds") + del ingester # we need instant return. Delets are "best effort" assume always successful... sigh :( response = jsonify({"outcome": 'success'}) @@ -279,13 +320,21 @@ def delete(): return response @app.route('/web-scrape', methods=['GET']) -def scrape(): - url: str = request.args.get('url') - max_urls: int = request.args.get('max_urls') - max_depth: int = request.args.get('max_depth') - timeout: int = request.args.get('timeout') - course_name: str = request.args.get('course_name') - stay_on_baseurl: bool = request.args.get('stay_on_baseurl') +def scrape() -> Response: + url: str = request.args.get('url', default='', type=str) + max_urls: int = request.args.get('max_urls', default=-1, type=int) + max_depth: int = request.args.get('max_depth', default=-1, type=int) + timeout: int = request.args.get('timeout', default=-1, type=int) + course_name: str = request.args.get('course_name', default='', type=str) + stay_on_baseurl: bool | None = request.args.get('stay_on_baseurl', type=bool) + + if url == '' or max_urls == -1 or max_depth == -1 or timeout == -1 or course_name == '' or stay_on_baseurl is None: + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'url', 'max_urls', 'max_depth', 'timeout', 'course_name', and 'stay_on_baseurl' must be provided. url: `{url}`, max_urls: `{max_urls}`, max_depth: `{max_depth}`, timeout: `{timeout}`, course_name: `{course_name}`, stay_on_baseurl: `{stay_on_baseurl}`" + ) # print all input params print(f"Web scrape!") @@ -298,14 +347,25 @@ def scrape(): response = jsonify(success_fail_dict) response.headers.add('Access-Control-Allow-Origin', '*') + gc.collect() # manually invoke garbage collection, try to reduce memory on Railway $$$ return response @app.route('/mit-download', methods=['GET']) -def mit_download_course(): - url: str = request.args.get('url') - course_name: str = request.args.get('course_name') - local_dir: str = request.args.get('local_dir') +def mit_download_course() -> Response: + """ Web scraper built for + """ + url: str = request.args.get('url', default='', type=str) + course_name: str = request.args.get('course_name', default='', type=str) + local_dir: str = request.args.get('local_dir', default='', type=str) + + if url == '' or course_name == '' or local_dir == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'url', 'course_name', and 'local_dir' must be provided. url: `{url}`, course_name: `{course_name}`, local_dir: `{local_dir}`" + ) success_fail = mit_course_download(url, course_name, local_dir) @@ -334,4 +394,4 @@ def nomic_map(): if __name__ == '__main__': - app.run(debug=True, port=os.getenv("PORT", default=8000)) + app.run(debug=True, port=int(os.getenv("PORT", default=8000))) diff --git a/ai_ta_backend/nomic_logging.py b/ai_ta_backend/nomic_logging.py index bcef5fe9..12681801 100644 --- a/ai_ta_backend/nomic_logging.py +++ b/ai_ta_backend/nomic_logging.py @@ -124,3 +124,6 @@ def create_nomic_map(course_name: str, log_embeddings: np.ndarray, log_data: lis name=project_name, colorable_fields=['query']) project.create_index(index_name, build_topic_model=True) return f"Successfully created Nomic map for {course_name}" + +if __name__ == '__main__': + pass diff --git a/ai_ta_backend/utils_tokenization.py b/ai_ta_backend/utils_tokenization.py index 596dcb92..096e2bb6 100644 --- a/ai_ta_backend/utils_tokenization.py +++ b/ai_ta_backend/utils_tokenization.py @@ -112,6 +112,8 @@ def analyze_conversations(supabase_client: Any = None): content = message['content'] # If the message is from the user, it's a prompt + # TODO: Fix these + # WARNING: Fix these error messages... they are the sign of a logic bug. if role == 'user': num_tokens, cost = count_tokens_and_cost(prompt=content, openai_model_name=model_name) total_prompt_cost += cost @@ -124,6 +126,9 @@ def analyze_conversations(supabase_client: Any = None): print(f'Assistant Completion: {content}\nTokens: {num_tokens_completion}, cost: {cost_completion}') return total_convos, total_messages, total_prompt_cost, total_completion_cost +if __name__ == '__main__': + pass + # if __name__ == '__main__': # print('starting main') # total_convos, total_messages, total_prompt_cost, total_completion_cost = analyze_conversations() diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index 8e70b3de..e47f777c 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -2,32 +2,25 @@ import inspect import logging import mimetypes -# import json import os import shutil import subprocess import time import traceback -import uuid # Literal +import uuid from pathlib import Path -from tempfile import NamedTemporaryFile # TemporaryFile +from tempfile import NamedTemporaryFile from typing import Any, Dict, List, Optional, Tuple, Union import boto3 -# import requests import fitz -import numpy as np import openai -import requests import supabase from bs4 import BeautifulSoup - -from langchain.document_loaders import (Docx2txtLoader, PythonLoader, - SRTLoader, - UnstructuredPowerPointLoader, TextLoader, GitLoader) - -from git import Repo - +from git.repo import Repo +from langchain.document_loaders import (Docx2txtLoader, GitLoader, + PythonLoader, SRTLoader, TextLoader, + UnstructuredPowerPointLoader) from langchain.embeddings.openai import OpenAIEmbeddings from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -40,16 +33,6 @@ from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor from ai_ta_backend.utils_tokenization import count_tokens_and_cost -# from arize.api import Client -# from arize.pandas.embeddings import EmbeddingGenerator, UseCases -# from arize.utils import ModelTypes -# from arize.utils.ModelTypes import GENERATIVE_LLM -# # from arize.utils.types import (Embedding, EmbeddingColumnNames, Environments, -# # Metrics, ModelTypes, Schema) - - - - class Ingest(): """ @@ -70,7 +53,7 @@ def __init__(self): self.vectorstore = Qdrant( client=self.qdrant_client, - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore + collection_name=os.environ['QDRANT_COLLECTION_NAME'], embeddings=OpenAIEmbeddings()) # type: ignore # S3 @@ -82,8 +65,8 @@ def __init__(self): # Create a Supabase client self.supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + supabase_url=os.environ['SUPABASE_URL'], + supabase_key=os.environ['SUPABASE_API_KEY']) return None def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n: int, top_k_to_search: int) -> str: @@ -149,9 +132,9 @@ def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n # no useful text, it replied with a summary of "None" continue if text is not None: - if "pagenumber" not in results[i][-1].keys(): - results[i][-1]['pagenumber'] = results[i][-1].get('pagenumber_or_timestamp') - num_tokens, prompt_cost = count_tokens_and_cost(text) + if "pagenumber" not in results[i][-1].keys(): # type: ignore + results[i][-1]['pagenumber'] = results[i][-1].get('pagenumber_or_timestamp') # type: ignore + num_tokens, prompt_cost = count_tokens_and_cost(text) # type: ignore if token_counter + num_tokens > max_tokens: print(f"Total tokens yet in loop {i} is {num_tokens}") break # Stop building the string if it exceeds the maximum number of tokens @@ -230,8 +213,10 @@ def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwarg # TODO: no need to download, just guess_type against the s3_path... with NamedTemporaryFile(suffix=ext) as tmpfile: self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) - mime_type = mimetypes.guess_type(tmpfile.name)[0] - category, subcategory = mime_type.split('/') + mime_type = str(mimetypes.guess_type(tmpfile.name)[0]) + category, subcategory = mime_type.split('/') + + # TODO: if mime-type is text, we should handle that via .txt ingest if s3_path.endswith('.html'): ret = self._ingest_html(s3_path, course_name, kwargs=kwargs) @@ -294,8 +279,6 @@ def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwarg def _ingest_single_py(self, s3_path: str, course_name: str): try: - print("in ingest_py") - file_name = s3_path.split("/")[-1] file_path = "media/" + file_name # download from s3 to local folder for ingest @@ -743,14 +726,12 @@ def ingest_github(self, github_url: str, course_name: str) -> str: Returns: _type_: Success or error message. """ - print("in ingest_github") - try: repo_path = "media/cloned_repo" repo = Repo.clone_from(github_url, to_path=repo_path, depth=1, clone_submodules=False) branch = repo.head.reference - loader = GitLoader(repo_path="media/cloned_repo", branch=branch) + loader = GitLoader(repo_path="media/cloned_repo", branch=str(branch)) data = loader.load() shutil.rmtree("media/cloned_repo") # create metadata for each file in data @@ -818,7 +799,7 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): )) self.qdrant_client.upsert( - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore + collection_name=os.environ['QDRANT_COLLECTION_NAME'], # type: ignore points=vectors # type: ignore ) ### Supabase SQL ### @@ -872,7 +853,7 @@ def delete_entire_course(self, course_name: str): # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), print("deleting from qdrant") self.qdrant_client.delete( - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), + collection_name=os.environ['QDRANT_COLLECTION_NAME'], points_selector=models.Filter(must=[ models.FieldCondition( key="metadata.course_name", @@ -888,7 +869,7 @@ def delete_entire_course(self, course_name: str): try: # Delete from Supabase print("deleting from supabase") - response = self.supabase_client.from_(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).delete().eq('course_name', course_name).execute() + response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq('course_name', course_name).execute() print("supabase response: ", response) return "Success" except Exception as e: @@ -910,7 +891,7 @@ def delete_data(self, s3_path: str, course_name: str): # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), self.qdrant_client.delete( - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), + collection_name=os.environ['QDRANT_COLLECTION_NAME'], points_selector=models.Filter(must=[ models.FieldCondition( key="metadata.s3_path", @@ -920,7 +901,7 @@ def delete_data(self, s3_path: str, course_name: str): ) # Delete from Supabase - response = self.supabase_client.from_(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).delete().eq('s3_path', s3_path).eq( + response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq('s3_path', s3_path).eq( 'course_name', course_name).execute() return "Success" except Exception as e: @@ -940,7 +921,7 @@ def getAll( """ response = self.supabase_client.table( - os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, s3_path, readable_filename, url, base_url').eq( # type: ignore + os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq( 'course_name', course_name).execute() data = response.data @@ -957,7 +938,7 @@ def getAll( def vector_search(self, search_query, course_name): top_n = 80 - o = OpenAIEmbeddings() + o = OpenAIEmbeddings() # type: ignore user_query_embedding = o.embed_query(search_query) myfilter = models.Filter( must=[ @@ -978,12 +959,12 @@ def vector_search(self, search_query, course_name): print("search_results", search_results) found_docs: list[Document] = [] for d in search_results: - metadata = d.payload.get('metadata') - if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): + metadata = d.payload.get('metadata') # type: ignore + if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): # type: ignore # aiding in the database migration... - metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] + metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] # type: ignore - found_docs.append(Document(page_content=d.payload.get('page_content'), metadata=metadata)) + found_docs.append(Document(page_content=d.payload.get('page_content'), metadata=metadata)) # type: ignore # found_docs: list[Document] = [Document(page_content=str(d.payload.get('page_content')), metadata=d.payload.get('metadata')) for d in search_results] print("found_docs", found_docs) @@ -1008,13 +989,13 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) + token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) # type: ignore valid_docs = [] num_tokens = 0 for doc in found_docs: doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) + num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") if token_counter + num_tokens <= token_limit: @@ -1044,8 +1025,8 @@ def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: i try: top_n = 150 start_time_overall = time.monotonic() - o = OpenAIEmbeddings() - user_query_embedding = o.embed_documents(search_query)[0] + o = OpenAIEmbeddings() # type: ignore + user_query_embedding = o.embed_documents(search_query)[0] # type: ignore myfilter = models.Filter( must=[ models.FieldCondition( @@ -1068,19 +1049,19 @@ def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: i pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) + token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) # type: ignore valid_docs = [] for d in found_docs: - if "pagenumber" not in d.payload["metadata"].keys(): - d.payload["metadata"]["pagenumber"] = d.payload["metadata"]["pagenumber_or_timestamp"] - doc_string = f"---\nDocument: {d.payload['metadata']['readable_filename']}{', page: ' + str(d.payload['metadata']['pagenumber']) if d.payload['metadata']['pagenumber'] else ''}\n{d.payload.get('page_content')}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) + if "pagenumber" not in d.payload["metadata"].keys(): # type: ignore + d.payload["metadata"]["pagenumber"] = d.payload["metadata"]["pagenumber_or_timestamp"] # type: ignore + doc_string = f"---\nDocument: {d.payload['metadata']['readable_filename']}{', page: ' + str(d.payload['metadata']['pagenumber']) if d.payload['metadata']['pagenumber'] else ''}\n{d.payload.get('page_content')}\n" # type: ignore + num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore - print(f"Page: {d.payload.get('page_content')[:100]}...") + print(f"Page: {d.payload.get('page_content')[:100]}...") # type: ignore print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, token_limit: {token_limit}") if token_counter + num_tokens <= token_limit: token_counter += num_tokens - valid_docs.append(Document(page_content=d.payload.get('page_content'), metadata=d.payload.get('metadata'))) + valid_docs.append(Document(page_content=d.payload.get('page_content'), metadata=d.payload.get('metadata'))) # type: ignore else: continue print("running continue") @@ -1094,7 +1075,7 @@ def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: i # Create the stuffedPrompt stuffedPrompt = (pre_prompt + context_text + '\n\nNow please respond to my query: ' + search_query) - TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4') + TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4') # type: ignore print(f"Total tokens: {TOTAL_num_tokens}, prompt_cost: {prompt_cost}") print("total docs: ", len(found_docs)) print("num docs used: ", len(valid_docs)) @@ -1137,3 +1118,6 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]: } for doc in found_docs] return contexts + +if __name__ == '__main__': + pass diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index 501b35aa..f8422148 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -272,6 +272,16 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url else: print("Exceeded Max URLS, found:", len(url_contents), "out of", amount) print(len(url_contents), "urls found") + + # Free up memory + del url_contents[:] + del urls[:] + if _invalid_urls is not None: + del _invalid_urls[:] + if _existing_urls is not None: + del _existing_urls[:] + # gc.collect() + return url_contents def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, timeout:int=1, stay_on_baseurl:bool=False): @@ -309,6 +319,7 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti print("Begin Ingesting GitHub page") results = ingester.ingest_github(url, course_name) print("Finished ingesting GitHub page") + del ingester return results else: print("Gathering existing urls from Supabase") @@ -316,6 +327,7 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti supabase_url=os.getenv('SUPABASE_URL'), # type: ignore supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() + del supabase_client if urls.data == []: existing_urls = None else: @@ -364,10 +376,8 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti else: path_name.append(value) counter += 1 - print("Cleaned title names", path_name) - # Upload each html to S3 print("Uploading files to S3") paths = [] @@ -392,38 +402,12 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti counter += 1 else: print("No", key[2] ,"to upload", key[1]) - # if ".pdf" in key[0]: - # with NamedTemporaryFile(suffix=".pdf") as temp_pdf: - # if key[1] != "" or key[1] != None: - # temp_pdf.write(key[1]) - # temp_pdf.seek(0) - # s3_upload_path = "courses/"+ course_name + "/" + path_name[i] + ".pdf" - # paths.append(s3_upload_path) - # with open(temp_pdf.name, 'rb') as f: - # print("Uploading PDF to S3") - # s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path) - # ingester.bulk_ingest(s3_upload_path, course_name=course_name, url=key[0], base_url=url) - # counter += 1 - # else: - # print("No PDF to upload", key[1]) - # else: - # with NamedTemporaryFile(suffix=".html") as temp_html: - # if key[1] != "" or key[1] != None: - # temp_html.write(key[1].encode('utf-8')) - # temp_html.seek(0) - # s3_upload_path = "courses/"+ course_name + "/" + path_name[i] + ".html" - # paths.append(s3_upload_path) - # with open(temp_html.name, 'rb') as f: - # print("Uploading html to S3") - # s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path) - # ingester.bulk_ingest(s3_upload_path, course_name=course_name, url=key[0], base_url=url) - # counter += 1 - # else: - # print("No html to upload", key[1]) except Exception as e: print("Error in upload:", e) + finally: + del ingester - print("Successfully uploaded", counter, "files to S3") + print(f"Successfully uploaded files to s3: {counter}") print("Finished /web-scrape") # Download an MIT course using its url @@ -467,6 +451,9 @@ def mit_course_download(url:str, course_name:str, local_dir:str): shutil.move(zip_file, local_dir) shutil.rmtree(local_dir) + del ingester print("Finished Ingest") return success_fail +if __name__ == '__main__': + pass From f8c3bc21b0d37d592cf3bdf0501f979e9d0b9da0 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Tue, 5 Sep 2023 13:03:21 -0700 Subject: [PATCH 4/9] adding defaults for web scraping --- ai_ta_backend/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index f20aff23..e32b8b7d 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -322,11 +322,11 @@ def delete(): @app.route('/web-scrape', methods=['GET']) def scrape() -> Response: url: str = request.args.get('url', default='', type=str) - max_urls: int = request.args.get('max_urls', default=-1, type=int) - max_depth: int = request.args.get('max_depth', default=-1, type=int) - timeout: int = request.args.get('timeout', default=-1, type=int) course_name: str = request.args.get('course_name', default='', type=str) - stay_on_baseurl: bool | None = request.args.get('stay_on_baseurl', type=bool) + max_urls: int = request.args.get('max_urls', default=100, type=int) + max_depth: int = request.args.get('max_depth', default=2, type=int) + timeout: int = request.args.get('timeout', default=3, type=int) + stay_on_baseurl: bool | None = request.args.get('`stay_on_baseurl`', default=True, type=bool) if url == '' or max_urls == -1 or max_depth == -1 or timeout == -1 or course_name == '' or stay_on_baseurl is None: # proper web error "400 Bad request" From 805936bcf1b7cd985dbd2713d4385c55730bacdf Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Tue, 5 Sep 2023 14:01:15 -0700 Subject: [PATCH 5/9] increasing timeout from 30 sec to 30 min, web scrape takes a while sometimes --- run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.sh b/run.sh index 3edfdcf1..a7c30f35 100755 --- a/run.sh +++ b/run.sh @@ -3,4 +3,4 @@ # Docs https://docs.gunicorn.org/en/stable/settings.html#workers export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend -exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 60 --max-requests 2 \ No newline at end of file +exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 --max-requests 2 \ No newline at end of file From 0af41e64eea5c91ba53e0a72bad7905148ee3714 Mon Sep 17 00:00:00 2001 From: jkmin3 Date: Tue, 5 Sep 2023 19:49:42 -0500 Subject: [PATCH 6/9] fixed webscraper --- ai_ta_backend/web_scrape.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index f8422148..da48bc90 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -240,7 +240,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url url_contents.append((url, s, filetype)) else: _invalid_urls.append(url) - + print("existing urls", _existing_urls) url_contents = remove_duplicates(url_contents, _existing_urls) max_urls = max_urls - len(url_contents) print(max_urls, "urls left") @@ -251,6 +251,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url if max_urls > 0: if _depth < max_depth: temp_data = crawler(url[0], max_urls, max_depth, timeout, _invalid_urls, _depth, url[1], url[2]) + print("existing urls", _existing_urls) temp_data = remove_duplicates(temp_data, _existing_urls) max_urls = max_urls - len(temp_data) print(max_urls, "urls left") @@ -274,12 +275,12 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url print(len(url_contents), "urls found") # Free up memory - del url_contents[:] - del urls[:] - if _invalid_urls is not None: - del _invalid_urls[:] - if _existing_urls is not None: - del _existing_urls[:] + # del url_contents[:] + # del urls[:] + # if _invalid_urls is not None: + # del _invalid_urls[:] + # if _existing_urls is not None: + # del _existing_urls[:] # gc.collect() return url_contents @@ -373,6 +374,7 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti if value == "403_Forbidden": print("Found Forbidden Key, deleting data") del data[counter] + counter -= 1 else: path_name.append(value) counter += 1 From 6ce20e7e5bd6f3da6479f04cdd414e7d69d0225c Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Wed, 6 Sep 2023 09:54:04 -0700 Subject: [PATCH 7/9] HOTFIX for cannot schedule new futures after interpreter shutdown, which prevented uploads to s3 on web-scrape --- run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.sh b/run.sh index a7c30f35..9a09c44d 100755 --- a/run.sh +++ b/run.sh @@ -3,4 +3,4 @@ # Docs https://docs.gunicorn.org/en/stable/settings.html#workers export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend -exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 --max-requests 2 \ No newline at end of file +exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 --max-requests 20 \ No newline at end of file From 32abf395d8d176ef3ff81946594403280f9c4561 Mon Sep 17 00:00:00 2001 From: jkmin3 Date: Wed, 6 Sep 2023 14:17:45 -0500 Subject: [PATCH 8/9] quick fix for supabase using try except --- ai_ta_backend/web_scrape.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index da48bc90..36158db9 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -323,22 +323,28 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti del ingester return results else: - print("Gathering existing urls from Supabase") - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() - del supabase_client - if urls.data == []: + try: + print("Gathering existing urls from Supabase") + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() + del supabase_client + if urls.data == []: + existing_urls = None + else: + existing_urls = [] + for thing in urls.data: + whole = '' + for t in thing['contexts']: + whole += t['text'] + existing_urls.append((thing['url'], whole)) + print("Finished gathering existing urls from Supabase") + except Exception as e: + print("Error:", e) + print("Could not gather existing urls from Supabase") existing_urls = None - else: - existing_urls = [] - for thing in urls.data: - whole = '' - for t in thing['contexts']: - whole += t['text'] - existing_urls.append((thing['url'], whole)) - print("Finished gathering existing urls from Supabase") + print("Begin Ingesting Web page") data = crawler(url=url, max_urls=max_urls, max_depth=max_depth, timeout=timeout, base_url_on=stay_on_baseurl, _existing_urls=existing_urls) From 6a1a38f47eb81f8a8ba3688149034eaa127d6a43 Mon Sep 17 00:00:00 2001 From: Asmita Dabholkar Date: Wed, 6 Sep 2023 17:55:12 -0500 Subject: [PATCH 9/9] Fix GitHub ingest: separate files created properly (#76) * modified 'document' which is uploaded to supabase * delete comments --------- Co-authored-by: Kastan Day --- ai_ta_backend/vector_database.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index e47f777c..39773834 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -810,14 +810,14 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): "embedding": embeddings_dict[context.page_content] } for context in contexts] - document = { - "course_name": contexts[0].metadata.get('course_name'), - "s3_path": contexts[0].metadata.get('s3_path'), - "readable_filename": contexts[0].metadata.get('readable_filename'), - "url": contexts[0].metadata.get('url'), - "base_url": contexts[0].metadata.get('base_url'), - "contexts": contexts_for_supa, - } + document = [{ + "course_name": context.metadata.get('course_name'), + "s3_path": context.metadata.get('s3_path'), + "readable_filename": context.metadata.get('readable_filename'), + "url": context.metadata.get('url'), + "base_url": context.metadata.get('base_url'), + "contexts": contexts_for_supa, # should ideally be just one context but getting JSON serialization error when I do that + } for context in contexts] count = self.supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore print("successful END OF split_and_upload")