diff --git a/.env.template b/.env.template new file mode 100644 index 00000000..ba04c704 --- /dev/null +++ b/.env.template @@ -0,0 +1,34 @@ +# Supabase SQL +SUPABASE_URL= +SUPABASE_API_KEY= +SUPABASE_READ_ONLY= +SUPABASE_JWT_SECRET= + +MATERIALS_SUPABASE_TABLE=uiuc_chatbot +NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE=documents + +# QDRANT +QDRANT_COLLECTION_NAME=uiuc-chatbot +DEV_QDRANT_COLLECTION_NAME=dev +QDRANT_URL= +QDRANT_API_KEY= + +REFACTORED_MATERIALS_SUPABASE_TABLE= + +# AWS +S3_BUCKET_NAME=uiuc-chatbot +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= + +OPENAI_API_KEY= + +NOMIC_API_KEY= +LINTRULE_SECRET= + +# Github Agent +GITHUB_APP_ID= +GITHUB_APP_PRIVATE_KEY="-----BEGIN RSA PRIVATE KEY----- + +-----END RSA PRIVATE KEY-----" + +NUMEXPR_MAX_THREADS=2 diff --git a/README.md b/README.md index 501db0e2..c0c2fe12 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ Automatic [API Reference](https://uiuc-chatbot.github.io/ai-ta-backend/reference ## 📣 Development -- Install Python requirements `pip install -r requirements.txt` -- Start the server for development (with live reloads) `cd ai_ta_backend` then `flask --app ai_ta_backend.main:app --debug run --port 8000` +1. Rename `.env.template` to `.env` and fill in the required variables +2. Install Python requirements `pip install -r requirements.txt` +3. Start the server for development (with live reloads) `cd ai_ta_backend` then `flask --app ai_ta_backend.main:app --debug run --port 8000` The docs are auto-built and deployed to [our docs website](https://uiuc-chatbot.github.io/ai-ta-backend/) on every push. Or you can build the docs locally when writing: - `mkdocs serve` diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index e47f777c..39773834 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -810,14 +810,14 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): "embedding": embeddings_dict[context.page_content] } for context in contexts] - document = { - "course_name": contexts[0].metadata.get('course_name'), - "s3_path": contexts[0].metadata.get('s3_path'), - "readable_filename": contexts[0].metadata.get('readable_filename'), - "url": contexts[0].metadata.get('url'), - "base_url": contexts[0].metadata.get('base_url'), - "contexts": contexts_for_supa, - } + document = [{ + "course_name": context.metadata.get('course_name'), + "s3_path": context.metadata.get('s3_path'), + "readable_filename": context.metadata.get('readable_filename'), + "url": context.metadata.get('url'), + "base_url": context.metadata.get('base_url'), + "contexts": contexts_for_supa, # should ideally be just one context but getting JSON serialization error when I do that + } for context in contexts] count = self.supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore print("successful END OF split_and_upload") diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index f8422148..36158db9 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -240,7 +240,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url url_contents.append((url, s, filetype)) else: _invalid_urls.append(url) - + print("existing urls", _existing_urls) url_contents = remove_duplicates(url_contents, _existing_urls) max_urls = max_urls - len(url_contents) print(max_urls, "urls left") @@ -251,6 +251,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url if max_urls > 0: if _depth < max_depth: temp_data = crawler(url[0], max_urls, max_depth, timeout, _invalid_urls, _depth, url[1], url[2]) + print("existing urls", _existing_urls) temp_data = remove_duplicates(temp_data, _existing_urls) max_urls = max_urls - len(temp_data) print(max_urls, "urls left") @@ -274,12 +275,12 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url print(len(url_contents), "urls found") # Free up memory - del url_contents[:] - del urls[:] - if _invalid_urls is not None: - del _invalid_urls[:] - if _existing_urls is not None: - del _existing_urls[:] + # del url_contents[:] + # del urls[:] + # if _invalid_urls is not None: + # del _invalid_urls[:] + # if _existing_urls is not None: + # del _existing_urls[:] # gc.collect() return url_contents @@ -322,22 +323,28 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti del ingester return results else: - print("Gathering existing urls from Supabase") - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() - del supabase_client - if urls.data == []: + try: + print("Gathering existing urls from Supabase") + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() + del supabase_client + if urls.data == []: + existing_urls = None + else: + existing_urls = [] + for thing in urls.data: + whole = '' + for t in thing['contexts']: + whole += t['text'] + existing_urls.append((thing['url'], whole)) + print("Finished gathering existing urls from Supabase") + except Exception as e: + print("Error:", e) + print("Could not gather existing urls from Supabase") existing_urls = None - else: - existing_urls = [] - for thing in urls.data: - whole = '' - for t in thing['contexts']: - whole += t['text'] - existing_urls.append((thing['url'], whole)) - print("Finished gathering existing urls from Supabase") + print("Begin Ingesting Web page") data = crawler(url=url, max_urls=max_urls, max_depth=max_depth, timeout=timeout, base_url_on=stay_on_baseurl, _existing_urls=existing_urls) @@ -373,6 +380,7 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti if value == "403_Forbidden": print("Found Forbidden Key, deleting data") del data[counter] + counter -= 1 else: path_name.append(value) counter += 1 diff --git a/run.sh b/run.sh index 3edfdcf1..9a09c44d 100755 --- a/run.sh +++ b/run.sh @@ -3,4 +3,4 @@ # Docs https://docs.gunicorn.org/en/stable/settings.html#workers export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend -exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 60 --max-requests 2 \ No newline at end of file +exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 --max-requests 20 \ No newline at end of file