From 805936bcf1b7cd985dbd2713d4385c55730bacdf Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Tue, 5 Sep 2023 14:01:15 -0700 Subject: [PATCH 1/7] increasing timeout from 30 sec to 30 min, web scrape takes a while sometimes --- run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.sh b/run.sh index 3edfdcf1..a7c30f35 100755 --- a/run.sh +++ b/run.sh @@ -3,4 +3,4 @@ # Docs https://docs.gunicorn.org/en/stable/settings.html#workers export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend -exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 60 --max-requests 2 \ No newline at end of file +exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 --max-requests 2 \ No newline at end of file From 0af41e64eea5c91ba53e0a72bad7905148ee3714 Mon Sep 17 00:00:00 2001 From: jkmin3 Date: Tue, 5 Sep 2023 19:49:42 -0500 Subject: [PATCH 2/7] fixed webscraper --- ai_ta_backend/web_scrape.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index f8422148..da48bc90 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -240,7 +240,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url url_contents.append((url, s, filetype)) else: _invalid_urls.append(url) - + print("existing urls", _existing_urls) url_contents = remove_duplicates(url_contents, _existing_urls) max_urls = max_urls - len(url_contents) print(max_urls, "urls left") @@ -251,6 +251,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url if max_urls > 0: if _depth < max_depth: temp_data = crawler(url[0], max_urls, max_depth, timeout, _invalid_urls, _depth, url[1], url[2]) + print("existing urls", _existing_urls) temp_data = remove_duplicates(temp_data, _existing_urls) max_urls = max_urls - len(temp_data) print(max_urls, "urls left") @@ -274,12 +275,12 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url print(len(url_contents), "urls found") # Free up memory - del url_contents[:] - del urls[:] - if _invalid_urls is not None: - del _invalid_urls[:] - if _existing_urls is not None: - del _existing_urls[:] + # del url_contents[:] + # del urls[:] + # if _invalid_urls is not None: + # del _invalid_urls[:] + # if _existing_urls is not None: + # del _existing_urls[:] # gc.collect() return url_contents @@ -373,6 +374,7 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti if value == "403_Forbidden": print("Found Forbidden Key, deleting data") del data[counter] + counter -= 1 else: path_name.append(value) counter += 1 From 6ce20e7e5bd6f3da6479f04cdd414e7d69d0225c Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Wed, 6 Sep 2023 09:54:04 -0700 Subject: [PATCH 3/7] HOTFIX for cannot schedule new futures after interpreter shutdown, which prevented uploads to s3 on web-scrape --- run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.sh b/run.sh index a7c30f35..9a09c44d 100755 --- a/run.sh +++ b/run.sh @@ -3,4 +3,4 @@ # Docs https://docs.gunicorn.org/en/stable/settings.html#workers export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend -exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 --max-requests 2 \ No newline at end of file +exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 --max-requests 20 \ No newline at end of file From 32abf395d8d176ef3ff81946594403280f9c4561 Mon Sep 17 00:00:00 2001 From: jkmin3 Date: Wed, 6 Sep 2023 14:17:45 -0500 Subject: [PATCH 4/7] quick fix for supabase using try except --- ai_ta_backend/web_scrape.py | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index da48bc90..36158db9 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -323,22 +323,28 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti del ingester return results else: - print("Gathering existing urls from Supabase") - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() - del supabase_client - if urls.data == []: + try: + print("Gathering existing urls from Supabase") + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() + del supabase_client + if urls.data == []: + existing_urls = None + else: + existing_urls = [] + for thing in urls.data: + whole = '' + for t in thing['contexts']: + whole += t['text'] + existing_urls.append((thing['url'], whole)) + print("Finished gathering existing urls from Supabase") + except Exception as e: + print("Error:", e) + print("Could not gather existing urls from Supabase") existing_urls = None - else: - existing_urls = [] - for thing in urls.data: - whole = '' - for t in thing['contexts']: - whole += t['text'] - existing_urls.append((thing['url'], whole)) - print("Finished gathering existing urls from Supabase") + print("Begin Ingesting Web page") data = crawler(url=url, max_urls=max_urls, max_depth=max_depth, timeout=timeout, base_url_on=stay_on_baseurl, _existing_urls=existing_urls) From 6a1a38f47eb81f8a8ba3688149034eaa127d6a43 Mon Sep 17 00:00:00 2001 From: Asmita Dabholkar Date: Wed, 6 Sep 2023 17:55:12 -0500 Subject: [PATCH 5/7] Fix GitHub ingest: separate files created properly (#76) * modified 'document' which is uploaded to supabase * delete comments --------- Co-authored-by: Kastan Day --- ai_ta_backend/vector_database.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index e47f777c..39773834 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -810,14 +810,14 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): "embedding": embeddings_dict[context.page_content] } for context in contexts] - document = { - "course_name": contexts[0].metadata.get('course_name'), - "s3_path": contexts[0].metadata.get('s3_path'), - "readable_filename": contexts[0].metadata.get('readable_filename'), - "url": contexts[0].metadata.get('url'), - "base_url": contexts[0].metadata.get('base_url'), - "contexts": contexts_for_supa, - } + document = [{ + "course_name": context.metadata.get('course_name'), + "s3_path": context.metadata.get('s3_path'), + "readable_filename": context.metadata.get('readable_filename'), + "url": context.metadata.get('url'), + "base_url": context.metadata.get('base_url'), + "contexts": contexts_for_supa, # should ideally be just one context but getting JSON serialization error when I do that + } for context in contexts] count = self.supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore print("successful END OF split_and_upload") From c6c21e64c679d23e305a55d75323b059cc5c5599 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Mon, 11 Sep 2023 13:30:55 -0700 Subject: [PATCH 6/7] Create .env.template --- .env.template | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .env.template diff --git a/.env.template b/.env.template new file mode 100644 index 00000000..ba04c704 --- /dev/null +++ b/.env.template @@ -0,0 +1,34 @@ +# Supabase SQL +SUPABASE_URL= +SUPABASE_API_KEY= +SUPABASE_READ_ONLY= +SUPABASE_JWT_SECRET= + +MATERIALS_SUPABASE_TABLE=uiuc_chatbot +NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE=documents + +# QDRANT +QDRANT_COLLECTION_NAME=uiuc-chatbot +DEV_QDRANT_COLLECTION_NAME=dev +QDRANT_URL= +QDRANT_API_KEY= + +REFACTORED_MATERIALS_SUPABASE_TABLE= + +# AWS +S3_BUCKET_NAME=uiuc-chatbot +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= + +OPENAI_API_KEY= + +NOMIC_API_KEY= +LINTRULE_SECRET= + +# Github Agent +GITHUB_APP_ID= +GITHUB_APP_PRIVATE_KEY="-----BEGIN RSA PRIVATE KEY----- + +-----END RSA PRIVATE KEY-----" + +NUMEXPR_MAX_THREADS=2 From 09010b5eb605450b7861a655e0c01e01215adb47 Mon Sep 17 00:00:00 2001 From: Kastan Day Date: Mon, 11 Sep 2023 13:32:33 -0700 Subject: [PATCH 7/7] =?UTF-8?q?=F0=9F=93=9C=20Add=20.env.template=20to=20i?= =?UTF-8?q?nstall=20instructions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 501db0e2..c0c2fe12 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,9 @@ Automatic [API Reference](https://uiuc-chatbot.github.io/ai-ta-backend/reference ## 📣 Development -- Install Python requirements `pip install -r requirements.txt` -- Start the server for development (with live reloads) `cd ai_ta_backend` then `flask --app ai_ta_backend.main:app --debug run --port 8000` +1. Rename `.env.template` to `.env` and fill in the required variables +2. Install Python requirements `pip install -r requirements.txt` +3. Start the server for development (with live reloads) `cd ai_ta_backend` then `flask --app ai_ta_backend.main:app --debug run --port 8000` The docs are auto-built and deployed to [our docs website](https://uiuc-chatbot.github.io/ai-ta-backend/) on every push. Or you can build the docs locally when writing: - `mkdocs serve`