Skip to content

Commit

Permalink
Merge branch 'main' into nomic-viz
Browse files Browse the repository at this point in the history
  • Loading branch information
KastanDay committed Sep 11, 2023
2 parents 2ca6d79 + 09010b5 commit eea8bd3
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 33 deletions.
34 changes: 34 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Supabase SQL
SUPABASE_URL=
SUPABASE_API_KEY=
SUPABASE_READ_ONLY=
SUPABASE_JWT_SECRET=

MATERIALS_SUPABASE_TABLE=uiuc_chatbot
NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE=documents

# QDRANT
QDRANT_COLLECTION_NAME=uiuc-chatbot
DEV_QDRANT_COLLECTION_NAME=dev
QDRANT_URL=
QDRANT_API_KEY=

REFACTORED_MATERIALS_SUPABASE_TABLE=

# AWS
S3_BUCKET_NAME=uiuc-chatbot
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=

OPENAI_API_KEY=

NOMIC_API_KEY=
LINTRULE_SECRET=

# Github Agent
GITHUB_APP_ID=<OptionalForGithubApps>
GITHUB_APP_PRIVATE_KEY="-----BEGIN RSA PRIVATE KEY-----

-----END RSA PRIVATE KEY-----"

NUMEXPR_MAX_THREADS=2
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@ Automatic [API Reference](https://uiuc-chatbot.github.io/ai-ta-backend/reference

## 📣 Development

- Install Python requirements `pip install -r requirements.txt`
- Start the server for development (with live reloads) `cd ai_ta_backend` then `flask --app ai_ta_backend.main:app --debug run --port 8000`
1. Rename `.env.template` to `.env` and fill in the required variables
2. Install Python requirements `pip install -r requirements.txt`
3. Start the server for development (with live reloads) `cd ai_ta_backend` then `flask --app ai_ta_backend.main:app --debug run --port 8000`

The docs are auto-built and deployed to [our docs website](https://uiuc-chatbot.github.io/ai-ta-backend/) on every push. Or you can build the docs locally when writing:
- `mkdocs serve`
Expand Down
16 changes: 8 additions & 8 deletions ai_ta_backend/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -810,14 +810,14 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]):
"embedding": embeddings_dict[context.page_content]
} for context in contexts]

document = {
"course_name": contexts[0].metadata.get('course_name'),
"s3_path": contexts[0].metadata.get('s3_path'),
"readable_filename": contexts[0].metadata.get('readable_filename'),
"url": contexts[0].metadata.get('url'),
"base_url": contexts[0].metadata.get('base_url'),
"contexts": contexts_for_supa,
}
document = [{
"course_name": context.metadata.get('course_name'),
"s3_path": context.metadata.get('s3_path'),
"readable_filename": context.metadata.get('readable_filename'),
"url": context.metadata.get('url'),
"base_url": context.metadata.get('base_url'),
"contexts": contexts_for_supa, # should ideally be just one context but getting JSON serialization error when I do that
} for context in contexts]

count = self.supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore
print("successful END OF split_and_upload")
Expand Down
52 changes: 30 additions & 22 deletions ai_ta_backend/web_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url
url_contents.append((url, s, filetype))
else:
_invalid_urls.append(url)

print("existing urls", _existing_urls)
url_contents = remove_duplicates(url_contents, _existing_urls)
max_urls = max_urls - len(url_contents)
print(max_urls, "urls left")
Expand All @@ -251,6 +251,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url
if max_urls > 0:
if _depth < max_depth:
temp_data = crawler(url[0], max_urls, max_depth, timeout, _invalid_urls, _depth, url[1], url[2])
print("existing urls", _existing_urls)
temp_data = remove_duplicates(temp_data, _existing_urls)
max_urls = max_urls - len(temp_data)
print(max_urls, "urls left")
Expand All @@ -274,12 +275,12 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url
print(len(url_contents), "urls found")

# Free up memory
del url_contents[:]
del urls[:]
if _invalid_urls is not None:
del _invalid_urls[:]
if _existing_urls is not None:
del _existing_urls[:]
# del url_contents[:]
# del urls[:]
# if _invalid_urls is not None:
# del _invalid_urls[:]
# if _existing_urls is not None:
# del _existing_urls[:]
# gc.collect()

return url_contents
Expand Down Expand Up @@ -322,22 +323,28 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti
del ingester
return results
else:
print("Gathering existing urls from Supabase")
supabase_client = supabase.create_client( # type: ignore
supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore
urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute()
del supabase_client
if urls.data == []:
try:
print("Gathering existing urls from Supabase")
supabase_client = supabase.create_client( # type: ignore
supabase_url=os.getenv('SUPABASE_URL'), # type: ignore
supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore
urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute()
del supabase_client
if urls.data == []:
existing_urls = None
else:
existing_urls = []
for thing in urls.data:
whole = ''
for t in thing['contexts']:
whole += t['text']
existing_urls.append((thing['url'], whole))
print("Finished gathering existing urls from Supabase")
except Exception as e:
print("Error:", e)
print("Could not gather existing urls from Supabase")
existing_urls = None
else:
existing_urls = []
for thing in urls.data:
whole = ''
for t in thing['contexts']:
whole += t['text']
existing_urls.append((thing['url'], whole))
print("Finished gathering existing urls from Supabase")

print("Begin Ingesting Web page")
data = crawler(url=url, max_urls=max_urls, max_depth=max_depth, timeout=timeout, base_url_on=stay_on_baseurl, _existing_urls=existing_urls)

Expand Down Expand Up @@ -373,6 +380,7 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti
if value == "403_Forbidden":
print("Found Forbidden Key, deleting data")
del data[counter]
counter -= 1
else:
path_name.append(value)
counter += 1
Expand Down
2 changes: 1 addition & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# Docs https://docs.gunicorn.org/en/stable/settings.html#workers

export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend
exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 60 --max-requests 2
exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 --max-requests 20

0 comments on commit eea8bd3

Please sign in to comment.