Skip to content

Commit

Permalink
Merge branch 'main' into refactor_file_handling
Browse files Browse the repository at this point in the history
  • Loading branch information
jkmin3 authored Sep 11, 2023
2 parents 557b77a + 6a1a38f commit f70d800
Show file tree
Hide file tree
Showing 8 changed files with 222 additions and 169 deletions.
5 changes: 4 additions & 1 deletion ai_ta_backend/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,7 @@ def upload(myfile):
pool.map(upload, filenames)

print("All data files uploaded to S3 successfully.")
return s3_paths
return s3_paths

if __name__ == '__main__':
pass
16 changes: 4 additions & 12 deletions ai_ta_backend/extreme_context_stuffing.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,19 +274,9 @@ def extract_context_from_results(results: List[Any]) -> List[str]:
assistant_contents.append(choice['message']['content'])
total_prompt_tokens += item['usage']['prompt_tokens']
total_completion_tokens += item['usage']['completion_tokens']
# Note: I don't think the prompt_tokens or completion_tokens is working quite right...

# print("Assistant Contents:", assistant_contents)
print("Total Prompt Tokens:", total_prompt_tokens)
print("Total Completion Tokens:", total_completion_tokens)
turbo_total_cost = (total_prompt_tokens * 0.0015) + (total_completion_tokens * 0.002)
print("Total cost (3.5-turbo):", (total_prompt_tokens * 0.0015), " + Completions: ", (total_completion_tokens * 0.002), " = ",
turbo_total_cost)

gpt4_total_cost = (total_prompt_tokens * 0.03) + (total_completion_tokens * 0.06)
print("Hypothetical cost for GPT-4:", (total_prompt_tokens * 0.03), " + Completions: ", (total_completion_tokens * 0.06), " = ",
gpt4_total_cost)
print("GPT-4 cost premium: ", (gpt4_total_cost / max(turbo_total_cost, 1)), "x")
return assistant_contents #, total_prompt_tokens, total_completion_tokens
return assistant_contents


# dataclasses
Expand Down Expand Up @@ -442,6 +432,8 @@ def task_id_generator_function():
yield task_id
task_id += 1

if __name__ == '__main__':
pass

# run script
# if __name__ == "__main__":
Expand Down
162 changes: 111 additions & 51 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
import gc
import os
import time
from typing import Any, List, Union
from typing import List

from dotenv import load_dotenv
from flask import Flask, abort, jsonify, request
from flask import Flask, Response, abort, jsonify, request
from flask_cors import CORS
from flask_executor import Executor
from sqlalchemy import JSON

from ai_ta_backend.nomic_logging import get_nomic_map, log_query_to_nomic
from ai_ta_backend.vector_database import Ingest
from ai_ta_backend.web_scrape import main_crawler, mit_course_download
from ai_ta_backend.nomic_logging import log_query_to_nomic, get_nomic_map, create_nomic_map
from flask_executor import Executor

app = Flask(__name__)
CORS(app)
Expand All @@ -21,7 +22,7 @@
load_dotenv()

@app.route('/')
def index() -> JSON:
def index() -> Response:
"""_summary_
Args:
Expand All @@ -30,11 +31,13 @@ def index() -> JSON:
Returns:
JSON: _description_
"""
return jsonify({"Choo Choo": "Welcome to your Flask app 🚅"})
response = jsonify({"Choo Choo": "Welcome to your Flask app 🚅"})
response.headers.add('Access-Control-Allow-Origin', '*')
return response


@app.route('/coursera', methods=['GET'])
def coursera() -> JSON:
def coursera() -> Response:
try:
course_name: str = request.args.get('course_name') # type: ignore
coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore
Expand All @@ -43,29 +46,37 @@ def coursera() -> JSON:

ingester = Ingest()
results = ingester.ingest_coursera(coursera_course_name, course_name) # type: ignore
del ingester

response = jsonify(results)
response.headers.add('Access-Control-Allow-Origin', '*')
return response


@app.route('/github', methods=['GET'])
def github() -> JSON:
try:
course_name: str = request.args.get('course_name') # type: ignore
github_url: str = request.args.get('github_url') # type: ignore
except Exception as e:
print(f"No course name provided: {e}")
def github() -> Response:
course_name: str = request.args.get('course_name', default='', type=str)
github_url: str = request.args.get('github_url', default='', type=str)

if course_name == '' or github_url == '':
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{github_url}`"
)


print("In /github")
ingester = Ingest()
results = ingester.ingest_github(github_url, course_name)
del ingester
response = jsonify(results)
response.headers.add('Access-Control-Allow-Origin', '*')
return response


@app.route('/delete-entire-course', methods=['GET'])
def delete_entire_course():
def delete_entire_course() -> Response:
try:
course_name: str = request.args.get('course_name') # type: ignore
# coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore
Expand All @@ -74,13 +85,15 @@ def delete_entire_course():

ingester = Ingest()
results = ingester.delete_entire_course(course_name) # type: ignore
del ingester

response = jsonify(results)
response.headers.add('Access-Control-Allow-Origin', '*')
return response


@app.route('/getTopContexts', methods=['GET'])
def getTopContexts():
def getTopContexts() -> Response:
"""Get most relevant contexts for a given search query.
Return value
Expand Down Expand Up @@ -129,6 +142,7 @@ def getTopContexts():

ingester = Ingest()
found_documents = ingester.getTopContexts(search_query, course_name, token_limit)
del ingester

# background execution of tasks!!
executor.submit(log_query_to_nomic, course_name, search_query)
Expand All @@ -140,7 +154,7 @@ def getTopContexts():


@app.route('/get_stuffed_prompt', methods=['GET'])
def get_stuffed_prompt():
def get_stuffed_prompt() -> Response:
"""Get most relevant contexts for a given search query.
## GET arguments
Expand All @@ -154,13 +168,16 @@ def get_stuffed_prompt():
String
"""
# todo: best way to handle optional arguments?
try:
course_name: str = request.args.get('course_name')
search_query: str = request.args.get('search_query')
token_limit: int = request.args.get('token_limit')
except Exception as e:
print("No course name provided.")
course_name: str = request.args.get('course_name', default='', type=str)
search_query: str = request.args.get('search_query', default='', type=str)
token_limit: int = request.args.get('token_limit', default=-1, type=int)
if course_name == '' or search_query == '' or token_limit == -1:
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing one or more required parameters: 'course_name', 'search_query', and 'token_limit' must be provided. Course name: `{course_name}`, Search query: `{search_query}`, Token limit: `{token_limit}`"
)

print("In /getTopContexts: ", search_query)
if search_query is None:
Expand All @@ -172,14 +189,15 @@ def get_stuffed_prompt():

ingester = Ingest()
prompt = ingester.get_stuffed_prompt(search_query, course_name, token_limit)
del ingester

response = jsonify(prompt)
response.headers.add('Access-Control-Allow-Origin', '*')
return response


@app.route('/ingest', methods=['GET'])
def ingest():
def ingest() -> Response:
"""Recursively ingests anything from S3 filepath and below.
Pass a s3_paths filepath (not URL) into our S3 bucket.
Expand All @@ -191,21 +209,28 @@ def ingest():
Returns:
str: Success or Failure message. Failure message if any failures. TODO: email on failure.
"""
s3_paths: List[str] | str = request.args.get('s3_paths', default='')
course_name: List[str] | str = request.args.get('course_name', default='')

print("In /ingest")
if course_name == '' or s3_paths == '':
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{s3_paths}`"
)

ingester = Ingest()
s3_paths: List[str] | str = request.args.get('s3_paths')
course_name: List[str] | str = request.args.get('course_name')
success_fail_dict = ingester.bulk_ingest(s3_paths, course_name)
del ingester

response = jsonify(success_fail_dict)
response.headers.add('Access-Control-Allow-Origin', '*')
return response


@app.route('/getContextStuffedPrompt', methods=['GET'])
def getContextStuffedPrompt():
def getContextStuffedPrompt() -> Response:
"""
Get a stuffed prompt for a given user question and course name.
Args :
Expand All @@ -217,32 +242,48 @@ def getContextStuffedPrompt():
print("In /getContextStuffedPrompt")

ingester = Ingest()
search_query: str = str(request.args.get('search_query')) # type: ignore
course_name: str = str(request.args.get('course_name')) # type: ignore
top_n: int = int(request.args.get('top_n')) # type: ignore
top_k_to_search: int = int(request.args.get('top_k_to_search')) # type: ignore
search_query: str = request.args.get('search_query', default='', type=str)
course_name: str = request.args.get('course_name', default='', type=str)
top_n: int = request.args.get('top_n', default=-1, type=int)
top_k_to_search: int = request.args.get('top_k_to_search', default=-1, type=int)

if search_query == '' or course_name == '' or top_n == -1 or top_k_to_search == -1:
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing one or more required parameters: 'search_query', 'course_name', 'top_n', and 'top_k_to_search' must be provided. Search query: `{search_query}`, Course name: `{course_name}`, Top N: `{top_n}`, Top K to search: `{top_k_to_search}`"
)

start_time = time.monotonic()
stuffed_prompt = ingester.get_context_stuffed_prompt(search_query, course_name, top_n, top_k_to_search)
print(f"⏰ Runtime of EXTREME prompt stuffing: {(time.monotonic() - start_time):.2f} seconds")
response = jsonify({"prompt": stuffed_prompt})
del ingester

response = jsonify({"prompt": stuffed_prompt})
response.headers.add('Access-Control-Allow-Origin', '*')
return response


@app.route('/getAll', methods=['GET'])
def getAll():
def getAll() -> Response:
"""Get all course materials based on the course_name
"""
course_name: List[str] | str = request.args.get('course_name', default='', type=str)

print("In /getAll")
if course_name == '':
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing the one required parameter: 'course_name' must be provided. Course name: `{course_name}`"
)

ingester = Ingest()
course_name: List[str] | str = request.args.get('course_name')
distinct_dicts = ingester.getAll(course_name)
response = jsonify({"distinct_files": distinct_dicts})
del ingester

response = jsonify({"distinct_files": distinct_dicts})
response.headers.add('Access-Control-Allow-Origin', '*')
return response

Expand All @@ -267,25 +308,33 @@ def delete():

start_time = time.monotonic()
ingester = Ingest()

# background execution of tasks!!
executor.submit(ingester.delete_data, s3_path, course_name)
print(f"From {course_name}, deleted file: {s3_path}")
print(f"⏰ Runtime of FULL delete func: {(time.monotonic() - start_time):.2f} seconds")
del ingester

# we need instant return. Delets are "best effort" assume always successful... sigh :(
response = jsonify({"outcome": 'success'})
response.headers.add('Access-Control-Allow-Origin', '*')
return response

@app.route('/web-scrape', methods=['GET'])
def scrape():
url: str = request.args.get('url')
max_urls: int = request.args.get('max_urls')
max_depth: int = request.args.get('max_depth')
timeout: int = request.args.get('timeout')
course_name: str = request.args.get('course_name')
stay_on_baseurl: bool = request.args.get('stay_on_baseurl')
def scrape() -> Response:
url: str = request.args.get('url', default='', type=str)
course_name: str = request.args.get('course_name', default='', type=str)
max_urls: int = request.args.get('max_urls', default=100, type=int)
max_depth: int = request.args.get('max_depth', default=2, type=int)
timeout: int = request.args.get('timeout', default=3, type=int)
stay_on_baseurl: bool | None = request.args.get('`stay_on_baseurl`', default=True, type=bool)

if url == '' or max_urls == -1 or max_depth == -1 or timeout == -1 or course_name == '' or stay_on_baseurl is None:
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing one or more required parameters: 'url', 'max_urls', 'max_depth', 'timeout', 'course_name', and 'stay_on_baseurl' must be provided. url: `{url}`, max_urls: `{max_urls}`, max_depth: `{max_depth}`, timeout: `{timeout}`, course_name: `{course_name}`, stay_on_baseurl: `{stay_on_baseurl}`"
)

# print all input params
print(f"Web scrape!")
Expand All @@ -298,14 +347,25 @@ def scrape():

response = jsonify(success_fail_dict)
response.headers.add('Access-Control-Allow-Origin', '*')
gc.collect() # manually invoke garbage collection, try to reduce memory on Railway $$$
return response


@app.route('/mit-download', methods=['GET'])
def mit_download_course():
url: str = request.args.get('url')
course_name: str = request.args.get('course_name')
local_dir: str = request.args.get('local_dir')
def mit_download_course() -> Response:
""" Web scraper built for
"""
url: str = request.args.get('url', default='', type=str)
course_name: str = request.args.get('course_name', default='', type=str)
local_dir: str = request.args.get('local_dir', default='', type=str)

if url == '' or course_name == '' or local_dir == '':
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing one or more required parameters: 'url', 'course_name', and 'local_dir' must be provided. url: `{url}`, course_name: `{course_name}`, local_dir: `{local_dir}`"
)

success_fail = mit_course_download(url, course_name, local_dir)

Expand Down Expand Up @@ -334,4 +394,4 @@ def nomic_map():


if __name__ == '__main__':
app.run(debug=True, port=os.getenv("PORT", default=8000))
app.run(debug=True, port=int(os.getenv("PORT", default=8000)))
3 changes: 3 additions & 0 deletions ai_ta_backend/nomic_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,6 @@ def create_nomic_map(course_name: str, log_embeddings: np.ndarray, log_data: lis
name=project_name, colorable_fields=['query'])
project.create_index(index_name, build_topic_model=True)
return f"Successfully created Nomic map for {course_name}"

if __name__ == '__main__':
pass
Loading

0 comments on commit f70d800

Please sign in to comment.