diff --git a/ai_ta_backend/aws.py b/ai_ta_backend/aws.py index 66fb0bbe..53e4c8c2 100644 --- a/ai_ta_backend/aws.py +++ b/ai_ta_backend/aws.py @@ -51,4 +51,7 @@ def upload(myfile): pool.map(upload, filenames) print("All data files uploaded to S3 successfully.") - return s3_paths \ No newline at end of file + return s3_paths + +if __name__ == '__main__': + pass diff --git a/ai_ta_backend/extreme_context_stuffing.py b/ai_ta_backend/extreme_context_stuffing.py index b92f2fd0..ed133a6a 100644 --- a/ai_ta_backend/extreme_context_stuffing.py +++ b/ai_ta_backend/extreme_context_stuffing.py @@ -274,19 +274,9 @@ def extract_context_from_results(results: List[Any]) -> List[str]: assistant_contents.append(choice['message']['content']) total_prompt_tokens += item['usage']['prompt_tokens'] total_completion_tokens += item['usage']['completion_tokens'] + # Note: I don't think the prompt_tokens or completion_tokens is working quite right... - # print("Assistant Contents:", assistant_contents) - print("Total Prompt Tokens:", total_prompt_tokens) - print("Total Completion Tokens:", total_completion_tokens) - turbo_total_cost = (total_prompt_tokens * 0.0015) + (total_completion_tokens * 0.002) - print("Total cost (3.5-turbo):", (total_prompt_tokens * 0.0015), " + Completions: ", (total_completion_tokens * 0.002), " = ", - turbo_total_cost) - - gpt4_total_cost = (total_prompt_tokens * 0.03) + (total_completion_tokens * 0.06) - print("Hypothetical cost for GPT-4:", (total_prompt_tokens * 0.03), " + Completions: ", (total_completion_tokens * 0.06), " = ", - gpt4_total_cost) - print("GPT-4 cost premium: ", (gpt4_total_cost / max(turbo_total_cost, 1)), "x") - return assistant_contents #, total_prompt_tokens, total_completion_tokens + return assistant_contents # dataclasses @@ -442,6 +432,8 @@ def task_id_generator_function(): yield task_id task_id += 1 +if __name__ == '__main__': + pass # run script # if __name__ == "__main__": diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 01d6949b..e32b8b7d 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -1,16 +1,17 @@ +import gc import os import time -from typing import Any, List, Union +from typing import List from dotenv import load_dotenv -from flask import Flask, abort, jsonify, request +from flask import Flask, Response, abort, jsonify, request from flask_cors import CORS +from flask_executor import Executor from sqlalchemy import JSON +from ai_ta_backend.nomic_logging import get_nomic_map, log_query_to_nomic from ai_ta_backend.vector_database import Ingest from ai_ta_backend.web_scrape import main_crawler, mit_course_download -from ai_ta_backend.nomic_logging import log_query_to_nomic, get_nomic_map, create_nomic_map -from flask_executor import Executor app = Flask(__name__) CORS(app) @@ -21,7 +22,7 @@ load_dotenv() @app.route('/') -def index() -> JSON: +def index() -> Response: """_summary_ Args: @@ -30,11 +31,13 @@ def index() -> JSON: Returns: JSON: _description_ """ - return jsonify({"Choo Choo": "Welcome to your Flask app 🚅"}) + response = jsonify({"Choo Choo": "Welcome to your Flask app 🚅"}) + response.headers.add('Access-Control-Allow-Origin', '*') + return response @app.route('/coursera', methods=['GET']) -def coursera() -> JSON: +def coursera() -> Response: try: course_name: str = request.args.get('course_name') # type: ignore coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore @@ -43,29 +46,37 @@ def coursera() -> JSON: ingester = Ingest() results = ingester.ingest_coursera(coursera_course_name, course_name) # type: ignore + del ingester + response = jsonify(results) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/github', methods=['GET']) -def github() -> JSON: - try: - course_name: str = request.args.get('course_name') # type: ignore - github_url: str = request.args.get('github_url') # type: ignore - except Exception as e: - print(f"No course name provided: {e}") +def github() -> Response: + course_name: str = request.args.get('course_name', default='', type=str) + github_url: str = request.args.get('github_url', default='', type=str) + + if course_name == '' or github_url == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{github_url}`" + ) + - print("In /github") ingester = Ingest() results = ingester.ingest_github(github_url, course_name) + del ingester response = jsonify(results) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/delete-entire-course', methods=['GET']) -def delete_entire_course(): +def delete_entire_course() -> Response: try: course_name: str = request.args.get('course_name') # type: ignore # coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore @@ -74,13 +85,15 @@ def delete_entire_course(): ingester = Ingest() results = ingester.delete_entire_course(course_name) # type: ignore + del ingester + response = jsonify(results) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/getTopContexts', methods=['GET']) -def getTopContexts(): +def getTopContexts() -> Response: """Get most relevant contexts for a given search query. Return value @@ -129,6 +142,7 @@ def getTopContexts(): ingester = Ingest() found_documents = ingester.getTopContexts(search_query, course_name, token_limit) + del ingester # background execution of tasks!! executor.submit(log_query_to_nomic, course_name, search_query) @@ -140,7 +154,7 @@ def getTopContexts(): @app.route('/get_stuffed_prompt', methods=['GET']) -def get_stuffed_prompt(): +def get_stuffed_prompt() -> Response: """Get most relevant contexts for a given search query. ## GET arguments @@ -154,13 +168,16 @@ def get_stuffed_prompt(): String """ - # todo: best way to handle optional arguments? - try: - course_name: str = request.args.get('course_name') - search_query: str = request.args.get('search_query') - token_limit: int = request.args.get('token_limit') - except Exception as e: - print("No course name provided.") + course_name: str = request.args.get('course_name', default='', type=str) + search_query: str = request.args.get('search_query', default='', type=str) + token_limit: int = request.args.get('token_limit', default=-1, type=int) + if course_name == '' or search_query == '' or token_limit == -1: + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'course_name', 'search_query', and 'token_limit' must be provided. Course name: `{course_name}`, Search query: `{search_query}`, Token limit: `{token_limit}`" + ) print("In /getTopContexts: ", search_query) if search_query is None: @@ -172,6 +189,7 @@ def get_stuffed_prompt(): ingester = Ingest() prompt = ingester.get_stuffed_prompt(search_query, course_name, token_limit) + del ingester response = jsonify(prompt) response.headers.add('Access-Control-Allow-Origin', '*') @@ -179,7 +197,7 @@ def get_stuffed_prompt(): @app.route('/ingest', methods=['GET']) -def ingest(): +def ingest() -> Response: """Recursively ingests anything from S3 filepath and below. Pass a s3_paths filepath (not URL) into our S3 bucket. @@ -191,13 +209,20 @@ def ingest(): Returns: str: Success or Failure message. Failure message if any failures. TODO: email on failure. """ + s3_paths: List[str] | str = request.args.get('s3_paths', default='') + course_name: List[str] | str = request.args.get('course_name', default='') - print("In /ingest") + if course_name == '' or s3_paths == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'course_name' and 's3_path' must be provided. Course name: `{course_name}`, S3 path: `{s3_paths}`" + ) ingester = Ingest() - s3_paths: List[str] | str = request.args.get('s3_paths') - course_name: List[str] | str = request.args.get('course_name') success_fail_dict = ingester.bulk_ingest(s3_paths, course_name) + del ingester response = jsonify(success_fail_dict) response.headers.add('Access-Control-Allow-Origin', '*') @@ -205,7 +230,7 @@ def ingest(): @app.route('/getContextStuffedPrompt', methods=['GET']) -def getContextStuffedPrompt(): +def getContextStuffedPrompt() -> Response: """ Get a stuffed prompt for a given user question and course name. Args : @@ -217,32 +242,48 @@ def getContextStuffedPrompt(): print("In /getContextStuffedPrompt") ingester = Ingest() - search_query: str = str(request.args.get('search_query')) # type: ignore - course_name: str = str(request.args.get('course_name')) # type: ignore - top_n: int = int(request.args.get('top_n')) # type: ignore - top_k_to_search: int = int(request.args.get('top_k_to_search')) # type: ignore + search_query: str = request.args.get('search_query', default='', type=str) + course_name: str = request.args.get('course_name', default='', type=str) + top_n: int = request.args.get('top_n', default=-1, type=int) + top_k_to_search: int = request.args.get('top_k_to_search', default=-1, type=int) + + if search_query == '' or course_name == '' or top_n == -1 or top_k_to_search == -1: + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'search_query', 'course_name', 'top_n', and 'top_k_to_search' must be provided. Search query: `{search_query}`, Course name: `{course_name}`, Top N: `{top_n}`, Top K to search: `{top_k_to_search}`" + ) start_time = time.monotonic() stuffed_prompt = ingester.get_context_stuffed_prompt(search_query, course_name, top_n, top_k_to_search) print(f"⏰ Runtime of EXTREME prompt stuffing: {(time.monotonic() - start_time):.2f} seconds") - response = jsonify({"prompt": stuffed_prompt}) + del ingester + response = jsonify({"prompt": stuffed_prompt}) response.headers.add('Access-Control-Allow-Origin', '*') return response @app.route('/getAll', methods=['GET']) -def getAll(): +def getAll() -> Response: """Get all course materials based on the course_name """ + course_name: List[str] | str = request.args.get('course_name', default='', type=str) - print("In /getAll") + if course_name == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing the one required parameter: 'course_name' must be provided. Course name: `{course_name}`" + ) ingester = Ingest() - course_name: List[str] | str = request.args.get('course_name') distinct_dicts = ingester.getAll(course_name) - response = jsonify({"distinct_files": distinct_dicts}) + del ingester + response = jsonify({"distinct_files": distinct_dicts}) response.headers.add('Access-Control-Allow-Origin', '*') return response @@ -267,11 +308,11 @@ def delete(): start_time = time.monotonic() ingester = Ingest() - # background execution of tasks!! executor.submit(ingester.delete_data, s3_path, course_name) print(f"From {course_name}, deleted file: {s3_path}") print(f"⏰ Runtime of FULL delete func: {(time.monotonic() - start_time):.2f} seconds") + del ingester # we need instant return. Delets are "best effort" assume always successful... sigh :( response = jsonify({"outcome": 'success'}) @@ -279,13 +320,21 @@ def delete(): return response @app.route('/web-scrape', methods=['GET']) -def scrape(): - url: str = request.args.get('url') - max_urls: int = request.args.get('max_urls') - max_depth: int = request.args.get('max_depth') - timeout: int = request.args.get('timeout') - course_name: str = request.args.get('course_name') - stay_on_baseurl: bool = request.args.get('stay_on_baseurl') +def scrape() -> Response: + url: str = request.args.get('url', default='', type=str) + course_name: str = request.args.get('course_name', default='', type=str) + max_urls: int = request.args.get('max_urls', default=100, type=int) + max_depth: int = request.args.get('max_depth', default=2, type=int) + timeout: int = request.args.get('timeout', default=3, type=int) + stay_on_baseurl: bool | None = request.args.get('`stay_on_baseurl`', default=True, type=bool) + + if url == '' or max_urls == -1 or max_depth == -1 or timeout == -1 or course_name == '' or stay_on_baseurl is None: + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'url', 'max_urls', 'max_depth', 'timeout', 'course_name', and 'stay_on_baseurl' must be provided. url: `{url}`, max_urls: `{max_urls}`, max_depth: `{max_depth}`, timeout: `{timeout}`, course_name: `{course_name}`, stay_on_baseurl: `{stay_on_baseurl}`" + ) # print all input params print(f"Web scrape!") @@ -298,14 +347,25 @@ def scrape(): response = jsonify(success_fail_dict) response.headers.add('Access-Control-Allow-Origin', '*') + gc.collect() # manually invoke garbage collection, try to reduce memory on Railway $$$ return response @app.route('/mit-download', methods=['GET']) -def mit_download_course(): - url: str = request.args.get('url') - course_name: str = request.args.get('course_name') - local_dir: str = request.args.get('local_dir') +def mit_download_course() -> Response: + """ Web scraper built for + """ + url: str = request.args.get('url', default='', type=str) + course_name: str = request.args.get('course_name', default='', type=str) + local_dir: str = request.args.get('local_dir', default='', type=str) + + if url == '' or course_name == '' or local_dir == '': + # proper web error "400 Bad request" + abort( + 400, + description= + f"Missing one or more required parameters: 'url', 'course_name', and 'local_dir' must be provided. url: `{url}`, course_name: `{course_name}`, local_dir: `{local_dir}`" + ) success_fail = mit_course_download(url, course_name, local_dir) @@ -334,4 +394,4 @@ def nomic_map(): if __name__ == '__main__': - app.run(debug=True, port=os.getenv("PORT", default=8000)) + app.run(debug=True, port=int(os.getenv("PORT", default=8000))) diff --git a/ai_ta_backend/nomic_logging.py b/ai_ta_backend/nomic_logging.py index bcef5fe9..12681801 100644 --- a/ai_ta_backend/nomic_logging.py +++ b/ai_ta_backend/nomic_logging.py @@ -124,3 +124,6 @@ def create_nomic_map(course_name: str, log_embeddings: np.ndarray, log_data: lis name=project_name, colorable_fields=['query']) project.create_index(index_name, build_topic_model=True) return f"Successfully created Nomic map for {course_name}" + +if __name__ == '__main__': + pass diff --git a/ai_ta_backend/utils_tokenization.py b/ai_ta_backend/utils_tokenization.py index 596dcb92..096e2bb6 100644 --- a/ai_ta_backend/utils_tokenization.py +++ b/ai_ta_backend/utils_tokenization.py @@ -112,6 +112,8 @@ def analyze_conversations(supabase_client: Any = None): content = message['content'] # If the message is from the user, it's a prompt + # TODO: Fix these + # WARNING: Fix these error messages... they are the sign of a logic bug. if role == 'user': num_tokens, cost = count_tokens_and_cost(prompt=content, openai_model_name=model_name) total_prompt_cost += cost @@ -124,6 +126,9 @@ def analyze_conversations(supabase_client: Any = None): print(f'Assistant Completion: {content}\nTokens: {num_tokens_completion}, cost: {cost_completion}') return total_convos, total_messages, total_prompt_cost, total_completion_cost +if __name__ == '__main__': + pass + # if __name__ == '__main__': # print('starting main') # total_convos, total_messages, total_prompt_cost, total_completion_cost = analyze_conversations() diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index e0e9ae6e..c43e712d 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -2,32 +2,25 @@ import inspect import logging import mimetypes -# import json import os import shutil import subprocess import time import traceback -import uuid # Literal +import uuid from pathlib import Path -from tempfile import NamedTemporaryFile # TemporaryFile +from tempfile import NamedTemporaryFile from typing import Any, Dict, List, Optional, Tuple, Union import boto3 -# import requests import fitz -import numpy as np import openai -import requests import supabase from bs4 import BeautifulSoup - -from langchain.document_loaders import (Docx2txtLoader, PythonLoader, - SRTLoader, - UnstructuredPowerPointLoader, TextLoader, GitLoader) - -from git import Repo - +from git.repo import Repo +from langchain.document_loaders import (Docx2txtLoader, GitLoader, + PythonLoader, SRTLoader, TextLoader, + UnstructuredPowerPointLoader) from langchain.embeddings.openai import OpenAIEmbeddings from langchain.schema import Document from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -40,16 +33,6 @@ from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor from ai_ta_backend.utils_tokenization import count_tokens_and_cost -# from arize.api import Client -# from arize.pandas.embeddings import EmbeddingGenerator, UseCases -# from arize.utils import ModelTypes -# from arize.utils.ModelTypes import GENERATIVE_LLM -# # from arize.utils.types import (Embedding, EmbeddingColumnNames, Environments, -# # Metrics, ModelTypes, Schema) - - - - class Ingest(): """ @@ -70,7 +53,7 @@ def __init__(self): self.vectorstore = Qdrant( client=self.qdrant_client, - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore + collection_name=os.environ['QDRANT_COLLECTION_NAME'], embeddings=OpenAIEmbeddings()) # type: ignore # S3 @@ -82,8 +65,8 @@ def __init__(self): # Create a Supabase client self.supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + supabase_url=os.environ['SUPABASE_URL'], + supabase_key=os.environ['SUPABASE_API_KEY']) return None def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n: int, top_k_to_search: int) -> str: @@ -149,9 +132,9 @@ def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n # no useful text, it replied with a summary of "None" continue if text is not None: - if "pagenumber" not in results[i][-1].keys(): - results[i][-1]['pagenumber'] = results[i][-1].get('pagenumber_or_timestamp') - num_tokens, prompt_cost = count_tokens_and_cost(text) + if "pagenumber" not in results[i][-1].keys(): # type: ignore + results[i][-1]['pagenumber'] = results[i][-1].get('pagenumber_or_timestamp') # type: ignore + num_tokens, prompt_cost = count_tokens_and_cost(text) # type: ignore if token_counter + num_tokens > max_tokens: print(f"Total tokens yet in loop {i} is {num_tokens}") break # Stop building the string if it exceeds the maximum number of tokens @@ -262,6 +245,7 @@ def ingest(file_ext_mapping, s3_path, *args, **kwargs): ingest(file_ext_mapping, s3_path, course_name, kwargs=kwargs) return success_status + except Exception as e: success_status['failure_ingest'].append(f"MAJOR ERROR IN /bulk_ingest: Error: {str(e)}") return success_status @@ -269,8 +253,6 @@ def ingest(file_ext_mapping, s3_path, *args, **kwargs): def _ingest_single_py(self, s3_path: str, course_name: str): try: - print("in ingest_py") - file_name = s3_path.split("/")[-1] file_path = "media/" + file_name # download from s3 to local folder for ingest @@ -718,14 +700,12 @@ def ingest_github(self, github_url: str, course_name: str) -> str: Returns: _type_: Success or error message. """ - print("in ingest_github") - try: repo_path = "media/cloned_repo" repo = Repo.clone_from(github_url, to_path=repo_path, depth=1, clone_submodules=False) branch = repo.head.reference - loader = GitLoader(repo_path="media/cloned_repo", branch=branch) + loader = GitLoader(repo_path="media/cloned_repo", branch=str(branch)) data = loader.load() shutil.rmtree("media/cloned_repo") # create metadata for each file in data @@ -755,31 +735,17 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): metadatas (List[Dict[str, Any]]): _description_ """ print("In split and upload") - print(f"Texts: {texts}") print(f"metadatas: {metadatas}") - print(type(texts)) - assert len(texts) == len(metadatas), 'must have equal number of text strings and metadata dicts' - + print(f"Texts: {texts}") + assert len(texts) == len(metadatas), f'must have equal number of text strings and metadata dicts. len(texts) is {len(texts)}. len(metadatas) is {len(metadatas)}' try: - # generate AI summary - # summary = self.ai_summary(texts, metadatas) - # for i in range(len(summary)): - # metadatas[i]['summary'] = summary[i] - text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=1000, chunk_overlap=150, - separators=". ", # try to split on sentences... + separators=[". ", "\n\n", "\n", " ", ""] # try to split on sentences... fallback to others to ensure we always fit in context window ) contexts: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas) - - def remove_small_contexts(contexts: List[Document]) -> List[Document]: - # Remove TextSplit contexts with fewer than 50 chars. - return [doc for doc in contexts if len(doc.page_content) > 50] - - contexts = remove_small_contexts(contexts=contexts) - input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts] oai = OpenAIAPIProcessor(input_prompts_list=input_texts, @@ -807,12 +773,9 @@ def remove_small_contexts(contexts: List[Document]) -> List[Document]: )) self.qdrant_client.upsert( - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore + collection_name=os.environ['QDRANT_COLLECTION_NAME'], # type: ignore points=vectors # type: ignore ) - # # replace with Qdrant - # self.vectorstore.add_texts([doc.page_content for doc in documents], [doc.metadata for doc in documents]) - ### Supabase SQL ### contexts_for_supa = [{ "text": context.page_content, @@ -821,14 +784,14 @@ def remove_small_contexts(contexts: List[Document]) -> List[Document]: "embedding": embeddings_dict[context.page_content] } for context in contexts] - document = { - "course_name": contexts[0].metadata.get('course_name'), - "s3_path": contexts[0].metadata.get('s3_path'), - "readable_filename": contexts[0].metadata.get('readable_filename'), - "url": contexts[0].metadata.get('url'), - "base_url": contexts[0].metadata.get('base_url'), - "contexts": contexts_for_supa, - } + document = [{ + "course_name": context.metadata.get('course_name'), + "s3_path": context.metadata.get('s3_path'), + "readable_filename": context.metadata.get('readable_filename'), + "url": context.metadata.get('url'), + "base_url": context.metadata.get('base_url'), + "contexts": contexts_for_supa, # should ideally be just one context but getting JSON serialization error when I do that + } for context in contexts] count = self.supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).insert(document).execute() # type: ignore print("successful END OF split_and_upload") @@ -864,7 +827,7 @@ def delete_entire_course(self, course_name: str): # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), print("deleting from qdrant") self.qdrant_client.delete( - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), + collection_name=os.environ['QDRANT_COLLECTION_NAME'], points_selector=models.Filter(must=[ models.FieldCondition( key="metadata.course_name", @@ -880,7 +843,7 @@ def delete_entire_course(self, course_name: str): try: # Delete from Supabase print("deleting from supabase") - response = self.supabase_client.from_(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).delete().eq('course_name', course_name).execute() + response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq('course_name', course_name).execute() print("supabase response: ", response) return "Success" except Exception as e: @@ -902,7 +865,7 @@ def delete_data(self, s3_path: str, course_name: str): # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), self.qdrant_client.delete( - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), + collection_name=os.environ['QDRANT_COLLECTION_NAME'], points_selector=models.Filter(must=[ models.FieldCondition( key="metadata.s3_path", @@ -912,7 +875,7 @@ def delete_data(self, s3_path: str, course_name: str): ) # Delete from Supabase - response = self.supabase_client.from_(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).delete().eq('s3_path', s3_path).eq( + response = self.supabase_client.from_(os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).delete().eq('s3_path', s3_path).eq( 'course_name', course_name).execute() return "Success" except Exception as e: @@ -932,7 +895,7 @@ def getAll( """ response = self.supabase_client.table( - os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, s3_path, readable_filename, url, base_url').eq( # type: ignore + os.environ['NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq( 'course_name', course_name).execute() data = response.data @@ -949,7 +912,7 @@ def getAll( def vector_search(self, search_query, course_name): top_n = 80 - o = OpenAIEmbeddings() + o = OpenAIEmbeddings() # type: ignore user_query_embedding = o.embed_query(search_query) myfilter = models.Filter( must=[ @@ -970,12 +933,12 @@ def vector_search(self, search_query, course_name): print("search_results", search_results) found_docs: list[Document] = [] for d in search_results: - metadata = d.payload.get('metadata') - if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): + metadata = d.payload.get('metadata') # type: ignore + if "pagenumber" not in metadata.keys() and "pagenumber_or_timestamp" in metadata.keys(): # type: ignore # aiding in the database migration... - metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] + metadata["pagenumber"] = metadata["pagenumber_or_timestamp"] # type: ignore - found_docs.append(Document(page_content=d.payload.get('page_content'), metadata=metadata)) + found_docs.append(Document(page_content=d.payload.get('page_content'), metadata=metadata)) # type: ignore # found_docs: list[Document] = [Document(page_content=str(d.payload.get('page_content')), metadata=d.payload.get('metadata')) for d in search_results] print("found_docs", found_docs) @@ -1000,13 +963,13 @@ def getTopContexts(self, search_query: str, course_name: str, token_limit: int = pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) + token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) # type: ignore valid_docs = [] num_tokens = 0 for doc in found_docs: doc_string = f"Document: {doc.metadata['readable_filename']}{', page: ' + str(doc.metadata['pagenumber']) if doc.metadata['pagenumber'] else ''}\n{str(doc.page_content)}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) + num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") if token_counter + num_tokens <= token_limit: @@ -1036,8 +999,8 @@ def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: i try: top_n = 150 start_time_overall = time.monotonic() - o = OpenAIEmbeddings() - user_query_embedding = o.embed_documents(search_query)[0] + o = OpenAIEmbeddings() # type: ignore + user_query_embedding = o.embed_documents(search_query)[0] # type: ignore myfilter = models.Filter( must=[ models.FieldCondition( @@ -1060,19 +1023,19 @@ def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: i pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) + token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) # type: ignore valid_docs = [] for d in found_docs: - if "pagenumber" not in d.payload["metadata"].keys(): - d.payload["metadata"]["pagenumber"] = d.payload["metadata"]["pagenumber_or_timestamp"] - doc_string = f"---\nDocument: {d.payload['metadata']['readable_filename']}{', page: ' + str(d.payload['metadata']['pagenumber']) if d.payload['metadata']['pagenumber'] else ''}\n{d.payload.get('page_content')}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) + if "pagenumber" not in d.payload["metadata"].keys(): # type: ignore + d.payload["metadata"]["pagenumber"] = d.payload["metadata"]["pagenumber_or_timestamp"] # type: ignore + doc_string = f"---\nDocument: {d.payload['metadata']['readable_filename']}{', page: ' + str(d.payload['metadata']['pagenumber']) if d.payload['metadata']['pagenumber'] else ''}\n{d.payload.get('page_content')}\n" # type: ignore + num_tokens, prompt_cost = count_tokens_and_cost(doc_string) # type: ignore - print(f"Page: {d.payload.get('page_content')[:100]}...") + print(f"Page: {d.payload.get('page_content')[:100]}...") # type: ignore print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, token_limit: {token_limit}") if token_counter + num_tokens <= token_limit: token_counter += num_tokens - valid_docs.append(Document(page_content=d.payload.get('page_content'), metadata=d.payload.get('metadata'))) + valid_docs.append(Document(page_content=d.payload.get('page_content'), metadata=d.payload.get('metadata'))) # type: ignore else: continue print("running continue") @@ -1086,7 +1049,7 @@ def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: i # Create the stuffedPrompt stuffedPrompt = (pre_prompt + context_text + '\n\nNow please respond to my query: ' + search_query) - TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4') + TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4') # type: ignore print(f"Total tokens: {TOTAL_num_tokens}, prompt_cost: {prompt_cost}") print("total docs: ", len(found_docs)) print("num docs used: ", len(valid_docs)) @@ -1129,3 +1092,6 @@ def format_for_json(self, found_docs: List[Document]) -> List[Dict]: } for doc in found_docs] return contexts + +if __name__ == '__main__': + pass diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index c8fc74d9..2d5327bb 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -240,7 +240,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url url_contents.append((url, s, filetype)) else: _invalid_urls.append(url) - + print("existing urls", _existing_urls) url_contents = remove_duplicates(url_contents, _existing_urls) max_urls = max_urls - len(url_contents) print(max_urls, "urls left") @@ -251,6 +251,7 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url if max_urls > 0: if _depth < max_depth: temp_data = crawler(url[0], max_urls, max_depth, timeout, _invalid_urls, _depth, url[1], url[2]) + print("existing urls", _existing_urls) temp_data = remove_duplicates(temp_data, _existing_urls) max_urls = max_urls - len(temp_data) print(max_urls, "urls left") @@ -272,6 +273,16 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url else: print("Exceeded Max URLS, found:", len(url_contents), "out of", amount) print(len(url_contents), "urls found") + + # Free up memory + # del url_contents[:] + # del urls[:] + # if _invalid_urls is not None: + # del _invalid_urls[:] + # if _existing_urls is not None: + # del _existing_urls[:] + # gc.collect() + return url_contents def is_github_repo(url): @@ -320,24 +331,31 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti print("Begin Ingesting GitHub page") results = ingester.ingest_github(url, course_name) print("Finished ingesting GitHub page") + del ingester return results else: - print("Gathering existing urls from Supabase") - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() - if urls.data == []: - existing_urls = [] - else: + try: + print("Gathering existing urls from Supabase") + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() + del supabase_client + if urls.data == []: + existing_urls = [] + else: + existing_urls = [] + for thing in urls.data: + whole = '' + for t in thing['contexts']: + whole += t['text'] + existing_urls.append((thing['url'], whole)) + print("Finished gathering existing urls from Supabase") + print("Length of existing urls:", len(existing_urls)) + except Exception as e: + print("Error:", e) + print("Could not gather existing urls from Supabase") existing_urls = [] - for thing in urls.data: - whole = '' - for t in thing['contexts']: - whole += t['text'] - existing_urls.append((thing['url'], whole)) - print("Finished gathering existing urls from Supabase") - print("Length of existing urls:", len(existing_urls)) print("Begin Ingesting Web page") data = crawler(url=url, max_urls=max_urls, max_depth=max_depth, timeout=timeout, base_url_on=baseurl, _existing_urls=existing_urls) @@ -373,13 +391,12 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti if value == "403_Forbidden": print("Found Forbidden Key, deleting data") del data[counter] + counter -= 1 else: path_name.append(value) counter += 1 - print("Cleaned title names", path_name) - # Upload each html to S3 print("Uploading files to S3") paths = [] @@ -406,8 +423,10 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti print("No", key[2] ,"to upload", key[1]) except Exception as e: print("Error in upload:", e) + finally: + del ingester - print("Successfully uploaded", counter, "files to S3") + print(f"Successfully uploaded files to s3: {counter}") print("Finished /web-scrape") # Download an MIT course using its url @@ -451,6 +470,9 @@ def mit_course_download(url:str, course_name:str, local_dir:str): shutil.move(zip_file, local_dir) shutil.rmtree(local_dir) + del ingester print("Finished Ingest") return success_fail +if __name__ == '__main__': + pass diff --git a/run.sh b/run.sh index 02359fd1..9a09c44d 100755 --- a/run.sh +++ b/run.sh @@ -1,4 +1,6 @@ #!/bin/bash +# Docs https://docs.gunicorn.org/en/stable/settings.html#workers + export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend -exec gunicorn --workers=6 --threads=6 --worker-class=gthread ai_ta_backend.main:app --timeout 108000 \ No newline at end of file +exec gunicorn --workers=2 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 --max-requests 20 \ No newline at end of file