From 4f38670a7509ac04423a9dde90aef2d4c45a22b8 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 12 Sep 2023 15:19:40 -0500 Subject: [PATCH] added missing emoji --- .env.template | 68 +- .gitignore | 334 +++---- ai_ta_backend/extreme_context_stuffing.py | 1082 ++++++++++----------- ai_ta_backend/nomic_logging.py | 4 + ai_ta_backend/nomic_map_creation.ipynb | 1031 +++++++++++++------- ai_ta_backend/utils_tokenization.py | 270 ++--- ai_ta_backend/web_scrape.py | 934 +++++++++--------- 7 files changed, 2021 insertions(+), 1702 deletions(-) diff --git a/.env.template b/.env.template index ba04c704..5c5520de 100644 --- a/.env.template +++ b/.env.template @@ -1,34 +1,34 @@ -# Supabase SQL -SUPABASE_URL= -SUPABASE_API_KEY= -SUPABASE_READ_ONLY= -SUPABASE_JWT_SECRET= - -MATERIALS_SUPABASE_TABLE=uiuc_chatbot -NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE=documents - -# QDRANT -QDRANT_COLLECTION_NAME=uiuc-chatbot -DEV_QDRANT_COLLECTION_NAME=dev -QDRANT_URL= -QDRANT_API_KEY= - -REFACTORED_MATERIALS_SUPABASE_TABLE= - -# AWS -S3_BUCKET_NAME=uiuc-chatbot -AWS_ACCESS_KEY_ID= -AWS_SECRET_ACCESS_KEY= - -OPENAI_API_KEY= - -NOMIC_API_KEY= -LINTRULE_SECRET= - -# Github Agent -GITHUB_APP_ID= -GITHUB_APP_PRIVATE_KEY="-----BEGIN RSA PRIVATE KEY----- - ------END RSA PRIVATE KEY-----" - -NUMEXPR_MAX_THREADS=2 +# Supabase SQL +SUPABASE_URL= +SUPABASE_API_KEY= +SUPABASE_READ_ONLY= +SUPABASE_JWT_SECRET= + +MATERIALS_SUPABASE_TABLE=uiuc_chatbot +NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE=documents + +# QDRANT +QDRANT_COLLECTION_NAME=uiuc-chatbot +DEV_QDRANT_COLLECTION_NAME=dev +QDRANT_URL= +QDRANT_API_KEY= + +REFACTORED_MATERIALS_SUPABASE_TABLE= + +# AWS +S3_BUCKET_NAME=uiuc-chatbot +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= + +OPENAI_API_KEY= + +NOMIC_API_KEY= +LINTRULE_SECRET= + +# Github Agent +GITHUB_APP_ID= +GITHUB_APP_PRIVATE_KEY="-----BEGIN RSA PRIVATE KEY----- + +-----END RSA PRIVATE KEY-----" + +NUMEXPR_MAX_THREADS=2 diff --git a/.gitignore b/.gitignore index 70babf88..3db8ad0c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,167 +1,167 @@ -# don't sync coursera docs -coursera-dl/ -*parsed.json -wandb - -# don't expose env files -dummy.ipynb -.env -# Created by https://www.toptal.com/developers/gitignore/api/python -# Edit at https://www.toptal.com/developers/gitignore?templates=python - -### Python ### -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coveage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -pytestdebug.log - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ -doc/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ -pythonenv* - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# profiling data -.prof - -# Virtualenv -# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ -.Python -[Bb]in -[Ii]nclude -[Ll]ib -[Ll]ib64 -[Ll]ocal -[Ss]cripts -pyvenv.cfg -.venv -pip-selfcheck.json - - -# End of https://www.toptal.com/developers/gitignore/api/python -.aider* +# don't sync coursera docs +coursera-dl/ +*parsed.json +wandb + +# don't expose env files +dummy.ipynb +.env +# Created by https://www.toptal.com/developers/gitignore/api/python +# Edit at https://www.toptal.com/developers/gitignore?templates=python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coveage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +pytestdebug.log + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ +doc/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pythonenv* + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# profiling data +.prof + +# Virtualenv +# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ +.Python +[Bb]in +[Ii]nclude +[Ll]ib +[Ll]ib64 +[Ll]ocal +[Ss]cripts +pyvenv.cfg +.venv +pip-selfcheck.json + + +# End of https://www.toptal.com/developers/gitignore/api/python +.aider* diff --git a/ai_ta_backend/extreme_context_stuffing.py b/ai_ta_backend/extreme_context_stuffing.py index ed133a6a..03b56e86 100644 --- a/ai_ta_backend/extreme_context_stuffing.py +++ b/ai_ta_backend/extreme_context_stuffing.py @@ -1,541 +1,541 @@ -""" -API REQUEST PARALLEL PROCESSOR - -Using the OpenAI API to process lots of text quickly takes some care. -If you trickle in a million API requests one by one, they'll take days to complete. -If you flood a million API requests in parallel, they'll exceed the rate limits and fail with errors. -To maximize throughput, parallel requests need to be throttled to stay under rate limits. - -This script parallelizes requests to the OpenAI API while throttling to stay under rate limits. - -Features: -- Streams requests from file, to avoid running out of memory for giant jobs -- Makes requests concurrently, to maximize throughput -- Throttles request and token usage, to stay under rate limits -- Retries failed requests up to {max_attempts} times, to avoid missing data -- Logs errors, to diagnose problems with requests - -Example command to call script: -``` -python examples/api_request_parallel_processor.py \ - --requests_filepath examples/data/example_requests_to_parallel_process.jsonl \ - --save_filepath examples/data/example_requests_to_parallel_process_results.jsonl \ - --request_url https://api.openai.com/v1/embeddings \ - --max_requests_per_minute 1500 \ - --max_tokens_per_minute 6250000 \ - --token_encoding_name cl100k_base \ - --max_attempts 5 \ - --logging_level 20 -``` - -Inputs: -- requests_filepath : str - - path to the file containing the requests to be processed - - file should be a jsonl file, where each line is a json object with API parameters and an optional metadata field - - e.g., {"model": "text-embedding-ada-002", "input": "embed me", "metadata": {"row_id": 1}} - - as with all jsonl files, take care that newlines in the content are properly escaped (json.dumps does this automatically) - - an example file is provided at examples/data/example_requests_to_parallel_process.jsonl - - the code to generate the example file is appended to the bottom of this script -- save_filepath : str, optional - - path to the file where the results will be saved - - file will be a jsonl file, where each line is an array with the original request plus the API response - - e.g., [{"model": "text-embedding-ada-002", "input": "embed me"}, {...}] - - if omitted, results will be saved to {requests_filename}_results.jsonl -- request_url : str, optional - - URL of the API endpoint to call - - if omitted, will default to "https://api.openai.com/v1/embeddings" -- api_key : str, optional - - API key to use - - if omitted, the script will attempt to read it from an environment variable {os.getenv("OPENAI_API_KEY")} -- max_requests_per_minute : float, optional - - target number of requests to make per minute (will make less if limited by tokens) - - leave headroom by setting this to 50% or 75% of your limit - - if requests are limiting you, try batching multiple embeddings or completions into one request - - if omitted, will default to 1,500 -- max_tokens_per_minute : float, optional - - target number of tokens to use per minute (will use less if limited by requests) - - leave headroom by setting this to 50% or 75% of your limit - - if omitted, will default to 125,000 -- token_encoding_name : str, optional - - name of the token encoding used, as defined in the `tiktoken` package - - if omitted, will default to "cl100k_base" (used by `text-embedding-ada-002`) -- max_attempts : int, optional - - number of times to retry a failed request before giving up - - if omitted, will default to 5 -- logging_level : int, optional - - level of logging to use; higher numbers will log fewer messages - - 40 = ERROR; will log only when requests fail after all retries - - 30 = WARNING; will log when requests his rate limits or other errors - - 20 = INFO; will log when requests start and the status at finish - - 10 = DEBUG; will log various things as the loop runs to see when they occur - - if omitted, will default to 20 (INFO). - -The script is structured as follows: - - Imports - - Define main() - - Initialize things - - In main loop: - - Get next request if one is not already waiting for capacity - - Update available token & request capacity - - If enough capacity available, call API - - The loop pauses if a rate limit error is hit - - The loop breaks when no tasks remain - - Define dataclasses - - StatusTracker (stores script metadata counters; only one instance is created) - - APIRequest (stores API inputs, outputs, metadata; one method to call API) - - Define functions - - api_endpoint_from_url (extracts API endpoint from request URL) - - append_to_jsonl (writes to results file) - - num_tokens_consumed_from_request (bigger function to infer token usage from request) - - task_id_generator_function (yields 1, 2, 3, ...) - - Run main() -""" - -# import argparse -# import subprocess -# import tempfile -# from langchain.llms import OpenAI -import asyncio -import json -import logging -import os -import re -import time -from dataclasses import ( # for storing API inputs, outputs, and metadata - dataclass, field) -from typing import Any, List - -import aiohttp # for making API calls concurrently -import tiktoken # for counting tokens -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.vectorstores import Qdrant -from qdrant_client import QdrantClient, models - - -class OpenAIAPIProcessor: - - def __init__(self, input_prompts_list, request_url, api_key, max_requests_per_minute, max_tokens_per_minute, token_encoding_name, - max_attempts, logging_level): - self.request_url = request_url - self.api_key = api_key - self.max_requests_per_minute = max_requests_per_minute - self.max_tokens_per_minute = max_tokens_per_minute - self.token_encoding_name = token_encoding_name - self.max_attempts = max_attempts - self.logging_level = logging_level - self.input_prompts_list: List[dict] = input_prompts_list - self.results = [] - self.cleaned_results: List[str] = [] - - async def process_api_requests_from_file(self): - """Processes API requests in parallel, throttling to stay under rate limits.""" - # constants - seconds_to_pause_after_rate_limit_error = 15 - seconds_to_sleep_each_loop = 0.001 # 1 ms limits max throughput to 1,000 requests per second - - # initialize logging - logging.basicConfig(level=self.logging_level) - logging.debug(f"Logging initialized at level {self.logging_level}") - - # infer API endpoint and construct request header - api_endpoint = api_endpoint_from_url(self.request_url) - request_header = {"Authorization": f"Bearer {self.api_key}"} - - # initialize trackers - queue_of_requests_to_retry = asyncio.Queue() - task_id_generator = task_id_generator_function() # generates integer IDs of 1, 2, 3, ... - status_tracker = StatusTracker() # single instance to track a collection of variables - next_request = None # variable to hold the next request to call - - # initialize available capacity counts - available_request_capacity = self.max_requests_per_minute - available_token_capacity = self.max_tokens_per_minute - last_update_time = time.time() - - # initialize flags - file_not_finished = True # after file is empty, we'll skip reading it - logging.debug(f"Initialization complete.") - - requests = self.input_prompts_list.__iter__() - - logging.debug(f"File opened. Entering main loop") - - task_list = [] - - while True: - # get next request (if one is not already waiting for capacity) - if next_request is None: - if not queue_of_requests_to_retry.empty(): - next_request = queue_of_requests_to_retry.get_nowait() - logging.debug(f"Retrying request {next_request.task_id}: {next_request}") - elif file_not_finished: - try: - # get new request - # request_json = json.loads(next(requests)) - request_json = next(requests) - - next_request = APIRequest(task_id=next(task_id_generator), - request_json=request_json, - token_consumption=num_tokens_consumed_from_request(request_json, api_endpoint, - self.token_encoding_name), - attempts_left=self.max_attempts, - metadata=request_json.pop("metadata", None)) - status_tracker.num_tasks_started += 1 - status_tracker.num_tasks_in_progress += 1 - logging.debug(f"Reading request {next_request.task_id}: {next_request}") - except StopIteration: - # if file runs out, set flag to stop reading it - logging.debug("Read file exhausted") - file_not_finished = False - - # update available capacity - current_time = time.time() - seconds_since_update = current_time - last_update_time - available_request_capacity = min( - available_request_capacity + self.max_requests_per_minute * seconds_since_update / 60.0, - self.max_requests_per_minute, - ) - available_token_capacity = min( - available_token_capacity + self.max_tokens_per_minute * seconds_since_update / 60.0, - self.max_tokens_per_minute, - ) - last_update_time = current_time - - # if enough capacity available, call API - if next_request: - next_request_tokens = next_request.token_consumption - if (available_request_capacity >= 1 and available_token_capacity >= next_request_tokens): - # update counters - available_request_capacity -= 1 - available_token_capacity -= next_request_tokens - next_request.attempts_left -= 1 - - # call API - # TODO: NOT SURE RESPONSE WILL WORK HERE - task = asyncio.create_task( - next_request.call_api( - request_url=self.request_url, - request_header=request_header, - retry_queue=queue_of_requests_to_retry, - status_tracker=status_tracker, - )) - task_list.append(task) - next_request = None # reset next_request to empty - - # print("status_tracker.num_tasks_in_progress", status_tracker.num_tasks_in_progress) - # one_task_result = task.result() - # print("one_task_result", one_task_result) - - # if all tasks are finished, break - if status_tracker.num_tasks_in_progress == 0: - break - - # main loop sleeps briefly so concurrent tasks can run - await asyncio.sleep(seconds_to_sleep_each_loop) - - # if a rate limit error was hit recently, pause to cool down - seconds_since_rate_limit_error = (time.time() - status_tracker.time_of_last_rate_limit_error) - if seconds_since_rate_limit_error < seconds_to_pause_after_rate_limit_error: - remaining_seconds_to_pause = (seconds_to_pause_after_rate_limit_error - seconds_since_rate_limit_error) - await asyncio.sleep(remaining_seconds_to_pause) - # ^e.g., if pause is 15 seconds and final limit was hit 5 seconds ago - logging.warn( - f"Pausing to cool down until {time.ctime(status_tracker.time_of_last_rate_limit_error + seconds_to_pause_after_rate_limit_error)}" - ) - - # after finishing, log final status - logging.info(f"""Parallel processing complete. About to return.""") - if status_tracker.num_tasks_failed > 0: - logging.warning(f"{status_tracker.num_tasks_failed} / {status_tracker.num_tasks_started} requests failed.") - if status_tracker.num_rate_limit_errors > 0: - logging.warning(f"{status_tracker.num_rate_limit_errors} rate limit errors received. Consider running at a lower rate.") - - # asyncio wait for task_list - await asyncio.wait(task_list) - - for task in task_list: - openai_completion = task.result() - self.results.append(openai_completion) - - self.cleaned_results: List[str] = extract_context_from_results(self.results) - - -def extract_context_from_results(results: List[Any]) -> List[str]: - assistant_contents = [] - total_prompt_tokens = 0 - total_completion_tokens = 0 - - for element in results: - if element is not None: - for item in element: - if 'choices' in item: - for choice in item['choices']: - if choice['message']['role'] == 'assistant': - assistant_contents.append(choice['message']['content']) - total_prompt_tokens += item['usage']['prompt_tokens'] - total_completion_tokens += item['usage']['completion_tokens'] - # Note: I don't think the prompt_tokens or completion_tokens is working quite right... - - return assistant_contents - - -# dataclasses - - -@dataclass -class StatusTracker: - """Stores metadata about the script's progress. Only one instance is created.""" - - num_tasks_started: int = 0 - num_tasks_in_progress: int = 0 # script ends when this reaches 0 - num_tasks_succeeded: int = 0 - num_tasks_failed: int = 0 - num_rate_limit_errors: int = 0 - num_api_errors: int = 0 # excluding rate limit errors, counted above - num_other_errors: int = 0 - time_of_last_rate_limit_error: float = 0 # used to cool off after hitting rate limits - - -@dataclass -class APIRequest: - """Stores an API request's inputs, outputs, and other metadata. Contains a method to make an API call.""" - - task_id: int - request_json: dict - token_consumption: int - attempts_left: int - metadata: dict - result: list = field(default_factory=list) - - async def call_api( - self, - request_url: str, - request_header: dict, - retry_queue: asyncio.Queue, - status_tracker: StatusTracker, - ): - """Calls the OpenAI API and saves results.""" - # logging.info(f"Starting request #{self.task_id}") - error = None - try: - async with aiohttp.ClientSession() as session: - async with session.post(url=request_url, headers=request_header, json=self.request_json) as response: - response = await response.json() - if "error" in response: - logging.warning(f"Request {self.task_id} failed with error {response['error']}") - status_tracker.num_api_errors += 1 - error = response - if "Rate limit" in response["error"].get("message", ""): - status_tracker.time_of_last_rate_limit_error = time.time() - status_tracker.num_rate_limit_errors += 1 - status_tracker.num_api_errors -= 1 # rate limit errors are counted separately - - except Exception as e: # catching naked exceptions is bad practice, but in this case we'll log & save them - logging.warning(f"Request {self.task_id} failed with Exception {e}") - status_tracker.num_other_errors += 1 - error = e - if error: - self.result.append(error) - if self.attempts_left: - retry_queue.put_nowait(self) - else: - logging.error(f"Request {self.request_json} failed after all attempts. Saving errors: {self.result}") - data = ([self.request_json, [str(e) for e in self.result], self.metadata] - if self.metadata else [self.request_json, [str(e) for e in self.result]]) - #append_to_jsonl(data, save_filepath) - status_tracker.num_tasks_in_progress -= 1 - status_tracker.num_tasks_failed += 1 - return data - else: - data = ([self.request_json, response, self.metadata] if self.metadata else [self.request_json, response]) # type: ignore - #append_to_jsonl(data, save_filepath) - status_tracker.num_tasks_in_progress -= 1 - status_tracker.num_tasks_succeeded += 1 - # logging.debug(f"Request {self.task_id} saved to {save_filepath}") - - return data - - -# functions - - -def api_endpoint_from_url(request_url: str): - """Extract the API endpoint from the request URL.""" - match = re.search('^https://[^/]+/v\\d+/(.+)$', request_url) - return match[1] # type: ignore - - -def append_to_jsonl(data, filename: str) -> None: - """Append a json payload to the end of a jsonl file.""" - json_string = json.dumps(data) - with open(filename, "a") as f: - f.write(json_string + "\n") - - -def num_tokens_consumed_from_request( - request_json: dict, - api_endpoint: str, - token_encoding_name: str, -): - """Count the number of tokens in the request. Only supports completion and embedding requests.""" - encoding = tiktoken.get_encoding(token_encoding_name) - # if completions request, tokens = prompt + n * max_tokens - if api_endpoint.endswith("completions"): - max_tokens = request_json.get("max_tokens", 15) - n = request_json.get("n", 1) - completion_tokens = n * max_tokens - - # chat completions - if api_endpoint.startswith("chat/"): - num_tokens = 0 - for message in request_json["messages"]: - num_tokens += 4 # every message follows {role/name}\n{content}\n - for key, value in message.items(): - num_tokens += len(encoding.encode(value)) - if key == "name": # if there's a name, the role is omitted - num_tokens -= 1 # role is always required and always 1 token - num_tokens += 2 # every reply is primed with assistant - return num_tokens + completion_tokens - # normal completions - else: - prompt = request_json["prompt"] - if isinstance(prompt, str): # single prompt - prompt_tokens = len(encoding.encode(prompt)) - num_tokens = prompt_tokens + completion_tokens - return num_tokens - elif isinstance(prompt, list): # multiple prompts - prompt_tokens = sum([len(encoding.encode(p)) for p in prompt]) - num_tokens = prompt_tokens + completion_tokens * len(prompt) - return num_tokens - else: - raise TypeError('Expecting either string or list of strings for "prompt" field in completion request') - # if embeddings request, tokens = input tokens - elif api_endpoint == "embeddings": - input = request_json["input"] - if isinstance(input, str): # single input - num_tokens = len(encoding.encode(input)) - return num_tokens - elif isinstance(input, list): # multiple inputs - num_tokens = sum([len(encoding.encode(i)) for i in input]) - return num_tokens - else: - raise TypeError('Expecting either string or list of strings for "inputs" field in embedding request') - # more logic needed to support other API calls (e.g., edits, inserts, DALL-E) - else: - raise NotImplementedError(f'API endpoint "{api_endpoint}" not implemented in this script') - - -def task_id_generator_function(): - """Generate integers 0, 1, 2, and so on.""" - task_id = 0 - while True: - yield task_id - task_id += 1 - -if __name__ == '__main__': - pass - -# run script -# if __name__ == "__main__": -# qdrant_client = QdrantClient( -# url=os.getenv('QDRANT_URL'), -# api_key=os.getenv('QDRANT_API_KEY'), -# ) -# vectorstore = Qdrant( -# client=qdrant_client, -# collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore -# embeddings=OpenAIEmbeddings()) # type: ignore - -# user_question = "What is the significance of Six Sigma?" -# k = 4 -# fetch_k = 200 -# found_docs = vectorstore.max_marginal_relevance_search(user_question, k=k, fetch_k=200) - -# requests = [] -# for i, doc in enumerate(found_docs): -# dictionary = { -# "model": "gpt-3.5-turbo-0613", # 4k context -# "messages": [{ -# "role": "system", -# "content": "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary." -# }, { -# "role": -# "user", -# "content": -# f"What is a comprehensive summary of the given text, based on the question:\n{doc.page_content}\nQuestion: {user_question}\nThe summary should cover all the key points only relevant to the question, while also condensing the information into a concise and easy-to-understand format. Please ensure that the summary includes relevant details and examples that support the main ideas, while avoiding any unnecessary information or repetition. Feel free to include references, sentence fragments, keywords, or anything that could help someone learn about it, only as it relates to the given question. The length of the summary should be as short as possible, without losing relevant information.\n" -# }], -# "n": 1, -# "max_tokens": 500, -# "metadata": doc.metadata -# } -# requests.append(dictionary) - -# oai = OpenAIAPIProcessor( -# input_prompts_list=requests, -# request_url='https://api.openai.com/v1/chat/completions', -# api_key=os.getenv("OPENAI_API_KEY"), -# max_requests_per_minute=1500, -# max_tokens_per_minute=90000, -# token_encoding_name='cl100k_base', -# max_attempts=5, -# logging_level=20, -# ) -# # run script -# asyncio.run(oai.process_api_requests_from_file()) - -# assistant_contents = [] -# total_prompt_tokens = 0 -# total_completion_tokens = 0 - -# print("Results, end of main: ", oai.results) -# print("-"*50) - -# # jsonObject = json.loads(oai.results) -# for element in oai.results: -# for item in element: -# if 'choices' in item: -# for choice in item['choices']: -# if choice['message']['role'] == 'assistant': -# assistant_contents.append(choice['message']['content']) -# total_prompt_tokens += item['usage']['prompt_tokens'] -# total_completion_tokens += item['usage']['completion_tokens'] - -# print("Assistant Contents:", assistant_contents) -# print("Total Prompt Tokens:", total_prompt_tokens) -# print("Total Completion Tokens:", total_completion_tokens) -# turbo_total_cost = (total_prompt_tokens * 0.0015) + (total_completion_tokens * 0.002) -# print("Total cost (3.5-turbo):", (total_prompt_tokens * 0.0015), " + Completions: ", (total_completion_tokens * 0.002), " = ", turbo_total_cost) - -# gpt4_total_cost = (total_prompt_tokens * 0.03) + (total_completion_tokens * 0.06) -# print("Hypothetical cost for GPT-4:", (total_prompt_tokens * 0.03), " + Completions: ", (total_completion_tokens * 0.06), " = ", gpt4_total_cost) -# print("GPT-4 cost premium: ", (gpt4_total_cost / turbo_total_cost), "x") - ''' - Pricing: - GPT4: - * $0.03 prompt - * $0.06 completions - 3.5-turbo: - * $0.0015 prompt - * $0.002 completions - ''' -""" -APPENDIX - -The example requests file at openai-cookbook/examples/data/example_requests_to_parallel_process.jsonl contains 10,000 requests to text-embedding-ada-002. - -It was generated with the following code: - -```python -import json - -filename = "data/example_requests_to_parallel_process.jsonl" -n_requests = 10_000 -jobs = [{"model": "text-embedding-ada-002", "input": str(x) + "\n"} for x in range(n_requests)] -with open(filename, "w") as f: - for job in jobs: - json_string = json.dumps(job) - f.write(json_string + "\n") -``` - -As with all jsonl files, take care that newlines in the content are properly escaped (json.dumps does this automatically). -""" +""" +API REQUEST PARALLEL PROCESSOR + +Using the OpenAI API to process lots of text quickly takes some care. +If you trickle in a million API requests one by one, they'll take days to complete. +If you flood a million API requests in parallel, they'll exceed the rate limits and fail with errors. +To maximize throughput, parallel requests need to be throttled to stay under rate limits. + +This script parallelizes requests to the OpenAI API while throttling to stay under rate limits. + +Features: +- Streams requests from file, to avoid running out of memory for giant jobs +- Makes requests concurrently, to maximize throughput +- Throttles request and token usage, to stay under rate limits +- Retries failed requests up to {max_attempts} times, to avoid missing data +- Logs errors, to diagnose problems with requests + +Example command to call script: +``` +python examples/api_request_parallel_processor.py \ + --requests_filepath examples/data/example_requests_to_parallel_process.jsonl \ + --save_filepath examples/data/example_requests_to_parallel_process_results.jsonl \ + --request_url https://api.openai.com/v1/embeddings \ + --max_requests_per_minute 1500 \ + --max_tokens_per_minute 6250000 \ + --token_encoding_name cl100k_base \ + --max_attempts 5 \ + --logging_level 20 +``` + +Inputs: +- requests_filepath : str + - path to the file containing the requests to be processed + - file should be a jsonl file, where each line is a json object with API parameters and an optional metadata field + - e.g., {"model": "text-embedding-ada-002", "input": "embed me", "metadata": {"row_id": 1}} + - as with all jsonl files, take care that newlines in the content are properly escaped (json.dumps does this automatically) + - an example file is provided at examples/data/example_requests_to_parallel_process.jsonl + - the code to generate the example file is appended to the bottom of this script +- save_filepath : str, optional + - path to the file where the results will be saved + - file will be a jsonl file, where each line is an array with the original request plus the API response + - e.g., [{"model": "text-embedding-ada-002", "input": "embed me"}, {...}] + - if omitted, results will be saved to {requests_filename}_results.jsonl +- request_url : str, optional + - URL of the API endpoint to call + - if omitted, will default to "https://api.openai.com/v1/embeddings" +- api_key : str, optional + - API key to use + - if omitted, the script will attempt to read it from an environment variable {os.getenv("OPENAI_API_KEY")} +- max_requests_per_minute : float, optional + - target number of requests to make per minute (will make less if limited by tokens) + - leave headroom by setting this to 50% or 75% of your limit + - if requests are limiting you, try batching multiple embeddings or completions into one request + - if omitted, will default to 1,500 +- max_tokens_per_minute : float, optional + - target number of tokens to use per minute (will use less if limited by requests) + - leave headroom by setting this to 50% or 75% of your limit + - if omitted, will default to 125,000 +- token_encoding_name : str, optional + - name of the token encoding used, as defined in the `tiktoken` package + - if omitted, will default to "cl100k_base" (used by `text-embedding-ada-002`) +- max_attempts : int, optional + - number of times to retry a failed request before giving up + - if omitted, will default to 5 +- logging_level : int, optional + - level of logging to use; higher numbers will log fewer messages + - 40 = ERROR; will log only when requests fail after all retries + - 30 = WARNING; will log when requests his rate limits or other errors + - 20 = INFO; will log when requests start and the status at finish + - 10 = DEBUG; will log various things as the loop runs to see when they occur + - if omitted, will default to 20 (INFO). + +The script is structured as follows: + - Imports + - Define main() + - Initialize things + - In main loop: + - Get next request if one is not already waiting for capacity + - Update available token & request capacity + - If enough capacity available, call API + - The loop pauses if a rate limit error is hit + - The loop breaks when no tasks remain + - Define dataclasses + - StatusTracker (stores script metadata counters; only one instance is created) + - APIRequest (stores API inputs, outputs, metadata; one method to call API) + - Define functions + - api_endpoint_from_url (extracts API endpoint from request URL) + - append_to_jsonl (writes to results file) + - num_tokens_consumed_from_request (bigger function to infer token usage from request) + - task_id_generator_function (yields 1, 2, 3, ...) + - Run main() +""" + +# import argparse +# import subprocess +# import tempfile +# from langchain.llms import OpenAI +import asyncio +import json +import logging +import os +import re +import time +from dataclasses import ( # for storing API inputs, outputs, and metadata + dataclass, field) +from typing import Any, List + +import aiohttp # for making API calls concurrently +import tiktoken # for counting tokens +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.vectorstores import Qdrant +from qdrant_client import QdrantClient, models + + +class OpenAIAPIProcessor: + + def __init__(self, input_prompts_list, request_url, api_key, max_requests_per_minute, max_tokens_per_minute, token_encoding_name, + max_attempts, logging_level): + self.request_url = request_url + self.api_key = api_key + self.max_requests_per_minute = max_requests_per_minute + self.max_tokens_per_minute = max_tokens_per_minute + self.token_encoding_name = token_encoding_name + self.max_attempts = max_attempts + self.logging_level = logging_level + self.input_prompts_list: List[dict] = input_prompts_list + self.results = [] + self.cleaned_results: List[str] = [] + + async def process_api_requests_from_file(self): + """Processes API requests in parallel, throttling to stay under rate limits.""" + # constants + seconds_to_pause_after_rate_limit_error = 15 + seconds_to_sleep_each_loop = 0.001 # 1 ms limits max throughput to 1,000 requests per second + + # initialize logging + logging.basicConfig(level=self.logging_level) + logging.debug(f"Logging initialized at level {self.logging_level}") + + # infer API endpoint and construct request header + api_endpoint = api_endpoint_from_url(self.request_url) + request_header = {"Authorization": f"Bearer {self.api_key}"} + + # initialize trackers + queue_of_requests_to_retry = asyncio.Queue() + task_id_generator = task_id_generator_function() # generates integer IDs of 1, 2, 3, ... + status_tracker = StatusTracker() # single instance to track a collection of variables + next_request = None # variable to hold the next request to call + + # initialize available capacity counts + available_request_capacity = self.max_requests_per_minute + available_token_capacity = self.max_tokens_per_minute + last_update_time = time.time() + + # initialize flags + file_not_finished = True # after file is empty, we'll skip reading it + logging.debug(f"Initialization complete.") + + requests = self.input_prompts_list.__iter__() + + logging.debug(f"File opened. Entering main loop") + + task_list = [] + + while True: + # get next request (if one is not already waiting for capacity) + if next_request is None: + if not queue_of_requests_to_retry.empty(): + next_request = queue_of_requests_to_retry.get_nowait() + logging.debug(f"Retrying request {next_request.task_id}: {next_request}") + elif file_not_finished: + try: + # get new request + # request_json = json.loads(next(requests)) + request_json = next(requests) + + next_request = APIRequest(task_id=next(task_id_generator), + request_json=request_json, + token_consumption=num_tokens_consumed_from_request(request_json, api_endpoint, + self.token_encoding_name), + attempts_left=self.max_attempts, + metadata=request_json.pop("metadata", None)) + status_tracker.num_tasks_started += 1 + status_tracker.num_tasks_in_progress += 1 + logging.debug(f"Reading request {next_request.task_id}: {next_request}") + except StopIteration: + # if file runs out, set flag to stop reading it + logging.debug("Read file exhausted") + file_not_finished = False + + # update available capacity + current_time = time.time() + seconds_since_update = current_time - last_update_time + available_request_capacity = min( + available_request_capacity + self.max_requests_per_minute * seconds_since_update / 60.0, + self.max_requests_per_minute, + ) + available_token_capacity = min( + available_token_capacity + self.max_tokens_per_minute * seconds_since_update / 60.0, + self.max_tokens_per_minute, + ) + last_update_time = current_time + + # if enough capacity available, call API + if next_request: + next_request_tokens = next_request.token_consumption + if (available_request_capacity >= 1 and available_token_capacity >= next_request_tokens): + # update counters + available_request_capacity -= 1 + available_token_capacity -= next_request_tokens + next_request.attempts_left -= 1 + + # call API + # TODO: NOT SURE RESPONSE WILL WORK HERE + task = asyncio.create_task( + next_request.call_api( + request_url=self.request_url, + request_header=request_header, + retry_queue=queue_of_requests_to_retry, + status_tracker=status_tracker, + )) + task_list.append(task) + next_request = None # reset next_request to empty + + # print("status_tracker.num_tasks_in_progress", status_tracker.num_tasks_in_progress) + # one_task_result = task.result() + # print("one_task_result", one_task_result) + + # if all tasks are finished, break + if status_tracker.num_tasks_in_progress == 0: + break + + # main loop sleeps briefly so concurrent tasks can run + await asyncio.sleep(seconds_to_sleep_each_loop) + + # if a rate limit error was hit recently, pause to cool down + seconds_since_rate_limit_error = (time.time() - status_tracker.time_of_last_rate_limit_error) + if seconds_since_rate_limit_error < seconds_to_pause_after_rate_limit_error: + remaining_seconds_to_pause = (seconds_to_pause_after_rate_limit_error - seconds_since_rate_limit_error) + await asyncio.sleep(remaining_seconds_to_pause) + # ^e.g., if pause is 15 seconds and final limit was hit 5 seconds ago + logging.warn( + f"Pausing to cool down until {time.ctime(status_tracker.time_of_last_rate_limit_error + seconds_to_pause_after_rate_limit_error)}" + ) + + # after finishing, log final status + logging.info(f"""Parallel processing complete. About to return.""") + if status_tracker.num_tasks_failed > 0: + logging.warning(f"{status_tracker.num_tasks_failed} / {status_tracker.num_tasks_started} requests failed.") + if status_tracker.num_rate_limit_errors > 0: + logging.warning(f"{status_tracker.num_rate_limit_errors} rate limit errors received. Consider running at a lower rate.") + + # asyncio wait for task_list + await asyncio.wait(task_list) + + for task in task_list: + openai_completion = task.result() + self.results.append(openai_completion) + + self.cleaned_results: List[str] = extract_context_from_results(self.results) + + +def extract_context_from_results(results: List[Any]) -> List[str]: + assistant_contents = [] + total_prompt_tokens = 0 + total_completion_tokens = 0 + + for element in results: + if element is not None: + for item in element: + if 'choices' in item: + for choice in item['choices']: + if choice['message']['role'] == 'assistant': + assistant_contents.append(choice['message']['content']) + total_prompt_tokens += item['usage']['prompt_tokens'] + total_completion_tokens += item['usage']['completion_tokens'] + # Note: I don't think the prompt_tokens or completion_tokens is working quite right... + + return assistant_contents + + +# dataclasses + + +@dataclass +class StatusTracker: + """Stores metadata about the script's progress. Only one instance is created.""" + + num_tasks_started: int = 0 + num_tasks_in_progress: int = 0 # script ends when this reaches 0 + num_tasks_succeeded: int = 0 + num_tasks_failed: int = 0 + num_rate_limit_errors: int = 0 + num_api_errors: int = 0 # excluding rate limit errors, counted above + num_other_errors: int = 0 + time_of_last_rate_limit_error: float = 0 # used to cool off after hitting rate limits + + +@dataclass +class APIRequest: + """Stores an API request's inputs, outputs, and other metadata. Contains a method to make an API call.""" + + task_id: int + request_json: dict + token_consumption: int + attempts_left: int + metadata: dict + result: list = field(default_factory=list) + + async def call_api( + self, + request_url: str, + request_header: dict, + retry_queue: asyncio.Queue, + status_tracker: StatusTracker, + ): + """Calls the OpenAI API and saves results.""" + # logging.info(f"Starting request #{self.task_id}") + error = None + try: + async with aiohttp.ClientSession() as session: + async with session.post(url=request_url, headers=request_header, json=self.request_json) as response: + response = await response.json() + if "error" in response: + logging.warning(f"Request {self.task_id} failed with error {response['error']}") + status_tracker.num_api_errors += 1 + error = response + if "Rate limit" in response["error"].get("message", ""): + status_tracker.time_of_last_rate_limit_error = time.time() + status_tracker.num_rate_limit_errors += 1 + status_tracker.num_api_errors -= 1 # rate limit errors are counted separately + + except Exception as e: # catching naked exceptions is bad practice, but in this case we'll log & save them + logging.warning(f"Request {self.task_id} failed with Exception {e}") + status_tracker.num_other_errors += 1 + error = e + if error: + self.result.append(error) + if self.attempts_left: + retry_queue.put_nowait(self) + else: + logging.error(f"Request {self.request_json} failed after all attempts. Saving errors: {self.result}") + data = ([self.request_json, [str(e) for e in self.result], self.metadata] + if self.metadata else [self.request_json, [str(e) for e in self.result]]) + #append_to_jsonl(data, save_filepath) + status_tracker.num_tasks_in_progress -= 1 + status_tracker.num_tasks_failed += 1 + return data + else: + data = ([self.request_json, response, self.metadata] if self.metadata else [self.request_json, response]) # type: ignore + #append_to_jsonl(data, save_filepath) + status_tracker.num_tasks_in_progress -= 1 + status_tracker.num_tasks_succeeded += 1 + # logging.debug(f"Request {self.task_id} saved to {save_filepath}") + + return data + + +# functions + + +def api_endpoint_from_url(request_url: str): + """Extract the API endpoint from the request URL.""" + match = re.search('^https://[^/]+/v\\d+/(.+)$', request_url) + return match[1] # type: ignore + + +def append_to_jsonl(data, filename: str) -> None: + """Append a json payload to the end of a jsonl file.""" + json_string = json.dumps(data) + with open(filename, "a") as f: + f.write(json_string + "\n") + + +def num_tokens_consumed_from_request( + request_json: dict, + api_endpoint: str, + token_encoding_name: str, +): + """Count the number of tokens in the request. Only supports completion and embedding requests.""" + encoding = tiktoken.get_encoding(token_encoding_name) + # if completions request, tokens = prompt + n * max_tokens + if api_endpoint.endswith("completions"): + max_tokens = request_json.get("max_tokens", 15) + n = request_json.get("n", 1) + completion_tokens = n * max_tokens + + # chat completions + if api_endpoint.startswith("chat/"): + num_tokens = 0 + for message in request_json["messages"]: + num_tokens += 4 # every message follows {role/name}\n{content}\n + for key, value in message.items(): + num_tokens += len(encoding.encode(value)) + if key == "name": # if there's a name, the role is omitted + num_tokens -= 1 # role is always required and always 1 token + num_tokens += 2 # every reply is primed with assistant + return num_tokens + completion_tokens + # normal completions + else: + prompt = request_json["prompt"] + if isinstance(prompt, str): # single prompt + prompt_tokens = len(encoding.encode(prompt)) + num_tokens = prompt_tokens + completion_tokens + return num_tokens + elif isinstance(prompt, list): # multiple prompts + prompt_tokens = sum([len(encoding.encode(p)) for p in prompt]) + num_tokens = prompt_tokens + completion_tokens * len(prompt) + return num_tokens + else: + raise TypeError('Expecting either string or list of strings for "prompt" field in completion request') + # if embeddings request, tokens = input tokens + elif api_endpoint == "embeddings": + input = request_json["input"] + if isinstance(input, str): # single input + num_tokens = len(encoding.encode(input)) + return num_tokens + elif isinstance(input, list): # multiple inputs + num_tokens = sum([len(encoding.encode(i)) for i in input]) + return num_tokens + else: + raise TypeError('Expecting either string or list of strings for "inputs" field in embedding request') + # more logic needed to support other API calls (e.g., edits, inserts, DALL-E) + else: + raise NotImplementedError(f'API endpoint "{api_endpoint}" not implemented in this script') + + +def task_id_generator_function(): + """Generate integers 0, 1, 2, and so on.""" + task_id = 0 + while True: + yield task_id + task_id += 1 + +if __name__ == '__main__': + pass + +# run script +# if __name__ == "__main__": +# qdrant_client = QdrantClient( +# url=os.getenv('QDRANT_URL'), +# api_key=os.getenv('QDRANT_API_KEY'), +# ) +# vectorstore = Qdrant( +# client=qdrant_client, +# collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore +# embeddings=OpenAIEmbeddings()) # type: ignore + +# user_question = "What is the significance of Six Sigma?" +# k = 4 +# fetch_k = 200 +# found_docs = vectorstore.max_marginal_relevance_search(user_question, k=k, fetch_k=200) + +# requests = [] +# for i, doc in enumerate(found_docs): +# dictionary = { +# "model": "gpt-3.5-turbo-0613", # 4k context +# "messages": [{ +# "role": "system", +# "content": "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary." +# }, { +# "role": +# "user", +# "content": +# f"What is a comprehensive summary of the given text, based on the question:\n{doc.page_content}\nQuestion: {user_question}\nThe summary should cover all the key points only relevant to the question, while also condensing the information into a concise and easy-to-understand format. Please ensure that the summary includes relevant details and examples that support the main ideas, while avoiding any unnecessary information or repetition. Feel free to include references, sentence fragments, keywords, or anything that could help someone learn about it, only as it relates to the given question. The length of the summary should be as short as possible, without losing relevant information.\n" +# }], +# "n": 1, +# "max_tokens": 500, +# "metadata": doc.metadata +# } +# requests.append(dictionary) + +# oai = OpenAIAPIProcessor( +# input_prompts_list=requests, +# request_url='https://api.openai.com/v1/chat/completions', +# api_key=os.getenv("OPENAI_API_KEY"), +# max_requests_per_minute=1500, +# max_tokens_per_minute=90000, +# token_encoding_name='cl100k_base', +# max_attempts=5, +# logging_level=20, +# ) +# # run script +# asyncio.run(oai.process_api_requests_from_file()) + +# assistant_contents = [] +# total_prompt_tokens = 0 +# total_completion_tokens = 0 + +# print("Results, end of main: ", oai.results) +# print("-"*50) + +# # jsonObject = json.loads(oai.results) +# for element in oai.results: +# for item in element: +# if 'choices' in item: +# for choice in item['choices']: +# if choice['message']['role'] == 'assistant': +# assistant_contents.append(choice['message']['content']) +# total_prompt_tokens += item['usage']['prompt_tokens'] +# total_completion_tokens += item['usage']['completion_tokens'] + +# print("Assistant Contents:", assistant_contents) +# print("Total Prompt Tokens:", total_prompt_tokens) +# print("Total Completion Tokens:", total_completion_tokens) +# turbo_total_cost = (total_prompt_tokens * 0.0015) + (total_completion_tokens * 0.002) +# print("Total cost (3.5-turbo):", (total_prompt_tokens * 0.0015), " + Completions: ", (total_completion_tokens * 0.002), " = ", turbo_total_cost) + +# gpt4_total_cost = (total_prompt_tokens * 0.03) + (total_completion_tokens * 0.06) +# print("Hypothetical cost for GPT-4:", (total_prompt_tokens * 0.03), " + Completions: ", (total_completion_tokens * 0.06), " = ", gpt4_total_cost) +# print("GPT-4 cost premium: ", (gpt4_total_cost / turbo_total_cost), "x") + ''' + Pricing: + GPT4: + * $0.03 prompt + * $0.06 completions + 3.5-turbo: + * $0.0015 prompt + * $0.002 completions + ''' +""" +APPENDIX + +The example requests file at openai-cookbook/examples/data/example_requests_to_parallel_process.jsonl contains 10,000 requests to text-embedding-ada-002. + +It was generated with the following code: + +```python +import json + +filename = "data/example_requests_to_parallel_process.jsonl" +n_requests = 10_000 +jobs = [{"model": "text-embedding-ada-002", "input": str(x) + "\n"} for x in range(n_requests)] +with open(filename, "w") as f: + for job in jobs: + json_string = json.dumps(job) + f.write(json_string + "\n") +``` + +As with all jsonl files, take care that newlines in the content are properly escaped (json.dumps does this automatically). +""" diff --git a/ai_ta_backend/nomic_logging.py b/ai_ta_backend/nomic_logging.py index 4c9f3677..8e5d179f 100644 --- a/ai_ta_backend/nomic_logging.py +++ b/ai_ta_backend/nomic_logging.py @@ -87,6 +87,10 @@ def log_convo_to_nomic(course_name: str, conversation) -> str: user_queries.append(first_message) for message in messages: + if message['role'] == 'user': + emoji = "🙋" + else: + emoji = "🤖" conversation_string += "\n>>> " + emoji + message['role'] + ": " + message['content'] + "\n" metadata = [{"course": course_name, "conversation": conversation_string, "conversation_id": conversation_id, diff --git a/ai_ta_backend/nomic_map_creation.ipynb b/ai_ta_backend/nomic_map_creation.ipynb index 23924157..aeecd4bd 100644 --- a/ai_ta_backend/nomic_map_creation.ipynb +++ b/ai_ta_backend/nomic_map_creation.ipynb @@ -2,18 +2,9 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-09-07 11:57:29,274:INFO - Note: NumExpr detected 16 cores but \"NUMEXPR_MAX_THREADS\" not set, so enforcing safe limit of 8.\n", - "2023-09-07 11:57:29,274:INFO - NumExpr defaulting to 8 threads.\n" - ] - } - ], + "outputs": [], "source": [ "# import required libraries\n", "\n", @@ -29,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -38,7 +29,7 @@ "True" ] }, - "execution_count": 2, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -52,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -66,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -90,324 +81,85 @@ " \n", " \n", " \n", - " id\n", - " created_at\n", - " convo\n", - " convo_id\n", " course_name\n", - " user_email\n", " \n", " \n", " \n", " \n", " 0\n", - " 5200\n", - " 2023-09-07T17:03:47.705812+00:00\n", - " {'id': 'abd2e044-fbff-455e-8c60-755cc7635182',...\n", - " abd2e044-fbff-455e-8c60-755cc7635182\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " gpt4\n", " \n", " \n", " 1\n", - " 5201\n", - " 2023-09-07T17:05:25.863567+00:00\n", - " {'id': '3e5d4861-b128-4c64-96ac-87c74f3217e5',...\n", - " 3e5d4861-b128-4c64-96ac-87c74f3217e5\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " gpt4\n", " \n", " \n", " 2\n", - " 5216\n", - " 2023-09-07T17:18:32.197379+00:00\n", - " {'id': '43ee631a-cb58-43f5-b2af-a5b91b7585cd',...\n", - " 43ee631a-cb58-43f5-b2af-a5b91b7585cd\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " gpt4\n", " \n", " \n", " 3\n", - " 5212\n", - " 2023-09-07T17:16:34.267931+00:00\n", - " {'id': '0129ea46-207f-47e3-be90-da143857000f',...\n", - " 0129ea46-207f-47e3-be90-da143857000f\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " gpt4\n", " \n", " \n", " 4\n", - " 5217\n", - " 2023-09-07T17:19:00.681823+00:00\n", - " {'id': 'c6b4e4d8-4de7-4387-b4e9-411084dffea6',...\n", - " c6b4e4d8-4de7-4387-b4e9-411084dffea6\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 5\n", - " 5223\n", - " 2023-09-07T17:22:38.970643+00:00\n", - " {'id': 'b5500763-7e7b-4b23-9031-cc320a51ccbf',...\n", - " b5500763-7e7b-4b23-9031-cc320a51ccbf\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 6\n", - " 5227\n", - " 2023-09-07T17:24:10.362647+00:00\n", - " {'id': 'd410955f-4398-4869-b395-e6b659cc2d06',...\n", - " d410955f-4398-4869-b395-e6b659cc2d06\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 7\n", - " 5209\n", - " 2023-09-07T17:14:43.518617+00:00\n", - " {'id': '0ecd2c05-772a-42aa-b29a-0a892bd0e9ab',...\n", - " 0ecd2c05-772a-42aa-b29a-0a892bd0e9ab\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 8\n", - " 5222\n", - " 2023-09-07T17:21:29.223343+00:00\n", - " {'id': 'c82056a0-2d67-4ce8-82e3-86a30f1f6dc0',...\n", - " c82056a0-2d67-4ce8-82e3-86a30f1f6dc0\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 9\n", - " 5224\n", - " 2023-09-07T17:22:54.856839+00:00\n", - " {'id': '2316bbd7-61f3-44aa-a79e-bb42bd688c47',...\n", - " 2316bbd7-61f3-44aa-a79e-bb42bd688c47\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 10\n", - " 5226\n", - " 2023-09-07T17:23:27.644745+00:00\n", - " {'id': '66abfe85-bb04-456e-8709-89f9aafe5508',...\n", - " 66abfe85-bb04-456e-8709-89f9aafe5508\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 11\n", - " 5228\n", - " 2023-09-07T17:24:41.32465+00:00\n", - " {'id': '175ad6b2-3bf2-4889-b2de-a18961ee8ecb',...\n", - " 175ad6b2-3bf2-4889-b2de-a18961ee8ecb\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 12\n", - " 5232\n", - " 2023-09-07T17:30:05.770146+00:00\n", - " {'id': 'f9859e36-bf76-40ab-9413-91ef6663dbd6',...\n", - " f9859e36-bf76-40ab-9413-91ef6663dbd6\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 13\n", - " 5233\n", - " 2023-09-07T17:30:52.749867+00:00\n", - " {'id': 'bab32d0b-8e2b-4eaa-a46e-069be0d0c3a2',...\n", - " bab32d0b-8e2b-4eaa-a46e-069be0d0c3a2\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 14\n", - " 5234\n", - " 2023-09-07T17:31:19.801611+00:00\n", - " {'id': 'ecaf3228-78f3-49f7-b46d-3a5c3d5b62fd',...\n", - " ecaf3228-78f3-49f7-b46d-3a5c3d5b62fd\n", - " cropwizard\n", - " avd6@illinois.edu\n", - " \n", - " \n", - " 15\n", - " 5237\n", - " 2023-09-07T17:36:14.68431+00:00\n", - " {'id': 'edead825-12df-417c-af40-059e83067c69',...\n", - " edead825-12df-417c-af40-059e83067c69\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " gpt4\n", " \n", " \n", - " 16\n", - " 5238\n", - " 2023-09-07T17:36:42.984907+00:00\n", - " {'id': 'bc44d229-327a-452d-a386-8868216a1bd2',...\n", - " bc44d229-327a-452d-a386-8868216a1bd2\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " ...\n", + " ...\n", " \n", " \n", - " 17\n", - " 5241\n", - " 2023-09-07T17:37:22.134543+00:00\n", - " {'id': 'ff7a1c27-e126-49db-be79-6deaefcffec3',...\n", - " ff7a1c27-e126-49db-be79-6deaefcffec3\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " 1789\n", + " FIN574-GT\n", " \n", " \n", - " 18\n", - " 5304\n", - " 2023-09-07T19:45:21.73541+00:00\n", - " {'id': '6226b153-356a-408c-9483-49ef5808538c',...\n", - " 6226b153-356a-408c-9483-49ef5808538c\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " 1790\n", + " NCSA\n", " \n", " \n", - " 19\n", - " 5305\n", - " 2023-09-07T19:46:03.626639+00:00\n", - " {'id': 'e9edae6b-b7e1-46a8-b5e8-6215890a2a01',...\n", - " e9edae6b-b7e1-46a8-b5e8-6215890a2a01\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " 1791\n", + " gpt4\n", " \n", " \n", - " 20\n", - " 5306\n", - " 2023-09-07T19:46:36.076704+00:00\n", - " {'id': 'b2116035-da7b-4136-878d-66a10098a756',...\n", - " b2116035-da7b-4136-878d-66a10098a756\n", - " cropwizard\n", - " avd6@illinois.edu\n", + " 1792\n", + " NCSA\n", " \n", " \n", - " 21\n", - " 5195\n", - " 2023-09-06T23:43:38.201481+00:00\n", - " {'id': '543ee10e-faf0-47a8-bb1c-c040aec44ed1',...\n", - " 543ee10e-faf0-47a8-bb1c-c040aec44ed1\n", - " cropwizard\n", - " dabholkar.asmita@gmail.com\n", + " 1793\n", + " NCSA\n", " \n", " \n", "\n", + "

1794 rows × 1 columns

\n", "" ], "text/plain": [ - " id created_at \\\n", - "0 5200 2023-09-07T17:03:47.705812+00:00 \n", - "1 5201 2023-09-07T17:05:25.863567+00:00 \n", - "2 5216 2023-09-07T17:18:32.197379+00:00 \n", - "3 5212 2023-09-07T17:16:34.267931+00:00 \n", - "4 5217 2023-09-07T17:19:00.681823+00:00 \n", - "5 5223 2023-09-07T17:22:38.970643+00:00 \n", - "6 5227 2023-09-07T17:24:10.362647+00:00 \n", - "7 5209 2023-09-07T17:14:43.518617+00:00 \n", - "8 5222 2023-09-07T17:21:29.223343+00:00 \n", - "9 5224 2023-09-07T17:22:54.856839+00:00 \n", - "10 5226 2023-09-07T17:23:27.644745+00:00 \n", - "11 5228 2023-09-07T17:24:41.32465+00:00 \n", - "12 5232 2023-09-07T17:30:05.770146+00:00 \n", - "13 5233 2023-09-07T17:30:52.749867+00:00 \n", - "14 5234 2023-09-07T17:31:19.801611+00:00 \n", - "15 5237 2023-09-07T17:36:14.68431+00:00 \n", - "16 5238 2023-09-07T17:36:42.984907+00:00 \n", - "17 5241 2023-09-07T17:37:22.134543+00:00 \n", - "18 5304 2023-09-07T19:45:21.73541+00:00 \n", - "19 5305 2023-09-07T19:46:03.626639+00:00 \n", - "20 5306 2023-09-07T19:46:36.076704+00:00 \n", - "21 5195 2023-09-06T23:43:38.201481+00:00 \n", - "\n", - " convo \\\n", - "0 {'id': 'abd2e044-fbff-455e-8c60-755cc7635182',... \n", - "1 {'id': '3e5d4861-b128-4c64-96ac-87c74f3217e5',... \n", - "2 {'id': '43ee631a-cb58-43f5-b2af-a5b91b7585cd',... \n", - "3 {'id': '0129ea46-207f-47e3-be90-da143857000f',... \n", - "4 {'id': 'c6b4e4d8-4de7-4387-b4e9-411084dffea6',... \n", - "5 {'id': 'b5500763-7e7b-4b23-9031-cc320a51ccbf',... \n", - "6 {'id': 'd410955f-4398-4869-b395-e6b659cc2d06',... \n", - "7 {'id': '0ecd2c05-772a-42aa-b29a-0a892bd0e9ab',... \n", - "8 {'id': 'c82056a0-2d67-4ce8-82e3-86a30f1f6dc0',... \n", - "9 {'id': '2316bbd7-61f3-44aa-a79e-bb42bd688c47',... \n", - "10 {'id': '66abfe85-bb04-456e-8709-89f9aafe5508',... \n", - "11 {'id': '175ad6b2-3bf2-4889-b2de-a18961ee8ecb',... \n", - "12 {'id': 'f9859e36-bf76-40ab-9413-91ef6663dbd6',... \n", - "13 {'id': 'bab32d0b-8e2b-4eaa-a46e-069be0d0c3a2',... \n", - "14 {'id': 'ecaf3228-78f3-49f7-b46d-3a5c3d5b62fd',... \n", - "15 {'id': 'edead825-12df-417c-af40-059e83067c69',... \n", - "16 {'id': 'bc44d229-327a-452d-a386-8868216a1bd2',... \n", - "17 {'id': 'ff7a1c27-e126-49db-be79-6deaefcffec3',... \n", - "18 {'id': '6226b153-356a-408c-9483-49ef5808538c',... \n", - "19 {'id': 'e9edae6b-b7e1-46a8-b5e8-6215890a2a01',... \n", - "20 {'id': 'b2116035-da7b-4136-878d-66a10098a756',... \n", - "21 {'id': '543ee10e-faf0-47a8-bb1c-c040aec44ed1',... \n", + " course_name\n", + "0 gpt4\n", + "1 gpt4\n", + "2 gpt4\n", + "3 gpt4\n", + "4 gpt4\n", + "... ...\n", + "1789 FIN574-GT\n", + "1790 NCSA\n", + "1791 gpt4\n", + "1792 NCSA\n", + "1793 NCSA\n", "\n", - " convo_id course_name \\\n", - "0 abd2e044-fbff-455e-8c60-755cc7635182 cropwizard \n", - "1 3e5d4861-b128-4c64-96ac-87c74f3217e5 cropwizard \n", - "2 43ee631a-cb58-43f5-b2af-a5b91b7585cd cropwizard \n", - "3 0129ea46-207f-47e3-be90-da143857000f cropwizard \n", - "4 c6b4e4d8-4de7-4387-b4e9-411084dffea6 cropwizard \n", - "5 b5500763-7e7b-4b23-9031-cc320a51ccbf cropwizard \n", - "6 d410955f-4398-4869-b395-e6b659cc2d06 cropwizard \n", - "7 0ecd2c05-772a-42aa-b29a-0a892bd0e9ab cropwizard \n", - "8 c82056a0-2d67-4ce8-82e3-86a30f1f6dc0 cropwizard \n", - "9 2316bbd7-61f3-44aa-a79e-bb42bd688c47 cropwizard \n", - "10 66abfe85-bb04-456e-8709-89f9aafe5508 cropwizard \n", - "11 175ad6b2-3bf2-4889-b2de-a18961ee8ecb cropwizard \n", - "12 f9859e36-bf76-40ab-9413-91ef6663dbd6 cropwizard \n", - "13 bab32d0b-8e2b-4eaa-a46e-069be0d0c3a2 cropwizard \n", - "14 ecaf3228-78f3-49f7-b46d-3a5c3d5b62fd cropwizard \n", - "15 edead825-12df-417c-af40-059e83067c69 cropwizard \n", - "16 bc44d229-327a-452d-a386-8868216a1bd2 cropwizard \n", - "17 ff7a1c27-e126-49db-be79-6deaefcffec3 cropwizard \n", - "18 6226b153-356a-408c-9483-49ef5808538c cropwizard \n", - "19 e9edae6b-b7e1-46a8-b5e8-6215890a2a01 cropwizard \n", - "20 b2116035-da7b-4136-878d-66a10098a756 cropwizard \n", - "21 543ee10e-faf0-47a8-bb1c-c040aec44ed1 cropwizard \n", - "\n", - " user_email \n", - "0 avd6@illinois.edu \n", - "1 avd6@illinois.edu \n", - "2 avd6@illinois.edu \n", - "3 avd6@illinois.edu \n", - "4 avd6@illinois.edu \n", - "5 avd6@illinois.edu \n", - "6 avd6@illinois.edu \n", - "7 avd6@illinois.edu \n", - "8 avd6@illinois.edu \n", - "9 avd6@illinois.edu \n", - "10 avd6@illinois.edu \n", - "11 avd6@illinois.edu \n", - "12 avd6@illinois.edu \n", - "13 avd6@illinois.edu \n", - "14 avd6@illinois.edu \n", - "15 avd6@illinois.edu \n", - "16 avd6@illinois.edu \n", - "17 avd6@illinois.edu \n", - "18 avd6@illinois.edu \n", - "19 avd6@illinois.edu \n", - "20 avd6@illinois.edu \n", - "21 dabholkar.asmita@gmail.com " + "[1794 rows x 1 columns]" ] }, - "execution_count": 22, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# query data for one course for testing\n", - "course = 'cropwizard'\n", - "response = supabase_client.table(\"llm-convo-monitor\").select(\"*\").eq(\"course_name\", course).execute()\n", + "course = 'ece120'\n", + "response = supabase_client.table(\"llm-convo-monitor\").select(\"course_name\", count='exact').execute()\n", "data = response.data\n", "df = pd.DataFrame(data)\n", "df" @@ -415,45 +167,11 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 {'id': 'abd2e044-fbff-455e-8c60-755cc7635182',...\n", - "1 {'id': '3e5d4861-b128-4c64-96ac-87c74f3217e5',...\n", - "2 {'id': '43ee631a-cb58-43f5-b2af-a5b91b7585cd',...\n", - "3 {'id': '0129ea46-207f-47e3-be90-da143857000f',...\n", - "4 {'id': 'c6b4e4d8-4de7-4387-b4e9-411084dffea6',...\n", - "Name: convo, dtype: object" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "course_df = df[df['course_name'] == course]['convo']\n", - "course_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{'role': 'assistant', 'content': \"The U.S. Environmental Protection Agency (EPA)'s mission is to protect human health and the environment. As part of this mission, the EPA is involved in efforts such as developing strategies to protect endangered and threatened species from potential impacts of agricultural practices, including the use of herbicides. For instance, the EPA has released a draft Herbicide Strategy for public comment, aimed at proposing early mitigations for more than 900 listed species and designated critical habitats to reduce potential impacts from the agricultural use of herbicides(1^,2^,3^,4^).\\n\\n1. University of Illinois Extension\\n2. EPA releases draft herbicide strategy\\n3. EPA releases draft herbicide strategy\\n4. extension.pdf, page: 3\", 'contexts': [{'base_url': 'https://extension.illinois.edu/newsletters/illinois-pesticide-review-newsletter/julyaugust-2023', 'course_name ': 'cropwizard', 'pagenumber': '', 'readable_filename': 'University of Illinois Extension', 's3_path': 'courses/cropwizard/University_of_Illinois_Extension.html', 'text': \". — The U.S. Environmental Protection Agency, in collaboration with the U.S. Department of Energy, is funding the creation of Thriving Communities Technical Assistance Centers to help rural and underserved communities access state and federal funding for environmental infrastructure…\\n\\n\\n \\nAddress health challenges with holistic solutions\\nSeptember 1, 2023\\n\\nURBANA, Ill. — The University of Illinois, along with the Interdisciplinary Health Sciences Institute, and in collaboration with Illinois Extension, has developed the Autumn Health Picks 2023 webinar series. This series is part of the Community Seminar Series, and it provides an opportunity for…\\n\\n\\n \\nDo artificial roosts help bats? Illinois experts say more research needed\\nSeptember 1, 2023\\n\\nURBANA, Ill.\\xa0— Artificial roosts for bats come in many forms — bat boxes, condos, bark mimics, clay roosts, and cinder block structures, to name a few — but a new conservation practice and policy article from researchers at the\\xa0University of Illinois Urbana-Champaign\\xa0suggests the structures…\\n\\n\\nMore news\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\nUpcoming Events\\n\\n\\n\\n\\n \\n\\nRead Before You Sign: Renting & Leasing \\n\\n\\nSeptember 6, 2023\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\nTechnology & Soil Health Field Day\\n\\n\\nSeptember 6, 2023\\n\\n\\nCounty\\n\\nHenry\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\nPollinator Plants to Span the Season\\n\\n\\nSeptember 6, 2023\\n\\n\\nCounty\\n\\nMacoupin\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\nOrr Beef Research Center Field Day\\n\\n\\nSeptember 6, 2023\\n\\n\\n\\n\\n\\n\\nMore Events\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nSubtitle\\nOpens the Door to New Opportunities\\n\\n\\n\\n\\nTitle\\nLearn @ Illinois Extension\\n\\n\\n\\n\\nBody\\nThere is so much you want to do, but the challenges of time, distance, and cost create barriers to achieving those goals. You need a program that's flexible to your schedule.\\xa0Learn @ Illinois Extension\\xa0helps remove those challenge by offering\\xa0flexible online learning programs that meet your personal interests and continuing education requirements. We provide learning on your terms so you can be who you were meant to be.\\xa0\\n\\n\\n\\nOnline Courses\\n\\n\\n\\n\\n\\n \\n\\n\\nLatest Podcast\\n\\n\\n\\n\\nGood Growing\\n\\n\\nGardenbite: Three tips for a healthier lawn | #GoodGrowingThis week on the Good Growing podcast Chris shares a Gardenbite of when retired horticulture educator Richard Hentschel visited the show in 2021 to talk about fall lawn care. During the show, Richard spoke about three things we could all do to reduce our lawn inputs.\\xa0Want to see or...\\n\\n\\n Your browser does not support iframes, but you can visit \\n\\n\\n\\n\\n\\nMore Podcasts\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\nDig Deeper\\n\\n\\nTopics we address\\n\\n4-H Youth Development\\nBeef Cattle\\nClimate\\nCommunity Gardens\\nCommunity Planning\\nCottage Food\\nCrops\\nDisasters\\nEconomic Development\\nEnergy\\nEnvironmental Quality\\nFamily\\nFinances\\nFirewood\\nFlowers\\nFood\\nForestry\\nHealth\\nHemp\\nHerbs\\nHome Vegetable Gardening\\nIllinois Grasses\\nInsects\\nInvasives\\nLivestock\\nLocal Food Systems and Small Farms\\nLocal Government Education\\nMental Health\\nMushrooms\\nNatural Resources\\nPlant Problems\\nPlants\\nRainfall Management\\nSoil\\nSpecialty Crops\\nVaccines\\nWeather\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nShare with us\\n\\n\\n \\n\\nBody\\n\\n\\n\\n\\xa0\\n\\n\\xa0\\n\\xa0\\n\\n\\n\\xa0\\n\\n\\nView this profile on Instagram\\n\\n\\xa0\\n\\n\\n\\xa0\\n\\xa0\\n\\xa0\\n\\n\\n\\xa0\\n\\xa0\\n\\n\\n\\xa0\\n\\xa0\\n\\xa0\\n\\n\\n\\n\\xa0\\n\\xa0\\n\\nIllinois Extension (@ilextension) • Instagram photos and videos\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\nBody\\n\\xa0\\n\\nUniversity of Illinois Extension\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nVolunteer with Extension\\nLearn Something New\\nRead a Blog\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nIllinois Extension\\n\\n 101 Mumford Hall (MC-710)\\n \\n1301 W\", 'url': 'https://extension.illinois.edu/'}, {'base_url': 'https://extension.illinois.edu/newsletters/illinois-pesticide-review-newsletter/julyaugust-2023', 'course_name ': 'cropwizard', 'pagenumber': '', 'readable_filename': 'EPA releases draft herbicide strategy', 's3_path': 'courses/cropwizard/EPA_releases_draft_herbicide_strategy.html', 'text': '. The draft framework describes how EPA would apply the mitigations in the Strategy compared to mitigations in the other initiatives.\\nWhat can you do? Submit comments! Learn more!\\nThe draft herbicide framework and accompanying documents are available in docket EPA-HQ-OPP-2023-0365 for public comment for 60 days.\\xa0Comments are due September 22, 2023. Agricultural pesticide users are encouraged to learn about EPA’s plan and to start thinking about how these mitigation measures could apply to herbicide use in their operation. While extensive recordkeeping is not currently required for the mitigation factors described in the strategy, it is highly recommended that users begin thinking about how to incorporate these new elements into their current record systems. If you are applying according to label directions, proper records can only assist your defense should you need it. To help guide you, watch for shared comments from professional organizations such as the Weed Science Society of America (WSSA). In April, a WSSA press release linked their comments to EPA and encouraged growers to act now to understand the impact of ESA’s new compliance initiatives. One good suggestion they offered to growers is to learn how to use EPA’s Bulletins Live! Two which is where important application instructions will be found.\\nEPA’s Office of Pesticide Programs will present a webinar on this draft herbicide Strategy on August 10th at Noon Central Time. EPA plans to walk through the framework and take questions from grower groups and other stakeholders. Register today. Questions may be submitted in advance of the webinar by emailing sm.opmp.pesticides@usda.gov.\\nTo learn more about EPA’s comprehensive ESA workplan Check out our article, “Change Coming to How EPA Protects Endangered Species from Pesticides – Feedback Needed” in the November/December 2022 issue of this newsletter. Proposed mitigation measures are discussed in more general terms in this comprehensive workplan. Please note that the comment period discussed there has ended.\\nVisit EPA’s website to learn more about how EPA’s pesticide program is protecting endangered species.\\nAdapted slightly from an EPA press release, “EPA Releases Draft Strategy to Better Protect Endangered Species from Herbicide Use” and related EPA documents. \\nABOUT THE AUTHOR: Michelle Wiesbrook\\xa0provides subject matter expertise and training in pesticide safety with an emphasis on horticultural weed science. She serves as the Illinois Pesticide Review newsletter editor, collecting and organizing material; and co-coordinates social media information for the PSEP program and ensures its timely publication.\\n\\nPesticide News\\n\\n\\n\\n\\nKeywords\\n\\nPesticide\\nHerbicide\\nInsecticide\\nFungicide\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nRelated Content\\n\\nUnintended herbicide injury on trees: A growing concernAugust 28, 2023\\n\\nTips to help employees succeedAugust 2, 2023\\n\\nParaquat certification valid 3 years: Are you due for training?August 2, 2023\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nIllinois Extension\\n\\n 101 Mumford Hall (MC-710)\\n \\n1301 W. Gregory Dr.\\n Urbana,\\n IL\\n 61801\\n \\nEmail: extension@illinois.edu\\n\\n\\n\\nInstagram\\nFacebook\\nTwitter\\nYouTube\\nLinkedIn\\n\\n\\n\\nCollege of Agricultural, Consumer & Environmental Sciences\\n\\n\\n\\n\\n\\nDig Deeper\\n\\n\\nTake an Online Course\\n\\n\\nRead a Blog\\n\\n\\nRead a Newsletter\\n\\n\\nListen to a Podcast\\n\\n\\nWatch a Video\\n\\n\\nBuy a Publication\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nExtension Network\\n\\n\\nEat.Move.Save.\\n\\n\\nIllinois 4-H\\n\\n\\nIllini Science Policy Program\\n\\n\\nIllinois Indiana Sea Grant\\n\\n\\nIllinois Master Gardeners\\n\\n\\nIllinois Master Naturalists\\n\\n\\nIllinois Nutrition Education Programs\\n\\n\\nPesticide Safety Education Program\\n\\n\\nResearch Centers\\n\\n\\nSafe Electricity\\n\\n\\nU of I Plant Clinic\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nAdditional links\\n\\nAbout Cookies\\nPrivacy Policy\\n© 2023 University of Illinois Board of Trustees\\nEEO\\nAccessibility\\nmyExtension\\nLogin', 'url': 'https://extension.illinois.edu/blogs/pesticide-news/2023-08-02-epa-releases-draft-herbicide-strategy-public-comment-period-open'}, {'base_url': 'https://extension.illinois.edu/newsletters/illinois-pesticide-review-newsletter/julyaugust-2023', 'course_name ': 'cropwizard', 'pagenumber': '', 'readable_filename': 'EPA releases draft herbicide strategy', 's3_path': 'courses/cropwizard/EPA_releases_draft_herbicide_strategy.html', 'text': \". The Strategy — which is primarily designed to provide early mitigations that minimize impacts to over 900 listed species — is one of EPA’s most significant proposals to help overcome these challenges.\\nEPA focused the Strategy on agricultural crop uses in the lower 48 states because hundreds of millions of pounds of herbicides (and plant growth regulators) are applied each year, which is substantially more than for non-agricultural uses of herbicides and for other pesticide classes (e.g., insecticides, fungicides). Additionally, hundreds of listed species in the lower 48 states live in habitats adjacent to agricultural areas. The proposed mitigations in the Strategy would address the most common ways that conventional agricultural herbicides might impact these listed species. More specifically, EPA developed potential mitigation options for conventional agricultural herbicides to reduce pesticide transport via spray drift and runoff/erosion that could result in exposure to listed plants and listed animals that depend on plants.\\nEPA expects that the Strategy will increase the efficiency of future ESA consultations on herbicides with the U.S. Fish and Wildlife Service (FWS), which has authority over most listed species that could benefit from the proposed mitigations. Under the Strategy, EPA proposes to identify and begin mitigating for potential impacts even before EPA completes ESA consultations. These early mitigations should expedite EPA’s ability to fully comply with the ESA by reducing impacts to listed species before EPA conducts most of its ESA analysis. Adopting mitigations earlier will also allow EPA and FWS to use their resources more efficiently in ESA consultations.\\nThe Strategy’s proposed mitigations to reduce spray drift, runoff, and erosion and thereby reduce the potential exposure reflect practices that can be readily implemented by growers and identified by pesticide applicators and that provide flexibility for growers to select the mitigations that work best for them. The Strategy also gives credit to landowners who are already implementing certain measures to reduce pesticide runoff. For example, existing vegetated ditches and water retention ponds will qualify for credits that reduce the need for additional mitigation. Similarly, the Strategy would require less mitigation on flat lands, which are less prone to runoff, and in many western states, which typically experience less rain to carry pesticides off fields. The Strategy also describes how the Agency could add other mitigation practices to the menu of mitigation options in the future, particularly to incorporate emerging technology or new information on the effectiveness of specific practices.\\nDraft Herbicide Framework Document\\nThe draft framework document titled, “Draft Herbicide Strategy Framework to Reduce Exposure of Federally Listed Endangered and Threatened Species and Designated Critical Habitats from the Use of Conventional Agricultural Herbicides” is 97 pages long and includes a discussion of both the proposed scope of the Herbicide Strategy and the proposed decision framework to determine the level of mitigation needed for a particular conventional agricultural herbicide. The draft framework document also includes examples of how the proposed herbicide mitigation would apply to some of the herbicides for which EPA has conducted case studies as well as EPA's proposed implementation plan.\\nSome of the accompanying documents are quite lengthy. The “Herbicide Strategy Case Study Summary and Process” is 666 pages!\\xa0 Coincidence on the number? I’m not sure. I haven’t made it through it all yet. The primary thing I gathered from perusing through the spreadsheet files was that managing these complexities must be a nightmare. The document, “Application of EPA’s Draft Herbicide Strategy Framework Through Scenarios that Represent Crop Production Systems” is only 17 pages long and includes possible scenarios. Examples 1 and 2 would be particularly fitting for Illinois corn and soybean producers. These are shared to help producers better understand how these mitigation practices may be used.\\nIn its ESA Workplan and ESA Workplan Update, EPA outlined this and other ESA initiatives to develop early mitigations that provide listed species with practical protections from pesticides. The Strategy complements those other initiatives, such as targeted mitigations for listed species particularly vulnerable to pesticides and Interim Ecological Mitigations that EPA has begun incorporating under the Federal Insecticide, Fungicide, and Rodenticide Act. The draft framework describes how EPA would apply the mitigations in the Strategy compared to mitigations in the other initiatives.\\nWhat can you do? Submit comments! Learn more!\\nThe draft herbicide framework and accompanying documents are available in docket EPA-HQ-OPP-2023-0365 for public comment for 60 days.\\xa0Comments are due September 22, 2023. Agricultural pesticide users are encouraged to learn about EPA’s plan and to start thinking about how these mitigation measures could apply to herbicide use in their operation. While extensive recordkeeping is not currently required for the mitigation factors described in the strategy, it is highly recommended that users begin thinking about how to incorporate these new elements into their current record systems\", 'url': 'https://extension.illinois.edu/blogs/pesticide-news/2023-08-02-epa-releases-draft-herbicide-strategy-public-comment-period-open'}, {'base_url': 'https://extension.illinois.edu/newsletters/illinois-pesticide-review-newsletter/julyaugust-2023', 'course_name ': 'cropwizard', 'pagenumber': '', 'readable_filename': 'EPA releases draft herbicide strategy', 's3_path': 'courses/cropwizard/EPA_releases_draft_herbicide_strategy.html', 'text': 'EPA releases draft herbicide strategy; public comment period open | Illinois Extension | UIUC\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n Skip to main content\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\nCollege of Agricultural, Consumer & Environmental Sciences\\n\\nIllinois Extension\\n\\n\\n\\n\\n\\nGive\\nVolunteer\\nCareers\\n\\n\\n\\n\\nTopics\\n\\n\\nAll Topics\\n\\n\\nCottage Food\\n\\n\\nFood\\n\\n\\nForestry\\n\\n\\nLocal Government Education\\n\\n\\nPlants\\n\\n\\nRainfall Management\\n\\n\\nSoil\\n\\n\\nVaccines\\n\\n\\nVegetable Gardening\\n\\n\\n\\n\\nLearn\\n\\n\\nOnline Courses\\n\\n\\nBlogs\\n\\n\\nNewsletters\\n\\n\\nPodcasts\\n\\n\\nVideos\\n\\n\\nPublications\\n\\n\\nSummer Resources\\n\\n\\n\\n\\nEvents\\n\\n\\nStatewide Webinars\\n\\n\\n\\n\\nNews\\n\\n\\nConnect\\n\\n\\nContact Staff\\n\\n\\nFind an Office\\n\\n\\nSocial Media\\n\\n\\nAdministration and Educator Teams\\n\\n\\nCommunications and Information Technology\\n\\n\\nIllini Science Policy Program\\n\\n\\nIllinois Indiana Sea Grant\\n\\n\\nMaster Gardeners\\n\\n\\nMaster Naturalists\\n\\n\\nPlant Clinic\\n\\n\\nResearch and Education Centers\\n\\n\\nSea Grant\\n\\n\\nEnergy Education Council\\n\\n\\nHome and Community Education\\n\\n\\nPlanning, Reporting, and Evaluation\\n\\n\\n\\n\\nImpact\\n\\n\\n2024 Extension Collaboration Grants\\n\\n\\nEconomic and Functional Impact\\n\\n\\nOur Impact in Agriculture and AgriBusiness\\n\\n\\nSNAP-Education Impact\\n\\n\\nExtension Funded Research Projects\\n\\n\\nOur Impact in Agriculture and Natural Resources\\n\\n\\nOur Impact in Community & Economic Development\\n\\n\\nOur Impact in Family and Consumer Sciences\\n\\n\\nOur Impact in Integrated Health Disparities\\n\\n\\n\\n\\nAbout\\n\\n\\nStrategic Planning\\n\\n\\nExtension Councils\\n\\n\\nCareers\\n\\n\\nProfessional Associations\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nBreadcrumb\\n\\n\\nHome\\n\\n\\nBlogs\\n\\n\\nPesticide News\\n\\n\\n EPA releases draft herbicide strategy; public comment period open \\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\nPesticide News\\n\\n\\nEPA releases draft herbicide strategy; public comment period open \\n\\n\\n\\n\\n\\n\\nAugust 2, 2023\\n\\n\\n\\nMichelle Wiesbrook\\n\\n\\n \\n\\nStrategy aims to increase efficiencies while supporting farmers, herbicide users with continued use of important pesticide tools. \\xa0\\nThe U.S. Environmental Protection Agency (EPA) released the draft Herbicide Strategy for public comment, a major milestone in the Agency’s work to protect federally endangered and threatened (listed) species from conventional agricultural herbicides. The Strategy describes proposed early mitigations for more than 900 listed species and designated critical habitats to reduce potential impacts from the agricultural use of these herbicides while helping to ensure the continued availability of these important pesticide tools.\\n“Ensuring safe use of herbicides is an important part of EPA’s mission to protect the environment,” said Deputy Assistant Administrator for Pesticide Programs for the Office of Chemical Safety and Pollution Prevention Jake Li. “This strategy reflects one of our biggest steps to support farmers and other herbicide users with tools for managing weeds, while accelerating EPA’s ability to protect many endangered species that live near agricultural areas.”\\nThe Strategy is part of EPA’s ongoing efforts to develop a multichemical, multispecies approach toward meeting its obligations under the Endangered Species Act (ESA). EPA’s traditional chemical-by-chemical, species-by-species approach to meeting these obligations is slow and costly.\\xa0 As a result, EPA has completed its ESA obligations for less than 5% of its actions, creating legal vulnerabilities for the Agency, increased litigation, and uncertainty for farmers and other pesticide users about their continued ability to use many pesticides. The Strategy — which is primarily designed to provide early mitigations that minimize impacts to over 900 listed species — is one of EPA’s most significant proposals to help overcome these challenges.\\nEPA focused the Strategy on agricultural crop uses in the lower 48 states because hundreds of millions of pounds of herbicides (and plant growth regulators) are applied each year, which is substantially more than for non-agricultural uses of herbicides and for other pesticide classes (e.g., insecticides, fungicides). Additionally, hundreds of listed species in the lower 48 states live in habitats adjacent to agricultural areas. The proposed mitigations in the Strategy would address the most common ways that conventional agricultural herbicides might impact these listed species', 'url': 'https://extension.illinois.edu/blogs/pesticide-news/2023-08-02-epa-releases-draft-herbicide-strategy-public-comment-period-open'}, {'base_url': 'https://extension.illinois.edu/newsletters/illinois-pesticide-review-newsletter/julyaugust-2023', 'course_name ': 'cropwizard', 'pagenumber': '', 'readable_filename': 'News', 's3_path': 'courses/cropwizard/News.html', 'text': \". — The U.S. Environmental Protection Agency, in collaboration with the U.S. Department of Energy, is funding the creation of Thriving Communities Technical Assistance Centers to help rural and underserved communities access state and federal funding for environmental infrastructure…\\n\\n\\n \\nAddress health challenges with holistic solutions\\nSeptember 1, 2023\\n\\nURBANA, Ill. — The University of Illinois, along with the Interdisciplinary Health Sciences Institute, and in collaboration with Illinois Extension, has developed the Autumn Health Picks 2023 webinar series. This series is part of the Community Seminar Series, and it provides an opportunity for…\\n\\n\\n \\nDo artificial roosts help bats? Illinois experts say more research needed\\nSeptember 1, 2023\\n\\nURBANA, Ill.\\xa0— Artificial roosts for bats come in many forms — bat boxes, condos, bark mimics, clay roosts, and cinder block structures, to name a few — but a new conservation practice and policy article from researchers at the\\xa0University of Illinois Urbana-Champaign\\xa0suggests the structures…\\n\\n\\nMore news\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\nBlock Reference\\n\\nNews\\n\\n\\n\\nBy topic\\n- Any -4-H4-H alumni4-H FoundationAccessibilityAdministrationAgingAgricultural safetyAgricultureAnimal scienceAnnie's ProjectAnnualsAutismBeefBeef CattleBeekeepingBeesBeneficial InsectsBirdsBrain healthBulbsBullyingBusiness developmentBusiness retention and expansionButterflyCampingCaregivingCensusCertified Crop AdvisorCertified Livestock ManagerChild nutritionChronic diseasesCitizen ScienceCivic engagementClimateCollege ReadinessCommercial agricultureCommercial Vegetable ProductionCommunicationCommunity developmentCommunity gardenCommunity healthCommunity planningCommunity resiliencyCompostingConservationConsumer economicsCornCover cropsCreditCrop diseaseCropsDairy CattleDebt managementDementia Alzheimer’s diseaseDiabetesDicambaDisaster preparednessDiversity Equity InclusionDowntown developmentDrainageDronesEarly childhoodEconomic developmentEDEN Ready BusinessEFNEPElder careEmergency foodEnergyEnergy conservationEnergy efficiencyEntomologyEntrepreneurshipEnvironmentEstate planningExpensesFacultyFamily lifeFarm business managementFarm safetyFarmers marketsFinancial ExploitationFinancial planningFinancial wellnessFlowersFood accessFood PreservationFood safetyFood sanitationForestryFruitsFungicideGardeningGrassesHayHealthHealth CareHealthy cookingHealthy eatingHempHerbicideHerbsHolidaysHome OwnershipHorticultureHouseplantsIdentity TheftInclusionINEPInformation TechnologyInsect PestsInsecticideInsects and pestsInsuranceIntegrated Health DisparitiesIntegrated pest managementInvasive speciesInvestingLandscape architectureLandscape designLawn careLeadershipLeadership developmentLife skillsLivestockLocal foods and small farmsLocal governmentManaging stressManure managementMarketingMaster GardenersMaster NaturalistMeeting ManagementMental healthMindfulnessMoney MentorsMyPINative plantsNavigating differenceNutritionNutrition educationObesity preventionOrnamentalOutdoor SkillsParentingPasturePerennialsPesticidePesticide LabelPhysical ActivityPlant ClinicPlant diseasePlant health carePollinator HabitatPondsPoultryPoverty simulationPrivate/Commercial ApplicatorProfessional Development CEU CPDUPSEP trainingReal ColorsRecyclingRelationshipsResilienceRoboticsRosesSafetyShooting sportsShrubsSmall farmsSmart MeterSNAP-EdSocial-emotional healthSoilSoybeansSpecialty CropsSpendingState 4-H OfficeSTEMSubstance UseSustainable agricultureSwineTaxesTeam buildingTeenagersTime managementTrauma informed Adverse Childhood ExperiencesTree fruitsTreesTurfUrban AgricultureUrban gardeningVegetable gardeningVegetablesVolunteersWaterWeatherWeedsWellnessWheatWhole grainsWildlifeWorkforce developmentWorkplace wellnessYouth and MoneyYouth development\\n\\n\\nSearch\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nSeptember 6, 2023\\n\\nIllinois Extension selected to establish environmental assistance center to help Illinois communities\\n\\n \\n URBANA, Ill. — The U.S. Environmental Protection Agency, in collaboration with the U.S. Department of Energy, is funding the creation of Thriving Communities Technical Assistance Centers to help rural and underserved communities access state and federal funding for environmental...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nSeptember 1, 2023\\n\\nAddress health challenges with holistic solutions\\n\\n \\n URBANA, Ill\", 'url': 'https://extension.illinois.edu/global/news-releases'}, {'base_url': 'https://extension.illinois.edu/newsletters/illinois-pesticide-review-newsletter/julyaugust-2023', 'course_name ': 'cropwizard', 'pagenumber': 4, 'readable_filename': 'extension.pdf', 's3_path': 'courses/cropwizard/extension.pdf', 'text': \"Illinois Pesticide Review \\nVolume 36, No. 4, July/August \\n4\\nproposed mitigations. Under the Strategy, EPA \\nproposes to identify and begin mitigating for \\npotential impacts even before EPA completes \\nESA consultations. These early mitigations \\nshould expedite EPAs ability to fully comply \\nwith the ESA by reducing impacts to listed \\nspecies before EPA conducts most of its ESA \\nanalysis. Adopting mitigations earlier will \\nalso allow EPA and FWS to use their resources \\nmore efficiently in ESA consultations.\\nThe Strategys proposed mitigations to reduce \\nspray drift, runoff, and erosion and thereby \\nreduce the potential exposure reflect practices \\nthat can be readily implemented by growers \\nand identified by pesticide applicators and \\nthat provide flexibility for growers to select \\nthe mitigations that work best for them. The \\nStrategy also gives credit to landowners who \\nare already implementing certain measures to \\nreduce pesticide runoff. For example, existing \\nvegetated ditches and water retention ponds \\nwill qualify for credits that reduce the need for \\nadditional mitigation. Similarly, the Strategy \\nwould require less mitigation on flat lands, \\nwhich are less prone to runoff, and in many \\nwestern states, which typically experience \\nless rain to carry pesticides off fields. The \\nStrategy also describes how the Agency could \\nadd other mitigation practices to the menu of \\nmitigation options in the future, particularly \\nto incorporate emerging technology or new \\ninformation on the effectiveness of specific \\npractices.\\nDraft Herbicide Framework \\nDocument\\nThe draft framework document titled, Draft \\nHerbicide Strategy Framework to Reduce \\nExposure of Federally Listed Endangered \\nand Threatened Species and Designated \\nCritical Habitats from the Use of Conventional \\nAgricultural Herbicides is 97 pages long and \\nincludes a discussion of both the proposed \\nscope of the Herbicide Strategy and the \\nproposed decision framework to determine \\nthe level of mitigation needed for a particular \\nconventional agricultural herbicide. The draft \\nframework document also includes examples \\nof how the proposed herbicide mitigation \\nwould apply to some of the herbicides for \\nwhich EPA has conducted case studies as well \\nas EPA's proposed implementation plan.\\nSome of the accompanying documents are \\nquite lengthy. The Herbicide Strategy Case \\nStudy Summary and Process is 666 pages! \\nCoincidence on the number? Im not sure. I \\nhavent made it through it all yet. The primary \\nthing I gathered from perusing through \\nthe spreadsheet files was that managing \\nthese complexities must be a nightmare. \\nThe document, Application of EPAs Draft \\nHerbicide Strategy Framework Through \\nScenarios that Represent Crop Production \\nSystems is only 17 pages long and includes \\npossible scenarios. Examples 1 and 2 would \\nbe particularly fitting for Illinois corn and \\nsoybean producers. These are shared to \\nhelp producers better understand how these \\nmitigation practices may be used. \\nIn its ESA Workplan and ESA Workplan \\nUpdate, EPA outlined this and other ESA \\ninitiatives to develop early mitigations \\nthat provide listed species with practical \\nprotections from pesticides. The Strategy \\ncomplements those other initiatives, such \\nas targeted mitigations for listed species \\nparticularly vulnerable to pesticides and \\nInterim Ecological Mitigations that EPA \\nhas begun incorporating under the Federal \\nInsecticide, Fungicide, and Rodenticide Act. \\nThe draft framework describes how EPA would \\napply the mitigations in the Strategy compared \\nto mitigations in the other initiatives. \\nWhat can you do? Submit \\ncomments! Learn more!\\nThe draft herbicide framework and \\naccompanying documents are available in \\ndocket EPA-HQ-OPP-2023-0365 for public \\ncomment for 60 days. Comments are due \\nSeptember 22, 2023. Agricultural pesticide \\nusers are encouraged to learn about EPAs\", 'url': 'https://extension.illinois.edu/sites/default/files/2023-08/IPR%20Volume%2036%20Issue%204%20July%20August%20SECURE.pdf'}, {'base_url': 'https://extension.illinois.edu/newsletters/illinois-pesticide-review-newsletter/julyaugust-2023', 'course_name ': 'cropwizard', 'pagenumber': 3, 'readable_filename': 'extension.pdf', 's3_path': 'courses/cropwizard/extension.pdf', 'text': 'Illinois Pesticide Review \\nVolume 36, No. 4, July/August \\n3\\nIts important to consider that one should \\ntake the test in the language he or she is \\nmost comfortable with. If someone has \\nbeen studying the material in English, dont \\nbe surprised if they opt to take the exam in \\nEnglish too. \\nIn the end, it all comes down to good \\ncommunication between you and your \\nemployees. It could be that they dont know \\nyet which learning method would work best \\nfor them and theyll need to try a few things. \\nTheyll appreciate you taking the time to ask \\nthem and work with them to help ensure their \\nsuccess.\\nMichelle Wiesbrook \\nEPA Releases Draft \\nHerbicide Strategy, Public \\nComment Period Open \\nStrategy aims to increase \\nefficiencies while supporting \\nfarmers, herbicide users with \\ncontinued use of important \\npesticide tools \\nThe U.S. Environmental Protection Agency \\n(EPA) released the draft Herbicide Strategy \\nfor public comment, a major milestone in the \\nAgencys work to protect federally endangered \\nand threatened (listed) species from conven-\\ntional agricultural herbicides. The Strategy \\ndescribes proposed early mitigations for more \\nthan 900 listed species and designated criti-\\ncal habitats to reduce potential impacts from \\nthe agricultural use of these herbicides while \\nhelping to ensure the continued availability of \\nthese important pesticide tools.\\nEnsuring safe use of herbicides is an \\nimportant part of EPAs mission to protect \\nthe environment, said Deputy Assistant \\nAdministrator for Pesticide Programs for \\nthe Office of Chemical Safety and Pollution \\nPrevention Jake Li. This strategy reflects one \\nof our biggest steps to support farmers and \\nother herbicide users with tools for managing \\nweeds, while accelerating EPAs ability to \\nprotect many endangered species that live near \\nagricultural areas.\\nThe Strategy is part of EPAs ongoing efforts \\nto develop a multichemical, multispecies \\napproach toward meeting its obligations \\nunder the Endangered Species Act (ESA). \\nEPAs traditional chemical-by-chemical, \\nspecies-by-species approach to meeting these \\nobligations is slow and costly. As a result, EPA \\nhas completed its ESA obligations for less than \\n5% of its actions, creating legal vulnerabilities \\nfor the Agency, increased litigation, and \\nuncertainty for farmers and other pesticide \\nusers about their continued ability to use many \\npesticides. The Strategy which is primarily \\ndesigned to provide early mitigations that \\nminimize impacts to over 900 listed species \\nis one of EPAs most significant proposals to \\nhelp overcome these challenges.\\nEPA focused the Strategy on agricultural crop \\nuses in the lower 48 states because hundreds \\nof millions of pounds of herbicides (and plant \\ngrowth regulators) are applied each year, \\nwhich is substantially more than for non-\\nagricultural uses of herbicides and for other \\npesticide classes (e.g., insecticides, fungicides). \\nAdditionally, hundreds of listed species in \\nthe lower 48 states live in habitats adjacent to \\nagricultural areas. The proposed mitigations \\nin the Strategy would address the most \\ncommon ways that conventional agricultural \\nherbicides might impact these listed \\nspecies. More specifically, EPA developed \\npotential mitigation options for conventional \\nagricultural herbicides to reduce pesticide \\ntransport via spray drift and runoff/erosion \\nthat could result in exposure to listed plants \\nand listed animals that depend on plants.\\nEPA expects that the Strategy will increase \\nthe efficiency of future ESA consultations \\non herbicides with the U.S. Fish and Wildlife \\nService (FWS), which has authority over most \\nlisted species that could benefit from the', 'url': 'https://extension.illinois.edu/sites/default/files/2023-08/IPR%20Volume%2036%20Issue%204%20July%20August%20SECURE.pdf'}, {'base_url': 'https://extension.illinois.edu/newsletters/illinois-pesticide-review-newsletter/julyaugust-2023', 'course_name ': 'cropwizard', 'pagenumber': '', 'readable_filename': 'News', 's3_path': 'courses/cropwizard/News.html', 'text': \". — The U.S. Environmental Protection Agency, in collaboration with the U.S. Department of Energy, is funding the creation of Thriving Communities Technical Assistance Centers to help rural and underserved communities access state and federal funding for environmental...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nSeptember 1, 2023\\n\\nAddress health challenges with holistic solutions\\n\\n \\n URBANA, Ill. — The University of Illinois, along with the Interdisciplinary Health Sciences Institute, and in collaboration with Illinois Extension, has developed...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nSeptember 1, 2023\\n\\nDo artificial roosts help bats? Illinois experts say more research needed\\n\\n \\n URBANA, Ill.\\xa0— Artificial roosts for bats come in many forms — bat boxes, condos, bark mimics, clay roosts, and cinder block structures, to name a few — but a new conservation practice and policy article from researchers at the\\xa0University of...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nAugust 29, 2023\\n\\nButterflies can eat to live, live to eat in a balanced garden\\n\\n \\n URBANA, Ill. — A favorite thing about visiting gardens in the summer is catching sight of a butterfly enjoying nectar from a brightly colored zinnia or a monarch caterpillar munching on a milkweed leaf. When designing a butterfly garden, expand and balance plant selection to provide more than...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nAugust 24, 2023\\n\\nField Day event plans to keep beef cattle producers up on trends for their herds\\n\\n \\n URBANA, Ill. — Beef cattle producers will gain insights and stay up to date on current research from cow/calf patterns to alternative forages and more at the Orr Beef Research Center's Field Day on September 6.\\xa0The meeting will be held at the John Wood Community College Ag Center located west of...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nAugust 23, 2023\\n\\nBuild drought-tolerant gardens this fall for next summer’s blooms\\n\\n \\n URBANA, Ill. — Many Illinois gardens are entering the fall stressed from the lack of summer rains combined with scorching hot temperatures. These conditions easily stress some plants; however, many plants quickly adapt to hot, dry conditions. Drought-tolerant plants are not only tough and...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nAugust 23, 2023\\n\\nIllinois Extension exhibits research, programs, innovation at 2023 Farm Progress Show\\n\\n \\n DECATUR, Ill. — The Farm Progress Show returns to Decatur, Aug. 29-31, and\\xa0University of Illinois Extension will be on-site in the College of...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nAugust 21, 2023\\n\\nBuild privacy with plants for secret gardens\\n\\n \\n URBANA, Ill.\\xa0— Plants serve a lot of purposes in the landscape. One of which is to add some privacy. Screening plants can help define and give purpose to a space. Homeowners may wish to screen a particular area or transparency in the landscape, creating interest in what lies beyond.\\xa0\\n\\n...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nAugust 21, 2023\\n\\nIllinois Extension investing in the future of farming across the state\\n\\n \\n URBANA, Ill. — Helping Illinois farmers grow thriving crops and livestock has always been at the heart of University of Illinois Extension’s mission. Using feedback received from farmers and other agricultural stakeholders through a 2022 survey,...\\n \\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n \\n\\n\\n\\nAugust 21, 2023\\n\\nExtraordinary 4-H volunteers honored\\n\\n \\n SPRINGFIELD, Ill\", 'url': 'https://extension.illinois.edu/global/news-releases'}, {'base_url': 'https://extension.illinois.edu/newsletters/illinois-pesticide-review-newsletter/julyaugust-2023', 'course_name ': 'cropwizard', 'pagenumber': '', 'readable_filename': 'Extension Funded Research Projects', 's3_path': 'courses/cropwizard/Extension_Funded_Research_Projects.html', 'text': '. Today’s most pressing issues are related to climate change. An understanding of how human activity impacts the environment helps us make sense of how a zoonotic transfer of a virus to humans causes a global pandemic, and how rising temperatures increase the frequency and intensity of wildfires and storms. Environmental Education (EE) connects people to their environments, instilling values and knowledge that leads to better environmental stewardship of local environments and natural resources. There are several well-established EE programs offered throughout Cook County by Illinois\\xa0Extension and the Forest Preserve of Cook County (FPCC). Missing from these opportunities are programs available to middle school-aged children, the age best suited for EE experiences because their environmental sensitivities are still developing and early experiences in nature have been shown to have impacts throughout adulthood (Boom, 2017). This proposal seeks to develop a program, Illinois Inquiry Adventures in Nature (IIAN), for middle school children and their families, suitable for small groups during the pandemic\\xa0and expanding in scope to include classrooms when safe. A series of four seasonal activities\\xa0and teacher workshops\\xa0will be created to bring groups to their local green spaces, including FPCC sites. Groups will engage in open-ended investigations based on their own observations and questions, complete activities at home\\xa0and enact local community conservation projects. Research will be conducted to examine how individuals’ connections to nature and environmental stewardship change over the course of their participation. This program fills a local need in Cook County, creating a continuum of opportunities across ages, and will be made available to all residents in Illinois, and nationwide, encouraging the next generation of environmental leaders.\\n\\n\\nAssessing the Needs and Connecting Young & Beginning Farmers with Extension Resources in Northern Illinois\\nAwarded to: Illinois Extension in the College of ACES\\nPrincipal Investigator: Joseph Malual\\nCo-Investigators:\\nNikki Keltner, Extension program coordinator, Illinois Extension\\nGrant McCarty, Extension educator, Illinois Extension\\nHope Michelson, assistant professor,\\xa0Department of Agricultural & Consumer Economics\\nPROJECT SUMMARY\\nMore and more young people are engaging in small-scale farming, with many focusing on specialty crops and sustainable agricultural production. Despite this trend, entry into farming, which is a complex business, is challenging. Beginning farmers face serious obstacles in accessing critical assets, including startup capital to acquire land, farm equipment\\xa0and agricultural technical knowledge needed to develop a\\xa0successful agricultural practice and profitable business. The situation is complicated by lack of adequate research to understand the unique challenges facing this generation of farmers. In Illinois, there is limited research to understand how people new to farming navigate access to critical resources. This research project aims to provide a comprehensive assessment of the needs and opportunities facing young and beginning\\xa0farmers in northern Illinois. We will identify and map farms owned by young and beginning farmers, examine their experiences and strategies used to leverage critical startup assets, including farmland and equipment, financial capital\\xa0and agricultural technical assistance, as well as strategies for marketing agricultural products. This project will build relations and connect this new audience with Extension resources, which can help\\xa0beginning farmers develop the knowledge and skills necessary for solving critical problems. Through interdisciplinary collaboration between Extension educators and specialists with faculty at the University of Illinois at Urbana-Champaign, this research will generate useful knowledge that can help beginning farmers, businesses\\xa0and communities make informed decisions and plan for future support of those new to farming. The\\xa0knowledge and practices discovered and identified through this project will be shared with Extension across the state. Extension educators can build on this knowledge to plan and deliver educational programming that empowers farmers to develop financially viable and sustainable farms. Those successful endeavors will, in turn, help to revitalize their rural communities.\\n\\n\\nNew Immigrant Foodways\\nAwarded to: Department of History in the College of Liberal Arts and Sciences\\nPrincipal Investigator: Teresa Barnes\\nCo-Investigators:\\nGisela Sin, director, Center for Latin American and Caribbean Studies\\nMargarita Teran-Garcia, Extension specialist, Illinois Extension\\nPROJECT SUMMARY\\nThis project will leverage new and existing research with immigrant communities about challenges and strategies in adapting home foodways to American food systems to create short instructional videos related to nutrition and cooking. The project addresses a complex issue at the intersection of three critical areas of Extension’s mission: food, health\\xa0and environment. It addresses the public need of new immigrant families to access information and expertise and develop sustainable strategies when faced with the bewildering array of often unhealthy food options in the USA', 'url': 'https://extension.illinois.edu/global/extension-funded-research-projects'}]}\n" - ] - } - ], + "outputs": [], "source": [ - "print(course_df[0]['messages'][1])" + "course_list = df['course_name'].unique()\n" ] }, { @@ -461,9 +179,7 @@ "execution_count": 32, "metadata": {}, "outputs": [], - "source": [ - "# user email is in DF, outside of convo" - ] + "source": [] }, { "cell_type": "markdown", @@ -481,36 +197,538 @@ "name": "stdout", "output_type": "stream", "text": [ - "22\n", - "22\n" + "gpt4\n", + "623\n", + "badm_550_ashley\n", + "17\n", + "None\n", + "0\n", + "ece120\n", + "154\n", + "test-video-ingest\n", + "13\n", + "badm-567-v3\n", + "15\n", + "badm-567\n", + "3\n", + "new-weather\n", + "65\n", + "gies-online-mba-v2\n", + "course_name: gies-online-mba-v2\n", + "error: The read operation timed out\n", + "frontend\n", + "8\n", + "test-video-ingest-28\n", + "2\n", + "ECE220FA23\n", + "74\n", + "ECE408FA23\n", + "259\n", + "pdeploy999\n", + "2\n", + "badm-350-summer\n", + "5\n", + "previewtesting1\n", + "1\n", + "localtest2\n", + "2\n", + "your-favorite-url\n", + "1\n", + "mantine\n", + "6\n", + "ece408\n", + "27\n", + "27\n", + "27\n", + "(27, 1536)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:42:49.002 | INFO | nomic.project:_create_project:790 - Creating project `Conversation Map for ece408` in organization `dabholkar.asmita`\n", + "2023-09-12 15:42:50.721 | INFO | nomic.atlas:map_embeddings:108 - Uploading embeddings to Atlas.\n", + "2023-09-12 15:42:50.721 | WARNING | nomic.project:_validate_and_correct_arrow_upload:238 - Replacing 20 null values for field user_email with string 'null'. This behavior will change in a future version.\n", + "2023-09-12 15:42:50.721 | WARNING | nomic.project:_validate_and_correct_arrow_upload:261 - id_field is not a string. Converting to string from int32\n", + "1it [00:02, 2.16s/it]\n", + "2023-09-12 15:42:52.900 | INFO | nomic.project:_add_data:1422 - Upload succeeded.\n", + "2023-09-12 15:42:52.908 | INFO | nomic.atlas:map_embeddings:127 - Embedding upload succeeded.\n", + "2023-09-12 15:42:56.033 | INFO | nomic.project:create_index:1132 - Created map `Conversation Map for ece408` in project `Conversation Map for ece408`: https://atlas.nomic.ai/map/df8e3337-396a-443d-a6f5-8240c66024ac/bc754afd-83fb-43cb-99db-e2bd26f1f40b\n", + "2023-09-12 15:42:56.046 | INFO | nomic.atlas:map_embeddings:140 - Conversation Map for ece408: https://atlas.nomic.ai/map/df8e3337-396a-443d-a6f5-8240c66024ac/bc754afd-83fb-43cb-99db-e2bd26f1f40b\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Conversation Map for ece408: https://atlas.nomic.ai/map/df8e3337-396a-443d-a6f5-8240c66024ac/bc754afd-83fb-43cb-99db-e2bd26f1f40b]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:43:00.951 | INFO | nomic.project:create_index:1132 - Created map `ece408_convo_index` in project `Conversation Map for ece408`: https://atlas.nomic.ai/map/df8e3337-396a-443d-a6f5-8240c66024ac/49bd2ab9-db8a-45ab-b399-5039c7b7e736\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "test-video-ingest-2\n", + "2\n", + "Snowmass\n", + "19\n", + "badm_567_v2\n", + "11\n", + "erpnext\n", + "1\n", + "mip\n", + "1\n", + "farmdoc_test_kastan-v1\n", + "9\n", + "personalstatement\n", + "2\n", + "test-canvas\n", + "4\n", + "hrc\n", + "3\n", + "csv\n", + "4\n", + "star_nox\n", + "3\n", + "badm_567\n", + "3\n", + "SPaRCEd\n", + "2\n", + "NPRE247\n", + "13\n", + "localdemo8\n", + "2\n", + "badm_567_thumbnails\n", + "2\n", + "your-awesome-course\n", + "course_name: your-awesome-course\n", + "error: The read operation timed out\n", + "chatbot\n", + "3\n", + "erp\n", + "2\n", + "extreme\n", + "3\n", + "rohan_atree\n", + "4\n", + "zotero-extreme\n", + "9\n", + "pract\n", + "18\n", + "test-video-ingest-20\n", + "3\n", + "gies-online-mba2\n", + "2\n", + "gies-online-mba\n", + "3\n", + "ece120FL22\n", + "15\n", + "careerassistant\n", + "7\n", + "weather\n", + "4\n", + "lillian-wang-blog\n", + "2\n", + "local-test5\n", + "4\n", + "demo-for-vyriad\n", + "6\n", + "ag-gpt-beta\n", + "5\n", + "rohan_atree_demo\n", + "2\n", + "cropwizard\n", + "25\n", + "25\n", + "25\n", + "(25, 1536)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:43:49.933 | INFO | nomic.project:_create_project:790 - Creating project `Conversation Map for cropwizard` in organization `dabholkar.asmita`\n", + "2023-09-12 15:43:50.980 | INFO | nomic.atlas:map_embeddings:108 - Uploading embeddings to Atlas.\n", + "2023-09-12 15:43:50.980 | WARNING | nomic.project:_validate_and_correct_arrow_upload:261 - id_field is not a string. Converting to string from int32\n", + "1it [00:03, 3.94s/it]\n", + "2023-09-12 15:43:54.938 | INFO | nomic.project:_add_data:1422 - Upload succeeded.\n", + "2023-09-12 15:43:54.953 | INFO | nomic.atlas:map_embeddings:127 - Embedding upload succeeded.\n", + "2023-09-12 15:43:58.534 | INFO | nomic.project:create_index:1132 - Created map `Conversation Map for cropwizard` in project `Conversation Map for cropwizard`: https://atlas.nomic.ai/map/c9b13bcc-d0cb-40a6-80c6-3e98b1bf0bda/53b8076a-7f80-455f-abea-2cf84bc1912c\n", + "2023-09-12 15:43:58.534 | INFO | nomic.atlas:map_embeddings:140 - Conversation Map for cropwizard: https://atlas.nomic.ai/map/c9b13bcc-d0cb-40a6-80c6-3e98b1bf0bda/53b8076a-7f80-455f-abea-2cf84bc1912c\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Conversation Map for cropwizard: https://atlas.nomic.ai/map/c9b13bcc-d0cb-40a6-80c6-3e98b1bf0bda/53b8076a-7f80-455f-abea-2cf84bc1912c]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:44:00.949 | INFO | nomic.project:create_index:1132 - Created map `cropwizard_convo_index` in project `Conversation Map for cropwizard`: https://atlas.nomic.ai/map/c9b13bcc-d0cb-40a6-80c6-3e98b1bf0bda/5eb008c1-5a10-4f20-ab7d-c42a238e1595\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "rtest\n", + "1\n", + "previewdeploy\n", + "3\n", + "r2test\n", + "1\n", + "Law794-TransactionalDraftingAlam\n", + "4\n", + "personal-statement\n", + "2\n", + "rohan_excel\n", + "1\n", + "langchain-python\n", + "1\n", + "langchain\n", + "4\n", + "ncsa-live-demo\n", + "1\n", + "rohan_atree_individual\n", + "2\n", + "meta11-test\n", + "14\n", + "ceesd-mirgecom\n", + "2\n", + "NCSADelta\n", + "10\n", + "HealthyLivingGuide\n", + "3\n", + "rohan\n", + "2\n", + "NCSA\n", + "40\n", + "40\n", + "40\n", + "(40, 1536)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:44:35.409 | INFO | nomic.project:_create_project:790 - Creating project `Conversation Map for NCSA` in organization `dabholkar.asmita`\n", + "2023-09-12 15:44:36.768 | INFO | nomic.atlas:map_embeddings:108 - Uploading embeddings to Atlas.\n", + "2023-09-12 15:44:36.778 | WARNING | nomic.project:_validate_and_correct_arrow_upload:238 - Replacing 10 null values for field user_email with string 'null'. This behavior will change in a future version.\n", + "2023-09-12 15:44:36.783 | WARNING | nomic.project:_validate_and_correct_arrow_upload:261 - id_field is not a string. Converting to string from int32\n", + "1it [00:01, 1.99s/it]\n", + "2023-09-12 15:44:38.783 | INFO | nomic.project:_add_data:1422 - Upload succeeded.\n", + "2023-09-12 15:44:38.784 | INFO | nomic.atlas:map_embeddings:127 - Embedding upload succeeded.\n", + "2023-09-12 15:44:40.137 | INFO | nomic.project:create_index:1132 - Created map `Conversation Map for NCSA` in project `Conversation Map for NCSA`: https://atlas.nomic.ai/map/d2aef24e-2ea4-4712-87c0-804da0ab96b0/7b2238ae-7eb9-407a-ac60-6d5a8fd1f447\n", + "2023-09-12 15:44:40.146 | INFO | nomic.atlas:map_embeddings:140 - Conversation Map for NCSA: https://atlas.nomic.ai/map/d2aef24e-2ea4-4712-87c0-804da0ab96b0/7b2238ae-7eb9-407a-ac60-6d5a8fd1f447\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Conversation Map for NCSA: https://atlas.nomic.ai/map/d2aef24e-2ea4-4712-87c0-804da0ab96b0/7b2238ae-7eb9-407a-ac60-6d5a8fd1f447]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:44:45.686 | INFO | nomic.project:create_index:1132 - Created map `NCSA_convo_index` in project `Conversation Map for NCSA`: https://atlas.nomic.ai/map/d2aef24e-2ea4-4712-87c0-804da0ab96b0/331ba551-f6b4-4c79-a31c-4cc5390bfac7\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FIN574-GT\n", + "24\n", + "24\n", + "24\n", + "(24, 1536)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:45:00.655 | INFO | nomic.project:_create_project:790 - Creating project `Conversation Map for FIN574-GT` in organization `dabholkar.asmita`\n", + "2023-09-12 15:45:04.369 | INFO | nomic.atlas:map_embeddings:108 - Uploading embeddings to Atlas.\n", + "2023-09-12 15:45:04.385 | WARNING | nomic.project:_validate_and_correct_arrow_upload:261 - id_field is not a string. Converting to string from int32\n", + "1it [00:06, 6.08s/it]\n", + "2023-09-12 15:45:10.475 | INFO | nomic.project:_add_data:1422 - Upload succeeded.\n", + "2023-09-12 15:45:10.475 | INFO | nomic.atlas:map_embeddings:127 - Embedding upload succeeded.\n", + "2023-09-12 15:45:13.721 | INFO | nomic.project:create_index:1132 - Created map `Conversation Map for FIN574-GT` in project `Conversation Map for FIN574-GT`: https://atlas.nomic.ai/map/d83f5440-9ef1-45ed-a2e5-c3229398b0e8/149f6eab-f636-4754-8117-2da7f030c5b3\n", + "2023-09-12 15:45:13.723 | INFO | nomic.atlas:map_embeddings:140 - Conversation Map for FIN574-GT: https://atlas.nomic.ai/map/d83f5440-9ef1-45ed-a2e5-c3229398b0e8/149f6eab-f636-4754-8117-2da7f030c5b3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Conversation Map for FIN574-GT: https://atlas.nomic.ai/map/d83f5440-9ef1-45ed-a2e5-c3229398b0e8/149f6eab-f636-4754-8117-2da7f030c5b3]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:45:19.164 | INFO | nomic.project:create_index:1132 - Created map `FIN574-GT_convo_index` in project `Conversation Map for FIN574-GT`: https://atlas.nomic.ai/map/d83f5440-9ef1-45ed-a2e5-c3229398b0e8/092d7d2c-b792-4304-ae04-d8f09ffbba5d\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "babreu\n", + "1\n", + "test-video-ingest-31\n", + "1\n", + "p\n", + "1\n", + "new_test_course\n", + "1\n", + "cropwizard-beta\n", + "21\n", + "21\n", + "21\n", + "(21, 1536)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:45:30.549 | INFO | nomic.project:_create_project:790 - Creating project `Conversation Map for cropwizard-beta` in organization `dabholkar.asmita`\n", + "2023-09-12 15:45:32.134 | INFO | nomic.atlas:map_embeddings:108 - Uploading embeddings to Atlas.\n", + "2023-09-12 15:45:32.150 | WARNING | nomic.project:_validate_and_correct_arrow_upload:238 - Replacing 3 null values for field user_email with string 'null'. This behavior will change in a future version.\n", + "2023-09-12 15:45:32.150 | WARNING | nomic.project:_validate_and_correct_arrow_upload:261 - id_field is not a string. Converting to string from int32\n", + "1it [00:00, 1.08it/s]\n", + "2023-09-12 15:45:33.088 | INFO | nomic.project:_add_data:1422 - Upload succeeded.\n", + "2023-09-12 15:45:33.092 | INFO | nomic.atlas:map_embeddings:127 - Embedding upload succeeded.\n", + "2023-09-12 15:45:34.335 | INFO | nomic.project:create_index:1132 - Created map `Conversation Map for cropwizard-beta` in project `Conversation Map for cropwizard-beta`: https://atlas.nomic.ai/map/44b31bc3-726e-4930-9584-616bbcb2d5d3/d3a66bb5-0ab9-4e9c-9fe5-10aa840ce9bd\n", + "2023-09-12 15:45:34.335 | INFO | nomic.atlas:map_embeddings:140 - Conversation Map for cropwizard-beta: https://atlas.nomic.ai/map/44b31bc3-726e-4930-9584-616bbcb2d5d3/d3a66bb5-0ab9-4e9c-9fe5-10aa840ce9bd\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[Conversation Map for cropwizard-beta: https://atlas.nomic.ai/map/44b31bc3-726e-4930-9584-616bbcb2d5d3/d3a66bb5-0ab9-4e9c-9fe5-10aa840ce9bd]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-09-12 15:45:37.865 | INFO | nomic.project:create_index:1132 - Created map `cropwizard-beta_convo_index` in project `Conversation Map for cropwizard-beta`: https://atlas.nomic.ai/map/44b31bc3-726e-4930-9584-616bbcb2d5d3/20a567c6-056b-49b3-a421-f0e49f348cda\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "maxlindsey\n", + "1\n", + "Gies-graduate-hub\n", + "4\n", + "test-video-ingest-17\n", + "2\n", + "summary\n", + "10\n", + "test-video-ingest-3\n", + "8\n", + "test-video-ingest-27\n", + "2\n", + "lillian-wang-blog-2\n", + "1\n", + "python-magic\n", + "1\n", + "ansible2\n", + "1\n", + "ece408fa23\n", + "7\n", + "farmdoc_test_josh_v2\n", + "1\n", + "local-test3\n", + "1\n", + "automata\n", + "1\n", + "SpaceFlorida-GT\n", + "5\n", + "GBSI-GT\n", + "4\n", + "test-video-ingest-21\n", + "8\n", + "newnew_ncsa\n", + "1\n", + "canvas\n", + "1\n", + "gbsi-gt\n", + "3\n", + "meditation-tutorial\n", + "1\n", + "profit\n", + "1\n", + "ansible\n", + "8\n", + "langchain-docs\n", + "9\n", + "testing_url_metadata_josh\n", + "1\n", + "test-india-biodiversity\n", + "1\n", + "vyriad\n", + "10\n", + "irc-workplans\n", + "1\n", + "kastanasdfasdfasdf\n", + "1\n", + "testing_refactor\n", + "2\n", + "BADM-567-GT\n", + "3\n", + "mdt\n", + "1\n", + "vercel\n", + "1\n", + "gies-graduate-hub\n", + "course_name: gies-graduate-hub\n", + "error: The read operation timed out\n", + "test-video-ingest-12\n", + "3\n", + "NuclGPT-v1\n", + "2\n", + "test-video-ingest-13\n", + "1\n", + "test_new_supa_scrape\n", + "1\n", + "doe-ascr-2023\n", + "1\n", + "arize\n", + "2\n", + "final-meta-test\n", + "1\n", + "preview-meta-test\n", + "1\n", + "gies-online-mba-V3\n", + "2\n", + "FoF-Drawdown-from-INVPEIV-5-24-23\n", + "1\n", + "test-video-ingest-30\n", + "1\n", + "test\n", + "1\n", + "NCSA-v2\n", + "3\n", + "conversational\n", + "1\n", + "clowder-docs\n", + "5\n", + "DA\n", + "1\n", + "test-video-ingest-25\n", + "1\n", + "test-ingest-10\n", + "1\n", + "eric-test-course\n", + "1\n", + "farmdoc-test\n", + "1\n", + "test-video-ingest-22\n", + "2\n", + "Academic-paper\n", + "1\n", + "starwars\n", + "1\n", + "AdamDemo\n", + "1\n", + "OpenCI-ACCESS\n", + "1\n", + "clockkit-github\n", + "1\n" ] } ], "source": [ - "user_queries = []\n", - "metadata = []\n", - "i = 1\n", + "embeddings_model = OpenAIEmbeddings()\n", "\n", - "# log conversation instead of individual messages\n", - "for index, row in df.iterrows():\n", - " user_email = row['user_email']\n", - " convo = row['convo']\n", - " messages = convo['messages']\n", - " first_message = messages[0]['content']\n", - " user_queries.append(first_message)\n", - " # create metadata for multi-turn conversation\n", - " conversation = \"\"\n", - " for message in messages:\n", - " # string of role: content, role: content, ...\n", - " conversation += \"\\n>>> \" + message['role'] + \": \" + message['content'] + \"\\n\"\n", - " # add to metadata\n", - " metadata_row = {\"course\": row['course_name'], \"conversation\": conversation, \"conversation_id\": convo['id'], \n", - " \"id\": i, \"user_email\": user_email, \"first_query\": first_message}\n", - " metadata.append(metadata_row)\n", - " i += 1\n", + "for course in course_list:\n", + " print(course)\n", + " try:\n", + " response = supabase_client.table(\"llm-convo-monitor\").select(\"*\").eq('course_name', course).execute()\n", + " data = response.data\n", + " course_df = pd.DataFrame(data)\n", + " print(len(course_df))\n", + "\n", + " if len(course_df) < 20 or course in ['gpt4', 'badm_550_ashley', 'ece120', 'new-weather', 'ECE220FA23', 'ECE408FA23']:\n", + " continue\n", + " else:\n", + " \n", + " user_queries = []\n", + " metadata = []\n", + " i = 1\n", "\n", - "print(len(user_queries))\n", - "print(len(metadata))" + " # log conversation instead of individual messages\n", + " for index, row in course_df.iterrows():\n", + " user_email = row['user_email']\n", + " convo = row['convo']\n", + " messages = convo['messages']\n", + " first_message = messages[0]['content']\n", + " user_queries.append(first_message)\n", + " # create metadata for multi-turn conversation\n", + " conversation = \"\"\n", + " for message in messages:\n", + " # string of role: content, role: content, ...\n", + " if message['role'] == 'user':\n", + " emoji = \"🙋\"\n", + " else:\n", + " emoji = \"🤖\"\n", + " conversation += \"\\n>>> \" + emoji + message['role'] + \": \" + message['content'] + \"\\n\"\n", + " # add to metadata\n", + " metadata_row = {\"course\": row['course_name'], \"conversation\": conversation, \"conversation_id\": convo['id'], \n", + " \"id\": i, \"user_email\": user_email, \"first_query\": first_message}\n", + " metadata.append(metadata_row)\n", + " i += 1\n", + "\n", + " print(len(user_queries))\n", + " print(len(metadata))\n", + "\n", + " metadata = pd.DataFrame(metadata)\n", + " embeddings = embeddings_model.embed_documents(user_queries)\n", + " embeddings = np.array(embeddings)\n", + " print(embeddings.shape)\n", + "\n", + " # create an Atlas project\n", + " project_name = \"Conversation Map for \" + course\n", + " index_name = course + \"_convo_index\"\n", + " project = atlas.map_embeddings(embeddings=np.array(embeddings),\n", + " data=metadata,\n", + " id_field='id',\n", + " build_topic_model=True,\n", + " topic_label_field='first_query',\n", + " name=project_name,\n", + " colorable_fields=['conversation_id', 'first_query'])\n", + " print(project.maps)\n", + "\n", + " project.create_index(index_name, build_topic_model=True)\n", + " except Exception as e:\n", + " print(\"course_name:\", course)\n", + " print(\"error: \", e)\n", + " " ] }, { @@ -602,7 +820,7 @@ "source": [ "# create an Atlas project\n", "project_name = \"Conversation Map for \" + course\n", - "index_name = course + \"_convo_index_2\"\n", + "index_name = course + \"_convo_index\"\n", "project = atlas.map_embeddings(embeddings=np.array(embeddings),\n", " data=metadata,\n", " id_field='id',\n", @@ -615,6 +833,68 @@ "project.create_index(index_name, build_topic_model=True)" ] }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "ename": "ReadTimeout", + "evalue": "The read operation timed out", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTimeoutError\u001b[0m Traceback (most recent call last)", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_exceptions.py:10\u001b[0m, in \u001b[0;36mmap_exceptions\u001b[1;34m(map)\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m---> 10\u001b[0m \u001b[39myield\u001b[39;00m\n\u001b[0;32m 11\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m exc: \u001b[39m# noqa: PIE786\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\backends\\sync.py:28\u001b[0m, in \u001b[0;36mSyncStream.read\u001b[1;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sock\u001b[39m.\u001b[39msettimeout(timeout)\n\u001b[1;32m---> 28\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv(max_bytes)\n", + "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib\\ssl.py:1259\u001b[0m, in \u001b[0;36mSSLSocket.recv\u001b[1;34m(self, buflen, flags)\u001b[0m\n\u001b[0;32m 1256\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 1257\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[0;32m 1258\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[1;32m-> 1259\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(buflen)\n\u001b[0;32m 1260\u001b[0m \u001b[39melse\u001b[39;00m:\n", + "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib\\ssl.py:1132\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[1;34m(self, len, buffer)\u001b[0m\n\u001b[0;32m 1131\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m-> 1132\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m)\n\u001b[0;32m 1133\u001b[0m \u001b[39mexcept\u001b[39;00m SSLError \u001b[39mas\u001b[39;00m x:\n", + "\u001b[1;31mTimeoutError\u001b[0m: The read operation timed out", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mReadTimeout\u001b[0m Traceback (most recent call last)", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_transports\\default.py:60\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[1;34m()\u001b[0m\n\u001b[0;32m 59\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m---> 60\u001b[0m \u001b[39myield\u001b[39;00m\n\u001b[0;32m 61\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m exc: \u001b[39m# noqa: PIE-786\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_transports\\default.py:218\u001b[0m, in \u001b[0;36mHTTPTransport.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 217\u001b[0m \u001b[39mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[1;32m--> 218\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_pool\u001b[39m.\u001b[39;49mhandle_request(req)\n\u001b[0;32m 220\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(resp\u001b[39m.\u001b[39mstream, typing\u001b[39m.\u001b[39mIterable)\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\connection_pool.py:253\u001b[0m, in \u001b[0;36mConnectionPool.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 252\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresponse_closed(status)\n\u001b[1;32m--> 253\u001b[0m \u001b[39mraise\u001b[39;00m exc\n\u001b[0;32m 254\u001b[0m \u001b[39melse\u001b[39;00m:\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\connection_pool.py:237\u001b[0m, in \u001b[0;36mConnectionPool.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 236\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 237\u001b[0m response \u001b[39m=\u001b[39m connection\u001b[39m.\u001b[39;49mhandle_request(request)\n\u001b[0;32m 238\u001b[0m \u001b[39mexcept\u001b[39;00m ConnectionNotAvailable:\n\u001b[0;32m 239\u001b[0m \u001b[39m# The ConnectionNotAvailable exception is a special case, that\u001b[39;00m\n\u001b[0;32m 240\u001b[0m \u001b[39m# indicates we need to retry the request on a new connection.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 244\u001b[0m \u001b[39m# might end up as an HTTP/2 connection, but which actually ends\u001b[39;00m\n\u001b[0;32m 245\u001b[0m \u001b[39m# up as HTTP/1.1.\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\connection.py:90\u001b[0m, in \u001b[0;36mHTTPConnection.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 88\u001b[0m \u001b[39mraise\u001b[39;00m ConnectionNotAvailable()\n\u001b[1;32m---> 90\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_connection\u001b[39m.\u001b[39;49mhandle_request(request)\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\http11.py:112\u001b[0m, in \u001b[0;36mHTTP11Connection.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 111\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_response_closed()\n\u001b[1;32m--> 112\u001b[0m \u001b[39mraise\u001b[39;00m exc\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\http11.py:91\u001b[0m, in \u001b[0;36mHTTP11Connection.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 83\u001b[0m \u001b[39mwith\u001b[39;00m Trace(\n\u001b[0;32m 84\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mhttp11.receive_response_headers\u001b[39m\u001b[39m\"\u001b[39m, request, kwargs\n\u001b[0;32m 85\u001b[0m ) \u001b[39mas\u001b[39;00m trace:\n\u001b[0;32m 86\u001b[0m (\n\u001b[0;32m 87\u001b[0m http_version,\n\u001b[0;32m 88\u001b[0m status,\n\u001b[0;32m 89\u001b[0m reason_phrase,\n\u001b[0;32m 90\u001b[0m headers,\n\u001b[1;32m---> 91\u001b[0m ) \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_receive_response_headers(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 92\u001b[0m trace\u001b[39m.\u001b[39mreturn_value \u001b[39m=\u001b[39m (\n\u001b[0;32m 93\u001b[0m http_version,\n\u001b[0;32m 94\u001b[0m status,\n\u001b[0;32m 95\u001b[0m reason_phrase,\n\u001b[0;32m 96\u001b[0m headers,\n\u001b[0;32m 97\u001b[0m )\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\http11.py:155\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_response_headers\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 154\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 155\u001b[0m event \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_receive_event(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[0;32m 156\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(event, h11\u001b[39m.\u001b[39mResponse):\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\http11.py:191\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_event\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[39mif\u001b[39;00m event \u001b[39mis\u001b[39;00m h11\u001b[39m.\u001b[39mNEED_DATA:\n\u001b[1;32m--> 191\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_network_stream\u001b[39m.\u001b[39;49mread(\n\u001b[0;32m 192\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mREAD_NUM_BYTES, timeout\u001b[39m=\u001b[39;49mtimeout\n\u001b[0;32m 193\u001b[0m )\n\u001b[0;32m 195\u001b[0m \u001b[39m# If we feed this case through h11 we'll raise an exception like:\u001b[39;00m\n\u001b[0;32m 196\u001b[0m \u001b[39m#\u001b[39;00m\n\u001b[0;32m 197\u001b[0m \u001b[39m# httpcore.RemoteProtocolError: can't handle event type\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 201\u001b[0m \u001b[39m# perspective. Instead we handle this case distinctly and treat\u001b[39;00m\n\u001b[0;32m 202\u001b[0m \u001b[39m# it as a ConnectError.\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\backends\\sync.py:26\u001b[0m, in \u001b[0;36mSyncStream.read\u001b[1;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[0;32m 25\u001b[0m exc_map: ExceptionMapping \u001b[39m=\u001b[39m {socket\u001b[39m.\u001b[39mtimeout: ReadTimeout, \u001b[39mOSError\u001b[39;00m: ReadError}\n\u001b[1;32m---> 26\u001b[0m \u001b[39mwith\u001b[39;00m map_exceptions(exc_map):\n\u001b[0;32m 27\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sock\u001b[39m.\u001b[39msettimeout(timeout)\n", + "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib\\contextlib.py:153\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[1;34m(self, typ, value, traceback)\u001b[0m\n\u001b[0;32m 152\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 153\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgen\u001b[39m.\u001b[39;49mthrow(typ, value, traceback)\n\u001b[0;32m 154\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mStopIteration\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m 155\u001b[0m \u001b[39m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[0;32m 156\u001b[0m \u001b[39m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[0;32m 157\u001b[0m \u001b[39m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_exceptions.py:14\u001b[0m, in \u001b[0;36mmap_exceptions\u001b[1;34m(map)\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(exc, from_exc):\n\u001b[1;32m---> 14\u001b[0m \u001b[39mraise\u001b[39;00m to_exc(exc)\n\u001b[0;32m 15\u001b[0m \u001b[39mraise\u001b[39;00m\n", + "\u001b[1;31mReadTimeout\u001b[0m: The read operation timed out", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[1;31mReadTimeout\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mf:\\MSIM\\ML_Projects\\ai-ta-backend\\ai_ta_backend\\nomic_map_creation.ipynb Cell 13\u001b[0m line \u001b[0;36m1\n\u001b[1;32m----> 1\u001b[0m response \u001b[39m=\u001b[39m supabase_client\u001b[39m.\u001b[39;49mtable(\u001b[39m\"\u001b[39;49m\u001b[39mllm-convo-monitor\u001b[39;49m\u001b[39m\"\u001b[39;49m)\u001b[39m.\u001b[39;49mselect(\u001b[39m\"\u001b[39;49m\u001b[39m*\u001b[39;49m\u001b[39m\"\u001b[39;49m)\u001b[39m.\u001b[39;49mexecute()\n\u001b[0;32m 2\u001b[0m data \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mdata\n\u001b[0;32m 3\u001b[0m df \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mDataFrame(data)\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\postgrest\\_sync\\request_builder.py:55\u001b[0m, in \u001b[0;36mSyncQueryRequestBuilder.execute\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 43\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mexecute\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m APIResponse:\n\u001b[0;32m 44\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Execute the query.\u001b[39;00m\n\u001b[0;32m 45\u001b[0m \n\u001b[0;32m 46\u001b[0m \u001b[39m .. tip::\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 53\u001b[0m \u001b[39m :class:`APIError` If the API raised an error.\u001b[39;00m\n\u001b[0;32m 54\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 55\u001b[0m r \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msession\u001b[39m.\u001b[39;49mrequest(\n\u001b[0;32m 56\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mhttp_method,\n\u001b[0;32m 57\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpath,\n\u001b[0;32m 58\u001b[0m json\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mjson,\n\u001b[0;32m 59\u001b[0m params\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mparams,\n\u001b[0;32m 60\u001b[0m headers\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mheaders,\n\u001b[0;32m 61\u001b[0m )\n\u001b[0;32m 62\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 63\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[0;32m 64\u001b[0m \u001b[39m200\u001b[39m \u001b[39m<\u001b[39m\u001b[39m=\u001b[39m r\u001b[39m.\u001b[39mstatus_code \u001b[39m<\u001b[39m\u001b[39m=\u001b[39m \u001b[39m299\u001b[39m\n\u001b[0;32m 65\u001b[0m ): \u001b[39m# Response.ok from JS (https://developer.mozilla.org/en-US/docs/Web/API/Response/ok)\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:821\u001b[0m, in \u001b[0;36mClient.request\u001b[1;34m(self, method, url, content, data, files, json, params, headers, cookies, auth, follow_redirects, timeout, extensions)\u001b[0m\n\u001b[0;32m 806\u001b[0m warnings\u001b[39m.\u001b[39mwarn(message, \u001b[39mDeprecationWarning\u001b[39;00m)\n\u001b[0;32m 808\u001b[0m request \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbuild_request(\n\u001b[0;32m 809\u001b[0m method\u001b[39m=\u001b[39mmethod,\n\u001b[0;32m 810\u001b[0m url\u001b[39m=\u001b[39murl,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 819\u001b[0m extensions\u001b[39m=\u001b[39mextensions,\n\u001b[0;32m 820\u001b[0m )\n\u001b[1;32m--> 821\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(request, auth\u001b[39m=\u001b[39;49mauth, follow_redirects\u001b[39m=\u001b[39;49mfollow_redirects)\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:908\u001b[0m, in \u001b[0;36mClient.send\u001b[1;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[0;32m 900\u001b[0m follow_redirects \u001b[39m=\u001b[39m (\n\u001b[0;32m 901\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfollow_redirects\n\u001b[0;32m 902\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(follow_redirects, UseClientDefault)\n\u001b[0;32m 903\u001b[0m \u001b[39melse\u001b[39;00m follow_redirects\n\u001b[0;32m 904\u001b[0m )\n\u001b[0;32m 906\u001b[0m auth \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_build_request_auth(request, auth)\n\u001b[1;32m--> 908\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_send_handling_auth(\n\u001b[0;32m 909\u001b[0m request,\n\u001b[0;32m 910\u001b[0m auth\u001b[39m=\u001b[39;49mauth,\n\u001b[0;32m 911\u001b[0m follow_redirects\u001b[39m=\u001b[39;49mfollow_redirects,\n\u001b[0;32m 912\u001b[0m history\u001b[39m=\u001b[39;49m[],\n\u001b[0;32m 913\u001b[0m )\n\u001b[0;32m 914\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 915\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m stream:\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:936\u001b[0m, in \u001b[0;36mClient._send_handling_auth\u001b[1;34m(self, request, auth, follow_redirects, history)\u001b[0m\n\u001b[0;32m 933\u001b[0m request \u001b[39m=\u001b[39m \u001b[39mnext\u001b[39m(auth_flow)\n\u001b[0;32m 935\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 936\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_send_handling_redirects(\n\u001b[0;32m 937\u001b[0m request,\n\u001b[0;32m 938\u001b[0m follow_redirects\u001b[39m=\u001b[39;49mfollow_redirects,\n\u001b[0;32m 939\u001b[0m history\u001b[39m=\u001b[39;49mhistory,\n\u001b[0;32m 940\u001b[0m )\n\u001b[0;32m 941\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 942\u001b[0m \u001b[39mtry\u001b[39;00m:\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:973\u001b[0m, in \u001b[0;36mClient._send_handling_redirects\u001b[1;34m(self, request, follow_redirects, history)\u001b[0m\n\u001b[0;32m 970\u001b[0m \u001b[39mfor\u001b[39;00m hook \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_event_hooks[\u001b[39m\"\u001b[39m\u001b[39mrequest\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m 971\u001b[0m hook(request)\n\u001b[1;32m--> 973\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_send_single_request(request)\n\u001b[0;32m 974\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 975\u001b[0m \u001b[39mfor\u001b[39;00m hook \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_event_hooks[\u001b[39m\"\u001b[39m\u001b[39mresponse\u001b[39m\u001b[39m\"\u001b[39m]:\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:1009\u001b[0m, in \u001b[0;36mClient._send_single_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 1004\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[0;32m 1005\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mAttempted to send an async request with a sync Client instance.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 1006\u001b[0m )\n\u001b[0;32m 1008\u001b[0m \u001b[39mwith\u001b[39;00m request_context(request\u001b[39m=\u001b[39mrequest):\n\u001b[1;32m-> 1009\u001b[0m response \u001b[39m=\u001b[39m transport\u001b[39m.\u001b[39;49mhandle_request(request)\n\u001b[0;32m 1011\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(response\u001b[39m.\u001b[39mstream, SyncByteStream)\n\u001b[0;32m 1013\u001b[0m response\u001b[39m.\u001b[39mrequest \u001b[39m=\u001b[39m request\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_transports\\default.py:217\u001b[0m, in \u001b[0;36mHTTPTransport.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 203\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(request\u001b[39m.\u001b[39mstream, SyncByteStream)\n\u001b[0;32m 205\u001b[0m req \u001b[39m=\u001b[39m httpcore\u001b[39m.\u001b[39mRequest(\n\u001b[0;32m 206\u001b[0m method\u001b[39m=\u001b[39mrequest\u001b[39m.\u001b[39mmethod,\n\u001b[0;32m 207\u001b[0m url\u001b[39m=\u001b[39mhttpcore\u001b[39m.\u001b[39mURL(\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 215\u001b[0m extensions\u001b[39m=\u001b[39mrequest\u001b[39m.\u001b[39mextensions,\n\u001b[0;32m 216\u001b[0m )\n\u001b[1;32m--> 217\u001b[0m \u001b[39mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[0;32m 218\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_pool\u001b[39m.\u001b[39mhandle_request(req)\n\u001b[0;32m 220\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(resp\u001b[39m.\u001b[39mstream, typing\u001b[39m.\u001b[39mIterable)\n", + "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib\\contextlib.py:153\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[1;34m(self, typ, value, traceback)\u001b[0m\n\u001b[0;32m 151\u001b[0m value \u001b[39m=\u001b[39m typ()\n\u001b[0;32m 152\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 153\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgen\u001b[39m.\u001b[39;49mthrow(typ, value, traceback)\n\u001b[0;32m 154\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mStopIteration\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m 155\u001b[0m \u001b[39m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[0;32m 156\u001b[0m \u001b[39m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[0;32m 157\u001b[0m \u001b[39m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n\u001b[0;32m 158\u001b[0m \u001b[39mreturn\u001b[39;00m exc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m value\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_transports\\default.py:77\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[1;34m()\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mraise\u001b[39;00m\n\u001b[0;32m 76\u001b[0m message \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(exc)\n\u001b[1;32m---> 77\u001b[0m \u001b[39mraise\u001b[39;00m mapped_exc(message) \u001b[39mfrom\u001b[39;00m \u001b[39mexc\u001b[39;00m\n", + "\u001b[1;31mReadTimeout\u001b[0m: The read operation timed out" + ] + } + ], + "source": [ + "response = supabase_client.table(\"llm-convo-monitor\").select(\"*\").execute()\n", + "data = response.data\n", + "df = pd.DataFrame(data)\n", + "len(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "markdown", "metadata": {}, @@ -765,14 +1045,49 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 6, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "1571\n" + "ename": "ReadTimeout", + "evalue": "The read operation timed out", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTimeoutError\u001b[0m Traceback (most recent call last)", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_exceptions.py:10\u001b[0m, in \u001b[0;36mmap_exceptions\u001b[1;34m(map)\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m---> 10\u001b[0m \u001b[39myield\u001b[39;00m\n\u001b[0;32m 11\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m exc: \u001b[39m# noqa: PIE786\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\backends\\sync.py:28\u001b[0m, in \u001b[0;36mSyncStream.read\u001b[1;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[0;32m 27\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sock\u001b[39m.\u001b[39msettimeout(timeout)\n\u001b[1;32m---> 28\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv(max_bytes)\n", + "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib\\ssl.py:1259\u001b[0m, in \u001b[0;36mSSLSocket.recv\u001b[1;34m(self, buflen, flags)\u001b[0m\n\u001b[0;32m 1256\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 1257\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[0;32m 1258\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[1;32m-> 1259\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(buflen)\n\u001b[0;32m 1260\u001b[0m \u001b[39melse\u001b[39;00m:\n", + "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib\\ssl.py:1132\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[1;34m(self, len, buffer)\u001b[0m\n\u001b[0;32m 1131\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m-> 1132\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m)\n\u001b[0;32m 1133\u001b[0m \u001b[39mexcept\u001b[39;00m SSLError \u001b[39mas\u001b[39;00m x:\n", + "\u001b[1;31mTimeoutError\u001b[0m: The read operation timed out", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[1;31mReadTimeout\u001b[0m Traceback (most recent call last)", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_transports\\default.py:60\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[1;34m()\u001b[0m\n\u001b[0;32m 59\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m---> 60\u001b[0m \u001b[39myield\u001b[39;00m\n\u001b[0;32m 61\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m exc: \u001b[39m# noqa: PIE-786\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_transports\\default.py:218\u001b[0m, in \u001b[0;36mHTTPTransport.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 217\u001b[0m \u001b[39mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[1;32m--> 218\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_pool\u001b[39m.\u001b[39;49mhandle_request(req)\n\u001b[0;32m 220\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(resp\u001b[39m.\u001b[39mstream, typing\u001b[39m.\u001b[39mIterable)\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\connection_pool.py:253\u001b[0m, in \u001b[0;36mConnectionPool.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 252\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mresponse_closed(status)\n\u001b[1;32m--> 253\u001b[0m \u001b[39mraise\u001b[39;00m exc\n\u001b[0;32m 254\u001b[0m \u001b[39melse\u001b[39;00m:\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\connection_pool.py:237\u001b[0m, in \u001b[0;36mConnectionPool.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 236\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 237\u001b[0m response \u001b[39m=\u001b[39m connection\u001b[39m.\u001b[39;49mhandle_request(request)\n\u001b[0;32m 238\u001b[0m \u001b[39mexcept\u001b[39;00m ConnectionNotAvailable:\n\u001b[0;32m 239\u001b[0m \u001b[39m# The ConnectionNotAvailable exception is a special case, that\u001b[39;00m\n\u001b[0;32m 240\u001b[0m \u001b[39m# indicates we need to retry the request on a new connection.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 244\u001b[0m \u001b[39m# might end up as an HTTP/2 connection, but which actually ends\u001b[39;00m\n\u001b[0;32m 245\u001b[0m \u001b[39m# up as HTTP/1.1.\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\connection.py:90\u001b[0m, in \u001b[0;36mHTTPConnection.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 88\u001b[0m \u001b[39mraise\u001b[39;00m ConnectionNotAvailable()\n\u001b[1;32m---> 90\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_connection\u001b[39m.\u001b[39;49mhandle_request(request)\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\http11.py:112\u001b[0m, in \u001b[0;36mHTTP11Connection.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 111\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_response_closed()\n\u001b[1;32m--> 112\u001b[0m \u001b[39mraise\u001b[39;00m exc\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\http11.py:91\u001b[0m, in \u001b[0;36mHTTP11Connection.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 83\u001b[0m \u001b[39mwith\u001b[39;00m Trace(\n\u001b[0;32m 84\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mhttp11.receive_response_headers\u001b[39m\u001b[39m\"\u001b[39m, request, kwargs\n\u001b[0;32m 85\u001b[0m ) \u001b[39mas\u001b[39;00m trace:\n\u001b[0;32m 86\u001b[0m (\n\u001b[0;32m 87\u001b[0m http_version,\n\u001b[0;32m 88\u001b[0m status,\n\u001b[0;32m 89\u001b[0m reason_phrase,\n\u001b[0;32m 90\u001b[0m headers,\n\u001b[1;32m---> 91\u001b[0m ) \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_receive_response_headers(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[0;32m 92\u001b[0m trace\u001b[39m.\u001b[39mreturn_value \u001b[39m=\u001b[39m (\n\u001b[0;32m 93\u001b[0m http_version,\n\u001b[0;32m 94\u001b[0m status,\n\u001b[0;32m 95\u001b[0m reason_phrase,\n\u001b[0;32m 96\u001b[0m headers,\n\u001b[0;32m 97\u001b[0m )\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\http11.py:155\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_response_headers\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 154\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 155\u001b[0m event \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_receive_event(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[0;32m 156\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(event, h11\u001b[39m.\u001b[39mResponse):\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_sync\\http11.py:191\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_event\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 190\u001b[0m \u001b[39mif\u001b[39;00m event \u001b[39mis\u001b[39;00m h11\u001b[39m.\u001b[39mNEED_DATA:\n\u001b[1;32m--> 191\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_network_stream\u001b[39m.\u001b[39;49mread(\n\u001b[0;32m 192\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mREAD_NUM_BYTES, timeout\u001b[39m=\u001b[39;49mtimeout\n\u001b[0;32m 193\u001b[0m )\n\u001b[0;32m 195\u001b[0m \u001b[39m# If we feed this case through h11 we'll raise an exception like:\u001b[39;00m\n\u001b[0;32m 196\u001b[0m \u001b[39m#\u001b[39;00m\n\u001b[0;32m 197\u001b[0m \u001b[39m# httpcore.RemoteProtocolError: can't handle event type\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 201\u001b[0m \u001b[39m# perspective. Instead we handle this case distinctly and treat\u001b[39;00m\n\u001b[0;32m 202\u001b[0m \u001b[39m# it as a ConnectError.\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\backends\\sync.py:26\u001b[0m, in \u001b[0;36mSyncStream.read\u001b[1;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[0;32m 25\u001b[0m exc_map: ExceptionMapping \u001b[39m=\u001b[39m {socket\u001b[39m.\u001b[39mtimeout: ReadTimeout, \u001b[39mOSError\u001b[39;00m: ReadError}\n\u001b[1;32m---> 26\u001b[0m \u001b[39mwith\u001b[39;00m map_exceptions(exc_map):\n\u001b[0;32m 27\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sock\u001b[39m.\u001b[39msettimeout(timeout)\n", + "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib\\contextlib.py:153\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[1;34m(self, typ, value, traceback)\u001b[0m\n\u001b[0;32m 152\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 153\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgen\u001b[39m.\u001b[39;49mthrow(typ, value, traceback)\n\u001b[0;32m 154\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mStopIteration\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m 155\u001b[0m \u001b[39m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[0;32m 156\u001b[0m \u001b[39m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[0;32m 157\u001b[0m \u001b[39m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpcore\\_exceptions.py:14\u001b[0m, in \u001b[0;36mmap_exceptions\u001b[1;34m(map)\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(exc, from_exc):\n\u001b[1;32m---> 14\u001b[0m \u001b[39mraise\u001b[39;00m to_exc(exc)\n\u001b[0;32m 15\u001b[0m \u001b[39mraise\u001b[39;00m\n", + "\u001b[1;31mReadTimeout\u001b[0m: The read operation timed out", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[1;31mReadTimeout\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mf:\\MSIM\\ML_Projects\\ai-ta-backend\\ai_ta_backend\\nomic_map_creation.ipynb Cell 19\u001b[0m line \u001b[0;36m3\n\u001b[0;32m 1\u001b[0m \u001b[39m# cell for all course map creation\u001b[39;00m\n\u001b[1;32m----> 3\u001b[0m response \u001b[39m=\u001b[39m supabase_client\u001b[39m.\u001b[39;49mtable(\u001b[39m\"\u001b[39;49m\u001b[39mllm-convo-monitor\u001b[39;49m\u001b[39m\"\u001b[39;49m)\u001b[39m.\u001b[39;49mselect(\u001b[39m\"\u001b[39;49m\u001b[39m*\u001b[39;49m\u001b[39m\"\u001b[39;49m)\u001b[39m.\u001b[39;49mexecute()\n\u001b[0;32m 4\u001b[0m data \u001b[39m=\u001b[39m response\u001b[39m.\u001b[39mdata\n\u001b[0;32m 5\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mlen\u001b[39m(data))\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\postgrest\\_sync\\request_builder.py:55\u001b[0m, in \u001b[0;36mSyncQueryRequestBuilder.execute\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 43\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mexecute\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m APIResponse:\n\u001b[0;32m 44\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Execute the query.\u001b[39;00m\n\u001b[0;32m 45\u001b[0m \n\u001b[0;32m 46\u001b[0m \u001b[39m .. tip::\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 53\u001b[0m \u001b[39m :class:`APIError` If the API raised an error.\u001b[39;00m\n\u001b[0;32m 54\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 55\u001b[0m r \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msession\u001b[39m.\u001b[39;49mrequest(\n\u001b[0;32m 56\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mhttp_method,\n\u001b[0;32m 57\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mpath,\n\u001b[0;32m 58\u001b[0m json\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mjson,\n\u001b[0;32m 59\u001b[0m params\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mparams,\n\u001b[0;32m 60\u001b[0m headers\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mheaders,\n\u001b[0;32m 61\u001b[0m )\n\u001b[0;32m 62\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 63\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[0;32m 64\u001b[0m \u001b[39m200\u001b[39m \u001b[39m<\u001b[39m\u001b[39m=\u001b[39m r\u001b[39m.\u001b[39mstatus_code \u001b[39m<\u001b[39m\u001b[39m=\u001b[39m \u001b[39m299\u001b[39m\n\u001b[0;32m 65\u001b[0m ): \u001b[39m# Response.ok from JS (https://developer.mozilla.org/en-US/docs/Web/API/Response/ok)\u001b[39;00m\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:821\u001b[0m, in \u001b[0;36mClient.request\u001b[1;34m(self, method, url, content, data, files, json, params, headers, cookies, auth, follow_redirects, timeout, extensions)\u001b[0m\n\u001b[0;32m 806\u001b[0m warnings\u001b[39m.\u001b[39mwarn(message, \u001b[39mDeprecationWarning\u001b[39;00m)\n\u001b[0;32m 808\u001b[0m request \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mbuild_request(\n\u001b[0;32m 809\u001b[0m method\u001b[39m=\u001b[39mmethod,\n\u001b[0;32m 810\u001b[0m url\u001b[39m=\u001b[39murl,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 819\u001b[0m extensions\u001b[39m=\u001b[39mextensions,\n\u001b[0;32m 820\u001b[0m )\n\u001b[1;32m--> 821\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49msend(request, auth\u001b[39m=\u001b[39;49mauth, follow_redirects\u001b[39m=\u001b[39;49mfollow_redirects)\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:908\u001b[0m, in \u001b[0;36mClient.send\u001b[1;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[0;32m 900\u001b[0m follow_redirects \u001b[39m=\u001b[39m (\n\u001b[0;32m 901\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfollow_redirects\n\u001b[0;32m 902\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(follow_redirects, UseClientDefault)\n\u001b[0;32m 903\u001b[0m \u001b[39melse\u001b[39;00m follow_redirects\n\u001b[0;32m 904\u001b[0m )\n\u001b[0;32m 906\u001b[0m auth \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_build_request_auth(request, auth)\n\u001b[1;32m--> 908\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_send_handling_auth(\n\u001b[0;32m 909\u001b[0m request,\n\u001b[0;32m 910\u001b[0m auth\u001b[39m=\u001b[39;49mauth,\n\u001b[0;32m 911\u001b[0m follow_redirects\u001b[39m=\u001b[39;49mfollow_redirects,\n\u001b[0;32m 912\u001b[0m history\u001b[39m=\u001b[39;49m[],\n\u001b[0;32m 913\u001b[0m )\n\u001b[0;32m 914\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 915\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m stream:\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:936\u001b[0m, in \u001b[0;36mClient._send_handling_auth\u001b[1;34m(self, request, auth, follow_redirects, history)\u001b[0m\n\u001b[0;32m 933\u001b[0m request \u001b[39m=\u001b[39m \u001b[39mnext\u001b[39m(auth_flow)\n\u001b[0;32m 935\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 936\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_send_handling_redirects(\n\u001b[0;32m 937\u001b[0m request,\n\u001b[0;32m 938\u001b[0m follow_redirects\u001b[39m=\u001b[39;49mfollow_redirects,\n\u001b[0;32m 939\u001b[0m history\u001b[39m=\u001b[39;49mhistory,\n\u001b[0;32m 940\u001b[0m )\n\u001b[0;32m 941\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 942\u001b[0m \u001b[39mtry\u001b[39;00m:\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:973\u001b[0m, in \u001b[0;36mClient._send_handling_redirects\u001b[1;34m(self, request, follow_redirects, history)\u001b[0m\n\u001b[0;32m 970\u001b[0m \u001b[39mfor\u001b[39;00m hook \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_event_hooks[\u001b[39m\"\u001b[39m\u001b[39mrequest\u001b[39m\u001b[39m\"\u001b[39m]:\n\u001b[0;32m 971\u001b[0m hook(request)\n\u001b[1;32m--> 973\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_send_single_request(request)\n\u001b[0;32m 974\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 975\u001b[0m \u001b[39mfor\u001b[39;00m hook \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_event_hooks[\u001b[39m\"\u001b[39m\u001b[39mresponse\u001b[39m\u001b[39m\"\u001b[39m]:\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_client.py:1009\u001b[0m, in \u001b[0;36mClient._send_single_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 1004\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mRuntimeError\u001b[39;00m(\n\u001b[0;32m 1005\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mAttempted to send an async request with a sync Client instance.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 1006\u001b[0m )\n\u001b[0;32m 1008\u001b[0m \u001b[39mwith\u001b[39;00m request_context(request\u001b[39m=\u001b[39mrequest):\n\u001b[1;32m-> 1009\u001b[0m response \u001b[39m=\u001b[39m transport\u001b[39m.\u001b[39;49mhandle_request(request)\n\u001b[0;32m 1011\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(response\u001b[39m.\u001b[39mstream, SyncByteStream)\n\u001b[0;32m 1013\u001b[0m response\u001b[39m.\u001b[39mrequest \u001b[39m=\u001b[39m request\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_transports\\default.py:217\u001b[0m, in \u001b[0;36mHTTPTransport.handle_request\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 203\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(request\u001b[39m.\u001b[39mstream, SyncByteStream)\n\u001b[0;32m 205\u001b[0m req \u001b[39m=\u001b[39m httpcore\u001b[39m.\u001b[39mRequest(\n\u001b[0;32m 206\u001b[0m method\u001b[39m=\u001b[39mrequest\u001b[39m.\u001b[39mmethod,\n\u001b[0;32m 207\u001b[0m url\u001b[39m=\u001b[39mhttpcore\u001b[39m.\u001b[39mURL(\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 215\u001b[0m extensions\u001b[39m=\u001b[39mrequest\u001b[39m.\u001b[39mextensions,\n\u001b[0;32m 216\u001b[0m )\n\u001b[1;32m--> 217\u001b[0m \u001b[39mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[0;32m 218\u001b[0m resp \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_pool\u001b[39m.\u001b[39mhandle_request(req)\n\u001b[0;32m 220\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(resp\u001b[39m.\u001b[39mstream, typing\u001b[39m.\u001b[39mIterable)\n", + "File \u001b[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.3056.0_x64__qbz5n2kfra8p0\\lib\\contextlib.py:153\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[1;34m(self, typ, value, traceback)\u001b[0m\n\u001b[0;32m 151\u001b[0m value \u001b[39m=\u001b[39m typ()\n\u001b[0;32m 152\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 153\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mgen\u001b[39m.\u001b[39;49mthrow(typ, value, traceback)\n\u001b[0;32m 154\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mStopIteration\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m 155\u001b[0m \u001b[39m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[0;32m 156\u001b[0m \u001b[39m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[0;32m 157\u001b[0m \u001b[39m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n\u001b[0;32m 158\u001b[0m \u001b[39mreturn\u001b[39;00m exc \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m value\n", + "File \u001b[1;32m~\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\httpx\\_transports\\default.py:77\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[1;34m()\u001b[0m\n\u001b[0;32m 74\u001b[0m \u001b[39mraise\u001b[39;00m\n\u001b[0;32m 76\u001b[0m message \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(exc)\n\u001b[1;32m---> 77\u001b[0m \u001b[39mraise\u001b[39;00m mapped_exc(message) \u001b[39mfrom\u001b[39;00m \u001b[39mexc\u001b[39;00m\n", + "\u001b[1;31mReadTimeout\u001b[0m: The read operation timed out" ] } ], diff --git a/ai_ta_backend/utils_tokenization.py b/ai_ta_backend/utils_tokenization.py index 096e2bb6..5b000e5f 100644 --- a/ai_ta_backend/utils_tokenization.py +++ b/ai_ta_backend/utils_tokenization.py @@ -1,136 +1,136 @@ -import json -import os -from typing import Any, List - -import supabase -import tiktoken - - -def count_tokens_and_cost(prompt: str, completion: str = '', openai_model_name: str = "gpt-3.5-turbo"): # -> tuple[int, float] | tuple[int, float, int, float]: - """ - Returns the number of tokens in a text string. - - Only the first parameter is required, a string of text to measure. The completion and model name are optional. - - num_tokens, prompt_cost = count_tokens_and_cost(prompt="hello there") - num_tokens_prompt, prompt_cost, num_tokens_completion, completion_cost = count_tokens_and_cost(prompt="hello there", completion="how are you?") - - Args: - prompt (str): _description_ - completion (str, optional): _description_. Defaults to ''. - openai_model_name (str, optional): _description_. Defaults to "gpt-3.5-turbo". - - Returns: - tuple[int, float] | tuple[int, float, int, float]: Returns the number of tokens consumed and the cost. The total cost you'll be billed is the sum of each individual cost (prompt_cost + completion_cost) - """ - # encoding = tiktoken.encoding_for_model(openai_model_name) - openai_model_name = openai_model_name.lower() - encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # I think they all use the same encoding - prompt_cost = 0 - completion_cost = 0 - - prompt_token_cost = 0 - completion_token_cost = 0 - - if openai_model_name.startswith("gpt-3.5-turbo"): - if "16k" in openai_model_name: - prompt_token_cost: float = 0.003 / 1_000 - completion_token_cost: float = 0.004 / 1_000 - else: - # 3.5-turbo regular (4k context) - prompt_token_cost: float = 0.0015 / 1_000 - completion_token_cost: float = 0.002 / 1_000 - - elif openai_model_name.startswith("gpt-4"): - if "32k" in openai_model_name: - prompt_token_cost = 0.06 / 1_000 - completion_token_cost = 0.12 / 1_000 - else: - # gpt-4 regular (8k context) - prompt_token_cost = 0.03 / 1_000 - completion_token_cost = 0.06 / 1_000 - elif openai_model_name.startswith("text-embedding-ada-002"): - prompt_token_cost = 0.0001 / 1_000 - completion_token_cost = 0.0001 / 1_000 - else: - # no idea of cost - print(f"NO IDEA OF COST, pricing not supported for model model: `{openai_model_name}`") - prompt_token_cost = 0 - completion_token_cost = 0 - - if completion == '': - num_tokens_prompt: int = len(encoding.encode(prompt)) - prompt_cost = float(prompt_token_cost * num_tokens_prompt) - return num_tokens_prompt, prompt_cost - elif prompt == '': - num_tokens_completion: int = len(encoding.encode(completion)) - completion_cost = float(completion_token_cost * num_tokens_completion) - return num_tokens_completion, completion_cost - else: - num_tokens_prompt: int = len(encoding.encode(prompt)) - num_tokens_completion: int = len(encoding.encode(completion)) - prompt_cost = float(prompt_token_cost * num_tokens_prompt) - completion_cost = float(completion_token_cost * num_tokens_completion) - return num_tokens_prompt, prompt_cost, num_tokens_completion, completion_cost - -# from dotenv import load_dotenv - -# load_dotenv() - -def analyze_conversations(supabase_client: Any = None): - - if supabase_client is None: - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - # Get all conversations - response = supabase_client.table('llm-convo-monitor').select('convo').execute() - # print("total entries", response.data.count) - - total_convos = 0 - total_messages = 0 - total_prompt_cost = 0 - total_completion_cost = 0 - - # Iterate through all conversations - # for convo in response['data']: - for convo in response.data: - total_convos += 1 - # print(convo) - # prase json from convo - # parse json into dict - # print(type(convo)) - # convo = json.loads(convo) - convo = convo['convo'] - messages = convo['messages'] - model_name = convo['model']['name'] - - # Iterate through all messages in each conversation - for message in messages: - total_messages += 1 - role = message['role'] - content = message['content'] - - # If the message is from the user, it's a prompt - # TODO: Fix these - # WARNING: Fix these error messages... they are the sign of a logic bug. - if role == 'user': - num_tokens, cost = count_tokens_and_cost(prompt=content, openai_model_name=model_name) - total_prompt_cost += cost - print(f'User Prompt: {content}, Tokens: {num_tokens}, cost: {cost}') - - # If the message is from the assistant, it's a completion - elif role == 'assistant': - num_tokens_completion, cost_completion = count_tokens_and_cost(prompt='', completion=content, openai_model_name=model_name) - total_completion_cost += cost_completion - print(f'Assistant Completion: {content}\nTokens: {num_tokens_completion}, cost: {cost_completion}') - return total_convos, total_messages, total_prompt_cost, total_completion_cost - -if __name__ == '__main__': - pass - -# if __name__ == '__main__': -# print('starting main') -# total_convos, total_messages, total_prompt_cost, total_completion_cost = analyze_conversations() -# print(f'total_convos: {total_convos}, total_messages: {total_messages}') +import json +import os +from typing import Any, List + +import supabase +import tiktoken + + +def count_tokens_and_cost(prompt: str, completion: str = '', openai_model_name: str = "gpt-3.5-turbo"): # -> tuple[int, float] | tuple[int, float, int, float]: + """ + Returns the number of tokens in a text string. + + Only the first parameter is required, a string of text to measure. The completion and model name are optional. + + num_tokens, prompt_cost = count_tokens_and_cost(prompt="hello there") + num_tokens_prompt, prompt_cost, num_tokens_completion, completion_cost = count_tokens_and_cost(prompt="hello there", completion="how are you?") + + Args: + prompt (str): _description_ + completion (str, optional): _description_. Defaults to ''. + openai_model_name (str, optional): _description_. Defaults to "gpt-3.5-turbo". + + Returns: + tuple[int, float] | tuple[int, float, int, float]: Returns the number of tokens consumed and the cost. The total cost you'll be billed is the sum of each individual cost (prompt_cost + completion_cost) + """ + # encoding = tiktoken.encoding_for_model(openai_model_name) + openai_model_name = openai_model_name.lower() + encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # I think they all use the same encoding + prompt_cost = 0 + completion_cost = 0 + + prompt_token_cost = 0 + completion_token_cost = 0 + + if openai_model_name.startswith("gpt-3.5-turbo"): + if "16k" in openai_model_name: + prompt_token_cost: float = 0.003 / 1_000 + completion_token_cost: float = 0.004 / 1_000 + else: + # 3.5-turbo regular (4k context) + prompt_token_cost: float = 0.0015 / 1_000 + completion_token_cost: float = 0.002 / 1_000 + + elif openai_model_name.startswith("gpt-4"): + if "32k" in openai_model_name: + prompt_token_cost = 0.06 / 1_000 + completion_token_cost = 0.12 / 1_000 + else: + # gpt-4 regular (8k context) + prompt_token_cost = 0.03 / 1_000 + completion_token_cost = 0.06 / 1_000 + elif openai_model_name.startswith("text-embedding-ada-002"): + prompt_token_cost = 0.0001 / 1_000 + completion_token_cost = 0.0001 / 1_000 + else: + # no idea of cost + print(f"NO IDEA OF COST, pricing not supported for model model: `{openai_model_name}`") + prompt_token_cost = 0 + completion_token_cost = 0 + + if completion == '': + num_tokens_prompt: int = len(encoding.encode(prompt)) + prompt_cost = float(prompt_token_cost * num_tokens_prompt) + return num_tokens_prompt, prompt_cost + elif prompt == '': + num_tokens_completion: int = len(encoding.encode(completion)) + completion_cost = float(completion_token_cost * num_tokens_completion) + return num_tokens_completion, completion_cost + else: + num_tokens_prompt: int = len(encoding.encode(prompt)) + num_tokens_completion: int = len(encoding.encode(completion)) + prompt_cost = float(prompt_token_cost * num_tokens_prompt) + completion_cost = float(completion_token_cost * num_tokens_completion) + return num_tokens_prompt, prompt_cost, num_tokens_completion, completion_cost + +# from dotenv import load_dotenv + +# load_dotenv() + +def analyze_conversations(supabase_client: Any = None): + + if supabase_client is None: + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + # Get all conversations + response = supabase_client.table('llm-convo-monitor').select('convo').execute() + # print("total entries", response.data.count) + + total_convos = 0 + total_messages = 0 + total_prompt_cost = 0 + total_completion_cost = 0 + + # Iterate through all conversations + # for convo in response['data']: + for convo in response.data: + total_convos += 1 + # print(convo) + # prase json from convo + # parse json into dict + # print(type(convo)) + # convo = json.loads(convo) + convo = convo['convo'] + messages = convo['messages'] + model_name = convo['model']['name'] + + # Iterate through all messages in each conversation + for message in messages: + total_messages += 1 + role = message['role'] + content = message['content'] + + # If the message is from the user, it's a prompt + # TODO: Fix these + # WARNING: Fix these error messages... they are the sign of a logic bug. + if role == 'user': + num_tokens, cost = count_tokens_and_cost(prompt=content, openai_model_name=model_name) + total_prompt_cost += cost + print(f'User Prompt: {content}, Tokens: {num_tokens}, cost: {cost}') + + # If the message is from the assistant, it's a completion + elif role == 'assistant': + num_tokens_completion, cost_completion = count_tokens_and_cost(prompt='', completion=content, openai_model_name=model_name) + total_completion_cost += cost_completion + print(f'Assistant Completion: {content}\nTokens: {num_tokens_completion}, cost: {cost_completion}') + return total_convos, total_messages, total_prompt_cost, total_completion_cost + +if __name__ == '__main__': + pass + +# if __name__ == '__main__': +# print('starting main') +# total_convos, total_messages, total_prompt_cost, total_completion_cost = analyze_conversations() +# print(f'total_convos: {total_convos}, total_messages: {total_messages}') # print(f'total_prompt_cost: {total_prompt_cost}, total_completion_cost: {total_completion_cost}') \ No newline at end of file diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index 36158db9..f77d695a 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -1,467 +1,467 @@ -import os -import re -import shutil -import time -from tempfile import NamedTemporaryFile -from zipfile import ZipFile - -import boto3 # type: ignore -import requests -from bs4 import BeautifulSoup - -import supabase - -from ai_ta_backend.aws import upload_data_files_to_s3 -from ai_ta_backend.vector_database import Ingest -import mimetypes - -def get_file_extension(filename): - match = re.search(r'\.([a-zA-Z0-9]+)$', filename) - valid_filetypes = list(mimetypes.types_map.keys()) - valid_filetypes = valid_filetypes + ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx'] - if match: - filetype = "." + match.group(1) - if filetype in valid_filetypes: - return filetype - else: - return '.html' - else: - return '.html' - -def valid_url(url): - '''Returns the URL and it's content if it's good, otherwise returns false. Prints the status code.''' - try: - response = requests.get(url, allow_redirects=True, timeout=20) - - redirect_loop_counter = 0 - while response.status_code == 301: - # Check for permanent redirect - if redirect_loop_counter > 3: - print("❌ Redirect loop (on 301 error) exceeded redirect limit of:", redirect_loop_counter, "❌") - return False - redirect_url = response.headers['Location'] - response = requests.head(redirect_url) - redirect_loop_counter += 1 - if response.status_code == 200: - filetype = get_file_extension(response.url) - print("file extension:", filetype) - if filetype == '.html': - content = BeautifulSoup(response.content, "html.parser") - if " len(urls): - max_urls = max_urls - len(urls) - elif max_urls < len(urls): - urls = urls[:max_urls] - max_urls = 0 - else: - max_urls = 0 - # We grab content out of these urls - - for url in urls: - if base_url_on: - if url.startswith(site): - url, s, filetype = valid_url(url) - if url: - print("Scraped:", url) - url_contents.append((url, s, filetype)) - else: - _invalid_urls.append(url) - else: - pass - else: - url, s, filetype = valid_url(url) - if url: - print("Scraped:", url) - url_contents.append((url, s, filetype)) - else: - _invalid_urls.append(url) - print("existing urls", _existing_urls) - url_contents = remove_duplicates(url_contents, _existing_urls) - max_urls = max_urls - len(url_contents) - print(max_urls, "urls left") - - # recursively go through crawler until we reach the max amount of urls. - for url in url_contents: - if url[0] not in _invalid_urls: - if max_urls > 0: - if _depth < max_depth: - temp_data = crawler(url[0], max_urls, max_depth, timeout, _invalid_urls, _depth, url[1], url[2]) - print("existing urls", _existing_urls) - temp_data = remove_duplicates(temp_data, _existing_urls) - max_urls = max_urls - len(temp_data) - print(max_urls, "urls left") - url_contents.extend(temp_data) - url_contents = remove_duplicates(url_contents, _existing_urls) - else: - print("Depth exceeded:", _depth+1, "out of", max_depth) - break - else: - break - else: - pass - - if _depth == 0: - if len(url_contents) < amount: - print("Max URLS not reached, returning all urls found:", len(url_contents), "out of", amount) - elif len(url_contents) == amount: - print("Max URLS reached:", len(url_contents), "out of", amount) - else: - print("Exceeded Max URLS, found:", len(url_contents), "out of", amount) - print(len(url_contents), "urls found") - - # Free up memory - # del url_contents[:] - # del urls[:] - # if _invalid_urls is not None: - # del _invalid_urls[:] - # if _existing_urls is not None: - # del _existing_urls[:] - # gc.collect() - - return url_contents - -def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, timeout:int=1, stay_on_baseurl:bool=False): - """ - Crawl a site and scrape its content and PDFs, then upload the data to S3 and ingest it. - - Args: - url (str): The URL of the site to crawl. - course_name (str): The name of the course to associate with the crawled data. - max_urls (int, optional): The maximum number of URLs to crawl. Defaults to 100. - max_depth (int, optional): The maximum depth of URLs to crawl. Defaults to 3. - timeout (int, optional): The number of seconds to wait between requests. Defaults to 1. - - Returns: - None - """ - print("\n") - max_urls = int(max_urls) - max_depth = int(max_depth) - timeout = int(timeout) - stay_on_baseurl = bool(stay_on_baseurl) - if stay_on_baseurl: - stay_on_baseurl = base_url(url) - print(stay_on_baseurl) - - ingester = Ingest() - s3_client = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - # Check for GitHub repository coming soon - if url.startswith("https://github.com/"): - print("Begin Ingesting GitHub page") - results = ingester.ingest_github(url, course_name) - print("Finished ingesting GitHub page") - del ingester - return results - else: - try: - print("Gathering existing urls from Supabase") - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() - del supabase_client - if urls.data == []: - existing_urls = None - else: - existing_urls = [] - for thing in urls.data: - whole = '' - for t in thing['contexts']: - whole += t['text'] - existing_urls.append((thing['url'], whole)) - print("Finished gathering existing urls from Supabase") - except Exception as e: - print("Error:", e) - print("Could not gather existing urls from Supabase") - existing_urls = None - - print("Begin Ingesting Web page") - data = crawler(url=url, max_urls=max_urls, max_depth=max_depth, timeout=timeout, base_url_on=stay_on_baseurl, _existing_urls=existing_urls) - - # Clean some keys for a proper file name - # todo: have a default title - # titles = [value[1][1].title.string for value in data] - - titles = [] - for value in data: - try: - titles.append(value[1].title.string) - except AttributeError as e: - # if no title - try: - placeholder_title = re.findall(pattern=r'[a-zA-Z0-9.]*[a-z]', string=value[0])[1] - except Exception as e: - placeholder_title = "Title Not Found" - titles.append(placeholder_title) - print(f"URL is missing a title, using this title instead: {placeholder_title}") - - try: - clean = [re.match(r"[a-zA-Z0-9\s]*", title).group(0) for title in titles] # type: ignore - except Exception as e: - print("Error:", e) - clean = titles - print("title names after regex before cleaning", clean) - path_name = [] - counter = 0 - for value in clean: - value = value.strip() if value else "" - # value = value.strip() - value = value.replace(" ", "_") - if value == "403_Forbidden": - print("Found Forbidden Key, deleting data") - del data[counter] - counter -= 1 - else: - path_name.append(value) - counter += 1 - print("Cleaned title names", path_name) - - # Upload each html to S3 - print("Uploading files to S3") - paths = [] - counter = 0 - try: - for i, key in enumerate(data): - with NamedTemporaryFile(suffix=key[2]) as temp_file: - if key[1] != "" or key[1] != None: - if key[2] == ".html": - print("Writing", key[2] ,"to temp file") - temp_file.write(key[1].encode('utf-8')) - else: - print("Writing", key[2] ,"to temp file") - temp_file.write(key[1]) - temp_file.seek(0) - s3_upload_path = "courses/"+ course_name + "/" + path_name[i] + key[2] - paths.append(s3_upload_path) - with open(temp_file.name, 'rb') as f: - print("Uploading", key[2] ,"to S3") - s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path) - ingester.bulk_ingest(s3_upload_path, course_name=course_name, url=key[0], base_url=url) - counter += 1 - else: - print("No", key[2] ,"to upload", key[1]) - except Exception as e: - print("Error in upload:", e) - finally: - del ingester - - print(f"Successfully uploaded files to s3: {counter}") - print("Finished /web-scrape") - -# Download an MIT course using its url -def mit_course_download(url:str, course_name:str, local_dir:str): - ingester = Ingest() - base = "https://ocw.mit.edu" - if url.endswith("download"): - pass - else: - url = url + "download" - - r = requests.get(url) - soup = BeautifulSoup(r.text,"html.parser") - - zip = '' - for ref in soup.find_all("a"): - if ref.attrs['href'].endswith("zip"): - zip = ref.attrs['href'] - - site = zip - print('site', site) - r = requests.get(url=site, stream=True) - - zip_file = local_dir + ".zip" - - try: - with open(zip_file, 'wb') as fd: - for chunk in r.iter_content(chunk_size=128): - fd.write(chunk) - print("course downloaded!") - except Exception as e: - print("Error:", e, site) - - with ZipFile(zip_file, 'r') as zObject: - zObject.extractall( - path=local_dir) - - shutil.move(local_dir+"/"+"robots.txt", local_dir+"/static_resources") - s3_paths = upload_data_files_to_s3(course_name, local_dir+"/static_resources") - success_fail = ingester.bulk_ingest(s3_paths, course_name) # type: ignore - - shutil.move(zip_file, local_dir) - shutil.rmtree(local_dir) - del ingester - print("Finished Ingest") - return success_fail - -if __name__ == '__main__': - pass +import os +import re +import shutil +import time +from tempfile import NamedTemporaryFile +from zipfile import ZipFile + +import boto3 # type: ignore +import requests +from bs4 import BeautifulSoup + +import supabase + +from ai_ta_backend.aws import upload_data_files_to_s3 +from ai_ta_backend.vector_database import Ingest +import mimetypes + +def get_file_extension(filename): + match = re.search(r'\.([a-zA-Z0-9]+)$', filename) + valid_filetypes = list(mimetypes.types_map.keys()) + valid_filetypes = valid_filetypes + ['.html', '.py', '.vtt', '.pdf', '.txt', '.srt', '.docx', '.ppt', '.pptx'] + if match: + filetype = "." + match.group(1) + if filetype in valid_filetypes: + return filetype + else: + return '.html' + else: + return '.html' + +def valid_url(url): + '''Returns the URL and it's content if it's good, otherwise returns false. Prints the status code.''' + try: + response = requests.get(url, allow_redirects=True, timeout=20) + + redirect_loop_counter = 0 + while response.status_code == 301: + # Check for permanent redirect + if redirect_loop_counter > 3: + print("❌ Redirect loop (on 301 error) exceeded redirect limit of:", redirect_loop_counter, "❌") + return False + redirect_url = response.headers['Location'] + response = requests.head(redirect_url) + redirect_loop_counter += 1 + if response.status_code == 200: + filetype = get_file_extension(response.url) + print("file extension:", filetype) + if filetype == '.html': + content = BeautifulSoup(response.content, "html.parser") + if " len(urls): + max_urls = max_urls - len(urls) + elif max_urls < len(urls): + urls = urls[:max_urls] + max_urls = 0 + else: + max_urls = 0 + # We grab content out of these urls + + for url in urls: + if base_url_on: + if url.startswith(site): + url, s, filetype = valid_url(url) + if url: + print("Scraped:", url) + url_contents.append((url, s, filetype)) + else: + _invalid_urls.append(url) + else: + pass + else: + url, s, filetype = valid_url(url) + if url: + print("Scraped:", url) + url_contents.append((url, s, filetype)) + else: + _invalid_urls.append(url) + print("existing urls", _existing_urls) + url_contents = remove_duplicates(url_contents, _existing_urls) + max_urls = max_urls - len(url_contents) + print(max_urls, "urls left") + + # recursively go through crawler until we reach the max amount of urls. + for url in url_contents: + if url[0] not in _invalid_urls: + if max_urls > 0: + if _depth < max_depth: + temp_data = crawler(url[0], max_urls, max_depth, timeout, _invalid_urls, _depth, url[1], url[2]) + print("existing urls", _existing_urls) + temp_data = remove_duplicates(temp_data, _existing_urls) + max_urls = max_urls - len(temp_data) + print(max_urls, "urls left") + url_contents.extend(temp_data) + url_contents = remove_duplicates(url_contents, _existing_urls) + else: + print("Depth exceeded:", _depth+1, "out of", max_depth) + break + else: + break + else: + pass + + if _depth == 0: + if len(url_contents) < amount: + print("Max URLS not reached, returning all urls found:", len(url_contents), "out of", amount) + elif len(url_contents) == amount: + print("Max URLS reached:", len(url_contents), "out of", amount) + else: + print("Exceeded Max URLS, found:", len(url_contents), "out of", amount) + print(len(url_contents), "urls found") + + # Free up memory + # del url_contents[:] + # del urls[:] + # if _invalid_urls is not None: + # del _invalid_urls[:] + # if _existing_urls is not None: + # del _existing_urls[:] + # gc.collect() + + return url_contents + +def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, timeout:int=1, stay_on_baseurl:bool=False): + """ + Crawl a site and scrape its content and PDFs, then upload the data to S3 and ingest it. + + Args: + url (str): The URL of the site to crawl. + course_name (str): The name of the course to associate with the crawled data. + max_urls (int, optional): The maximum number of URLs to crawl. Defaults to 100. + max_depth (int, optional): The maximum depth of URLs to crawl. Defaults to 3. + timeout (int, optional): The number of seconds to wait between requests. Defaults to 1. + + Returns: + None + """ + print("\n") + max_urls = int(max_urls) + max_depth = int(max_depth) + timeout = int(timeout) + stay_on_baseurl = bool(stay_on_baseurl) + if stay_on_baseurl: + stay_on_baseurl = base_url(url) + print(stay_on_baseurl) + + ingester = Ingest() + s3_client = boto3.client( + 's3', + aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), + ) + + # Check for GitHub repository coming soon + if url.startswith("https://github.com/"): + print("Begin Ingesting GitHub page") + results = ingester.ingest_github(url, course_name) + print("Finished ingesting GitHub page") + del ingester + return results + else: + try: + print("Gathering existing urls from Supabase") + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + urls = supabase_client.table(os.getenv('NEW_NEW_NEWNEW_MATERIALS_SUPABASE_TABLE')).select('course_name, url, contexts').eq('course_name', course_name).execute() + del supabase_client + if urls.data == []: + existing_urls = None + else: + existing_urls = [] + for thing in urls.data: + whole = '' + for t in thing['contexts']: + whole += t['text'] + existing_urls.append((thing['url'], whole)) + print("Finished gathering existing urls from Supabase") + except Exception as e: + print("Error:", e) + print("Could not gather existing urls from Supabase") + existing_urls = None + + print("Begin Ingesting Web page") + data = crawler(url=url, max_urls=max_urls, max_depth=max_depth, timeout=timeout, base_url_on=stay_on_baseurl, _existing_urls=existing_urls) + + # Clean some keys for a proper file name + # todo: have a default title + # titles = [value[1][1].title.string for value in data] + + titles = [] + for value in data: + try: + titles.append(value[1].title.string) + except AttributeError as e: + # if no title + try: + placeholder_title = re.findall(pattern=r'[a-zA-Z0-9.]*[a-z]', string=value[0])[1] + except Exception as e: + placeholder_title = "Title Not Found" + titles.append(placeholder_title) + print(f"URL is missing a title, using this title instead: {placeholder_title}") + + try: + clean = [re.match(r"[a-zA-Z0-9\s]*", title).group(0) for title in titles] # type: ignore + except Exception as e: + print("Error:", e) + clean = titles + print("title names after regex before cleaning", clean) + path_name = [] + counter = 0 + for value in clean: + value = value.strip() if value else "" + # value = value.strip() + value = value.replace(" ", "_") + if value == "403_Forbidden": + print("Found Forbidden Key, deleting data") + del data[counter] + counter -= 1 + else: + path_name.append(value) + counter += 1 + print("Cleaned title names", path_name) + + # Upload each html to S3 + print("Uploading files to S3") + paths = [] + counter = 0 + try: + for i, key in enumerate(data): + with NamedTemporaryFile(suffix=key[2]) as temp_file: + if key[1] != "" or key[1] != None: + if key[2] == ".html": + print("Writing", key[2] ,"to temp file") + temp_file.write(key[1].encode('utf-8')) + else: + print("Writing", key[2] ,"to temp file") + temp_file.write(key[1]) + temp_file.seek(0) + s3_upload_path = "courses/"+ course_name + "/" + path_name[i] + key[2] + paths.append(s3_upload_path) + with open(temp_file.name, 'rb') as f: + print("Uploading", key[2] ,"to S3") + s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path) + ingester.bulk_ingest(s3_upload_path, course_name=course_name, url=key[0], base_url=url) + counter += 1 + else: + print("No", key[2] ,"to upload", key[1]) + except Exception as e: + print("Error in upload:", e) + finally: + del ingester + + print(f"Successfully uploaded files to s3: {counter}") + print("Finished /web-scrape") + +# Download an MIT course using its url +def mit_course_download(url:str, course_name:str, local_dir:str): + ingester = Ingest() + base = "https://ocw.mit.edu" + if url.endswith("download"): + pass + else: + url = url + "download" + + r = requests.get(url) + soup = BeautifulSoup(r.text,"html.parser") + + zip = '' + for ref in soup.find_all("a"): + if ref.attrs['href'].endswith("zip"): + zip = ref.attrs['href'] + + site = zip + print('site', site) + r = requests.get(url=site, stream=True) + + zip_file = local_dir + ".zip" + + try: + with open(zip_file, 'wb') as fd: + for chunk in r.iter_content(chunk_size=128): + fd.write(chunk) + print("course downloaded!") + except Exception as e: + print("Error:", e, site) + + with ZipFile(zip_file, 'r') as zObject: + zObject.extractall( + path=local_dir) + + shutil.move(local_dir+"/"+"robots.txt", local_dir+"/static_resources") + s3_paths = upload_data_files_to_s3(course_name, local_dir+"/static_resources") + success_fail = ingester.bulk_ingest(s3_paths, course_name) # type: ignore + + shutil.move(zip_file, local_dir) + shutil.rmtree(local_dir) + del ingester + print("Finished Ingest") + return success_fail + +if __name__ == '__main__': + pass