diff --git a/.gitignore b/.gitignore index 905e086a..55712534 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,11 @@ # don't sync coursera docs coursera-dl/ *parsed.json +*wandb* +*ipynb* +*.pem # don't expose env files -dummy.ipynb .env # Created by https://www.toptal.com/developers/gitignore/api/python # Edit at https://www.toptal.com/developers/gitignore?templates=python @@ -37,6 +39,8 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +.idea/ +node_modules/ # PyInstaller # Usually these files are written by a python script from a template @@ -116,7 +120,7 @@ celerybeat.pid # Environments .env -.venv +.venv* env/ venv/ ENV/ @@ -164,3 +168,10 @@ pip-selfcheck.json # End of https://www.toptal.com/developers/gitignore/api/python .aider* + +# Input/Output files +*payload*.json +*sample*.json + +#DS Store +.DS_Store diff --git a/.rules/no-bugs.md b/.rules/no-bugs.md index d67cb147..cf0e9ebf 100644 --- a/.rules/no-bugs.md +++ b/.rules/no-bugs.md @@ -1,8 +1,8 @@ -Make sure our code follows these best practices, UNLESS there's a comment explaining why it's okay to break the rule. - -1. Avoid typos. -2. Don't have like, really obvious bugs. -3. Don't store secrets in code. -4. Comment on code smells and bugs, suggest better 'best practices' ways to achieve the same result. No need to be too strict, but please help me learn. -5. Avoid dangerous stuff, like things that could lead to template injection, SQL injection, broken access control, or really anything that would show up as a CVE somewhere. -6. If something looks like not best practice, please show how to do it better. We want to learn and improve our code quality and use modern, elegant patterns and practices. \ No newline at end of file +Make sure our code follows these best practices, UNLESS there's a comment explaining why it's okay to break the rule. + +1. Avoid typos. +2. Don't have like, really obvious bugs. +3. Don't store secrets in code. +4. Comment on code smells and bugs, suggest better 'best practices' ways to achieve the same result. No need to be too strict, but please help me learn. +5. Avoid dangerous stuff, like things that could lead to template injection, SQL injection, broken access control, or really anything that would show up as a CVE somewhere. +6. If something looks like not best practice, please show how to do it better. We want to learn and improve our code quality and use modern, elegant patterns and practices. diff --git a/.trunk/.gitignore b/.trunk/.gitignore new file mode 100644 index 00000000..15966d08 --- /dev/null +++ b/.trunk/.gitignore @@ -0,0 +1,9 @@ +*out +*logs +*actions +*notifications +*tools +plugins +user_trunk.yaml +user.yaml +tmp diff --git a/.trunk/configs/.isort.cfg b/.trunk/configs/.isort.cfg new file mode 100644 index 00000000..5225d7a2 --- /dev/null +++ b/.trunk/configs/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile=google diff --git a/.trunk/configs/.markdownlint.yaml b/.trunk/configs/.markdownlint.yaml new file mode 100644 index 00000000..fb940393 --- /dev/null +++ b/.trunk/configs/.markdownlint.yaml @@ -0,0 +1,10 @@ +# Autoformatter friendly markdownlint config (all formatting rules disabled) +default: true +blank_lines: false +bullet: false +html: false +indentation: false +line_length: false +spaces: false +url: false +whitespace: false diff --git a/.trunk/configs/.shellcheckrc b/.trunk/configs/.shellcheckrc new file mode 100644 index 00000000..8c7b1ada --- /dev/null +++ b/.trunk/configs/.shellcheckrc @@ -0,0 +1,7 @@ +enable=all +source-path=SCRIPTDIR +disable=SC2154 + +# If you're having issues with shellcheck following source, disable the errors via: +# disable=SC1090 +# disable=SC1091 diff --git a/.trunk/configs/.style.yapf b/.trunk/configs/.style.yapf new file mode 100644 index 00000000..d4280b1a --- /dev/null +++ b/.trunk/configs/.style.yapf @@ -0,0 +1,4 @@ +[style] +based_on_style = google +column_limit = 140 +indent_width = 2 \ No newline at end of file diff --git a/.trunk/configs/.yamllint.yaml b/.trunk/configs/.yamllint.yaml new file mode 100644 index 00000000..4d444662 --- /dev/null +++ b/.trunk/configs/.yamllint.yaml @@ -0,0 +1,10 @@ +rules: + quoted-strings: + required: only-when-needed + extra-allowed: ["{|}"] + empty-values: + forbid-in-block-mappings: true + forbid-in-flow-mappings: true + key-duplicates: {} + octal-values: + forbid-implicit-octal: true diff --git a/.trunk/configs/ruff.toml b/.trunk/configs/ruff.toml new file mode 100644 index 00000000..f5a235cf --- /dev/null +++ b/.trunk/configs/ruff.toml @@ -0,0 +1,5 @@ +# Generic, formatter-friendly config. +select = ["B", "D3", "E", "F"] + +# Never enforce `E501` (line length violations). This should be handled by formatters. +ignore = ["E501"] diff --git a/.trunk/trunk.yaml b/.trunk/trunk.yaml new file mode 100644 index 00000000..9c247920 --- /dev/null +++ b/.trunk/trunk.yaml @@ -0,0 +1,55 @@ +# This file controls the behavior of Trunk: https://docs.trunk.io/cli +# To learn more about the format of this file, see https://docs.trunk.io/reference/trunk-yaml +version: 0.1 +cli: + version: 1.22.2 +# Trunk provides extensibility via plugins. (https://docs.trunk.io/plugins) +plugins: + sources: + - id: trunk + ref: v1.6.0 + uri: https://github.com/trunk-io/plugins +# Many linters and tools depend on runtimes - configure them here. (https://docs.trunk.io/runtimes) +runtimes: + enabled: + - go@1.21.0 + - node@18.12.1 + - python@3.10.8 +# This is the section where you manage your linters. (https://docs.trunk.io/check/configuration) +# - osv-scanner@1.5.0 # too sensitive, causing failures that make devs skip checks. +lint: + enabled: + # - black@24.4.2 + - isort@5.13.2 + - osv-scanner@1.7.4 + - trufflehog@3.78.1 + - yapf@0.40.2 + # - isort@5.13.2 + - actionlint@1.7.1 + - bandit@1.7.9 + - checkov@3.2.136 + - git-diff-check + - markdownlint@0.41.0 + - oxipng@9.1.1 + - prettier@3.3.2 + - ruff@0.4.8 + - shellcheck@0.10.0 + - shfmt@3.6.0 + - trivy@0.52.1 + - yamllint@1.35.1 + ignore: + - linters: [ALL] + paths: + - .github/**/* + - .trunk/**/* + - .rules/**/* + - mkdocs.yml + - .DS_Store + - .vscode/**/* + - README.md +actions: + enabled: + - trunk-announce + - trunk-check-pre-push + - trunk-fmt-pre-commit + - trunk-upgrade-available diff --git a/ai_ta_backend/agents/MemoryCallbacks.py b/ai_ta_backend/agents/MemoryCallbacks.py new file mode 100644 index 00000000..1398e067 --- /dev/null +++ b/ai_ta_backend/agents/MemoryCallbacks.py @@ -0,0 +1,72 @@ +import os +from typing import Any, Dict, Union + +from langchain.agents import AgentType +from langchain.agents import initialize_agent +from langchain.agents import load_tools +from langchain.callbacks.base import BaseCallbackHandler +from langchain.llms import OpenAI +from langchain.schema import AgentAction +from langchain.schema import AgentFinish +import supabase + + +class MemoryCallbackHandler(BaseCallbackHandler): + + def __init__(self): + self.tool_in_progress = False # usage TBD + + self.supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + + def on_tool_start(self, serialized: Dict[str, Any], input_str: str, **kwargs: Any) -> Any: + print(f"on_tool_start {serialized}") + self.tool_in_progress = True + + def on_tool_end(self, output: str, **kwargs: Any) -> Any: + """Run when LLM errors.""" + print(f"On tool end: {output}") + if self.tool_in_progress: + self.tool_in_progress = False + print(f"Tool output: {output}") + + def on_tool_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any: + """Run when LLM errors.""" + pass + + def on_llm_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any: + """Run when LLM errors.""" + pass + + def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any: + print(f"on_agent_action {action}") + + def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> Any: + print(f"on_agent_finish {finish}") + + # def on_llm_start( + # self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any + # ) -> Any: + # print(f"on_llm_start {serialized}") + + # def on_llm_new_token(self, token: str, **kwargs: Any) -> Any: + # print(f"on_new_token {token}") + + # def on_chain_start( + # self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any + # ) -> Any: + # print(f"on_chain_start {serialized['name']}") + + +if __name__ == "__main__": + # just for testing + handler1 = MemoryCallbackHandler() + + llm = OpenAI(temperature=0, streaming=True) + tools = load_tools(["llm-math"], llm=llm) + agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION) + + # Callbacks for handler1 will be issued by every object involved in the + # Agent execution (llm, llmchain, tool, agent executor) + agent.run("What is 2 raised to the 0.235 power?", callbacks=[handler1]) diff --git a/ai_ta_backend/agents/README.md b/ai_ta_backend/agents/README.md new file mode 100644 index 00000000..d6d6cb48 --- /dev/null +++ b/ai_ta_backend/agents/README.md @@ -0,0 +1,201 @@ +## Usage + +To start, run this from the top level of git repo: + +```bash +conda create -n ai-ta-backend python=3.10 -y +conda activate ai-ta-backend +pip install -r requirements.txt + +playwright install # for the AI to brows the web + +# Run command +flask --app ai_ta_backend.main:app --debug run --port 8000 +``` + +**For debugging**, to run an individual file, use this syntax to maintain proper imports and .env vars: `python -m ai_ta_backend.agents.vectordb` + +## Project Management // TODOs + +**All dev tracked here: https://github.com/orgs/UIUC-Chatbot/projects/5** + +## Accounts // services + +As a dev, request to be added here: + +- [LangSmith org](https://smith.langchain.com/o/f7abb6a0-31f6-400c-8bc1-62ade4b67dc1) +- Sentry.io and PostHog for python errors and logs, respectively. +- [Github Org](https://github.com/UIUC-Chatbot) + +## Overview (in progress) + +1. An Issue or PR is opened on Github. All these entrance points are defined in `github_webhoook_handlers.py`. + +2. Our agents respond to the issue/PR and try to implement it. + +# Top level agents + +1. Plan & Execute agent in `workflow_agent.py` +1. ReAct agent in `github_agent.py` +1. TOT agent tbd: https://api.python.langchain.com/en/latest/experimental_api_reference.html#module-langchain_experimental.tot + +## Prompts + +Store all prompts in LangSmith Hub: https://smith.langchain.com/hub/kastanday?organizationId=f7abb6a0-31f6-400c-8bc1-62ade4b67dc1 + +# Cool features to implement + +### Tools for search / retrieval + +- Make an embedding for every FUNCTION in the repo. + +### Tools for Github agent + +- Get a list of all docstrings in the repo (w/ and w/out function signatures) + +### Planner + +- Plan re-prioritization. Especially when listening to new comments coming in from the user. + +# Memory System + +Workflow: + +1. Ingest content via [Langchain Callbacks](https://python.langchain.com/docs/modules/callbacks/custom_callbacks) to populate our Memory object. +2. Store memory object in a persistent DB (probably Supabase Postgres SQL) +3. On every LLM call, do a DB fetch and construct a prompt in [`create_prompt()` method](langchain/agents/structured_chat/base.py). + +Maybe use this library to extract structured data from the callback functions: https://www.askmarvin.ai/components/ai_model/. See the `Auto-output-parsing` section below. + +### Prompt template for memory system + +- `{some_variable}` represents variables we'll inject +- `` represents comments for us to use, not to be part of the final prompt. + +## Actual prompt template [(on langsmith hub)](https://smith.langchain.com/hub/my-prompts?organizationId=f7abb6a0-31f6-400c-8bc1-62ade4b67dc1) + +```text +## Core memory +### Latest assignment +{Github issue string} + +Plan: +{plan} + +## Agent action steps, AKA tool use history. (In chronological order, 1 is oldest) +{tool_use_array} + +## Messages from human +{human_qa_pairs_array} +``` + +## Example prompts + +```text +## Core memory +### Latest assignment +{Github issue string} + +Plan: + +1. [Done] Get overview of files +2. [Done] Create new branch +3. [In progress now] Update files... +4. Open a pull request with results. + +## Agent action steps, AKA tool use history. (In chronological order, 1 is oldest) +1. tool='Calculator' tool_input='2^0.235' tool_output: '1.1769067372187674' +2. + +## Messages from human + +``` + +### Example full memory prompt + +``` +## Core memory +### Latest assignment +Title: Create a full command line executable workflow for RNA-Seq on PBMC Samples. Open a new pull request (on a separate branch) and comment the PR number here when you're done. +Opened by user: rohan-uiuc + +Body: Experiment Type: +RNA-Seq +Sequencing of total cellular RNA + +Workflow Management: +Bash/SLURM +Scripting and job scheduling + +Software Stack: +FastQC +MultiQC +STAR +RSEM +samtools +DESeq2 + +What else to know about the pipeline? +I am working PBMC samples collected from patients that are undergoing immunotherapy. + +Use the data files existing in [Report_WholeBrain](https://github.com/KastanDay/ML4Bio/tree/main/Report_WholeBrain) as input for this workflow. + +You should write a series of bash scripts and R scripts that can accomplish this task. Open a PR with those scripts when you're done. + +## Plan: +1. Read the relevant files in the repository using the `read_file` function. +2. Create a new branch for the changes. +3. Write a series of bash scripts and R scripts to create a command line executable workflow for RNA-Seq on PBMC Samples. The software stack will include FastQC, MultiQC, STAR, RSEM, samtools, and DESeq2. +4. Commit the changes and push them to the new branch. +5. Open a pull request with a clear and concise title and description. +6. Comment on the original issue with the pull request number. +7. Request a review from the user who opened the issue. + +## Tool use history (chronological order, 1 is oldest) +1. tool='Calculator' tool_input='2^0.235' log=' I need to use a calculator to solve this. Action Input: 2^0.235' +2. Another tool use... + +## Conversation history + + +## Messages history with your boss +AI: Could you please specify the file you want to read? +Boss: The files in directory data/report +``` + +### Auto-output-parsing + +Seems to work great. Can add more detail to the parsing of each variable via the "instruction()" param. + +```python +## AUTO PARSING !! +# https://www.askmarvin.ai/components/ai_model/ + +from typing import List, Optional +from pydantic import BaseModel +from marvin import ai_model +from dotenv import load_dotenv +load_dotenv(override=True) +import os +import marvin +marvin.settings.openai.api_key = os.environ['OPENAI_API_KEY'] +# @ai_model(model="gpt-35-turbo", temperature=0) +# @ai_model + +@ai_model(model="openai/gpt-3.5-turbo", temperature=0) +class PackagesToInstall(BaseModel): + apt_packages_to_install: Optional[List[str]] + pip_packages_to_install: Optional[List[str]] + r_packages_to_install: Optional[List[str]] + +PackagesToInstall(input) # Github issue string as input + +''' +Result: +PackagesToInstall( + apt_packages_to_install=['samtools'], + pip_packages_to_install=['multiqc', 'star'], + r_packages_to_install=['DESeq2', 'rsem'] +) +''' +``` diff --git a/ai_ta_backend/agents/__init__.py b/ai_ta_backend/agents/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ai_ta_backend/agents/code_intrepreter_sanbox.py b/ai_ta_backend/agents/code_intrepreter_sanbox.py new file mode 100644 index 00000000..7950005e --- /dev/null +++ b/ai_ta_backend/agents/code_intrepreter_sanbox.py @@ -0,0 +1,179 @@ +import io +import time +from typing import Optional +import uuid + +from e2b import EnvVars +from e2b import ProcessMessage +from e2b import Sandbox +from e2b.api.v1.client.exceptions import ForbiddenException +import termcolor + + +class E2B_class: + """ + Main entrypoints: + 1. run_python_code(code) + 2. run_r_code(code) + 3. run_shell(shell_command) + """ + + def __init__(self, langsmith_run_id: str, env_vars: Optional[EnvVars] = None): + """ + # TODOs: + 1. Maybe `git clone` the repo to a temp folder and run the code there + 2. On agent finish, delete sandbox + """ + + self.langsmith_run_id = langsmith_run_id + try: + self.sandbox = Sandbox(env_vars=env_vars) + except ForbiddenException as e: + print( + termcolor.colored( + "You have reached the maximum number of concurrent E2B sandboxes. Please close some sandboxes before creating new ones.", + "red", + attrs=["bold"], + )) + print(termcolor.colored(f"Error: {e.body}", "red")) + exit() + + self.sandboxID = self.sandbox.id + # self.sandbox.keep_alive(2 * 60) # 2 minutes for now. + # self.sandbox.keep_alive(60 * 60 * 1) # 1 hour max + self.command_timeout = 3 * 60 # 3 minutes + self.existing_files = [] + self.working_dir = "/home/user/" + self.curr_terminal_output = "" + self.install_base_packages() + + def __del__(self): + try: + self.sandbox.close() + except Exception: + print("Failed to close e2b sandbox, probably fine.") + + def install_base_packages(self): + self.install_r_packages() + + def install_python_packages(self): + self.run_shell("pip install -U numpy pandas matplotlib seaborn scikit-learn scipy") + + def install_r_packages(self): + self.run_shell("sudo apt-get install r-base r-base-dev -y") + + def run_python_code(self, code: str): + print(termcolor.colored("RUNNING PYTHON CODE:", "blue", attrs=["bold", "underline"])) + print(termcolor.colored(code, "blue")) + + code_file = io.StringIO(code) + fileid = str(uuid.uuid4()) + code_file.name = fileid + ".py" + filepath = self.sandbox.upload_file(code_file) + shell_output = self.run_shell(f"python {filepath}") + return shell_output + + def run_r_code(self, code: str): + print(termcolor.colored("RUNNING R CODE:", "green", attrs=["bold", "underline"])) + print(termcolor.colored(code, "green")) + + code_file = io.StringIO(code) + fileid = str(uuid.uuid4()) + code_file.name = fileid + ".r" + filepath = self.sandbox.upload_file(code_file) + shell_output = self.run_shell(f"Rscript {filepath}") + return shell_output + + def run_shell(self, shell_command: str): + print(termcolor.colored( + f"SHELL EXECUTION with command: {shell_command}", + "yellow", + attrs=["bold"], + )) + self.curr_terminal_output = "" + + start_time = time.monotonic() + # self.exit_event = threading.Event() + proc = self.sandbox.process.start( + cmd=shell_command, + on_stdout=self.handle_terminal_on_data, + on_stderr=self.handle_terminal_on_error, + # on_exit=on_exit, + cwd=self.working_dir, + ) + + proc.wait() + + print( + termcolor.colored( + f"$ Shell execution complete, Runtime: {(time.monotonic() - start_time):.2f} seconds", + "yellow", + attrs=["bold"], + )) + return self.curr_terminal_output + + def handle_terminal_on_data(self, message: ProcessMessage): + data = str(message) + self.curr_terminal_output += str(data) + print(termcolor.colored(data, "yellow")) + + def handle_terminal_on_error(self, message: ProcessMessage): + data = str(message) + self.curr_terminal_output += str(data) + print(termcolor.colored("Error in E2B Sandbox:", "red", attrs=["bold"])) + print(termcolor.colored(data, "red", attrs=["bold"])) + + +# def EXPERIMENTAL_run_simple_notebook(code, +# cwd: str = "", +# timeout: Optional[int] = None, +# env_vars: Optional[EnvVars] = None) -> Tuple[str, str, list[Any]]: +# """ + +# TBD if this is helpful; the one thing it uniquely does is grab matplotlib outputs. Simply, plt.show() becomes an "artifact" that can be downloaded. + +# Args: +# code (_type_): _description_ +# timeout (Optional[int], optional): _description_. Defaults to None. +# cwd (Optional[str], optional): _description_. Defaults to "". +# env_vars (Optional[EnvVars], optional): _description_. Defaults to None. + +# Returns: +# Tuple[str, str, list[Any]]: _description_ +# """ + +# # Don't use code intrepreter -- super limited, no shell access. +# # sandbox = Sandbox(env_vars={"FOO": "Hello"}) +# sandbox = CodeInterpreter(env_vars={"FOO": "Hello"}) + +# # sandbox.install_python_packages('ffmpeg') +# # sandbox.install_system_packages('ffmpeg') +# # with open("path/to/local/file", "rb") as f: +# # remote_path = sandbox.upload_file(f) + +# stdout, stderr, artifacts = sandbox.run_python(code, timeout=timeout, cwd=cwd, env_vars=env_vars) + +# artifact_files = [] +# for artifact in artifacts: +# # Now you can save this file, send it to frontend, or anything else +# artifact_files.append(artifact.download()) + +# sandbox.close() +# return stdout, stderr, artifact_files + +if __name__ == "__main__": + # Example usage + langsmith_run_id = str(uuid.uuid4()) + e2b = E2B_class(langsmith_run_id=langsmith_run_id) + code = """sayHello <- function(){ +print('hello') +} + +sayHello() + + """ + print(e2b.run_python_code("print('Hello World')")) + print(e2b.run_r_code(code)) + print(e2b.run_shell("ls -la")) + del e2b + print("Sandbox closed.") diff --git a/ai_ta_backend/agents/github_agent.py b/ai_ta_backend/agents/github_agent.py new file mode 100644 index 00000000..d552017b --- /dev/null +++ b/ai_ta_backend/agents/github_agent.py @@ -0,0 +1,141 @@ +""" +Env for Kastan: openai_3 or flask10_py10 +""" + +import inspect +import os +import traceback + +from dotenv import load_dotenv +import langchain +from langchain.agents import AgentExecutor +from langchain.agents import AgentType +from langchain.agents import initialize_agent +from langchain.callbacks.manager import tracing_v2_enabled +from langchain.chat_models import AzureChatOpenAI +from langchain.chat_models import ChatOpenAI +from langchain.memory import ConversationSummaryBufferMemory +from langchain.prompts.chat import MessagesPlaceholder +from langchain.utilities.github import GitHubAPIWrapper +# from langchain_experimental.autonomous_agents.autogpt.agent import AutoGPT +# from langchain_experimental.autonomous_agents.baby_agi import BabyAGI +from langsmith import Client +import ray +from tools import get_tools +from utils import fancier_trim_intermediate_steps + +load_dotenv(override=True, dotenv_path='.env') + +os.environ["LANGCHAIN_TRACING_V2"] = "true" # If you want to trace the execution of the program, set to "true" +langchain.debug = False # True for more detailed logs +VERBOSE = True + +GH_Agent_SYSTEM_PROMPT = """You are a senior developer who helps others finish the work faster and to a higher quality than anyone else on the team. People often tag you on pull requests (PRs), and you will finish the PR to the best of your ability and commit your changes. If you're blocked or stuck, feel free to leave a comment on the PR and the rest of the team will help you out. Remember to keep trying, and reflecting on how you solved previous problems will usually help you fix the current issue. Please work hard, stay organized, and follow best practices.\nYou have access to the following tools:""" + + +@ray.remote +class GH_Agent(): + + def __init__(self, branch_name: str = ''): + self.branch_name = branch_name + self.github_api_wrapper = GitHubAPIWrapper(active_branch=branch_name, github_base_branch='main') # type: ignore + self.pr_agent: AgentExecutor = self.make_bot() + + def make_bot(self): + # LLMs + if os.environ['OPENAI_API_TYPE'] == 'azure': + llm = AzureChatOpenAI(temperature=0, + model="gpt-4-0613", + max_retries=3, + request_timeout=60 * 3, + deployment_name=os.environ['AZURE_OPENAI_ENGINE']) # type: ignore + AzureChatOpenAI(temperature=0, + model="gpt-4-0613", + max_retries=3, + request_timeout=60 * 3, + deployment_name=os.environ['AZURE_OPENAI_ENGINE']) # type: ignore + summarizer_llm = AzureChatOpenAI(temperature=0, + model="gpt-3.5-turbo-0613", + max_retries=3, + request_timeout=60 * 3, + deployment_name=os.environ['AZURE_OPENAI_ENGINE']) # type: ignore + else: + llm = ChatOpenAI(temperature=0, model="gpt-4-0613", max_retries=3, request_timeout=60 * 3) # type: ignore + ChatOpenAI(temperature=0, model="gpt-4-0613", max_retries=3, request_timeout=60 * 3) # type: ignore + summarizer_llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613", max_retries=3, request_timeout=60 * 3) # type: ignore + + # MEMORY + chat_history = MessagesPlaceholder(variable_name="chat_history") + memory = ConversationSummaryBufferMemory(memory_key="chat_history", return_messages=True, llm=summarizer_llm, max_token_limit=2_000) + + # TOOLS + # toolkit: GitHubToolkit = GitHubToolkit.from_github_api_wrapper(self.github_api_wrapper) + # github_tools: list[BaseTool] = toolkit.get_tools() + # human_tools: List[BaseTool] = load_tools(["human"], llm=human_llm, input_func=get_human_input) + get_tools() + + return initialize_agent( + tools=github_tools + human_tools, + llm=llm, + agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, + # agent=AgentType.OPENAI_MULTI_FUNCTIONS, + verbose=VERBOSE, + handle_parsing_errors=True, # or pass a function that accepts the error and returns a string + max_iterations=30, + max_execution_time=None, + early_stopping_method='generate', + memory=memory, + trim_intermediate_steps=fancier_trim_intermediate_steps, + agent_kwargs={ + "memory_prompts": [chat_history], + "input_variables": ["input", "agent_scratchpad", "chat_history"], + "prefix": GH_Agent_SYSTEM_PROMPT, + # pretty sure this is wack: # "extra_prompt_messages": [MessagesPlaceholder(variable_name="GH_Agent_SYSTEM_PROMPT")] + }) + + def launch_gh_agent(self, instruction: str, run_id_in_metadata, active_branch='bot-branch'): + # self.github_api_wrapper.set_active_branch(active_branch) + return self.bot_runner_with_retries(self.pr_agent, instruction, run_id_in_metadata=run_id_in_metadata) + + def bot_runner_with_retries(self, bot: AgentExecutor, run_instruction, run_id_in_metadata, total_retries=1): + """Runs the given bot with attempted retries. First prototype. + """ + Client() + print("LIMITING TOTAL RETRIES TO 0, wasting too much money....") + runtime_exceptions = [] + result = '' + for num_retries in range(1, total_retries + 1): + + with tracing_v2_enabled(project_name="ML4Bio", tags=['lil-jr-dev']): + try: + #! MAIN RUN FUNCTION + if len(runtime_exceptions) >= 1: + warning_to_bot = f"Keep in mind {num_retries} previous bots have tried to solve this problem faced a runtime error. Please learn from their mistakes, focus on making sure you format your requests for tool use correctly. Here's a list of their previous runtime errors: {str(runtime_exceptions)}" + result = bot.with_config({ + "run_name": "ReAct ML4Bio Agent" + }).invoke({"input": f"{run_instruction}\n{warning_to_bot}"}, {"metadata": { + "run_id_in_metadata": str(run_id_in_metadata) + }}) + else: + result = bot.with_config({ + "run_name": "ReAct ML4Bio Agent" + }).invoke({"input": run_instruction}, {"metadata": { + "run_id_in_metadata": str(run_id_in_metadata) + }}) + break # no error, so break retry loop + + except Exception as e: + print("-----------❌❌❌❌------------START OF ERROR-----------❌❌❌❌------------") + print(f"❌❌❌ num_retries: {num_retries}. Bot hit runtime exception: {e}") + print(f"Error in {inspect.currentframe().f_code.co_name}: {e}\n Traceback: ", traceback.print_exc()) + runtime_exceptions.append(traceback.format_exc()) + + if result == '': + formatted_exceptions = '\n'.join([f'```\n{exc}\n```' for exc in runtime_exceptions]) + result = f"{total_retries} agents ALL FAILED with runtime exceptions: \n{formatted_exceptions}" + print(f"👇FINAL ANSWER 👇\n{result}") + return result + + +if __name__ == "__main__": + print("No code.") diff --git a/ai_ta_backend/agents/github_webhook_handlers.py b/ai_ta_backend/agents/github_webhook_handlers.py new file mode 100644 index 00000000..25317edc --- /dev/null +++ b/ai_ta_backend/agents/github_webhook_handlers.py @@ -0,0 +1,454 @@ +######## GITHUB WEBHOOK HANDLERS ######## +# from github import Github +import inspect +import logging +import os +import socket +import time +import traceback +from typing import Any, Dict, Union +import uuid + +from dotenv import load_dotenv +import github +from github import Auth +from github import GithubIntegration +from github.Issue import Issue +from github.PaginatedList import PaginatedList +from github.PullRequest import PullRequest +from github.Repository import Repository +from github.TimelineEvent import TimelineEvent +import langchain +import ray + +# from github_agent import GH_Agent +from ai_ta_backend.agents.langgraph_agent_v2 import WorkflowAgent +from ai_ta_backend.agents.utils import get_langsmith_trace_sharable_url + +# from langchain.tools.github.utils import generate_branch_name + +hostname = socket.gethostname() +RUNNING_ON_LOCAL = False +if 'railway' not in hostname: + RUNNING_ON_LOCAL = True + +# load API keys from globally-availabe .env file +load_dotenv(override=True) + +langchain.debug = False # True for more detailed logs + +MESSAGE_HANDLE_ISSUE_OPENED = f"""Thanks for opening a new issue! I'll now try to finish this implementation and open a PR for you to review. + +{'You can monitor the [LangSmith trace here](https://smith.langchain.com/o/f7abb6a0-31f6-400c-8bc1-62ade4b67dc1/projects/p/c2ec9de2-71b4-4042-bea0-c706b38737e2).' if 'ML4Bio' in os.environ['LANGCHAIN_PROJECT'] else ''} + +Feel free to comment in this thread to give me additional instructions, or I'll tag you in a comment if I get stuck. +If I think I'm successful I'll 'request your review' on the resulting PR. Just watch for emails while I work. +""" + + +async def handle_github_event(payload: Dict[str, Any]): + """Main entry point for all actions that take place on Github website. + + , langsmith_run_id: uuid.UUID + + Args: + payload (str): Union[Issue, PullRequest, IssueComment] + langsmith_run_id (uuid.UUID): UUID to use for the Langsmith trace. + + Raises: + ValueError: _description_ + """ + # payload: Dict[str, Any] = json.loads(gh_webhook_payload) + langsmith_run_id = str(uuid.uuid4()) # for Langsmith + + if not payload: + raise ValueError(f"Missing the body of the webhook response. Response is {payload}") + + # API reference for webhook endpoints https://docs.github.com/en/webhooks-and-events/webhooks/webhook-events-and-payloads#issue_comment + if payload.get('action') == 'opened' and payload.get('pull_request'): + await handle_pull_request_opened(payload, langsmith_run_id) + elif payload.get('action') in ['opened', 'edited'] and payload.get('issue'): + await handle_issue_opened(payload, langsmith_run_id) + elif payload.get('action') in ['created', 'edited'] and payload.get('comment'): + await handle_comment_opened(payload, langsmith_run_id) + + +async def handle_issue_opened(payload, langsmith_run_id): + """ This is the primary entry point to the app; Just open an issue! + + Args: + payload (_type_): From github, see their webhook docs. + """ + logging.warning(f'fAuth {os.environ["GITHUB_APP_ID"]}') + # print("Auth ", os.environ["GITHUB_APP_ID"]) + # print("Auth ", os.environ["GITHUB_APP_PRIVATE_KEY"]) + auth = Auth.AppAuth( + os.environ["GITHUB_APP_ID"], + os.environ["GITHUB_APP_PRIVATE_KEY"], + ) + gi = GithubIntegration(auth=auth) + installation = gi.get_installations()[0] + g = installation.get_github_for_installation() + + logging.info("After get instillation") + + issue = payload['issue'] + repo_name = payload["repository"]["full_name"] + repo: Repository = g.get_repo(repo_name) + repo.get_branch(payload["repository"]["default_branch"]) + number = payload.get('issue').get('number') + issue: Issue = repo.get_issue(number=number) + print("JUST BEFORE TRY CATCH") + issue_description = format_issue(issue) + + # {"issue": str(issue), 'number': number, "repo_name": repo_name, "langsmith_run_id": langsmith_run_id} + # logging.info(f"New issue created: #{number}", metadata) + # logging.info(f"New issue created: #{number}. Metadata: {metadata}") + + # log = Log(message=f"New issue created: #{number}", metadata=metadata) + # log_client = LogClient(os.environ['NEW_RELIC_LICENSE_KEY']) + # response = log_client.send(log) + # response.raise_for_status() + + try: + result_futures = [] + + # 1. INTRO COMMENT + # issue.create_comment(messageForNewIssues) + # result_futures.append(post_comment.remote(issue_or_pr=issue, text=MESSAGE_HANDLE_ISSUE_OPENED, time_delay_s=0)) + + # 2. SHARABLE URL (in background) + result_futures.append(post_sharable_url.remote(issue=issue, langsmith_run_id=langsmith_run_id, time_delay_s=20)) + + # 3. RUN BOT + # bot = github_agent.GH_Agent.remote() + #prompt = hub.pull("kastanday/new-github-issue").format(issue_description=format_issue(issue)) + + new_github_issue_prompt_template = f""" + Solve the following github issue by following these steps: +Step 1: Access and Preparation +Task: First use read_file to read any files in the repo {repo_name} https://github.com/KastanDay/ML4Bio-v2 that seem relevant. Read all data files. +Action: Use the GitHub API to read the files. +Step 2: Environment Setup +Task: Set up the R environment for DESeq2 analysis. +Action: Install and load the DESeq2 package along with other necessary libraries like tidyverse. Use the command BiocManager::install("DESeq2") in R. +Step 3: Data Loading and Preprocessing +Task: Load the count data and the sample information (dseq_data.csv). +Action: Ensure that the column names in the count data correspond to the sample names in dseq_data.csv. +Step 4: DESeq2 Dataset Creation +Task: Create a DESeq2 dataset. +Action: Use DESeqDataSetFromMatrix, inputting the count data, sample information, and an appropriate design formula (e.g., ~ gender + infection + Time). +Step 5: Pre-filtering +Task: Filter out low-count genes. +Action: Apply a threshold to keep genes with a minimum count across a minimum number of samples. +Step 6: Differential Expression Analysis +Task: Run the DESeq2 analysis. +Action: Install DESeq2 package from Bioconductor. Perform the analysis using DESeq, which includes size factor estimation, dispersion estimation, model fitting, and the Wald test. +Step 7: Results Extraction and Visualization +Task: Extract and visualize the results. +Action: +Extract the results using results. +Generate an MA plot, heatmaps, and a PCA plot for visual representation. Feel free to generate any others that might be relevant. +Step 8: Interpretation and Reporting +Task: Interpret the results and prepare a report. +Action: Analyze the statistically significant genes and their potential biological relevance. Prepare a comprehensive report detailing the findings. +Step 9: Export and Further Analysis +Task: Export the results and suggest further analyses. +Action: Save the result table and suggest additional analyses like gene ontology or pathway analysis. +Goal and Execution +Goal: To perform a complete differential gene expression analysis using DESeq2, from data retrieval to result interpretation, and provide a detailed report of the findings. +Execution: Execute each step sequentially in the provided sandbox environment, ensuring accuracy and thoroughness in analysis and reporting. + +Feel free to ask for help or leave a comment on the Issue or PR if you're stuck. + +Here's your latest assignment: {issue_description}""" + prompt = new_github_issue_prompt_template.format(issue_description=format_issue(issue), repo_name=repo_name, repo=repo) + # result_futures.append(bot.launch_gh_agent.remote(prompt, active_branch=base_branch, langsmith_run_id=langsmith_run_id)) + print("ABOUT TO CALL WORKFLOWAGENT on COMMENT OPENED") + bot = WorkflowAgent(langsmith_run_id=langsmith_run_id) + result = await bot.run(prompt) + + # COLLECT PARALLEL RESULTS + for _i in range(0, len(result_futures)): + ready, not_ready = ray.wait(result_futures) + result = ray.get(ready[0]) + result_futures = not_ready + if not result_futures: + break + + # FIN: Conclusion & results comment + ray.get(post_comment.remote(issue_or_pr=issue, text=str(result['output']), time_delay_s=0)) + except Exception as e: + logging.error(f"❌❌ Error in {inspect.currentframe().f_code.co_name}: {e}\nTraceback:\n", traceback.print_exc()) + err_str = f"Error in {inspect.currentframe().f_code.co_name}: {e}" + "\nTraceback\n```\n" + str(traceback.format_exc()) + "\n```" + + if RUNNING_ON_LOCAL: + print(err_str) + else: + issue.create_comment(err_str) + + +async def handle_pull_request_opened(payload: Dict[str, Any], langsmith_run_id: str): + auth = Auth.AppAuth( + os.environ["GITHUB_APP_ID"], + os.environ["GITHUB_APP_PRIVATE_KEY"], + ) + + # TODO: + # File "/Users/kastanday/code/ncsa/ai-ta/ai-ta-backend/ai_ta_backend/agents/github_webhook_handlers.py", line 120, in handle_pull_request_opened + # number = payload.get('issue').get('number') # AttributeError: 'NoneType' object has no attribute 'get' + # AttributeError: 'NoneType' object has no attribute 'get' + gi = GithubIntegration(auth=auth) + installation = gi.get_installations()[0] + g = installation.get_github_for_installation() + + repo_name = payload["repository"]["full_name"] + repo = g.get_repo(repo_name) + + number = payload.get('issue').get('number') # TODO: AttributeError: 'NoneType' object has no attribute 'get' + comment = payload.get('comment') + comment['user']['login'] + issue: Issue = repo.get_issue(number=number) + True if comment.get('performed_via_github_app') else False + pr: PullRequest = repo.get_pull(number=number) + + print(f"Received a pull request event for #{number}") + try: + messageForNewPRs = "Thanks for opening a new PR! I'll now try to finish this implementation and I'll comment if I get blocked or (WIP) 'request your review' if I think I'm successful. So just watch for emails while I work. Please comment to give me additional instructions." + # issue.create_comment(messageForNewPRs) + + result_futures = [] + + # 1. INTRO COMMENT + # issue.create_comment(messageForNewIssues) + result_futures.append(post_comment.remote(issue_or_pr=pr, text=messageForNewPRs, time_delay_s=0)) + + # 2. SHARABLE URL (in background) + result_futures.append(post_sharable_url.remote(issue=pr, langsmith_run_id=langsmith_run_id, time_delay_s=30)) + + # 3. RUN BOT + + print("LAUNCHING BOT") + bot = WorkflowAgent(langsmith_run_id=langsmith_run_id) + # pr_description = bot.github_api_wrapper.get_pull_request(number) + # instruction = f"Please implement these changes by creating or editing the necessary files. First read all existing comments to better understand your task. Then read the existing files to see the progress. Finally implement any and all remaining code to make the project work as the commenter intended (but no need to open a new PR, your edits are automatically committed every time you use a tool to edit files). Feel free to ask for help, or leave a comment on the PR if you're stuck. Here's the latest PR: {str(pr_description)}" + # result = bot.launch_gh_agent(instruction, active_branch=branch_name) + result = await bot.run(comment) + # COLLECT PARALLEL RESULTS + for _i in range(0, len(result_futures)): + ready, not_ready = ray.wait(result_futures) + result = ray.get(ready[0]) + result_futures = not_ready + if not result_futures: + break + + # FIN: Conclusion & results comment + ray.get(post_comment.remote(issue_or_pr=pr, text=str(result['output']), time_delay_s=0)) + except Exception as e: + print(f"Error: {e}") + logging.error(f"❌❌ Error in {inspect.currentframe().f_code.co_name}: {e}\nTraceback:\n", traceback.print_exc()) + err_str = f"Error in {inspect.currentframe().f_code.co_name}: {e}" + "\nTraceback\n```\n" + str(traceback.format_exc()) + "\n```" + if RUNNING_ON_LOCAL: + print(err_str) + else: + issue.create_comment(f"Bot hit a runtime exception during execution. TODO: have more bots debug this.\nError:{err_str}") + + +async def handle_comment_opened(payload, langsmith_run_id): + """Note: In Github API, PRs are just issues with an extra PR object. Issue numbers and PR numbers live in the same space. + Args: + payload (_type_): _description_ + """ + auth = Auth.AppAuth( + os.environ["GITHUB_APP_ID"], + os.environ["GITHUB_APP_PRIVATE_KEY"], + ) + # ensure the author is not lil-jr-dev bot. + gi = GithubIntegration(auth=auth) + installation = gi.get_installations()[0] + g = installation.get_github_for_installation() + + repo_name = payload["repository"]["full_name"] + repo = g.get_repo(repo_name) + number = payload.get('issue').get('number') + comment = payload.get('comment') + comment_author = comment['user']['login'] + # issue_response = payload.get('issue') + issue: Issue = repo.get_issue(number=number) + is_pr = True if payload.get('issue').get('pull_request') else False + True if comment.get('performed_via_github_app') else False + + # DON'T REPLY TO SELF (inf loop) + if comment_author == 'lil-jr-dev[bot]': + print(f"Comment author is {comment_author}, no reply...") + return + + print("Comment author: ", comment['user']['login']) + try: + result_futures = [] + if is_pr: + print("🥵🥵🥵🥵🥵🥵🥵🥵🥵🥵 COMMENT ON A PR") + pr: PullRequest = repo.get_pull(number=number) + branch_name = pr.head.ref + print(f"Head branch_name: {branch_name}") + + # LAUNCH NEW PR COMMENT BOT + messageForNewPRs = "Thanks for commenting on this PR!! I'll now try to finish this implementation and I'll comment if I get blocked or (WIP) 'request your review' if I think I'm successful. So just watch for emails while I work. Please comment to give me additional instructions." + # 1. INTRO COMMENT + # issue.create_comment(messageForNewIssues) + result_futures.append(post_comment.remote(issue_or_pr=pr, text=messageForNewPRs, time_delay_s=0)) + + # 2. SHARABLE URL (in background) + result_futures.append(post_sharable_url.remote(issue=pr, langsmith_run_id=langsmith_run_id, time_delay_s=30)) + + # 3. RUN BOT + bot = WorkflowAgent(langsmith_run_id=langsmith_run_id) + instruction = f"Please complete this work-in-progress pull request (PR number {number}) by implementing the changes discussed in the comments. You can update and create files to make all necessary changes. First use read_file to read any files in the repo that seem relevant. Then, when you're ready, start implementing changes by creating and updating files. Implement any and all remaining code to make the project work as the commenter intended. You don't have to commit your changes, they are saved automaticaly on every file change. The last step is to complete the PR and leave a comment tagging the relevant humans for review, or list any concerns or final changes necessary in your comment. Feel free to ask for help, or leave a comment on the PR if you're stuck. Here's your latest PR assignment: {format_issue(issue)}" + result = await bot.run(instruction) + + # COLLECT PARALLEL RESULTS + for _i in range(0, len(result_futures)): + ready, not_ready = ray.wait(result_futures) + result = ray.get(ready[0]) + result_futures = not_ready + if not result_futures: + break + + # FIN: Conclusion & results comment + ray.get(post_comment.remote(issue_or_pr=pr, text=str(result['output']), time_delay_s=0)) + else: + # IS COMMENT ON ISSUE + print("🤗🤗🤗🤗🤗🤗🤗🤗🤗🤗 THIS IS A COMMENT ON AN ISSUE") + messageForIssues = "Thanks for opening a new or edited comment on an issue! We'll try to implement changes per your updated request, and will attempt to contribute to any existing PRs related to this or open a new PR if necessary." + # 1. INTRO COMMENT + # issue.create_comment(messageForNewIssues) + result_futures.append(post_comment.remote(issue_or_pr=pr, text=messageForIssues, time_delay_s=0)) + + # 2. SHARABLE URL (in background) + result_futures.append(post_sharable_url.remote(issue=pr, langsmith_run_id=langsmith_run_id, time_delay_s=30)) + + # 3. RUN BOT + + # todo: refactor with new branch name creation + ensure_unique_branch_name(repo, "bot-branch") + # bot = github_agent.GH_Agent() + # issue_description = bot.github_api_wrapper.get_issue(number) + # instruction = f"Your boss has just commented on the Github issue that was assigned to you, please review their latest comments and complete the work assigned. There may or may not be an open PR related to this already. Open or complete that PR by implementing the changes discussed in the comments. You can update and create files to make all necessary changes. First use read_file to read any files in the repo that seem relevant. Then, when you're ready, start implementing changes by creating and updating files. Implement any and all remaining code to make the project work as the commenter intended. You don't have to commit your changes, they are saved automatically on every file change. The last step is to complete the PR and leave a comment tagging the relevant humans for review, or list any concerns or final changes necessary in your comment. Feel free to ask for help, or leave a comment on the PR if you're stuck. Here's your latest PR assignment: {str(issue_description)}" + # result = bot.launch_gh_agent(instruction, active_branch=unique_branch_name) + bot = WorkflowAgent(langsmith_run_id=langsmith_run_id) + result = bot.run(comment) + # COLLECT PARALLEL RESULTS + for _i in range(0, len(result_futures)): + ready, not_ready = ray.wait(result_futures) + result = ray.get(ready[0]) + result_futures = not_ready + if not result_futures: + break + + # FIN: Conclusion & results comment + ray.get(post_comment.remote(issue_or_pr=pr, text=str(result['output']), time_delay_s=0)) + except Exception as e: + logging.error(f"❌❌ Error in {inspect.currentframe().f_code.co_name}: {e}\nTraceback:\n", traceback.print_exc()) + err_str = f"Error in {inspect.currentframe().f_code.co_name}: {e}" + "\nTraceback\n```\n" + str(traceback.format_exc()) + "\n```" + if RUNNING_ON_LOCAL: + print(err_str) + else: + issue.create_comment(f"Bot hit a runtime exception during execution. TODO: have more bots debug this.\nError: {err_str}") + + +@ray.remote +def post_comment(issue_or_pr: Union[Issue, PullRequest], text: str, time_delay_s: int): + """A helper method to post a comment after a delay. + + Args: + issue_or_pr (Union[Issue, PullRequest]): The full object. + text (str): Text to be posted as a comment by Lil-Jr-Dev[bot] + time_delay_s (int): Time delay before running + """ + time.sleep(time_delay_s) + issue_or_pr.create_comment(str(text)) + + +def extract_key_info_from_issue_or_pr(issue_or_pr: Union[Issue, PullRequest]): + """Filter out useless info, format nicely. Especially filter out comment if comment 'performed_via_github_app'. + comment_made_by_bot = True if comment.get('performed_via_github_app') else False + + Maybe grab other issues if they're referenced. + + Args: + issue_or_pr (Union[Issue, PullRequest]): Full object of the issue or PR. + Returns: + full_description: str + """ + + pass + + +@ray.remote +def post_sharable_url(issue, langsmith_run_id, time_delay_s): + sharable_url = get_langsmith_trace_sharable_url(langsmith_run_id, time_delay_s=time_delay_s) + text = f"👉 [Follow the bot's progress in real time on LangSmith]({sharable_url})." + ray.get(post_comment.remote(issue_or_pr=issue, text=text, time_delay_s=0)) + + +def format_issue(issue): + linked_pr = get_linked_pr_from_issue(issue) + title = f"Title: {issue.title}." + existing_pr = f"Existing PR addressing issue: {linked_pr}" if linked_pr else "" + opened_by = f"Opened by user: {issue.user.login}" if type(issue.user) == github.NamedUser.NamedUser else '' + body = f"Body: {issue.body}" + return "\n".join([title, opened_by, existing_pr, body]) + + +def get_linked_pr_from_issue(issue: Issue) -> PullRequest | None: + """Check if the given issue has a linked pull request. + + This function iterates over the timeline of the issue and checks if there is a 'cross-referenced' event. + If such an event is found, it checks if the source of the event is an issue and if so, it returns the issue as a pull request. + + Usage: + issue: Issue = repo.get_issue(number=8) + pr_or_none = check_if_issue_has_linked_pr(issue) + + Args: + issue (Issue): The issue to check for a linked pull request. + + Returns: + PullRequest: The linked pull request if it exists, None otherwise. + """ + events_pages: PaginatedList[TimelineEvent] = issue.get_timeline() + pg_num = 0 + while events_pages.get_page(pg_num): + page = events_pages.get_page(pg_num) + pg_num += 1 + for e in page: + if str(e.event) == 'cross-referenced': + if e.source and e.source.issue: + return e.source.issue.as_pull_request() + + +def get_linked_issue_from_pr(pr: PullRequest) -> Issue | None: + """Check if the given pull request has a linked issue. + + This function iterates over the timeline of the pull request and checks if there is a 'cross-referenced' event. + If such an event is found, it checks if the source of the event is a pull request and if so, it returns the pull request as an issue. + + Usage: + pr: PullRequest = repo.get_pull(number=8) + issue_or_none = check_if_pr_has_linked_issue(pr) + + Args: + pr (PullRequest): The pull request to check for a linked issue. + + Returns: + Issue: The linked issue if it exists, None otherwise. + """ + events_pages: PaginatedList[TimelineEvent] = pr.as_issue().get_timeline() + pg_num = 0 + while events_pages.get_page(pg_num): + page = events_pages.get_page(pg_num) + pg_num += 1 + for e in page: + if str(e.event) == 'cross-referenced': + if e.source and e.source.issue: + return e.source.issue diff --git a/ai_ta_backend/agents/langgraph-plan-and-execute-demo.ipynb b/ai_ta_backend/agents/langgraph-plan-and-execute-demo.ipynb new file mode 100644 index 00000000..229bf2cc --- /dev/null +++ b/ai_ta_backend/agents/langgraph-plan-and-execute-demo.ipynb @@ -0,0 +1,637 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "79b5811c-1074-495f-9722-8325b5e717d3", + "metadata": {}, + "source": [ + "# Plan-and-Execute\n", + "\n", + "This notebook shows how to create a \"plan-and-execute\" style agent. This is heavily inspired by the [Plan-and-Solve](https://arxiv.org/abs/2305.04091) paper as well as the [Baby-AGI](https://github.com/yoheinakajima/babyagi) project.\n", + "\n", + "The core idea is to first come up with a multi-step plan, and then go through that plan one item at a time.\n", + "After accomplishing a particular task, you can then revisit the plan and modify as appropriate.\n", + "\n", + "\n", + "The general computational graph looks like the following:\n", + "\n", + "\n", + "![plan-and-execute diagram](./img/plan-and-execute.png)\n", + "\n", + "\n", + "This compares to a typical [ReAct](https://arxiv.org/abs/2210.03629) style agent where you think one step at a time.\n", + "The advantages of this \"plan-and-execute\" style agent are:\n", + "\n", + "1. Explicit long term planning (which even really strong LLMs can struggle with)\n", + "2. Ability to use smaller/weaker models for the execution step, only using larger/better models for the planning step\n", + "\n", + "\n", + "The following walkthrough demonstrates how to do so in LangGraph. The resulting agent will leave a trace like the following example: ([link](https://smith.langchain.com/public/d46e24d3-dda6-44d5-9550-b618fca4e0d4/r))." + ] + }, + { + "cell_type": "markdown", + "id": "a44a72d6-7e0c-4478-9d20-4c09000420a8", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "First, we need to install the packages required." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b451b58a-89bd-424f-8c06-0d9fe325e01b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.3.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.0\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.11 -m pip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install --quiet -U langchain langchain_openai tavily-python" + ] + }, + { + "cell_type": "markdown", + "id": "35f267b0-98db-4a59-8b2c-a23f795576ff", + "metadata": {}, + "source": [ + "Next, we need to set API keys for OpenAI (the LLM we will use) and Tavily (the search tool we will use)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ce438281-08d5-4804-afe7-e4089f7b016b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "# import getpass\n", + "\n", + "# os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n", + "# os.environ[\"TAVILY_API_KEY\"] = getpass.getpass(\"Tavily API Key:\")\n", + "\n", + "from dotenv import load_dotenv\n", + "load_dotenv(override=True)" + ] + }, + { + "cell_type": "markdown", + "id": "be2d7981-3737-4134-8bef-d00d18d4e91d", + "metadata": {}, + "source": [ + "Optionally, we can set API key for LangSmith tracing, which will give us best-in-class observability." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "01f460d1-f26f-47d1-ae76-de74d5d851de", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", + "os.environ[\"LANGCHAIN_PROJECT\"] = \"Plan-and-execute\"" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "e94db77d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Plan-and-execute'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "af684aee", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'ls__1f449613fd404c5b9a31990a420d1624'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9cb7861", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "6c5fb09a-0311-44c2-b243-d0e80de78902", + "metadata": {}, + "source": [ + "## Define Tools\n", + "\n", + "We will first define the tools we want to use. For this simple example, we will use a built-in search tool via Tavily. However, it is really easy to create your own tools - see documentation [here](https://python.langchain.com/docs/modules/agents/tools/custom_tools) on how to do that." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "25b9ec62-0675-4715-811c-9b32c635b22f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Tool(name='Search', description='A search engine. Useful for when you need to answer questions about current events. Input should be a search query.', func=, params={'engine': 'google', 'google_domain': 'google.com', 'gl': 'us', 'hl': 'en'}, serpapi_api_key='edf00a75c49d95767f0f7b99cddb763bea1145f2f94cf9f88879bbcab19c9a8f', aiosession=None)>, coroutine=, params={'engine': 'google', 'google_domain': 'google.com', 'gl': 'us', 'hl': 'en'}, serpapi_api_key='edf00a75c49d95767f0f7b99cddb763bea1145f2f94cf9f88879bbcab19c9a8f', aiosession=None)>)]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# from langchain_community.tools.tavily_search import TavilySearchResults\n", + "from langchain.agents import load_tools\n", + "\n", + "# tools = [TavilySearchResults(max_results=3)]\n", + "tools = load_tools([\"serpapi\"])\n", + "tools" + ] + }, + { + "cell_type": "markdown", + "id": "3dcda478-fa80-4e3e-bb35-0f622fe73a31", + "metadata": {}, + "source": [ + "## Define our Execution Agent\n", + "\n", + "Now we will create the execution agent we want to use to execute tasks. \n", + "Note that for this example, we will be using the same execution agent for each task, but this doesn't HAVE to be the case." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "72d233ca-1dbf-4b43-b680-b3bf39e3691f", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain import hub\n", + "from langchain.agents import create_openai_functions_agent\n", + "from langchain_openai import AzureChatOpenAI, ChatOpenAI\n", + "\n", + "# Get the prompt to use - you can modify this!\n", + "prompt = hub.pull(\"hwchase17/openai-functions-agent\")\n", + "# Choose the LLM that will drive the agent\n", + "# llm = ChatOpenAI(model=\"gpt-4-turbo-preview\")\n", + "llm = AzureChatOpenAI(\n", + " azure_deployment=\"gpt-4-128k\",\n", + " openai_api_version=os.getenv(\"AZURE_0125_MODEL_VERSION\"),\n", + " temperature=0,\n", + " azure_endpoint=os.getenv(\"AZURE_0125_MODEL_ENDPOINT\"),\n", + " openai_api_key=os.getenv(\"AZURE_0125_MODEL_API_KEY\"),\n", + " )\n", + "# Construct the OpenAI Functions agent\n", + "agent_runnable = create_openai_functions_agent(llm, tools, prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "a3ea9bd3-87d9-4a78-aec6-8ab4bf34479b", + "metadata": {}, + "outputs": [], + "source": [ + "from langgraph.prebuilt import create_agent_executor" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "998aebde-c204-494f-930c-14747ed34861", + "metadata": {}, + "outputs": [], + "source": [ + "agent_executor = create_agent_executor(agent_runnable, tools)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "746e697a-dec4-4342-a814-9b3456828169", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ------- MIGHT TRUNCATE MESSAGES ------- \n", + "/\\/\\/\\/\\/\\ num_tokens_in_messages 26\n", + "/\\/\\/\\/\\/\\ Hard coded context window size of: 120000\n", + " ------- MIGHT TRUNCATE MESSAGES ------- \n", + "/\\/\\/\\/\\/\\ num_tokens_in_messages 1956\n", + "/\\/\\/\\/\\/\\ Hard coded context window size of: 120000\n" + ] + }, + { + "data": { + "text/plain": [ + "{'input': 'who is the winnner of the us open',\n", + " 'chat_history': [],\n", + " 'agent_outcome': AgentFinish(return_values={'output': 'The US Open 2023 is currently ongoing, and the information provided includes results from the qualifiers. The winner of the tournament has not been determined yet, as the event is scheduled from August 22 to September 10, 2023. For the latest updates and results, you can visit the official US Open website.'}, log='The US Open 2023 is currently ongoing, and the information provided includes results from the qualifiers. The winner of the tournament has not been determined yet, as the event is scheduled from August 22 to September 10, 2023. For the latest updates and results, you can visit the official US Open website.'),\n", + " 'intermediate_steps': [(AgentActionMessageLog(tool='Search', tool_input='US Open winner 2023', log='\\nInvoking: `Search` with `US Open winner 2023`\\n\\n\\n', message_log=[AIMessage(content='', additional_kwargs={'function_call': {'arguments': '{\"__arg1\":\"US Open winner 2023\"}', 'name': 'Search'}})]),\n", + " \"{'title': 'US Open', 'date': 'Aug 22–Sep 10, 2023', 'tables': {'games': [{'stage': 'Qualifiers', 'location': 'Court 11', 'video_highlights': {'link': 'https://www.usopen.org/en_US/video/index.html?ac_vh_s=link&lwrt=%23%2Fchannel%2F4476%2Flfbumnsin5xuewlkobeue3cyjvdfuqzn?cid=2023oneboxINTL_00000000_phto_chothr_ctwbtr_endcid', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/videos/vita/W6gL8niH0X03coEj_96x54.jpg', 'duration': '1:49'}, 'players': [{'name': 'F. Delbonis', 'ranking': '', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/1xBWyjjkA6vEWopPK3lIPA_48x48.png', 'sets': {'set-1': '4', 'set-2': '6', 'set-3': '5'}}, {'name': 'M. Damm Jr.', 'ranking': '', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/wj9uZvn_vZrelLFGH8fnPA_48x48.png', 'sets': {'set-1': '6', 'set-2': '4', 'set-3': '7'}}]}, {'stage': 'Qualifiers', 'location': 'Court 4', 'video_highlights': {'link': 'https://www.usopen.org/en_US/video/index.html?ac_vh_s=link#/channel/4476/g5me2wsin5xueqzzk5jgyukfgj3tsqsl?cid=2023oneboxUSA_00000000_phto_chothr_ctwbtr_endcid', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/videos/vita/d57wXYtqqJzB3gDt_96x54.jpg', 'duration': '1:16'}, 'players': [{'name': 'F. Coria', 'ranking': '3', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/1xBWyjjkA6vEWopPK3lIPA_48x48.png', 'sets': {'set-1': '6', 'set-2': '6'}}, {'name': 'C. Carabelli', 'ranking': '', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/1xBWyjjkA6vEWopPK3lIPA_48x48.png', 'sets': {'set-1': '3', 'set-2': '1'}}]}, {'stage': 'Qualifiers', 'location': 'Grandstand', 'video_highlights': {'link': 'https://www.usopen.org/en_US/video/index.html#/channel/4558/mzjuylkin5xuewlkobeue3cyjv5dezzz?cid=2023oneboxINTL_00000000_phto_chothr_ctwbtr_endcid', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/videos/vita/B73RzZyZm5phigtv_96x54.jpg', 'duration': '1:20'}, 'players': [{'name': 'Q. Vandecasteele', 'ranking': '', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/wj9uZvn_vZrelLFGH8fnPA_48x48.png', 'sets': {'set-1': '1', 'set-2': '2'}}, {'name': 'F. Bagnis', 'ranking': '29', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/1xBWyjjkA6vEWopPK3lIPA_48x48.png', 'sets': {'set-1': '6', 'set-2': '6'}}]}, {'stage': 'Qualifiers', 'location': 'Court 15', 'video_highlights': {'link': 'https://www.usopen.org/en_US/video/index.html?ac_vh_s=link&lwrt=%23%2Fchannel%2F4476%2Fg5efclkin5xueqzzk5jgyukfgjmvcnbs?ccid=2023oneboxINTL_00000000_phto_chothr_ctwbtr_endcid', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/videos/vita/_fNjM6P103o7jljB_96x54.jpg', 'duration': '1:52'}, 'players': [{'name': 'F. Alves', 'ranking': '', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/zKLzoJVYz0bb6oAnPUdwWQ_48x48.png', 'sets': {'set-1': '2', 'set-2': '6', 'set-3': '6'}}, {'name': 'D. Svrcina', 'ranking': '', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/8AluO-WxpcHtC0KKHmFgvg_48x48.png', 'sets': {'set-1': '6', 'set-2': '2', 'set-3': '3'}}]}, {'stage': 'Qualifiers', 'location': 'Court 6', 'video_highlights': {'link': 'https://www.usopen.org/en_US/video/index.html?ac_vh_s=link&lwrt=%23%2Fchannel%2F4476%2Fmnuumq2in5xuewlkobeue3cyju2viqkl?cid=2023oneboxINTL_00000000_phto_chothr_ctwbtr_endcid', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/videos/vita/CwIny6WpBCXA2lAP_96x54.jpg', 'duration': '2:00'}, 'players': [{'name': 'F. Gaio', 'ranking': '', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/joYpsiaYi4GDCqhSRAq5Zg_48x48.png', 'sets': {'set-1': '3', 'set-2': '6', 'set-3': '6'}}, {'name': 'H. Grenier', 'ranking': '32', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/z3JEQB3coEAGLCJBEUzQ2A_48x48.png', 'sets': {'set-1': '6', 'set-2': '3', 'set-3': '2'}}]}, {'stage': 'Qualifiers', 'location': 'Court 14', 'video_highlights': {'link': 'https://www.usopen.org/en_US/video/index.html?ac_vh_s=link&lwrt=%23%2Fchannel%2F4476%2Fjjbumtkin5xuewlkobeue3cyjvndarsb%3Fcid%3D2023oneboxUSA_00000000_phto_chothr_ctwbtr_endcid&cid=2023oneboxINTL_00000000_phto_chothr_ctwbtr_endcid', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/videos/vita/T7EQXGy4ywFKKvuQ_96x54.jpg', 'duration': '1:55'}, 'players': [{'name': 'F. Passaro', 'ranking': '', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/joYpsiaYi4GDCqhSRAq5Zg_48x48.png', 'sets': {'set-1': '6 + 3', 'set-2': '4'}}, {'name': 'S.C. Hong', 'ranking': '', 'thumbnail': 'https:https://ssl.gstatic.com/onebox/media/sports/logos/Uu5pwNmMHGd5bCooKrS3Lw_48x48.png', 'sets': {'set-1': '7 + 7', 'set-2': '6'}}]}]}}\")]}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent_executor.invoke(\n", + " {\"input\": \"who is the winnner of the us open\", \"chat_history\": []}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5cf66804-44b2-4904-b1a7-17ad70b551f5", + "metadata": {}, + "source": [ + "## Define the State\n", + "\n", + "Let's now start by defining the state the track for this agent.\n", + "\n", + "First, we will need to track the current plan. Let's represent that as a list of strings.\n", + "\n", + "Next, we should track previously executed steps. Let's represent that as a list of tuples (these tuples will contain the step and then the result)\n", + "\n", + "Finally, we need to have some state to represent the final response as well as the original input." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "8eeeaeea-8f10-4fbe-8e24-4e1a2381a009", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.pydantic_v1 import BaseModel, Field\n", + "from typing import List, Tuple, Annotated, TypedDict\n", + "import operator\n", + "\n", + "\n", + "class PlanExecute(TypedDict):\n", + " input: str\n", + " plan: List[str]\n", + " past_steps: Annotated[List[Tuple], operator.add]\n", + " response: str" + ] + }, + { + "cell_type": "markdown", + "id": "1dbd770a-9941-40a9-977e-4d55359eee21", + "metadata": {}, + "source": [ + "## Planning Step\n", + "\n", + "Let's now think about creating the planning step. This will use function calling to create a plan." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "4a88626d-6dfd-4488-87f0-a9a0dd6da44c", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.pydantic_v1 import BaseModel\n", + "\n", + "\n", + "class Plan(BaseModel):\n", + " \"\"\"Plan to follow in future\"\"\"\n", + "\n", + " steps: List[str] = Field(\n", + " description=\"different steps to follow, should be in sorted order\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "ec7b1867-1ea3-4df3-9a98-992a1c32ec49", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains.openai_functions import create_structured_output_runnable\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "\n", + "plan_llm = AzureChatOpenAI(\n", + " azure_deployment=\"gpt-4-128k\",\n", + " openai_api_version=os.getenv(\"AZURE_0125_MODEL_VERSION\"),\n", + " temperature=0,\n", + " azure_endpoint=os.getenv(\"AZURE_0125_MODEL_ENDPOINT\"),\n", + " openai_api_key=os.getenv(\"AZURE_0125_MODEL_API_KEY\"),\n", + " )\n", + "\n", + "planner_prompt = ChatPromptTemplate.from_template(\n", + " \"\"\"For the given objective, come up with a simple step by step plan. \\\n", + "This plan should involve individual tasks, that if executed correctly will yield the correct answer. Do not add any superfluous steps. \\\n", + "The result of the final step should be the final answer. Make sure that each step has all the information needed - do not skip steps.\n", + "\n", + "{objective}\"\"\"\n", + ")\n", + "planner = create_structured_output_runnable(\n", + " Plan, plan_llm, planner_prompt\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "67ce37b7-e089-479b-bcb8-c3f5d9874613", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ------- MIGHT TRUNCATE MESSAGES ------- \n", + "/\\/\\/\\/\\/\\ num_tokens_in_messages 87\n", + "/\\/\\/\\/\\/\\ Hard coded context window size of: 120000\n" + ] + }, + { + "data": { + "text/plain": [ + "Plan(steps=['Identify the current year to determine the most recent Australia Open tournament.', 'Search for the winner of the most recent Australia Open in the identified year.', 'Find the hometown of the identified winner by searching their personal or professional profiles online or through sports news outlets.'])" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "planner.invoke(\n", + " {\"objective\": \"what is the hometown of the current Australia open winner?\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6e09ad9d-6f90-4bdc-bb43-b1ce94517c29", + "metadata": {}, + "source": [ + "## Re-Plan Step\n", + "\n", + "Now, let's create a step that re-does the plan based on the result of the previous step." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "ec2d12cc-016a-44d1-aa08-4c5ce1e8fe2a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains.openai_functions import create_openai_fn_runnable\n", + "\n", + "\n", + "class Response(BaseModel):\n", + " \"\"\"Response to user.\"\"\"\n", + "\n", + " response: str\n", + "\n", + "\n", + "replanner_prompt = ChatPromptTemplate.from_template(\n", + " \"\"\"For the given objective, come up with a simple step by step plan. \\\n", + "This plan should involve individual tasks, that if executed correctly will yield the correct answer. Do not add any superfluous steps. \\\n", + "The result of the final step should be the final answer. Make sure that each step has all the information needed - do not skip steps.\n", + "\n", + "Your objective was this:\n", + "{input}\n", + "\n", + "Your original plan was this:\n", + "{plan}\n", + "\n", + "You have currently done the follow steps:\n", + "{past_steps}\n", + "\n", + "Update your plan accordingly. If no more steps are needed and you can return to the user, then respond with that. Otherwise, fill out the plan. Only add steps to the plan that still NEED to be done. Do not return previously done steps as part of the plan.\"\"\"\n", + ")\n", + "\n", + "replanner_llm = AzureChatOpenAI(\n", + " azure_deployment=\"gpt-4-128k\",\n", + " openai_api_version=os.getenv(\"AZURE_0125_MODEL_VERSION\"),\n", + " temperature=0,\n", + " azure_endpoint=os.getenv(\"AZURE_0125_MODEL_ENDPOINT\"),\n", + " openai_api_key=os.getenv(\"AZURE_0125_MODEL_API_KEY\"),\n", + " )\n", + "\n", + "replanner = create_openai_fn_runnable(\n", + " [Plan, Response],\n", + " replanner_llm,\n", + " replanner_prompt,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "859abd13-6ba0-45ad-b341-e652dd5f755b", + "metadata": {}, + "source": [ + "## Create the Graph\n", + "\n", + "We can now create the graph!" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "6c8e0dad-bcea-4c9a-8922-0d820892e2d0", + "metadata": {}, + "outputs": [], + "source": [ + "async def execute_step(state: PlanExecute):\n", + " task = state[\"plan\"][0]\n", + " agent_response = await agent_executor.ainvoke({\"input\": task, \"chat_history\": []})\n", + " return {\n", + " \"past_steps\": (task, agent_response[\"agent_outcome\"].return_values[\"output\"])\n", + " }\n", + "\n", + "\n", + "async def plan_step(state: PlanExecute):\n", + " plan = await planner.ainvoke({\"objective\": state[\"input\"]})\n", + " return {\"plan\": plan.steps}\n", + "\n", + "\n", + "async def replan_step(state: PlanExecute):\n", + " output = await replanner.ainvoke(state)\n", + " print(f\"{output=}\")\n", + " if isinstance(output, Response):\n", + " return {\"response\": output.response}\n", + " else:\n", + " return {\"plan\": output.steps}\n", + "\n", + "\n", + "def should_end(state: PlanExecute):\n", + " if state[\"response\"]:\n", + " return True\n", + " else:\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "e954cea0-5ccc-46c2-a27b-f5b7185b597d", + "metadata": {}, + "outputs": [], + "source": [ + "from langgraph.graph import StateGraph, END\n", + "\n", + "workflow = StateGraph(PlanExecute)\n", + "\n", + "# Add the plan node\n", + "workflow.add_node(\"planner\", plan_step)\n", + "\n", + "# Add the execution step\n", + "workflow.add_node(\"agent\", execute_step)\n", + "\n", + "# Add a replan node\n", + "workflow.add_node(\"replan\", replan_step)\n", + "\n", + "workflow.set_entry_point(\"planner\")\n", + "\n", + "# From plan we go to agent\n", + "workflow.add_edge(\"planner\", \"agent\")\n", + "\n", + "# From agent, we replan\n", + "workflow.add_edge(\"agent\", \"replan\")\n", + "\n", + "workflow.add_conditional_edges(\n", + " \"replan\",\n", + " # Next, we pass in the function that will determine which node is called next.\n", + " should_end,\n", + " {\n", + " # If `tools`, then we call the tool node.\n", + " True: END,\n", + " False: \"agent\",\n", + " },\n", + ")\n", + "\n", + "# Finally, we compile it!\n", + "# This compiles it into a LangChain Runnable,\n", + "# meaning you can use it as you would any other runnable\n", + "app = workflow.compile()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "b8ac1f67-e87a-427c-b4f7-44351295b788", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'plan': ['Find the name of the 2024 Australian Open winner.', 'Search for the hometown of the identified winner.']}\n", + "{'past_steps': ('Find the name of the 2024 Australian Open winner.', 'The 2024 Australian Open has not concluded yet, as it is scheduled to take place from January 7 to January 28, 2024. Therefore, the winner has not been determined yet.')}\n", + "output=Response(response='Since the 2024 Australian Open has not concluded yet and the winner has not been determined, it is not possible to identify the hometown of the 2024 Australian Open winner at this time.')\n", + "{'response': 'Since the 2024 Australian Open has not concluded yet and the winner has not been determined, it is not possible to identify the hometown of the 2024 Australian Open winner at this time.'}\n" + ] + } + ], + "source": [ + "from langchain_core.messages import HumanMessage\n", + "\n", + "config = {\"recursion_limit\": 50}\n", + "inputs = {\"input\": \"what is the hometown of the 2024 Australia open winner?\"}\n", + "async for event in app.astream(inputs, config=config):\n", + " for k, v in event.items():\n", + " if k != \"__end__\":\n", + " print(v)" + ] + }, + { + "cell_type": "markdown", + "id": "8bf585a9-0f1e-4910-bd00-65e7bb05b6e6", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "Congrats on making a plan-and-execute agent! One known limitations of the above design is that each task is still executed in sequence, meaning embarassingly parallel operations all add to the total execution time. You could improve on this by having each task represented as a DAG (similar to LLMCompiler), rather than a regular list." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad8f7955-2cc9-4ebb-8c41-13abb3351a24", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/ai_ta_backend/agents/langgraph_agent.py b/ai_ta_backend/agents/langgraph_agent.py new file mode 100644 index 00000000..6f42a223 --- /dev/null +++ b/ai_ta_backend/agents/langgraph_agent.py @@ -0,0 +1,162 @@ +import getpass +import operator +import os +import platform +from typing import Annotated, TypedDict, Union + +from dotenv import load_dotenv +from langchain import hub +from langchain_core.agents import AgentAction +from langchain_core.agents import AgentFinish +from langchain_core.messages import BaseMessage +from langchain_experimental.plan_and_execute import load_agent_executor +from langchain_experimental.plan_and_execute import load_chat_planner +from langchain_experimental.plan_and_execute import PlanAndExecute +# from langchain.chat_models import AzureChatOpenAI, ChatOpenAI +from langchain_openai import AzureChatOpenAI +from langchain_openai import ChatOpenAI +from langgraph.graph import END +from langgraph.graph import StateGraph +from langgraph.prebuilt import ToolExecutor + +from ai_ta_backend.agents.tools import get_tools + +# from ai_ta_backend.agents.utils import fancier_trim_intermediate_steps + +load_dotenv(override=True) + + +def get_user_info_string(): + username = getpass.getuser() + current_working_directory = os.getcwd() + operating_system = platform.system() + default_shell = os.environ.get("SHELL") + + return f"[User Info]\nName: {username}\nCWD: {current_working_directory}\nSHELL: {default_shell}\nOS: {operating_system}" + + +class AgentState(TypedDict): + # The input string + input: str + # The list of previous messages in the conversation + chat_history: list[BaseMessage] + # The outcome of a given call to the agent + # Needs `None` as a valid type, since this is what this will start as + agent_outcome: Union[AgentAction, AgentFinish, None] + # intermediate steps are present in agent input arg as well + # intermediate_steps: Annotated[list[tuple[AgentAction, str]], operator.add] + plan: Union[list[str], None] + kas_scratchpad: Annotated[list[str], operator.add] + # agent_scratchpad: Annotated[list[tuple[AgentAction, str]], operator.add] + + +class WorkflowAgent: + + def __init__(self, langsmith_run_id): + self.langsmith_run_id = langsmith_run_id + if os.environ['OPENAI_API_TYPE'] == 'azure': + self.llm = AzureChatOpenAI( + azure_deployment="gpt-4-128k", + openai_api_version=os.getenv("AZURE_0125_MODEL_VERSION"), # type: ignore + temperature=0, + azure_endpoint=os.getenv("AZURE_0125_MODEL_ENDPOINT"), + openai_api_key=os.getenv("AZURE_0125_MODEL_API_KEY"), # type: ignore + ) + else: + self.llm: ChatOpenAI = ChatOpenAI( + temperature=0, + model="gpt-4-0613", + max_retries=500, + # request_timeout=60 * 3, + streaming=False) + self.tools = get_tools(langsmith_run_id=self.langsmith_run_id) + self.agent = self.make_agent() + + def make_agent(self): + # PLANNER + planner = load_chat_planner(self.llm, system_prompt=hub.pull("kastanday/ml4bio-rnaseq-planner").format(user_info=get_user_info_string)) + + # EXECUTOR + executor = load_agent_executor(self.llm, self.tools, trim_intermediate_steps=1, handle_parsing_errors=True) + + # Create PlanAndExecute Agent + workflow_agent = PlanAndExecute(planner=planner, executor=executor, verbose=True) + + return workflow_agent + + # Invoke the agent + def execute_agent(self, data): + agent_outcome = self.agent.invoke(data, {"metadata": {"langsmith_run_id": str(self.langsmith_run_id)}}) + print(f"{agent_outcome = }") + return {"agent_outcome": agent_outcome, "kas_scratchpad": ['hi from execute_agent']} + + # Define the function to execute tools + def execute_tools(self, data): + # Get the most recent agent_outcome - this is the key added in the `agent` above + print("In execute tools", data) + agent_action = data.pop('agent_outcome') + print(f"{agent_action = }") + tool_executor = ToolExecutor(self.tools) + output = tool_executor.invoke(agent_action.tool_input) + print(f"{output = }") + return { + "intermediate_steps": [(agent_action, str(output))], + "kas_scratchpad": ['hi from execute_tools', 'we have to entries from tools'] + } + + # Define logic that will be used to determine which conditional edge to go down + def should_continue(self, data): + # The return string will be used when setting up the graph to define the flow + # If the agent outcome is an AgentFinish, then we return `exit` string + if isinstance(data['agent_outcome'], AgentFinish): + return "end" + # Otherwise, an AgentAction is returned. Return `continue` string + else: + return "continue" + + def run(self, input_prompt): + # Define a new graph + workflow = StateGraph(AgentState) + + # Define the two nodes we will cycle between + workflow.add_node("agent", self.execute_agent) + workflow.add_node("action", self.execute_tools) + + # Set the entrypoint as `agent` + workflow.set_entry_point("agent") + workflow.add_conditional_edges( + # First, we define the start node. We use `agent`. + # This means these are the edges taken after the `agent` node is called. + "agent", + # Next, we pass in the function that will determine which node is called next. + self.should_continue, + # Pass in a mapping. The keys are strings, and the values are other nodes. + # END is a special node marking that the graph should finish. + # The output of `should_continue`, will be matched against this mapping and the respective node is called + { + # If `tools`, then we call the tool node. + "continue": "action", + # Otherwise we finish. + "end": END + }) + + # Add a normal edge from `tools` to `agent`. This means that after `tools` is called, `agent` node is called next. + workflow.add_edge('action', 'agent') + app = workflow.compile() + + key, value = '', '' + inputs = {"input": input_prompt} + # result = app.invoke(inputs) + # print("RESULT", result) + # output = result['agent_outcome'].return_values["output"] + # print(output) + for output in app.stream(inputs): + # stream() yields dictionaries with output keyed by node name + for key, value in output.items(): + print(f"Output from node '{key}':") + print("---") + print(value) + print("\n---\n") + + result = key + value + return result diff --git a/ai_ta_backend/agents/langgraph_agent_v2.py b/ai_ta_backend/agents/langgraph_agent_v2.py new file mode 100644 index 00000000..6935987f --- /dev/null +++ b/ai_ta_backend/agents/langgraph_agent_v2.py @@ -0,0 +1,162 @@ +import getpass +import operator +import os +import platform +from typing import Annotated, List, Tuple, TypedDict + +from dotenv import load_dotenv +from langchain import hub +from langchain.agents import create_openai_functions_agent +from langchain.chains.openai_functions import create_openai_fn_runnable +from langchain.chains.openai_functions import create_structured_output_runnable +from langchain_core.messages import BaseMessage +from langchain_core.prompts import ChatPromptTemplate +from langchain_core.pydantic_v1 import BaseModel +from langchain_core.pydantic_v1 import Field +from langchain_openai import AzureChatOpenAI +from langchain_openai import ChatOpenAI +from langgraph.graph import END +from langgraph.graph import StateGraph +from langgraph.prebuilt import create_agent_executor + +from ai_ta_backend.agents.tools import get_tools + +load_dotenv(override=True) + + +class Plan(BaseModel): + """Plan to follow in future to complete the objective""" + steps: List[str] = Field(description="Steps to follow in sorted order of execution.") + + +class Response(BaseModel): + """Objective complete (or impossible), final response to user.""" + response: str + + +class State(TypedDict): + input: str + chat_history: list[BaseMessage] + plan: List[str] + past_steps: Annotated[List[Tuple], operator.add] + response: str + + +def get_user_info_string(): + username = getpass.getuser() + current_working_directory = os.getcwd() + operating_system = platform.system() + default_shell = os.environ.get("SHELL") + + return f"[User Info]\nName: {username}\nCWD: {current_working_directory}\nSHELL: {default_shell}\nOS: {operating_system}" + + +def get_llm(): + if os.getenv('OPENAI_API_TYPE') == 'azure': + return AzureChatOpenAI( + azure_deployment="gpt-4-128k", + openai_api_version=os.getenv("AZURE_0125_MODEL_VERSION"), + temperature=0, + azure_endpoint=os.getenv("AZURE_0125_MODEL_ENDPOINT"), + openai_api_key=os.getenv("AZURE_0125_MODEL_API_KEY"), + ) + else: + return ChatOpenAI( + model="gpt-4-turbo-preview", + temperature=0, + ) + + +class WorkflowAgent: + + def __init__(self, langsmith_run_id): + print("Planner Replanner agent initialized") + self.langsmith_run_id = langsmith_run_id + self.llm = get_llm() + self.tools = get_tools(langsmith_run_id) + self.planner_prompt = ChatPromptTemplate.from_template("""For the given objective, come up with a simple step by step plan. \ +This plan should involve individual tasks, that if executed correctly will yield the correct answer. Do not add any superfluous steps. \ +The result of the final step should be the final answer. Make sure that each step has all the information needed - do not skip steps. + +{objective}""") + self.replanner_prompt = ChatPromptTemplate.from_template("""For the given objective, come up with a simple step by step plan. \ +This plan should involve individual tasks, that if executed correctly will yield the correct answer. Do not add any superfluous steps. \ +The result of the final step should be the final answer. Make sure that each step has all the information needed - do not skip steps. + +Your objective was this: +{input} + +Your original plan was this: +{plan} + +You have currently done the follow steps: +{past_steps} + +Update your plan accordingly. If no more steps are needed and you can return to the user, then respond with that. Otherwise, fill out the plan. Only add steps to the plan that still NEED to be done. Do not return previously done steps as part of the plan.""" + ) + + self.executor_prompt = ChatPromptTemplate.from_template( + """You are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\ + For the given task, execute the task and return the result.\ + When you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\ + You have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\ + Before any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\ + [User Info]: {user_info}\ + [Chat history]: {chat_history}\ + [Input]: {input}\ + [Agent scratchpad]: {agent_scratchpad}\ + """) + # hub.pull("hwchase17/openai-functions-agent") + + self.agent_runnable = create_openai_functions_agent(self.llm, self.tools, hub.pull("hwchase17/openai-functions-agent")) + self.agent_executor = create_agent_executor(self.agent_runnable, self.tools) + self.workflow = self.create_workflow() + + def create_workflow(self): + workflow = StateGraph(State) + + async def execute_step(state: State): + task = state["plan"][0] + agent_response = await self.agent_executor.ainvoke({"input": task, "chat_history": []}) + return {"past_steps": (task, agent_response["agent_outcome"].return_values["output"])} + + async def plan_step(state: State): + planner = create_structured_output_runnable(Plan, self.llm, self.planner_prompt) + plan = await planner.ainvoke({"objective": state["input"]}) + return {"plan": plan.steps} + + async def replan_step(state: State): + replanner = create_openai_fn_runnable([Plan, Response], self.llm, self.replanner_prompt) + output = await replanner.ainvoke(state) + if isinstance(output, Response): + return {"response": output.response} + else: + return {"plan": output.steps} + + def should_end(state: State): + if state["response"]: + return True + else: + return False + + workflow.add_node("planner", plan_step) + workflow.add_node("agent", execute_step) + workflow.add_node("replan", replan_step) + workflow.set_entry_point("planner") + workflow.add_edge("planner", "agent") + workflow.add_edge("agent", "replan") + workflow.add_conditional_edges("replan", should_end, {True: END, False: "agent"}) #type: ignore + + return workflow.compile().with_config({"recursion_limit": 100}) + + async def run(self, input_prompt): + inputs = {"input": input_prompt} + async for event in self.workflow.astream(inputs, config={"recursion_limit": 50}): + for k, v in event.items(): + if k != "__end__": + print(v) + + +# Example usage +# agent = WorkflowAgent() +# await agent.run("what is diff --git a/ai_ta_backend/agents/langgraph_lats.py b/ai_ta_backend/agents/langgraph_lats.py new file mode 100644 index 00000000..0c2b4b2c --- /dev/null +++ b/ai_ta_backend/agents/langgraph_lats.py @@ -0,0 +1,141 @@ +from __future__ import annotations + +from collections import deque +import math +from typing import List, Optional + +from langchain_core.messages import BaseMessage +from langchain_core.messages import HumanMessage +from langchain_core.pydantic_v1 import BaseModel +from langchain_core.pydantic_v1 import Field + + +class Reflection(BaseModel): + reflections: str = Field(description="The critique and reflections on the sufficiency, superfluency," + " and general quality of the response") + score: int = Field( + description="Score from 0-10 on the quality of the candidate response.", + gte=0, + lte=10, + ) + found_solution: bool = Field(description="Whether the response has fully solved the question or task.") + + def as_message(self): + return HumanMessage(content=f"Reasoning: {self.reflections}\nScore: {self.score}") + + @property + def normalized_score(self) -> float: + return self.score / 10.0 + + +class Node: + + def __init__( + self, + messages: List[BaseMessage], + reflection: Reflection, + parent: Optional[Node] = None, + ): + self.messages = messages + self.parent = parent + self.children = [] + self.value = 0 + self.visits = 0 + self.reflection = reflection + self.depth = parent.depth + 1 if parent is not None else 1 + self._is_solved = reflection.found_solution if reflection else False + if self._is_solved: + self._mark_tree_as_solved() + self.backpropagate(reflection.normalized_score) + + def __repr__(self) -> str: + return (f"") + + @property + def is_solved(self): + """If any solutions exist, we can end the search.""" + return self._is_solved + + @property + def is_terminal(self): + return not self.children + + @property + def best_child(self): + """Select the child with the highest UCT to search next.""" + if not self.children: + return None + return max(self.children, key=lambda child: child.upper_confidence_bound()) + + @property + def best_child_score(self): + """Return the child with the highest value.""" + if not self.children: + return None + return max(self.children, key=lambda child: int(child.is_solved) * child.value) + + @property + def height(self) -> int: + """Check for how far we've rolled out the tree.""" + if self.children: + return 1 + max([child.height for child in self.children]) + return 1 + + def upper_confidence_bound(self, exploration_weight=1.0): + """Return the UCT score. This helps balance exploration vs. exploitation of a branch.""" + if self.parent is None: + raise ValueError("Cannot obtain UCT from root node") + if self.visits == 0: + return self.value + # Encourages exploitation of high-value trajectories + average_reward = self.value / self.visits + # Encourages exploration of less-visited trajectories + exploration_term = math.sqrt(math.log(self.parent.visits) / self.visits) + return average_reward + exploration_weight * exploration_term + + def backpropagate(self, reward: float): + """Update the score of this node and its parents.""" + node = self + while node: + node.visits += 1 + node.value = (node.value * (node.visits - 1) + reward) / node.visits + node = node.parent + + def get_messages(self, include_reflections: bool = True): + if include_reflections: + return self.messages + [self.reflection.as_message()] + return self.messages + + def get_trajectory(self, include_reflections: bool = True) -> List[BaseMessage]: + """Get messages representing this search branch.""" + messages = [] + node = self + while node: + messages.extend(node.get_messages(include_reflections=include_reflections)[::-1]) + node = node.parent + # Reverse the final back-tracked trajectory to return in the correct order + return messages[::-1] # root solution, reflection, child 1, ... + + def get_best_solution(self): + """Return the best solution from within the current sub-tree.""" + all_nodes = [self] + nodes = deque() + nodes.append(self) + while nodes: + node = nodes.popleft() + all_nodes.extend(node.children) + for n in node.children: + nodes.append(n) + best_node = max( + all_nodes, + # We filter out all non-terminal, non-solution trajectories + key=lambda node: int(node.is_terminal and node.is_solved) * node.value, + ) + return best_node + + def _mark_tree_as_solved(self): + parent = self.parent + while parent: + parent._is_solved = True + parent = parent.parent diff --git a/ai_ta_backend/agents/ml4bio_agent.py b/ai_ta_backend/agents/ml4bio_agent.py new file mode 100644 index 00000000..b759d3e1 --- /dev/null +++ b/ai_ta_backend/agents/ml4bio_agent.py @@ -0,0 +1,68 @@ +import getpass +import os +import platform + +from langchain import hub +from langchain_community.chat_models import AzureChatOpenAI +from langchain_community.chat_models import ChatOpenAI +from langchain_experimental.plan_and_execute import load_agent_executor +from langchain_experimental.plan_and_execute import load_chat_planner +from langchain_experimental.plan_and_execute import PlanAndExecute + +from ai_ta_backend.agents.tools import get_tools +from ai_ta_backend.agents.utils import fancier_trim_intermediate_steps + + +def get_user_info_string(): + username = getpass.getuser() + current_working_directory = os.getcwd() + operating_system = platform.system() + default_shell = os.environ.get("SHELL") + + return f"[User Info]\nName: {username}\nCWD: {current_working_directory}\nSHELL: {default_shell}\nOS: {operating_system}" + + +class WorkflowAgent: + + def __init__(self, langsmith_run_id): + print("PlannerAndExecute agent initialized") + self.langsmith_run_id = langsmith_run_id + if os.environ['OPENAI_API_TYPE'] == 'azure': + self.llm = AzureChatOpenAI(temperature=0, + model="gpt-4-0613", + max_retries=3, + request_timeout=60 * 3, + deployment_name=os.environ['AZURE_OPENAI_ENGINE']) # type: ignore + else: + self.llm: ChatOpenAI = ChatOpenAI(temperature=0, model="gpt-4-0613", max_retries=500, request_timeout=60 * 3) # type: ignore + self.agent = self.make_agent() + + def run(self, input): + result = self.agent.with_config({ + "run_name": "ML4BIO Plan & Execute Agent" + }).invoke({"input": f"{input}"}, {"metadata": { + "langsmith_run_id": str(self.langsmith_run_id) + }}) + + print(f"Result: {result}") + return result + + def make_agent(self): + # TOOLS + tools = get_tools(langsmith_run_id=self.langsmith_run_id) + + # PLANNER + planner = load_chat_planner(self.llm, system_prompt=hub.pull("kastanday/ml4bio-rnaseq-planner").format(user_info=get_user_info_string)) + + # EXECUTOR + executor = load_agent_executor(self.llm, + tools, + verbose=True, + trim_intermediate_steps=fancier_trim_intermediate_steps, + handle_parsing_errors=True) + # executor = load_agent_executor(self.llm, tools, verbose=True, handle_parsing_errors=True) + + # Create PlanAndExecute Agent + workflow_agent = PlanAndExecute(planner=planner, executor=executor, verbose=True) + + return workflow_agent diff --git a/ai_ta_backend/agents/prompt_function.py b/ai_ta_backend/agents/prompt_function.py new file mode 100644 index 00000000..605347bb --- /dev/null +++ b/ai_ta_backend/agents/prompt_function.py @@ -0,0 +1,35 @@ +from langchain import hub + +from ai_ta_backend.agents.langgraph_agent import AgentState + + +def stateToPrompt(state: AgentState, token_limit: int = 8_000): + """ + Memory prompt: https://smith.langchain.com/hub/kastanday/memory_manager_agent + Inputs = ['github_issue', 'messages_with_human', 'plan', 'tool_use_history'] + """ + prompt_template = hub.pull("kastanday/memory_manager_agent") + print(prompt_template) + + # if + + return prompt_template.format( + # user_info=get_user_info_string(), + input=state['input'], + chat_history='\n'.join([f"User: {message.content}" for message in state['chat_history']]), + agent_outcome=state['agent_outcome'], + intermediate_steps='\n'.join([f"{action}: {observation}" for action, observation in state['intermediate_steps']]), + ) + + +if __name__ == '__main__': + a = AgentState({ + 'input': 'hello', + 'chat_history': [], + 'agent_outcome': None, + 'intermediate_steps': [], + 'plan': [], + }) + print(a) + ret = stateToPrompt(a) + print(ret) diff --git a/ai_ta_backend/agents/testing_langgraph.py b/ai_ta_backend/agents/testing_langgraph.py new file mode 100644 index 00000000..06024d98 --- /dev/null +++ b/ai_ta_backend/agents/testing_langgraph.py @@ -0,0 +1,51 @@ +""" +USAGE: +python -m ai_ta_backend.agents.testing_langgraph +""" + +import uuid + +from dotenv import load_dotenv +from langchain import hub + +from ai_ta_backend.agents.langrgraph_agent_v2 import PlanExecute +from ai_ta_backend.agents.langrgraph_agent_v2 import WorkflowAgent + +load_dotenv(override=True) + +# langchain.debug = True # True for more detailed logs +# VERBOSE = True + +import asyncio + +if __name__ == '__main__': + + async def main(): + id = uuid.uuid4() + a = WorkflowAgent(id) + await a.run("Write a function to calculate the mean of a list of numbers.") + + asyncio.run(main()) + +# print("-------- OPENAI_API_BASE", os.environ['OPENAI_API_BASE']) +# print("-------- OPENAI_API_TYPE", os.environ['OPENAI_API_TYPE']) +# print("-------- AZURE_ENDPOINT", os.environ['AZURE_ENDPOINT']) + + +def stateToPrompt(state: PlanExecute, token_limit: int = 8_000): + """ + Memory prompt: https://smith.langchain.com/hub/kastanday/memory_manager_agent + Inputs = ['github_issue', 'messages_with_human', 'plan', 'tool_use_history'] + """ + prompt_template = hub.pull("kastanday/memory_manager_agent") + print(prompt_template) + + # if + + return prompt_template.format( + # user_info=get_user_info_string(), + input=state['input'], + chat_history='\n'.join([f"User: {message.content}" for message in state['chat_history']]), + agent_outcome=state['response'], + intermediate_steps='\n'.join([f"{action}: {observation}" for action, observation in state['past_steps']]), # type: ignore + ) diff --git a/ai_ta_backend/agents/tool_executor.py b/ai_ta_backend/agents/tool_executor.py new file mode 100644 index 00000000..9e2aee39 --- /dev/null +++ b/ai_ta_backend/agents/tool_executor.py @@ -0,0 +1,69 @@ +from typing import Any, Sequence, Union + +from langchain_core.load.serializable import Serializable +from langchain_core.runnables import RunnableBinding +from langchain_core.runnables import RunnableLambda +from langchain_core.tools import BaseTool + +INVALID_TOOL_MSG_TEMPLATE = ("{requested_tool_name} is not a valid tool, " + "try one of [{available_tool_names_str}].") + + +class ToolInvocationInterface: + """Interface for invoking a tool""" + + tool: str + tool_input: Union[str, dict] + + +class ToolInvocation(Serializable): + """Information about how to invoke a tool.""" + + tool: str + """The name of the Tool to execute.""" + tool_input: Union[str, dict] + """The input to pass in to the Tool.""" + + +class ToolExecutor(RunnableBinding): + tools: Sequence[BaseTool] + tool_map: dict + invalid_tool_msg_template: str + + def __init__( + self, + tools: Sequence[BaseTool], + *, + invalid_tool_msg_template: str = INVALID_TOOL_MSG_TEMPLATE, + **kwargs: Any, + ) -> None: + bound = RunnableLambda(self._execute, afunc=self._aexecute) + super().__init__( + bound=bound, + tools=tools, + tool_map={t.name: t for t in tools}, + invalid_tool_msg_template=invalid_tool_msg_template, + **kwargs, + ) + + def _execute(self, tool_invocation: ToolInvocationInterface) -> Any: + if tool_invocation.tool not in self.tool_map: + return self.invalid_tool_msg_template.format( + requested_tool_name=tool_invocation.tool, + available_tool_names_str=", ".join([t.name for t in self.tools]), + ) + else: + tool = self.tool_map[tool_invocation.tool] + output = tool.invoke(tool_invocation.tool_input) + return output + + async def _aexecute(self, tool_invocation: ToolInvocationInterface) -> Any: + if tool_invocation.tool not in self.tool_map: + return self.invalid_tool_msg_template.format( + requested_tool_name=tool_invocation.tool, + available_tool_names_str=", ".join([t.name for t in self.tools]), + ) + else: + tool = self.tool_map[tool_invocation.tool] + output = await tool.ainvoke(tool_invocation.tool_input) + return output diff --git a/ai_ta_backend/agents/tools.py b/ai_ta_backend/agents/tools.py new file mode 100644 index 00000000..e62b8680 --- /dev/null +++ b/ai_ta_backend/agents/tools.py @@ -0,0 +1,169 @@ +import os +from typing import List + +from dotenv import load_dotenv +import langchain +from langchain.agents import load_tools +from langchain.agents.agent_toolkits.github.toolkit import GitHubToolkit +from langchain.tools import BaseTool +from langchain.tools import StructuredTool +from langchain_community.tools import VectorStoreQATool +# from langchain.tools.playwright.utils import ( +# create_async_playwright_browser, +# create_sync_playwright_browser, +# ) +from langchain_community.utilities.github import GitHubAPIWrapper +from langchain_openai import AzureChatOpenAI +from langchain_openai import ChatOpenAI + +from ai_ta_backend.agents.code_intrepreter_sanbox import E2B_class +from ai_ta_backend.agents.vector_db import get_vectorstore_retriever_tool + +load_dotenv(override=True, dotenv_path='../../.env') + +os.environ["LANGCHAIN_TRACING"] = "true" # If you want to trace the execution of the program, set to "true" +langchain.debug = False # type: ignore +VERBOSE = True + + +def get_tools(langsmith_run_id: str, sync=True): + """Main function to assemble tools for ML for Bio project.""" + + # CODE EXECUTION - langsmith_run_id as unique identifier for the sandbox + code_execution_class = E2B_class(langsmith_run_id=langsmith_run_id) + e2b_python_execution_tool = StructuredTool.from_function( + func=code_execution_class.run_python_code, + name="Python-Code-Execution", + description="Executes Python3 code in an safe Docker container.", + ) + e2b_r_execution_tool = StructuredTool.from_function( + func=code_execution_class.run_r_code, + name="R-Code-Execution", + description="Executes R code in an safe Docker container.", + ) + e2b_shell_tool = StructuredTool.from_function( + func=code_execution_class.run_shell, + name="Shell-commands-except-for-git", + description= + "Run shell commands to, for example, execute shell scripts or R scripts. It is in the same environment as the Code Execution tool.", + ) + # AutoGen's Code Execution Tool + # def execute_code_tool(code: str, timeout: int = 60, filename: str = "execution_file.py", work_dir: str = "work_dir", use_docker: bool = True, lang: str = "python"): + # return execute_code(code, timeout, filename, work_dir, use_docker, lang) + + # SHELL & FILES + # shell = ShellTool() + # file_management = FileManagementToolkit( + # # If you don't provide a root_dir, operations will default to the current working directory + # # root_dir=str("/app") + # ).get_tools() + + # WEB BROWSER + # browser_toolkit = None + # if sync: + # sync_browser = create_sync_playwright_browser() + # browser_toolkit = PlayWrightBrowserToolkit.from_browser(sync_browser=sync_browser) + # else: + # # TODO async is work in progress... not functional yet. + # async_browser = create_async_playwright_browser() + # browser_toolkit = PlayWrightBrowserToolkit.from_browser(async_browser=async_browser) + # browser_tools = browser_toolkit.get_tools() + + # HUMAN + if os.environ['OPENAI_API_TYPE'] == 'azure': + AzureChatOpenAI( + temperature=0.1, + model="gpt-4-1106-Preview", + ) + # max_retries=3, + # request_timeout=60 * 3, + # deployment_name=os.environ['AZURE_OPENAI_ENGINE']) # type: ignore + else: + ChatOpenAI(temperature=0.1, model="gpt-4-0613", max_retries=3, request_timeout=60 * 3) # type: ignore + # human_tools = load_tools(["human"], llm=llm, input_func=get_human_input) + # GOOGLE SEARCH + search = load_tools(["serpapi"]) + + # GITHUB + github = GitHubAPIWrapper() # type: ignore + toolkit = GitHubToolkit.from_github_api_wrapper(github) + github_tools: list[BaseTool] = toolkit.get_tools() + + # TODO: more vector stores per Bio package: trimmomatic, gffread, samtools, salmon, DESeq2 and ggpubr + docs_tools: List[VectorStoreQATool] = [ + get_vectorstore_retriever_tool( + course_name='langchain-docs', + name='Langchain-docs', + description="Build context-aware, reasoning applications with LangChain's flexible abstractions and AI-first toolkit."), + get_vectorstore_retriever_tool( + course_name='ml4bio-star', + name='STAR-docs', + description='Basic STAR workflow consists of 2 steps: (1) Generating genome indexes files and (2) Mapping reads to the genome'), + get_vectorstore_retriever_tool( + course_name='ml4bio-fastqc', + name='FastQC-docs', + description= + 'FastQC aims to provide a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses which you can use to give a quick impression of whether your data has any problems of which you should be aware before doing any further analysis. It works with data from BAM, SAM or FastQ files' + ), + get_vectorstore_retriever_tool( + course_name='ml4bio-multiqc', + name='MultiQC-docs', + description= + "MultiQC is a reporting tool that parses results and statistics from bioinformatics tool outputs, such as log files and console outputs. It helps to summarize experiments containing multiple samples and multiple analysis steps. It's designed to be placed at the end of pipelines or to be run manually when you've finished running your tools." + ), + get_vectorstore_retriever_tool( + course_name='ml4bio-bioconductor', + name='Bioconductor-docs', + description= + "Bioconductor is a project that contains hundreds of individual R packages. They're all high quality libraries that provide widespread access to a broad range of powerful statistical and graphical methods for the analysis of genomic data. Some of them also facilitate the inclusion of biological metadata in the analysis of genomic data, e.g. literature data from PubMed, annotation data from Entrez genes." + ), + ] + + # ARXIV SEARCH + # Probably unnecessary: WikipediaQueryRun, WolframAlphaQueryRun, PubmedQueryRun, ArxivQueryRun + # arxiv_tool = ArxivQueryRun() + + tools: list[BaseTool] = github_tools + search + docs_tools + [e2b_python_execution_tool, e2b_r_execution_tool, e2b_shell_tool + ] # browser_tools + + return tools + + +############# HELPERS ################ +# def _should_check(serialized_obj: dict) -> bool: +# # Only require approval on ShellTool. +# return serialized_obj.get("name") == "terminal" + +# def _approve(_input: str) -> bool: +# if _input == "echo 'Hello World'": +# return True +# msg = ("Do you approve of the following input? " +# "Anything except 'Y'/'Yes' (case-insensitive) will be treated as a no.") +# msg += "\n\n" + _input + "\n" +# resp = input(msg) +# return resp.lower() in ("yes", "y") + + +def get_human_input() -> str: + """Placeholder for Slack/GH-Comment input from user.""" + print("Insert your text. Enter 'q' or press Ctrl-D (or Ctrl-Z on Windows) to end.") + contents = [] + while True: + try: + line = input() + except EOFError: + break + if line == "q": + break + contents.append(line) + return "\n".join(contents) + + +if __name__ == "__main__": + tools = get_tools(sync=True, langsmith_run_id="MY RUN ID FROM OUTSIDE") + # print(tools) + # print("SCHEMA: ", tools.args_schema.schema_json(indent=2)) + if type(tools) == List: + # raise Exception("No tools found.") + pass + else: + tools[0].run("print('Hello World from inside the tools.run() function!')") diff --git a/ai_ta_backend/agents/utils.py b/ai_ta_backend/agents/utils.py new file mode 100644 index 00000000..5d021e0c --- /dev/null +++ b/ai_ta_backend/agents/utils.py @@ -0,0 +1,218 @@ +import inspect +import logging +import os +import time +import traceback +from typing import List, Tuple + +from langchain.schema import AgentAction +import langsmith +from langsmith import Client +import tiktoken + + +def fancier_trim_intermediate_steps(steps: List[Tuple[AgentAction, str]]) -> List[Tuple[AgentAction, str]]: + """ + Trim the history of Agent steps to fit within the token limit. + If we're over the limit, start removing the logs from the oldest actions first. then remove the tool_input from the oldest actions. then remove the tool from the oldest actions. then remove the oldest actions entirely. To remove any of these, just set it as an empty string. + + Args: + steps (List[Tuple[AgentAction, str]]): A list of agent actions and associated strings. + + Returns: + List[Tuple[AgentAction, str]]: A list of the most recent actions that fit within the token limit. + """ + try: + + def count_tokens(action: AgentAction) -> int: + return sum(count_tokens_and_cost(str(getattr(action, attr)))[0] for attr in ['tool', 'tool_input', 'log']) + + token_limit = 4_000 + total_tokens = sum(count_tokens(action) for action, _ in steps) + + # for logging + sum(count_tokens(action) for action, _ in steps) + steps.copy() + + # Remove the logs if over the limit + if total_tokens > token_limit: + for action, _ in steps: + action.log = '' + total_tokens = sum(count_tokens(action) for action, _ in steps) + if total_tokens <= token_limit: + break + + # Remove the tool_input if over the limit + if total_tokens > token_limit: + for action, _ in steps: + action.tool_input = '' + total_tokens = sum(count_tokens(action) for action, _ in steps) + if total_tokens <= token_limit: + break + + # Remove the tool if over the limit + if total_tokens > token_limit: + for action, _ in steps: + action.tool = '' + total_tokens = sum(count_tokens(action) for action, _ in steps) + if total_tokens <= token_limit: + break + + # Remove the oldest actions if over the limit + while total_tokens > token_limit: + steps.pop(0) + total_tokens = sum(count_tokens(action) for action, _ in steps) + + # log = Log(message=f"trim_intermediate_steps", + # original_steps=str(original_steps), + # final_steps=str(steps), + # original_tokens=original_total_tokens, + # final_tokens=total_tokens, + # ) + # response = log_client.send(log) + # response.raise_for_status() + + return steps + except Exception as e: + print("-----------❌❌❌❌------------START OF ERROR-----------❌❌❌❌------------") + print(f"Error in {inspect.currentframe().f_code.co_name}: {e}") # type: ignore # print function name in error. + print("Traceback:") + traceback.print_exc() + return [steps[-1]] + + +def get_langsmit_run_from_metadata(metadata_value, metadata_key="run_id_in_metadata") -> langsmith.schemas.Run: + """This will only return the FIRST match on single metadta field + + Args: + metadata_key (str, optional): _description_. Defaults to "run_id_in_metadata". + metadata_value (str, optional): _description_. Defaults to "b187061b-afd7-40ab-a918-705cf16219c3". + + Returns: + Run: _description_ + """ + langsmith_client = Client() + runs = langsmith_client.list_runs(project_name=os.environ['LANGCHAIN_PROJECT']) + + count = 0 + for _r in runs: + count += 1 + print(f"Found num runs: {count}") + + for run in langsmith_client.list_runs(project_name=os.environ['LANGCHAIN_PROJECT']): + if run.extra and run.extra.get('metadata') and run.extra.get('metadata').get(metadata_key) == metadata_value: + # return the 'top-level' of the trace (keep getting runs' parents until at top) + if run.parent_run_id: + curr_run = run + while curr_run.parent_run_id: + curr_run = langsmith_client.read_run(str(curr_run.parent_run_id)) + return curr_run + else: + return run + + +def get_langsmith_trace_sharable_url(run_id_in_metadata, project_name='', time_delay_s=0): + """ + + Adding metadata to runs: https://docs.smith.langchain.com/tracing/tracing-faq#how-do-i-add-metadata-to-runs + + Background: + A 'Trace' is a collection of runs organized in a tree or graph. The 'Root Run' is the top level run in a trace. + https://docs.smith.langchain.com/tracing/tracing-faq + + Args: + project (_type_): _description_ + """ + time.sleep(time_delay_s) + if project_name == '': + project_name = os.environ['LANGCHAIN_PROJECT'] + + langsmith_client = Client() + + # re-attempt to find the run, maybe it hasn't started yet. + run = None + for _i in range(8): + run = get_langsmit_run_from_metadata(str(run_id_in_metadata), metadata_key="run_id_in_metadata") + if run is not None: + break + time.sleep(5) + + if run is None: + return f"Failed to generate sharable URL, cannot find this run on LangSmith. RunID: {run_id_in_metadata}" + + if not langsmith_client.run_is_shared(run.id): + sharable_url = langsmith_client.share_run(run_id=run.id) + else: + sharable_url = langsmith_client.read_run_shared_link(run_id=run.id) + logging.info(f'⭐️ sharable_url: {sharable_url}') + return sharable_url + + +def count_tokens_and_cost(prompt: str, + completion: str = '', + openai_model_name: str = "gpt-3.5-turbo"): # -> tuple[int, float] | tuple[int, float, int, float]: + """ + Returns the number of tokens in a text string. + + Only the first parameter is required, a string of text to measure. The completion and model name are optional. + + num_tokens, prompt_cost = count_tokens_and_cost(prompt="hello there") + num_tokens_prompt, prompt_cost, num_tokens_completion, completion_cost = count_tokens_and_cost(prompt="hello there", completion="how are you?") + + Args: + prompt (str): _description_ + completion (str, optional): _description_. Defaults to ''. + openai_model_name (str, optional): _description_. Defaults to "gpt-3.5-turbo". + + Returns: + tuple[int, float] | tuple[int, float, int, float]: Returns the number of tokens consumed and the cost. The total cost you'll be billed is the sum of each individual cost (prompt_cost + completion_cost) + """ + # encoding = tiktoken.encoding_for_model(openai_model_name) + openai_model_name = openai_model_name.lower() + encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # I think they all use the same encoding + prompt_cost = 0 + completion_cost = 0 + + prompt_token_cost = 0 + completion_token_cost = 0 + + if openai_model_name.startswith("gpt-3.5-turbo"): + if "16k" in openai_model_name: + prompt_token_cost: float = 0.003 / 1_000 + completion_token_cost: float = 0.004 / 1_000 + else: + # 3.5-turbo regular (4k context) + prompt_token_cost: float = 0.0015 / 1_000 + completion_token_cost: float = 0.002 / 1_000 + + elif openai_model_name.startswith("gpt-4"): + if "32k" in openai_model_name: + prompt_token_cost = 0.06 / 1_000 + completion_token_cost = 0.12 / 1_000 + else: + # gpt-4 regular (8k context) + prompt_token_cost = 0.03 / 1_000 + completion_token_cost = 0.06 / 1_000 + elif openai_model_name.startswith("text-embedding-ada-002"): + prompt_token_cost = 0.0001 / 1_000 + completion_token_cost = 0.0001 / 1_000 + else: + # no idea of cost + print(f"NO IDEA OF COST, pricing not supported for model model: `{openai_model_name}`. (Defaulting to GPT-4 pricing...)") + prompt_token_cost = 0.03 / 1_000 + completion_token_cost = 0.06 / 1_000 + + if completion == '': + num_tokens_prompt: int = len(encoding.encode(prompt)) + prompt_cost = float(prompt_token_cost * num_tokens_prompt) + return num_tokens_prompt, prompt_cost + elif prompt == '': + num_tokens_completion: int = len(encoding.encode(completion)) + completion_cost = float(completion_token_cost * num_tokens_completion) + return num_tokens_completion, completion_cost + else: + num_tokens_prompt: int = len(encoding.encode(prompt)) + num_tokens_completion: int = len(encoding.encode(completion)) + prompt_cost = float(prompt_token_cost * num_tokens_prompt) + completion_cost = float(completion_token_cost * num_tokens_completion) + return num_tokens_prompt, prompt_cost, num_tokens_completion, completion_cost diff --git a/ai_ta_backend/agents/vector_db.py b/ai_ta_backend/agents/vector_db.py new file mode 100644 index 00000000..65320d3a --- /dev/null +++ b/ai_ta_backend/agents/vector_db.py @@ -0,0 +1,121 @@ +import inspect +import os +import traceback + +from dotenv import load_dotenv +import langchain +from langchain.agents import AgentType +from langchain.agents import initialize_agent +from langchain.agents import Tool +from langchain.agents.react.base import DocstoreExplorer +from langchain.docstore.base import Docstore +from langchain_community.tools import VectorStoreQATool +from langchain_community.vectorstores import Qdrant +from langchain_openai import AzureChatOpenAI +from langchain_openai import AzureOpenAIEmbeddings +from langchain_openai import ChatOpenAI +from langchain_openai import OpenAIEmbeddings +from qdrant_client import QdrantClient + +load_dotenv(override=True) + +langchain.debug = False +VERBOSE = True + + +def get_vectorstore_retriever_tool(course_name: str, + name: str, + description: str, + openai_model_name='gpt-3.5-turbo-16k', + temperature=0.1, + top_k=8) -> VectorStoreQATool: + r""" + course name str: Name of course on uiuc-chat as appears in URL-bar; yes it's case sensitive. + + Usage: + ``` + QAtool = get_vectorstore_retriever_tool(course_name='langchain-docs') + print(QAtool._run("query")) + print("FINAL RESULT\n", get_vectorstore_retriever_tool(search_query="How do Plan and Execute agents work in Langchain?", course_name='langchain-docs')) + ``` + + langchain_docs_tool._run(search_query) + """ + try: + qdrant_client = QdrantClient( + url=os.getenv('QDRANT_URL'), + api_key=os.getenv('QDRANT_API_KEY'), + ) + + langchain_docs_vectorstore = Qdrant( + client=qdrant_client, + collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore + embeddings=AzureOpenAIEmbeddings() if os.environ['OPENAI_API_TYPE'] == 'azure' else OpenAIEmbeddings()) + + if os.environ['OPENAI_API_TYPE'] == 'azure': + llm = AzureChatOpenAI(temperature=0, + model="gpt-4-0613", + max_retries=3, + request_timeout=60 * 3, + deployment_name=os.environ['AZURE_OPENAI_ENGINE']) + else: + llm: ChatOpenAI = ChatOpenAI( + temperature=0, + model="gpt-4-0613", + max_retries=500, + # request_timeout=60 * 3, + streaming=True) + + return VectorStoreQATool( + vectorstore=langchain_docs_vectorstore, + llm=llm, # type: ignore + name=name, + description=description, + retriever_kwargs={'filter': { + 'course_name': course_name, + 'k': top_k + }}) + except Exception as e: + # return full traceback to front end + print( + f"In /getTopContexts. Course: {course_name} \nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" + ) # type: ignore + raise e + + +def get_docstore_agent(docstore: Docstore): + """This returns an agent. Usage of this agent: react.run(question) + e.g. + ``` + question = "Author David Chanoff has collaborated with a U.S. Navy admiral who served as the ambassador to the United Kingdom under which President?" + react.run(question) + ``` + """ + if docstore is None: + doc_explorer = DocstoreExplorer(langchain.Wikipedia()) + else: + doc_explorer = DocstoreExplorer(docstore) + + tools = [ + Tool( + name="Search", + func=doc_explorer.search, + description="useful for when you need to ask with search", + ), + Tool( + name="Lookup", + func=doc_explorer.lookup, + description="useful for when you need to ask with lookup", + ), + ] + + if os.environ['OPENAI_API_TYPE'] == 'azure': + llm = AzureChatOpenAI(temperature=0, + model="gpt-4-0613", + max_retries=3, + request_timeout=60 * 3, + deployment_name=os.environ['AZURE_OPENAI_ENGINE']) # type: ignore + else: + llm = ChatOpenAI(temperature=0, model="gpt-4-0613", max_retries=3, request_timeout=60 * 3) # type: ignore + react = initialize_agent(tools, llm, agent=AgentType.REACT_DOCSTORE, verbose=VERBOSE) + return react diff --git a/ai_ta_backend/arize_logging.py b/ai_ta_backend/arize_logging.py deleted file mode 100644 index 19a7c4d9..00000000 --- a/ai_ta_backend/arize_logging.py +++ /dev/null @@ -1,78 +0,0 @@ -from arize.api import Client -from arize.pandas.embeddings import EmbeddingGenerator, UseCases -# from arize.utils import ModelTypes -# from arize.utils.ModelTypes import GENERATIVE_LLM -from arize.utils.types import (Embedding, EmbeddingColumnNames, Environments, - Metrics, ModelTypes, Schema) - -# self.arize_client = Client(space_key=os.getenv('ARIZE_SPACE_KEY'), api_key=os.getenv('ARIZE_API_KEY')) # type: ignore - -def log_to_arize(self, course_name: str, user_question: str, llm_completion: str) -> str: - """ - Use LangChain map_reduce_QA to implement this in parallel. - Write a function that takes in a question, and returns a very long "stuffed" prompt for GPT-4 to answer on the front-end. (You only construct the prompt for GPT-4, you don't actually return the answer). - - References: - Example & Docs: https://python.langchain.com/en/latest/modules/chains/index_examples/question_answering.html#the-map-reduce-chain - Code: https://github.com/hwchase17/langchain/blob/4092fd21dcabd1de273ad902fae2186ae5347e03/langchain/chains/question_answering/map_reduce_prompt.py#L11 - """ - return f"TODO: Implement me! You asked for: {course_name}" - import pandas as pd - - features = { - 'state': 'wa', - 'city': 'seattle', - 'merchant_name': 'Starbucks Coffee', - 'pos_approved': True, - 'item_count': 2, - 'merchant_type': 'coffee shop', - 'charge_amount': 22.11, - } - - #example tags - tags = { - 'age': 21, - 'zip_code': '94610', - 'device_os': 'MacOS', - 'server_node_id': 120, - } - - #example embeddings - embedding_features = { - # 'image_embedding': Embedding( - # vector=np.array([1.0, 2, 3]), # type: ignore - # link_to_data='https://my-bucket.s3.us-west-2.amazonaws.com/puppy.png', - # ), - 'prompt': Embedding( - vector=pd.Series([6.0, 1.0, 2.0, 6.0]), # type: ignore - data='slightly different This is a test sentence', - ), - 'completion': Embedding( - vector=pd.Series([15.0, 10.0, 1.0, 9.0]), # type: ignore - data=['slightly', 'different', 'This', 'is', 'a', 'sample', 'token', 'array'], - ), - } - - #log the prediction - response = self.arize_client.log( - prediction_id=str(uuid.uuid4()), - prediction_label=llm_completion, - model_id='kas-model-1', - # model_type=ModelTypes.GENERATIVE_LLM, # I think this is a bug. - model_type=ModelTypes.SCORE_CATEGORICAL, - environment=Environments.PRODUCTION, - model_version='v1', - prediction_timestamp=int(datetime.datetime.now().timestamp()), - features=features, - embedding_features=embedding_features, - tags=tags, - ) - - ## Listen to response code to ensure successful delivery - res = response.result() - if res.status_code == 200: - print('Success sending Prediction!') - return "Success logging to Arize!" - else: - print(f'Log failed with response code {res.status_code}, {res.text}') - return f'Log failed with response code {res.status_code}, {res.text}' diff --git a/ai_ta_backend/aws.py b/ai_ta_backend/aws.py index 66fb0bbe..64909cd9 100644 --- a/ai_ta_backend/aws.py +++ b/ai_ta_backend/aws.py @@ -1,54 +1,55 @@ -import os -from multiprocessing import Lock, cpu_count -from multiprocessing.pool import ThreadPool -from typing import List, Optional - -import boto3 - - -def upload_data_files_to_s3(course_name: str, localdir: str) -> Optional[List[str]]: - """Uploads all files in localdir to S3 bucket. - - Args: - course_name (str): Official course name on our website. - localdir (str): Local directory to upload from, coursera-dl downloads to this directory. - - Returns: - Optional[List[str]]: A list of S3 paths, the final resting place of uploads, or None if no files were uploaded. - """ - s3 = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - filenames = [] - for root, subdirs, files in os.walk(localdir): - for filename in files: - filenames.append(os.path.join(root, filename)) - - if not filenames: - print(f"No files to upload. Not found in: {localdir}") - return None - - print(f"Files to upload: {filenames}") - print("About to upload...") - - s3_paths = [] - s3_paths_lock = Lock() - - def upload(myfile): - s3_file = f"courses/{course_name}/{os.path.basename(myfile)}" - s3.upload_file(myfile, os.getenv('S3_BUCKET_NAME'), s3_file) - with s3_paths_lock: - s3_paths.append(s3_file) - - # only 2 parallel uploads because we're getting rate limited with min_p=6... 503 errors. - min_p = 2 - max_p = cpu_count() - num_procs = max(min(len(filenames), max_p), min_p) - pool = ThreadPool(processes=num_procs) - pool.map(upload, filenames) - - print("All data files uploaded to S3 successfully.") - return s3_paths \ No newline at end of file +from multiprocessing import cpu_count +from multiprocessing import Lock +from multiprocessing.pool import ThreadPool +import os +from typing import List, Optional + +import boto3 + + +def upload_data_files_to_s3(course_name: str, localdir: str) -> Optional[List[str]]: + """Uploads all files in localdir to S3 bucket. + + Args: + course_name (str): Official course name on our website. + localdir (str): Local directory to upload from, coursera-dl downloads to this directory. + + Returns: + Optional[List[str]]: A list of S3 paths, the final resting place of uploads, or None if no files were uploaded. + """ + s3 = boto3.client( + 's3', + aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), + ) + + filenames = [] + for root, _subdirs, files in os.walk(localdir): + for filename in files: + filenames.append(os.path.join(root, filename)) + + if not filenames: + print(f"No files to upload. Not found in: {localdir}") + return None + + print(f"Files to upload: {filenames}") + print("About to upload...") + + s3_paths = [] + s3_paths_lock = Lock() + + def upload(myfile): + s3_file = f"courses/{course_name}/{os.path.basename(myfile)}" + s3.upload_file(myfile, os.getenv('S3_BUCKET_NAME'), s3_file) + with s3_paths_lock: + s3_paths.append(s3_file) + + # only 2 parallel uploads because we're getting rate limited with min_p=6... 503 errors. + min_p = 2 + max_p = cpu_count() + num_procs = max(min(len(filenames), max_p), min_p) + pool = ThreadPool(processes=num_procs) + pool.map(upload, filenames) + + print("All data files uploaded to S3 successfully.") + return s3_paths diff --git a/ai_ta_backend/extreme_context_stuffing.py b/ai_ta_backend/extreme_context_stuffing.py index 887928b0..238f130c 100644 --- a/ai_ta_backend/extreme_context_stuffing.py +++ b/ai_ta_backend/extreme_context_stuffing.py @@ -96,20 +96,17 @@ # import tempfile # from langchain.llms import OpenAI import asyncio +from dataclasses import \ + dataclass # for storing API inputs, outputs, and metadata +from dataclasses import field import json import logging -import os import re import time -from dataclasses import ( # for storing API inputs, outputs, and metadata - dataclass, field) from typing import Any, List import aiohttp # for making API calls concurrently import tiktoken # for counting tokens -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.vectorstores import Qdrant -from qdrant_client import QdrantClient, models class OpenAIAPIProcessor: @@ -154,11 +151,11 @@ async def process_api_requests_from_file(self): # initialize flags file_not_finished = True # after file is empty, we'll skip reading it - logging.debug(f"Initialization complete.") + logging.debug("Initialization complete.") requests = self.input_prompts_list.__iter__() - logging.debug(f"File opened. Entering main loop") + logging.debug("File opened. Entering main loop") task_list = [] @@ -244,7 +241,7 @@ async def process_api_requests_from_file(self): ) # after finishing, log final status - logging.info(f"""Parallel processing complete. About to return.""") + logging.info("""Parallel processing complete. About to return.""") if status_tracker.num_tasks_failed > 0: logging.warning(f"{status_tracker.num_tasks_failed} / {status_tracker.num_tasks_started} requests failed.") if status_tracker.num_rate_limit_errors > 0: diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 6bc7862a..8f3b490b 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -1,318 +1,368 @@ -import os -import re -import time -from typing import Any, List, Union - -from dotenv import load_dotenv -from flask import Flask, jsonify, request -from flask_cors import CORS -from h11 import Response -# from qdrant_client import QdrantClient -from sqlalchemy import JSON - -from ai_ta_backend.vector_database import Ingest -from ai_ta_backend.web_scrape import main_crawler, mit_course_download - -app = Flask(__name__) -CORS(app) - -# load API keys from globally-availabe .env file -# load_dotenv(dotenv_path='.env', override=True) -load_dotenv() - -@app.route('/') -def index() -> JSON: - """_summary_ - - Args: - test (int, optional): _description_. Defaults to 1. - - Returns: - JSON: _description_ - """ - return jsonify({"Choo Choo": "Welcome to your Flask app 🚅"}) - - -@app.route('/coursera', methods=['GET']) -def coursera() -> JSON: - try: - course_name: str = request.args.get('course_name') # type: ignore - coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore - except Exception as e: - print(f"No course name provided: {e}") - - ingester = Ingest() - results = ingester.ingest_coursera(coursera_course_name, course_name) # type: ignore - response = jsonify(results) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - -@app.route('/github', methods=['GET']) -def github() -> JSON: - try: - course_name: str = request.args.get('course_name') # type: ignore - github_url: str = request.args.get('github_url') # type: ignore - except Exception as e: - print(f"No course name provided: {e}") - - print("In /github") - ingester = Ingest() - results = ingester.ingest_github(github_url, course_name) - response = jsonify(results) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - -@app.route('/delete-entire-course', methods=['GET']) -def delete_entire_course(): - try: - course_name: str = request.args.get('course_name') # type: ignore - # coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore - except Exception as e: - print(f"No course name provided: {e}") - - ingester = Ingest() - results = ingester.delete_entire_course(course_name) # type: ignore - response = jsonify(results) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/getTopContexts', methods=['GET']) -def getTopContexts(): - """Get most relevant contexts for a given search query. - - Return value - - ## GET arguments - course name (optional) str - A json response with TBD fields. - search_query - top_n - - Returns - ------- - JSON - A json response with TBD fields. - Metadata fileds - * pagenumber_or_timestamp - * readable_filename - * s3_pdf_path - - Example: - [ - { - 'readable_filename': 'Lumetta_notes', - 'pagenumber_or_timestamp': 'pg. 19', - 's3_pdf_path': '/courses//Lumetta_notes.pdf', - 'text': 'In FSM, we do this...' - }, - ] - - Raises - ------ - Exception - Testing how exceptions are handled. - """ - # todo: best way to handle optional arguments? - try: - course_name: str = request.args.get('course_name') - search_query: str = request.args.get('search_query') - token_limit: int = request.args.get('token_limit') - except Exception as e: - print("No course name provided.") - - if search_query is None: - return jsonify({"error": "No parameter `search_query` provided. It is undefined."}) - if token_limit is None: - token_limit = 3_000 - else: - token_limit = int(token_limit) - - ingester = Ingest() - found_documents = ingester.getTopContexts(search_query, course_name, token_limit) - - response = jsonify(found_documents) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - -@app.route('/get_stuffed_prompt', methods=['GET']) -def get_stuffed_prompt(): - """Get most relevant contexts for a given search query. - - ## GET arguments - course name (optional) str - A json response with TBD fields. - search_query - top_n - - Returns - ------- - String - - """ - # todo: best way to handle optional arguments? - try: - course_name: str = request.args.get('course_name') - search_query: str = request.args.get('search_query') - token_limit: int = request.args.get('token_limit') - except Exception as e: - print("No course name provided.") - - print("In /getTopContexts: ", search_query) - if search_query is None: - return jsonify({"error": "No parameter `search_query` provided. It is undefined."}) - if token_limit is None: - token_limit = 3_000 - else: - token_limit = int(token_limit) - - ingester = Ingest() - prompt = ingester.get_stuffed_prompt(search_query, course_name, token_limit) - - response = jsonify(prompt) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/ingest', methods=['GET']) -def ingest(): - """Recursively ingests anything from S3 filepath and below. - Pass a s3_paths filepath (not URL) into our S3 bucket. - - Ingests all files, not just PDFs. - - args: - s3_paths: str | List[str] - - Returns: - str: Success or Failure message. Failure message if any failures. TODO: email on failure. - """ - - print("In /ingest") - - ingester = Ingest() - s3_paths: List[str] | str = request.args.get('s3_paths') - course_name: List[str] | str = request.args.get('course_name') - success_fail_dict = ingester.bulk_ingest(s3_paths, course_name) - - response = jsonify(success_fail_dict) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/getContextStuffedPrompt', methods=['GET']) -def getContextStuffedPrompt(): - """ - Get a stuffed prompt for a given user question and course name. - Args : - search_query (str) - course_name (str) : used for metadata filtering - Returns : str - a very long "stuffed prompt" with question + summaries of 20 most relevant documents. - """ - print("In /getContextStuffedPrompt") - - - ingester = Ingest() - search_query: str = str(request.args.get('search_query')) # type: ignore - course_name: str = str(request.args.get('course_name')) # type: ignore - top_n: int = int(request.args.get('top_n')) # type: ignore - top_k_to_search: int = int(request.args.get('top_k_to_search')) # type: ignore - - start_time = time.monotonic() - stuffed_prompt = ingester.get_context_stuffed_prompt(search_query, course_name, top_n, top_k_to_search) - print(f"⏰ Runtime of EXTREME prompt stuffing: {(time.monotonic() - start_time):.2f} seconds") - response = jsonify({"prompt": stuffed_prompt}) - - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/getAll', methods=['GET']) -def getAll(): - """Get all course materials based on the course_name - """ - - print("In /getAll") - - ingester = Ingest() - course_name: List[str] | str = request.args.get('course_name') - distinct_dicts = ingester.getAll(course_name) - response = jsonify({"all_s3_paths": distinct_dicts}) - - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -#Write api to delete s3 files for a course -@app.route('/delete', methods=['DELETE']) -def delete(): - """Delete all course materials based on the course_name - """ - - print("In /delete") - - ingester = Ingest() - course_name: List[str] | str = request.args.get('course_name') - s3_path: str = request.args.get('s3_path') - success_or_failure = ingester.delete_data(s3_path, course_name) - response = jsonify({"outcome": success_or_failure}) - - response.headers.add('Access-Control-Allow-Origin', '*') - return response - - -@app.route('/log', methods=['GET']) -def log(): - """ - todo - """ - - print("In /log") - - ingester = Ingest() - # course_name: List[str] | str = request.args.get('course_name') - success_or_failure = ingester.log_to_arize('course_name', 'test', 'completion') - response = jsonify({"outcome": success_or_failure}) - - response.headers.add('Access-Control-Allow-Origin', '*') - return response - -@app.route('/web-scrape', methods=['GET']) -def scrape(): - url: str = request.args.get('url') - max_urls:int = request.args.get('max_urls') - max_depth:int = request.args.get('max_depth') - timeout:int = request.args.get('timeout') - course_name: str = request.args.get('course_name') - base_url_bool: str = request.args.get('base_url_on') - - # print all input params - print(f"Web scrape!") - print(f"Url: {url}") - print(f"Max Urls: {max_urls}") - print(f"Max Depth: {max_depth}") - print(f"Timeout in Seconds ⏰: {timeout}") - - success_fail_dict = main_crawler(url, course_name, max_urls, max_depth, timeout, base_url_bool) - - response = jsonify(success_fail_dict) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - -@app.route('/mit-download', methods=['GET']) -def mit_download_course(): - url:str = request.args.get('url') - course_name:str = request.args.get('course_name') - local_dir:str = request.args.get('local_dir') - - success_fail = mit_course_download(url, course_name,local_dir) - - response = jsonify(success_fail) - response.headers.add('Access-Control-Allow-Origin', '*') - return response - -# TODO: add a way to delete items from course based on base_url - -if __name__ == '__main__': - app.run(debug=True, port=os.getenv("PORT", default=8000)) +import asyncio +import json +import time +from typing import List + +from dotenv import load_dotenv +from flask import Flask +from flask import jsonify +from flask import request +from flask_cors import CORS +import ray +# from qdrant_client import QdrantClient +from sqlalchemy import JSON + +from ai_ta_backend.agents.github_webhook_handlers import handle_github_event +from ai_ta_backend.vector_database import Ingest +from ai_ta_backend.web_scrape import main_crawler +from ai_ta_backend.web_scrape import mit_course_download + +app = Flask(__name__) +CORS(app) + +# load API keys from globally-availabe .env file +load_dotenv(dotenv_path='.env', override=True) + +ray.init() + +# @app.route('/') +# def index() -> JSON: +# """_summary_ + +# Args: +# test (int, optional): _description_. Defaults to 1. + +# Returns: +# JSON: _description_ +# """ +# return jsonify({"Choo Choo": "Welcome to your Flask app 🚅"}) + + +@app.route('/coursera', methods=['GET']) +def coursera() -> JSON: + try: + course_name: str = request.args.get('course_name') # type: ignore + coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore + except Exception as e: + print(f"No course name provided: {e}") + + ingester = Ingest() + results = ingester.ingest_coursera(coursera_course_name, course_name) # type: ignore + response = jsonify(results) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/github', methods=['GET']) +def github() -> JSON: + try: + course_name: str = request.args.get('course_name') # type: ignore + github_url: str = request.args.get('github_url') # type: ignore + except Exception as e: + print(f"No course name provided: {e}") + + print("In /github") + ingester = Ingest() + results = ingester.ingest_github(github_url, course_name) + response = jsonify(results) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/delete-entire-course', methods=['GET']) +def delete_entire_course(): + try: + course_name: str = request.args.get('course_name') # type: ignore + # coursera_course_name: str = request.args.get('coursera_course_name') # type: ignore + except Exception as e: + print(f"No course name provided: {e}") + + ingester = Ingest() + results = ingester.delete_entire_course(course_name) # type: ignore + response = jsonify(results) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/getTopContexts', methods=['GET']) +def getTopContexts(): + """Get most relevant contexts for a given search query. + + Return value + + ## GET arguments + course name (optional) str + A json response with TBD fields. + search_query + top_n + + Returns + ------- + JSON + A json response with TBD fields. + Metadata fileds + * pagenumber_or_timestamp + * readable_filename + * s3_pdf_path + + Example: + [ + { + 'readable_filename': 'Lumetta_notes', + 'pagenumber_or_timestamp': 'pg. 19', + 's3_pdf_path': '/courses//Lumetta_notes.pdf', + 'text': 'In FSM, we do this...' + }, + ] + + Raises + ------ + Exception + Testing how exceptions are handled. + """ + # todo: best way to handle optional arguments? + try: + course_name: str = request.args.get('course_name') + search_query: str = request.args.get('search_query') + token_limit: int = request.args.get('token_limit') + except Exception: + print("No course name provided.") + + if search_query is None: + return jsonify({"error": "No parameter `search_query` provided. It is undefined."}) + if token_limit is None: + token_limit = 3_000 + else: + token_limit = int(token_limit) + + ingester = Ingest() + found_documents = ingester.getTopContexts(search_query, course_name, token_limit) + + response = jsonify(found_documents) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/get_stuffed_prompt', methods=['GET']) +def get_stuffed_prompt(): + """Get most relevant contexts for a given search query. + + ## GET arguments + course name (optional) str + A json response with TBD fields. + search_query + top_n + + Returns + ------- + String + + """ + # todo: best way to handle optional arguments? + try: + course_name: str = request.args.get('course_name') + search_query: str = request.args.get('search_query') + token_limit: int = request.args.get('token_limit') + except Exception: + print("No course name provided.") + + print("In /getTopContexts: ", search_query) + if search_query is None: + return jsonify({"error": "No parameter `search_query` provided. It is undefined."}) + if token_limit is None: + token_limit = 3_000 + else: + token_limit = int(token_limit) + + ingester = Ingest() + prompt = ingester.get_stuffed_prompt(search_query, course_name, token_limit) + + response = jsonify(prompt) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/ingest', methods=['GET']) +def ingest(): + """Recursively ingests anything from S3 filepath and below. + Pass a s3_paths filepath (not URL) into our S3 bucket. + + Ingests all files, not just PDFs. + + args: + s3_paths: str | List[str] + + Returns: + str: Success or Failure message. Failure message if any failures. TODO: email on failure. + """ + + print("In /ingest") + + ingester = Ingest() + s3_paths: List[str] | str = request.args.get('s3_paths') + course_name: List[str] | str = request.args.get('course_name') + success_fail_dict = ingester.bulk_ingest(s3_paths, course_name) + + response = jsonify(success_fail_dict) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/getContextStuffedPrompt', methods=['GET']) +def getContextStuffedPrompt(): + """ + Get a stuffed prompt for a given user question and course name. + Args : + search_query (str) + course_name (str) : used for metadata filtering + Returns : str + a very long "stuffed prompt" with question + summaries of 20 most relevant documents. + """ + print("In /getContextStuffedPrompt") + + ingester = Ingest() + search_query: str = str(request.args.get('search_query')) # type: ignore + course_name: str = str(request.args.get('course_name')) # type: ignore + top_n: int = int(request.args.get('top_n')) # type: ignore + top_k_to_search: int = int(request.args.get('top_k_to_search')) # type: ignore + + start_time = time.monotonic() + stuffed_prompt = ingester.get_context_stuffed_prompt(search_query, course_name, top_n, top_k_to_search) + print(f"⏰ Runtime of EXTREME prompt stuffing: {(time.monotonic() - start_time):.2f} seconds") + response = jsonify({"prompt": stuffed_prompt}) + + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/getAll', methods=['GET']) +def getAll(): + """Get all course materials based on the course_name + """ + + print("In /getAll") + + ingester = Ingest() + course_name: List[str] | str = request.args.get('course_name') + distinct_dicts = ingester.getAll(course_name) + response = jsonify({"all_s3_paths": distinct_dicts}) + + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +#Write api to delete s3 files for a course +@app.route('/delete', methods=['DELETE']) +def delete(): + """Delete all course materials based on the course_name + """ + + print("In /delete") + + ingester = Ingest() + course_name: List[str] | str = request.args.get('course_name') + s3_path: str = request.args.get('s3_path') + success_or_failure = ingester.delete_data(s3_path, course_name) + response = jsonify({"outcome": success_or_failure}) + + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/log', methods=['GET']) +def log(): + """ + todo + """ + + print("In /log") + + ingester = Ingest() + # course_name: List[str] | str = request.args.get('course_name') + success_or_failure = ingester.log_to_arize('course_name', 'test', 'completion') + response = jsonify({"outcome": success_or_failure}) + + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/web-scrape', methods=['GET']) +def scrape(): + url: str = request.args.get('url') + max_urls: int = request.args.get('max_urls') + max_depth: int = request.args.get('max_depth') + timeout: int = request.args.get('timeout') + course_name: str = request.args.get('course_name') + base_url_bool: str = request.args.get('base_url_on') + + # print all input params + print("Web scrape!") + print(f"Url: {url}") + print(f"Max Urls: {max_urls}") + print(f"Max Depth: {max_depth}") + print(f"Timeout in Seconds ⏰: {timeout}") + + success_fail_dict = main_crawler(url, course_name, max_urls, max_depth, timeout, base_url_bool) + + response = jsonify(success_fail_dict) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +@app.route('/mit-download', methods=['GET']) +def mit_download_course(): + url: str = request.args.get('url') + course_name: str = request.args.get('course_name') + local_dir: str = request.args.get('local_dir') + + success_fail = mit_course_download(url, course_name, local_dir) + + response = jsonify(success_fail) + response.headers.add('Access-Control-Allow-Origin', '*') + return response + + +# TODO: add a way to delete items from course based on base_url + + +@app.route('/', methods=['POST']) # RUN: $ smee -u https://smee.io/nRnJDGnCbWYUaSGg --port 8000 +# @app.route('/api/webhook', methods=['POST']) # https://flask-ai-ta-backend-pr-34.up.railway.app/api/webhook +async def webhook(): + """ + IN PROGRESS: Github App Webhooks (for lil-jr-dev) + Wehbook URL to use on my github app (if this route is `/api/webhook`): https://flask-ai-ta-backend-pr-34.up.railway.app/api/webhook + + DOCS: + API reference for Webhook objects: https://docs.github.com/en/webhooks-and-events/webhooks/webhook-events-and-payloads#issue_comment + WEBHOOK explainer: https://docs.github.com/en/apps/creating-github-apps/registering-a-github-app/using-webhooks-with-github-apps + """ + + payload = request.json + print("Payload received...") + print(payload) + + with open('payload.json', 'w') as file: + json.dump(payload, file) + print("Saved payload to file.") + + # FOR LOCAL TESTING, USE THIS PAYLOAD: + # payload = '' + # with open('payload.json', 'r') as file: + # payload = json.load(file) + + await handle_github_event(payload) + + return '', 200 + + +async def main(): + # await handle_github_event() + f = open('UIUC-Chatbot/ai-ta-backend/sample.json') + payload = json.load(f) + await handle_github_event(payload) + pass + + +if __name__ == '__main__': + #app.run(debug=True, port=os.getenv("PORT", default=8000)) + asyncio.run(main()) diff --git a/ai_ta_backend/utils_tokenization.py b/ai_ta_backend/utils_tokenization.py index 7c36c2f9..6ff60538 100644 --- a/ai_ta_backend/utils_tokenization.py +++ b/ai_ta_backend/utils_tokenization.py @@ -1,12 +1,13 @@ -import json import os -from typing import Any, List +from typing import Any import supabase import tiktoken -def count_tokens_and_cost(prompt: str, completion: str = '', openai_model_name: str = "gpt-3.5-turbo"): # -> tuple[int, float] | tuple[int, float, int, float]: +def count_tokens_and_cost(prompt: str, + completion: str = '', + openai_model_name: str = "gpt-3.5-turbo"): # -> tuple[int, float] | tuple[int, float, int, float]: """ Returns the number of tokens in a text string. @@ -25,13 +26,13 @@ def count_tokens_and_cost(prompt: str, completion: str = '', openai_model_name: """ # encoding = tiktoken.encoding_for_model(openai_model_name) openai_model_name = openai_model_name.lower() - encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # I think they all use the same encoding + encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # I think they all use the same encoding prompt_cost = 0 completion_cost = 0 - - prompt_token_cost = 0 + + prompt_token_cost = 0 completion_token_cost = 0 - + if openai_model_name.startswith("gpt-3.5-turbo"): if "16k" in openai_model_name: prompt_token_cost: float = 0.003 / 1_000 @@ -40,7 +41,7 @@ def count_tokens_and_cost(prompt: str, completion: str = '', openai_model_name: # 3.5-turbo regular (4k context) prompt_token_cost: float = 0.0015 / 1_000 completion_token_cost: float = 0.002 / 1_000 - + elif openai_model_name.startswith("gpt-4"): if "32k" in openai_model_name: prompt_token_cost = 0.06 / 1_000 @@ -52,12 +53,12 @@ def count_tokens_and_cost(prompt: str, completion: str = '', openai_model_name: elif openai_model_name.startswith("text-embedding-ada-002"): prompt_token_cost = 0.0001 / 1_000 completion_token_cost = 0.0001 / 1_000 - else: + else: # no idea of cost - print(f"NO IDEA OF COST, pricing not supported for model model: `{openai_model_name}`") - prompt_token_cost = 0 - completion_token_cost = 0 - + print(f"NO IDEA OF COST, pricing not supported for model model: `{openai_model_name}`. (Defaulting to GPT-4 pricing...)") + prompt_token_cost = 0.03 / 1_000 + completion_token_cost = 0.06 / 1_000 + if completion == '': num_tokens_prompt: int = len(encoding.encode(prompt)) prompt_cost = float(prompt_token_cost * num_tokens_prompt) @@ -73,59 +74,62 @@ def count_tokens_and_cost(prompt: str, completion: str = '', openai_model_name: completion_cost = float(completion_token_cost * num_tokens_completion) return num_tokens_prompt, prompt_cost, num_tokens_completion, completion_cost + # from dotenv import load_dotenv # load_dotenv() + def analyze_conversations(supabase_client: Any = None): - if supabase_client is None: - supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - # Get all conversations - response = supabase_client.table('llm-convo-monitor').select('convo').execute() - # print("total entries", response.data.count) - - total_convos = 0 - total_messages = 0 - total_prompt_cost = 0 - total_completion_cost = 0 - - # Iterate through all conversations - # for convo in response['data']: - for convo in response.data: - total_convos += 1 - # print(convo) - # prase json from convo - # parse json into dict - # print(type(convo)) - # convo = json.loads(convo) - convo = convo['convo'] - messages = convo['messages'] - model_name = convo['model']['name'] - - # Iterate through all messages in each conversation - for message in messages: - total_messages += 1 - role = message['role'] - content = message['content'] - - # If the message is from the user, it's a prompt - if role == 'user': - num_tokens, cost = count_tokens_and_cost(prompt=content, openai_model_name=model_name) - total_prompt_cost += cost - print(f'User Prompt: {content}, Tokens: {num_tokens}, cost: {cost}') - - # If the message is from the assistant, it's a completion - elif role == 'assistant': - num_tokens_completion, cost_completion = count_tokens_and_cost(prompt='', completion=content, openai_model_name=model_name) - total_completion_cost += cost_completion - print(f'Assistant Completion: {content}\nTokens: {num_tokens_completion}, cost: {cost_completion}') - return total_convos, total_messages, total_prompt_cost, total_completion_cost - + if supabase_client is None: + supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + # Get all conversations + response = supabase_client.table('llm-convo-monitor').select('convo').execute() + # print("total entries", response.data.count) + + total_convos = 0 + total_messages = 0 + total_prompt_cost = 0 + total_completion_cost = 0 + + # Iterate through all conversations + # for convo in response['data']: + for convo in response.data: + total_convos += 1 + # print(convo) + # prase json from convo + # parse json into dict + # print(type(convo)) + # convo = json.loads(convo) + convo = convo['convo'] + messages = convo['messages'] + model_name = convo['model']['name'] + + # Iterate through all messages in each conversation + for message in messages: + total_messages += 1 + role = message['role'] + content = message['content'] + + # If the message is from the user, it's a prompt + if role == 'user': + num_tokens, cost = count_tokens_and_cost(prompt=content, openai_model_name=model_name) + total_prompt_cost += cost + print(f'User Prompt: {content}, Tokens: {num_tokens}, cost: {cost}') + + # If the message is from the assistant, it's a completion + elif role == 'assistant': + num_tokens_completion, cost_completion = count_tokens_and_cost(prompt='', completion=content, openai_model_name=model_name) + total_completion_cost += cost_completion + print(f'Assistant Completion: {content}\nTokens: {num_tokens_completion}, cost: {cost_completion}') + return total_convos, total_messages, total_prompt_cost, total_completion_cost + + if __name__ == '__main__': print('starting main') total_convos, total_messages, total_prompt_cost, total_completion_cost = analyze_conversations() print(f'total_convos: {total_convos}, total_messages: {total_messages}') - print(f'total_prompt_cost: {total_prompt_cost}, total_completion_cost: {total_completion_cost}') \ No newline at end of file + print(f'total_prompt_cost: {total_prompt_cost}, total_completion_cost: {total_completion_cost}') diff --git a/ai_ta_backend/vector_database.py b/ai_ta_backend/vector_database.py index 4c998f19..cd3f0c96 100644 --- a/ai_ta_backend/vector_database.py +++ b/ai_ta_backend/vector_database.py @@ -1,992 +1,987 @@ -import asyncio -import inspect -import mimetypes -# import json -import os -import shutil -import subprocess -import time -import traceback -from pathlib import Path -from tempfile import NamedTemporaryFile # TemporaryFile -from typing import Any, Dict, List, Optional, Tuple, Union # Literal - -import boto3 -# import requests -import fitz -import openai -import requests -import supabase -from bs4 import BeautifulSoup -# from arize.api import Client -# from arize.pandas.embeddings import EmbeddingGenerator, UseCases -# from arize.utils import ModelTypes -# from arize.utils.ModelTypes import GENERATIVE_LLM -# # from arize.utils.types import (Embedding, EmbeddingColumnNames, Environments, -# # Metrics, ModelTypes, Schema) -from langchain.document_loaders import (Docx2txtLoader, PythonLoader, - SRTLoader, - UnstructuredPowerPointLoader, TextLoader, GitLoader) -from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.schema import Document -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain.vectorstores import Qdrant -from pydub import AudioSegment -from qdrant_client import QdrantClient, models - -from git import Repo - -from ai_ta_backend.aws import upload_data_files_to_s3 -from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor -from ai_ta_backend.utils_tokenization import count_tokens_and_cost - - -class Ingest(): - """ - Contains all methods for building and using vector databases. - """ - - def __init__(self): - """ - Initialize AWS S3, Qdrant, and Supabase. - """ - - # vector DB - self.qdrant_client = QdrantClient( - url=os.getenv('QDRANT_URL'), - api_key=os.getenv('QDRANT_API_KEY'), - ) - - self.vectorstore = Qdrant( - client=self.qdrant_client, - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore - embeddings=OpenAIEmbeddings()) - - # S3 - self.s3_client = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) - - # Create a Supabase client - self.supabase_client = supabase.create_client( # type: ignore - supabase_url=os.getenv('SUPABASE_URL'), # type: ignore - supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore - return None - - def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n: int, top_k_to_search: int) -> str: - """ - Get a stuffed prompt for a given user question and course name. - Args : - user_question (str) - course_name (str) : used for metadata filtering - Returns : str - a very long "stuffed prompt" with question + summaries of top_n most relevant documents. - """ - # MMR with metadata filtering based on course_name - vec_start_time = time.monotonic() - found_docs = self.vectorstore.max_marginal_relevance_search(user_question, k=top_n, fetch_k=top_k_to_search) - print( - f"⏰ MMR Search runtime (top_n_to_keep: {top_n}, top_k_to_search: {top_k_to_search}): {(time.monotonic() - vec_start_time):.2f} seconds" - ) - - requests = [] - for i, doc in enumerate(found_docs): - dictionary = { - "model": "gpt-3.5-turbo", - "messages": [{ - "role": - "system", - "content": - "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary." - }, { - "role": - "user", - "content": - f"Provide a comprehensive summary of the given text, based on this question:\n{doc.page_content}\nQuestion: {user_question}\nThe summary should cover all the key points that are relevant to the question, while also condensing the information into a concise format. The length of the summary should be as short as possible, without losing relevant information.\nMake use of direct quotes from the text.\nFeel free to include references, sentence fragments, keywords or anything that could help someone learn about it, only as it relates to the given question.\nIf the text does not provide information to answer the question, please write 'None' and nothing else.", - }], - "n": 1, - "max_tokens": 600, - "metadata": doc.metadata - } - requests.append(dictionary) - - oai = OpenAIAPIProcessor(input_prompts_list=requests, - request_url='https://api.openai.com/v1/chat/completions', - api_key=os.getenv("OPENAI_API_KEY"), - max_requests_per_minute=1500, - max_tokens_per_minute=90000, - token_encoding_name='cl100k_base', - max_attempts=5, - logging_level=20) - - chain_start_time = time.monotonic() - asyncio.run(oai.process_api_requests_from_file()) - results: list[str] = oai.results - print(f"⏰ EXTREME context stuffing runtime: {(time.monotonic() - chain_start_time):.2f} seconds") - - print(f"Cleaned results: {oai.cleaned_results}") - - all_texts = "" - separator = '---' # between each context - token_counter = 0 #keeps track of tokens in each summarization - max_tokens = 7_500 #limit, will keep adding text to string until 8000 tokens reached. - for i, text in enumerate(oai.cleaned_results): - if text.lower().startswith('none') or text.lower().endswith('none.') or text.lower().endswith('none'): - # no useful text, it replied with a summary of "None" - continue - if text is not None: - num_tokens, prompt_cost = count_tokens_and_cost(text) - if token_counter + num_tokens > max_tokens: - print(f"Total tokens yet in loop {i} is {num_tokens}") - break # Stop building the string if it exceeds the maximum number of tokens - token_counter += num_tokens - filename = str(results[i][-1].get('readable_filename', '')) # type: ignore - pagenumber_or_timestamp = str(results[i][-1].get('pagenumber_or_timestamp', '')) # type: ignore - pagenumber = f", page: {pagenumber_or_timestamp}" if pagenumber_or_timestamp else '' - doc = f"Document : filename: {filename}" + pagenumber - summary = f"\nSummary: {text}" - all_texts += doc + summary + '\n' + separator + '\n' - - stuffed_prompt = f"""Please answer the following question. -Use the context below, called 'your documents', only if it's helpful and don't use parts that are very irrelevant. -It's good to quote 'your documents' directly using informal citations, like "in document X it says Y". Try to avoid giving false or misleading information. Feel free to say you don't know. -Try to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. -That said, be practical and really do your best, and don't let caution get too much in the way of being useful. -To help answer the question, here's a few passages of high quality documents:\n{all_texts} -Now please respond to my question: {user_question}""" - -# "Please answer the following question. It's good to quote 'your documents' directly, something like 'from ABS source it says XYZ' Feel free to say you don't know. \nHere's a few passages of the high quality 'your documents':\n" - - return stuffed_prompt - - # def ai_summary(self, text: List[str], metadata: List[Dict[str, Any]]) -> List[str]: - # """ - # Given a textual input, return a summary of the text. - # """ - # #print("in AI SUMMARY") - # requests = [] - # for i in range(len(text)): - # dictionary = { - # "model": "gpt-3.5-turbo", - # "messages": [{ - # "role": - # "system", - # "content": - # "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary." - # }, { - # "role": - # "user", - # "content": - # f"Provide a descriptive summary of the given text:\n{text[i]}\nThe summary should cover all the key points, while also condensing the information into a concise format. The length of the summary should not exceed 3 sentences.", - # }], - # "n": 1, - # "max_tokens": 600, - # "metadata": metadata[i] - # } - # requests.append(dictionary) - - # oai = OpenAIAPIProcessor(input_prompts_list=requests, - # request_url='https://api.openai.com/v1/chat/completions', - # api_key=os.getenv("OPENAI_API_KEY"), - # max_requests_per_minute=1500, - # max_tokens_per_minute=90000, - # token_encoding_name='cl100k_base', - # max_attempts=5, - # logging_level=20) - - # asyncio.run(oai.process_api_requests_from_file()) - # #results: list[str] = oai.results - # #print(f"Cleaned results: {oai.cleaned_results}") - # summary = oai.cleaned_results - # return summary - - - def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwargs) -> Dict[str, List[str]]: - # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/microsoft_word.html - success_status = {"success_ingest": [], "failure_ingest": []} - - try: - if isinstance(s3_paths, str): - s3_paths = [s3_paths] - - for s3_path in s3_paths: - ext = Path(s3_path).suffix # check mimetype of file - # TODO: no need to download, just guess_type against the s3_path... - with NamedTemporaryFile(suffix=ext) as tmpfile: - self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) - mime_type = mimetypes.guess_type(tmpfile.name)[0] - category, subcategory = mime_type.split('/') - - if s3_path.endswith('.html'): - ret = self._ingest_html(s3_path, course_name, kwargs=kwargs) - if ret != "Success": - success_status['failure_ingest'].append(s3_path) - else: - success_status['success_ingest'].append(s3_path) - elif s3_path.endswith('.py'): - ret = self._ingest_single_py(s3_path, course_name) - if ret != "Success": - success_status['failure_ingest'].append(s3_path) - else: - success_status['success_ingest'].append(s3_path) - elif s3_path.endswith('.vtt'): - ret = self._ingest_single_vtt(s3_path, course_name) - if ret != "Success": - success_status['failure_ingest'].append(s3_path) - else: - success_status['success_ingest'].append(s3_path) - elif s3_path.endswith('.pdf'): - ret = self._ingest_single_pdf(s3_path, course_name, kwargs=kwargs) - if ret != "Success": - success_status['failure_ingest'].append(s3_path) - else: - success_status['success_ingest'].append(s3_path) - elif s3_path.endswith('.txt'): - ret = self._ingest_single_txt(s3_path, course_name) - if ret != "Success": - success_status['failure_ingest'].append(s3_path) - else: - success_status['success_ingest'].append(s3_path) - elif s3_path.endswith('.srt'): - ret = self._ingest_single_srt(s3_path, course_name) - if ret != "Success": - success_status['failure_ingest'].append(s3_path) - else: - success_status['success_ingest'].append(s3_path) - elif s3_path.endswith('.docx'): - ret = self._ingest_single_docx(s3_path, course_name) - if ret != "Success": - success_status['failure_ingest'].append(s3_path) - else: - success_status['success_ingest'].append(s3_path) - elif s3_path.endswith('.ppt') or s3_path.endswith('.pptx'): - ret = self._ingest_single_ppt(s3_path, course_name) - if ret != "Success": - success_status['failure_ingest'].append(s3_path) - else: - success_status['success_ingest'].append(s3_path) - elif category == 'video' or category == 'audio': - ret = self._ingest_single_video(s3_path, course_name) - if ret != "Success": - success_status['failure_ingest'].append(s3_path) - else: - success_status['success_ingest'].append(s3_path) - return success_status - except Exception as e: - success_status['failure_ingest'].append("MAJOR ERROR IN /bulk_ingest: Error: " + str(e)) - return success_status - - def _ingest_single_py(self, s3_path: str, course_name: str): - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into vtt_tmpfile - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) - loader = PythonLoader(tmpfile.name) - documents = loader.load() - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': Path(s3_path).name, - 'pagenumber_or_timestamp': '', - } for doc in documents] - - success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas) - return success_or_failure - except Exception as e: - print(f"ERROR IN py READING {e}") - - def _ingest_single_vtt(self, s3_path: str, course_name: str): - """ - Ingest a single .vtt file from S3. - """ - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into vtt_tmpfile - self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) - loader = TextLoader(tmpfile.name) - documents = loader.load() - texts = [doc.page_content for doc in documents] - - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': Path(s3_path).name, - 'pagenumber_or_timestamp': '', - } for doc in documents] - - success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas) - return success_or_failure - except Exception as e: - print(f"ERROR IN VTT READING {e}") - - def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str: - try: - response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path) - raw_html = response['Body'].read().decode('utf-8') - - soup = BeautifulSoup(raw_html, 'html.parser') - title = s3_path.replace("courses/"+course_name, "") - title = title.replace(".html", "") - title = title.replace("_", " ") - title = title.replace("/", " ") - title = title.strip() - - if kwargs['kwargs'] == {}: - url = '' - base_url = '' - else: - if 'url' in kwargs['kwargs'].keys(): - url = kwargs['kwargs']['url'] - else: - url = '' - if 'base_url' in kwargs['kwargs'].keys(): - base_url = kwargs['kwargs']['base_url'] - else: - base_url = '' - title = str(object=time.localtime()[1])+ "/" + str(time.localtime()[2]) + "/" + str(time.localtime()[0])[2:] + ' ' + str(title) - - text = [soup.get_text()] - metadata: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': str(title), # adding str to avoid error: unhashable type 'slice' - 'url': url, - 'base_url': base_url, - 'pagenumber_or_timestamp': '' - }] - - success_or_failure = self.split_and_upload(text, metadata) - print(f"_ingest_html: {success_or_failure}") - return success_or_failure - except Exception as e: - err: str = f"ERROR IN _ingest_html: {e}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - return f"_ingest_html Error: {e}" - - def _ingest_single_video(self, s3_path: str, course_name: str) -> str: - """ - Ingest a single video file from S3. - """ - try: - # check for file extension - file_ext = Path(s3_path).suffix - print(file_ext[1:]) - - openai.api_key = os.getenv('OPENAI_API_KEY') - transcript_list = [] - #print(os.getcwd()) - with NamedTemporaryFile(suffix=file_ext) as video_tmpfile: - # download from S3 into an video tmpfile - self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=video_tmpfile) - # extract audio from video tmpfile - mp4_version = AudioSegment.from_file(video_tmpfile.name, file_ext[1:]) - #print("Video file: ", video_tmpfile.name) - - # save the extracted audio as a temporary webm file - with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as webm_tmpfile: - mp4_version.export(webm_tmpfile, format="webm") - #print("WEBM file: ", webm_tmpfile.name) - - # check file size - file_size = os.path.getsize(webm_tmpfile.name) - # split the audio into 25MB chunks - if file_size > 26214400: - # load the webm file into audio object - full_audio = AudioSegment.from_file(webm_tmpfile.name, "webm") - file_count = file_size // 26214400 + 1 - split_segment = 35 * 60 * 1000 - start = 0 - count = 0 - - while count < file_count: - with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as split_tmp: - #print("Splitting file: ", split_tmp.name) - if count == file_count - 1: - # last segment - audio_chunk = full_audio[start:] - else: - audio_chunk = full_audio[start:split_segment] - - audio_chunk.export(split_tmp.name, format="webm") - - # transcribe the split file and store the text in dictionary - with open(split_tmp.name, "rb") as f: - transcript = openai.Audio.transcribe("whisper-1", f) - transcript_list.append(transcript['text']) # type: ignore - start += split_segment - split_segment += split_segment - count += 1 - os.remove(split_tmp.name) - else: - # transcribe the full audio - with open(webm_tmpfile.name, "rb") as f: - transcript = openai.Audio.transcribe("whisper-1", f) - transcript_list.append(transcript['text']) # type: ignore - - os.remove(webm_tmpfile.name) - - text = [txt for txt in transcript_list] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': Path(s3_path).name, - 'pagenumber_or_timestamp': text.index(txt), - } for txt in text] - - self.split_and_upload(texts=text, metadatas=metadatas) - return "Success" - except Exception as e: - print("ERROR IN VIDEO READING ") - print(e) - return f"Error {e}" - - def _ingest_single_docx(self, s3_path: str, course_name: str) -> str: - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into pdf_tmpfile - print("Bucket: ", os.getenv('S3_BUCKET_NAME')) - print("Key: ", s3_path) - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) - print("GOT THE FILE") - print(tmpfile.name) - - loader = Docx2txtLoader(tmpfile.name) - documents = loader.load() - - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': Path(s3_path).name, - 'pagenumber_or_timestamp': '', - } for doc in documents] - - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - print(f"ERROR IN DOCX {e}") - return f"Error: {e}" - - def _ingest_single_srt(self, s3_path: str, course_name: str) -> str: - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into pdf_tmpfile - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) - - loader = SRTLoader(tmpfile.name) - documents = loader.load() - - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': Path(s3_path).name, - 'pagenumber_or_timestamp': '', - } for doc in documents] - - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - print(f"SRT ERROR {e}") - return f"Error: {e}" - - def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs): - """ - Both OCR the PDF. And grab the first image as a PNG. - LangChain `Documents` have .metadata and .page_content attributes. - Be sure to use TemporaryFile() to avoid memory leaks! - """ - try: - with NamedTemporaryFile() as pdf_tmpfile: - # download from S3 into pdf_tmpfile - self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile) - ### READ OCR of PDF - doc = fitz.open(pdf_tmpfile.name) # type: ignore - - # improve quality of the image - zoom_x = 2.0 # horizontal zoom - zoom_y = 2.0 # vertical zoom - mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension - - pdf_pages_OCRed: List[Dict] = [] - for i, page in enumerate(doc): # type: ignore - - # UPLOAD FIRST PAGE IMAGE to S3 - if i == 0: - with NamedTemporaryFile(suffix=".png") as first_page_png: - pix = page.get_pixmap(matrix=mat) - pix.save(first_page_png) # store image as a PNG - - s3_upload_path = str(Path(s3_path)).rsplit('.pdf')[0] + "-pg1-thumb.png" - first_page_png.seek(0) # Seek the file pointer back to the beginning - with open(first_page_png.name, 'rb') as f: - print("Uploading image png to S3") - self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path) - - # Extract text - text = page.get_text().encode("utf8").decode('ascii', errors='ignore') # get plain text (is in UTF-8) - pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name)) - - if kwargs['kwargs'] == {}: - url = '' - base_url = '' - else: - if 'url' in kwargs['kwargs'].keys(): - url = kwargs['kwargs']['url'] - else: - url = '' - if 'base_url' in kwargs['kwargs'].keys(): - base_url = kwargs['kwargs']['base_url'] - else: - base_url = '' - page['readable_filename'] = str(object=time.localtime()[1])+ "/" + str(time.localtime()[2]) + "/" + str(time.localtime()[0])[2:] + ' ' + page['readable_filename'] - - - metadatas: List[Dict[str, Any]] = [ - { - 'course_name': course_name, - 's3_path': s3_path, - 'pagenumber_or_timestamp': page['page_number'] + 1, # +1 for human indexing - 'readable_filename': page['readable_filename'], - 'url': url, - 'base_url': base_url, - } for page in pdf_pages_OCRed - ] - pdf_texts = [page['text'] for page in pdf_pages_OCRed] - - self.split_and_upload(texts=pdf_texts, metadatas=metadatas) - print("Success pdf ingest") - except Exception as e: - print("ERROR IN PDF READING ") - print(e) - return f"Error {e}" - return "Success" - - def _ingest_single_txt(self, s3_path: str, course_name: str) -> str: - """Ingest a single .txt file from S3. - Args: - s3_path (str): A path to a .txt file in S3 - course_name (str): The name of the course - Returns: - str: "Success" or an error message - """ - try: - # NOTE: slightly different method for .txt files, no need for download. It's part of the 'body' - response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path) - text = response['Body'].read().decode('utf-8') - text = [text] - - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': Path(s3_path).name, - 'pagenumber_or_timestamp': '', - }] - - success_or_failure = self.split_and_upload(texts=text, metadatas=metadatas) - return success_or_failure - except Exception as e: - print(f"ERROR IN TXT READING {e}") - return f"Error: {e}" - - def _ingest_single_ppt(self, s3_path: str, course_name: str) -> str: - """ - Ingest a single .ppt or .pptx file from S3. - """ - try: - with NamedTemporaryFile() as tmpfile: - # download from S3 into pdf_tmpfile - #print("in ingest PPTX") - self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) - - loader = UnstructuredPowerPointLoader(tmpfile.name) - documents = loader.load() - - texts = [doc.page_content for doc in documents] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': s3_path, - 'readable_filename': Path(s3_path).name, - 'pagenumber_or_timestamp': '', - } for doc in documents] - - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - print("ERROR IN PDF READING ") - print(e) - return f"Error {e}" - - def list_files_recursively(self, bucket, prefix): - all_files = [] - continuation_token = None - - while True: - list_objects_kwargs = { - 'Bucket': bucket, - 'Prefix': prefix, - } - if continuation_token: - list_objects_kwargs['ContinuationToken'] = continuation_token - - response = self.s3_client.list_objects_v2(**list_objects_kwargs) - - if 'Contents' in response: - for obj in response['Contents']: - all_files.append(obj['Key']) - - if response['IsTruncated']: - continuation_token = response['NextContinuationToken'] - else: - break - - return all_files - - def ingest_coursera(self, coursera_course_name: str, course_name: str) -> str: - """ Download all the files from a coursera course and ingest them. - - 1. Download the coursera content. - 2. Upload to S3 (so users can view it) - 3. Run everything through the ingest_bulk method. - - Args: - coursera_course_name (str): The name of the coursera course. - course_name (str): The name of the course in our system. - - Returns: - _type_: Success or error message. - """ - certificate = "-ca 'FVhVoDp5cb-ZaoRr5nNJLYbyjCLz8cGvaXzizqNlQEBsG5wSq7AHScZGAGfC1nI0ehXFvWy1NG8dyuIBF7DLMA.X3cXsDvHcOmSdo3Fyvg27Q.qyGfoo0GOHosTVoSMFy-gc24B-_BIxJtqblTzN5xQWT3hSntTR1DMPgPQKQmfZh_40UaV8oZKKiF15HtZBaLHWLbpEpAgTg3KiTiU1WSdUWueo92tnhz-lcLeLmCQE2y3XpijaN6G4mmgznLGVsVLXb-P3Cibzz0aVeT_lWIJNrCsXrTFh2HzFEhC4FxfTVqS6cRsKVskPpSu8D9EuCQUwJoOJHP_GvcME9-RISBhi46p-Z1IQZAC4qHPDhthIJG4bJqpq8-ZClRL3DFGqOfaiu5y415LJcH--PRRKTBnP7fNWPKhcEK2xoYQLr9RxBVL3pzVPEFyTYtGg6hFIdJcjKOU11AXAnQ-Kw-Gb_wXiHmu63veM6T8N2dEkdqygMre_xMDT5NVaP3xrPbA4eAQjl9yov4tyX4AQWMaCS5OCbGTpMTq2Y4L0Mbz93MHrblM2JL_cBYa59bq7DFK1IgzmOjFhNG266mQlC9juNcEhc'" - always_use_flags = "-u kastanvday@gmail.com -p hSBsLaF5YM469# --ignore-formats mp4 --subtitle-language en --path ./coursera-dl" - - try: - results = subprocess.run(f"coursera-dl {always_use_flags} {certificate} {coursera_course_name}", - check=True, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) # capture_output=True, - dl_results_path = os.path.join('coursera-dl', coursera_course_name) - s3_paths: Union[List, None] = upload_data_files_to_s3(course_name, dl_results_path) - - if s3_paths is None: - return "Error: No files found in the coursera-dl directory" - - print("starting bulk ingest") - start_time = time.monotonic() - self.bulk_ingest(s3_paths, course_name) - print("completed bulk ingest") - print(f"⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds") - - # Cleanup the coursera downloads - shutil.rmtree(dl_results_path) - - return "Success" - except Exception as e: - err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - return err - - def ingest_github(self, github_url: str, course_name: str) -> str: - """ - Clones the given GitHub URL and uses Langchain to load data. - 1. Clone the repo - 2. Use Langchain to load the data - 3. Pass to split_and_upload() - Args: - github_url (str): The Github Repo URL to be ingested. - course_name (str): The name of the course in our system. - - Returns: - _type_: Success or error message. - """ - print("in ingest_github") - - try: - repo_path = "media/cloned_repo" - repo = Repo.clone_from(github_url, to_path=repo_path, depth=1, clone_submodules=False) - branch = repo.head.reference - - loader = GitLoader(repo_path="media/cloned_repo", branch=branch) - data = loader.load() - shutil.rmtree("media/cloned_repo") - # create metadata for each file in data - texts = [doc.page_content for doc in data] - metadatas: List[Dict[str, Any]] = [{ - 'course_name': course_name, - 's3_path': '', - 'readable_filename': doc.metadata['file_name'], - 'url': github_url, - 'pagenumber_or_timestamp': '', - } for doc in data] - self.split_and_upload(texts=texts, metadatas=metadatas) - return "Success" - except Exception as e: - print(f"ERROR IN GITHUB INGEST {e}") - return f"Error: {e}" - - def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): - """ This is usually the last step of document ingest. Chunk & upload to Qdrant (and Supabase.. todo). - Takes in Text and Metadata (from Langchain doc loaders) and splits / uploads to Qdrant. - - good examples here: https://langchain.readthedocs.io/en/latest/modules/utils/combine_docs_examples/textsplitter.html - - Args: - texts (List[str]): _description_ - metadatas (List[Dict[str, Any]]): _description_ - """ - assert len(texts) == len(metadatas), 'must have equal number of text strings and metadata dicts' - - try: - # generate AI summary - # summary = self.ai_summary(texts, metadatas) - # for i in range(len(summary)): - # metadatas[i]['summary'] = summary[i] - - text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( - chunk_size=1000, - chunk_overlap=150, - separators=". ", # try to split on sentences... - ) - documents: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas) - - def remove_small_contexts(documents: List[Document]) -> List[Document]: - # Remove TextSplit contexts with fewer than 50 chars. - return [doc for doc in documents if len(doc.page_content) > 50] - - documents = remove_small_contexts(documents=documents) - - # upload to Qdrant - self.vectorstore.add_texts([doc.page_content for doc in documents], [doc.metadata for doc in documents]) - data = [{"content": doc.page_content, "metadata": doc.metadata} for doc in documents] - count = self.supabase_client.table(os.getenv('MATERIALS_SUPABASE_TABLE')).insert(data).execute() # type: ignore - - return "Success" - except Exception as e: - err: str = f"ERROR IN split_and_upload(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - return err - - def delete_entire_course(self, course_name: str): - """Delete entire course. - - Delete materials from S3, Supabase SQL, Vercel KV, and QDrant vector DB - - Args: - course_name (str): _description_ - """ - print(f"Deleting entire course: {course_name}") - try: - # Delete file from S3 - objects_to_delete = self.s3_client.list_objects(Bucket=os.getenv('S3_BUCKET_NAME'), Prefix=f'courses/{course_name}/') - for object in objects_to_delete['Contents']: - self.s3_client.delete_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=object['Key']) - - # Delete from Qdrant - # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key - # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), - self.qdrant_client.delete( - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), - points_selector=models.Filter( - must=[ - models.FieldCondition( - key="metadata.course_name", - match=models.MatchValue(value=course_name), - ), - ] - ), - ) - - # Delete from Supabase - response = self.supabase_client.from_(os.getenv('MATERIALS_SUPABASE_TABLE')).delete().eq('metadata->>course_name', course_name).execute() - print("supabase response: ", response) - return "Success" - except Exception as e: - err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - return err - - # Create a method to delete file from s3, delete vector from qdrant, and delete row from supabase - def delete_data(self, s3_path: str, course_name: str): - """Delete file from S3, Qdrant, and Supabase.""" - print(f"Deleting {s3_path} from S3, Qdrant, and Supabase for course {course_name}") - try: - # Delete file from S3 - bucket_name = os.getenv('S3_BUCKET_NAME') - self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path) - - # Delete from Qdrant - # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key - # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), - self.qdrant_client.delete( - collection_name=os.getenv('QDRANT_COLLECTION_NAME'), - points_selector=models.Filter(must=[ - models.FieldCondition( - key="metadata.s3_path", - match=models.MatchValue(value=s3_path), - ), - ]), - ) - - # Delete from Supabase - response = self.supabase_client.from_(os.getenv('MATERIALS_SUPABASE_TABLE')).delete().eq('metadata->>s3_path', s3_path).eq( - 'metadata->>course_name', course_name).execute() - return "Success" - except Exception as e: - err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - return err - - def getAll( - self, - course_name: str, - ): - """Get all course materials based on course name. - Args : - course_name (as uploaded on supabase) - Returns : - list of dictionaries with distinct s3 path, readable_filename and course_name. - """ - response = self.supabase_client.table(os.getenv('MATERIALS_SUPABASE_TABLE')).select( - 'metadata->>course_name, metadata->>s3_path, metadata->>readable_filename').eq( # type: ignore - 'metadata->>course_name', course_name).execute() - - data = response.data - unique_combinations = set() - distinct_dicts = [] - - for item in data: - combination = (item['s3_path'], item['readable_filename'], item['course_name']) - if combination not in unique_combinations: - unique_combinations.add(combination) - distinct_dicts.append(item) - - return distinct_dicts - - def getTopContexts(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]: - """Here's a summary of the work. - - /GET arguments - course name (optional) str: A json response with TBD fields. - - Returns - JSON: A json response with TBD fields. See main.py:getTopContexts docs. - or - String: An error message with traceback. - """ - try: - # TODO: change back to 50+ once we have bigger qdrant DB. - top_n = 80 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS - start_time_overall = time.monotonic() - found_docs = self.vectorstore.similarity_search(search_query, k=top_n, filter={'course_name': course_name}) - if len(found_docs) == 0: - return [] - - pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - - # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) - valid_docs = [] - for d in found_docs: - doc_string = f"Document: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber_or_timestamp']) if d.metadata['pagenumber_or_timestamp'] else ''}\n{d.page_content}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) - # print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") - if token_counter + num_tokens <= token_limit: - token_counter += num_tokens - valid_docs.append(d) - else: - break - - print(f"Total tokens: {token_counter} total docs: {len(found_docs)} num docs used: {len(valid_docs)}") - print(f"Course: {course_name} ||| search_query: {search_query}") - print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") - - return self.format_for_json(valid_docs) - except Exception as e: - # return full traceback to front end - err: str = f"In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore - print(err) - return err - - def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: int = 7_000) -> str: - """ - Returns - String: A fully formatted prompt string. - """ - try: - top_n = 150 - start_time_overall = time.monotonic() - found_docs = self.vectorstore.similarity_search(search_query, k=top_n, filter={'course_name': course_name}) - if len(found_docs) == 0: - return search_query - - pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" - - # count tokens at start and end, then also count each context. - token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) - valid_docs = [] - for d in found_docs: - doc_string = f"---\nDocument: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber_or_timestamp']) if d.metadata['pagenumber_or_timestamp'] else ''}\n{d.page_content}\n" - num_tokens, prompt_cost = count_tokens_and_cost(doc_string) - print(f"Page: {d.page_content[:100]}...") - print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, token_limit: {token_limit}") - if token_counter + num_tokens <= token_limit: - token_counter += num_tokens - valid_docs.append(d) - else: - continue - print("running continue") - - # Convert the valid_docs to full prompt - separator = '---\n' # between each context - context_text = separator.join( - f"Document: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber_or_timestamp']) if d.metadata['pagenumber_or_timestamp'] else ''}\n{d.page_content}\n" - for d in valid_docs - ) - - # Create the stuffedPrompt - stuffedPrompt = ( - pre_prompt + - context_text + - '\n\nNow please respond to my query: ' + - search_query - ) - - TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4') - print(f"Total tokens: {TOTAL_num_tokens}, prompt_cost: {prompt_cost}") - print("total docs: ", len(found_docs)) - print("num docs used: ", len(valid_docs)) - - print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") - return stuffedPrompt - except Exception as e: - # return full traceback to front end - err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore - print(err) - return err - - def format_for_json(self, found_docs: List[Document]) -> List[Dict]: - """Formatting only. - {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]} - - Args: - found_docs (List[Document]): _description_ - - Raises: - Exception: _description_ - - Returns: - List[Dict]: _description_ - """ - - contexts = [{ - 'text': doc.page_content, - 'readable_filename': doc.metadata['readable_filename'], - 'course_name ': doc.metadata['course_name'], - 's3_path': doc.metadata['s3_path'], - 'pagenumber_or_timestamp': doc.metadata['pagenumber_or_timestamp'], - } for doc in found_docs] - - return contexts \ No newline at end of file +import asyncio +import inspect +import mimetypes +# import json +import os +from pathlib import Path +import shutil +import subprocess +from tempfile import NamedTemporaryFile # TemporaryFile +import time +import traceback +from typing import Any, Dict, List, Union # Literal + +import boto3 +from bs4 import BeautifulSoup +# import requests +import fitz +from git import Repo +# from arize.api import Client +# from arize.pandas.embeddings import EmbeddingGenerator, UseCases +# from arize.utils import ModelTypes +# from arize.utils.ModelTypes import GENERATIVE_LLM +# # from arize.utils.types import (Embedding, EmbeddingColumnNames, Environments, +# # Metrics, ModelTypes, Schema) +from langchain.document_loaders import Docx2txtLoader +from langchain.document_loaders import GitLoader +from langchain.document_loaders import PythonLoader +from langchain.document_loaders import SRTLoader +from langchain.document_loaders import TextLoader +from langchain.document_loaders import UnstructuredPowerPointLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.schema import Document +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain.vectorstores import Qdrant +import openai +from pydub import AudioSegment +from qdrant_client import models +from qdrant_client import QdrantClient +import supabase + +from ai_ta_backend.aws import upload_data_files_to_s3 +from ai_ta_backend.extreme_context_stuffing import OpenAIAPIProcessor +from ai_ta_backend.utils_tokenization import count_tokens_and_cost + + +class Ingest(): + """ + Contains all methods for building and using vector databases. + """ + + def __init__(self): + """ + Initialize AWS S3, Qdrant, and Supabase. + """ + + # vector DB + self.qdrant_client = QdrantClient( + url=os.getenv('QDRANT_URL'), + api_key=os.getenv('QDRANT_API_KEY'), + ) + + self.vectorstore = Qdrant( + client=self.qdrant_client, + collection_name=os.getenv('QDRANT_COLLECTION_NAME'), # type: ignore + embeddings=OpenAIEmbeddings()) + + # S3 + self.s3_client = boto3.client( + 's3', + aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), + ) + + # Create a Supabase client + self.supabase_client = supabase.create_client( # type: ignore + supabase_url=os.getenv('SUPABASE_URL'), # type: ignore + supabase_key=os.getenv('SUPABASE_API_KEY')) # type: ignore + return None + + def get_context_stuffed_prompt(self, user_question: str, course_name: str, top_n: int, top_k_to_search: int) -> str: + """ + Get a stuffed prompt for a given user question and course name. + Args : + user_question (str) + course_name (str) : used for metadata filtering + Returns : str + a very long "stuffed prompt" with question + summaries of top_n most relevant documents. + """ + # MMR with metadata filtering based on course_name + vec_start_time = time.monotonic() + found_docs = self.vectorstore.max_marginal_relevance_search(user_question, k=top_n, fetch_k=top_k_to_search) + print( + f"⏰ MMR Search runtime (top_n_to_keep: {top_n}, top_k_to_search: {top_k_to_search}): {(time.monotonic() - vec_start_time):.2f} seconds" + ) + + requests = [] + for i, doc in enumerate(found_docs): + dictionary = { + "model": "gpt-3.5-turbo", + "messages": [{ + "role": + "system", + "content": + "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary." + }, { + "role": + "user", + "content": + f"Provide a comprehensive summary of the given text, based on this question:\n{doc.page_content}\nQuestion: {user_question}\nThe summary should cover all the key points that are relevant to the question, while also condensing the information into a concise format. The length of the summary should be as short as possible, without losing relevant information.\nMake use of direct quotes from the text.\nFeel free to include references, sentence fragments, keywords or anything that could help someone learn about it, only as it relates to the given question.\nIf the text does not provide information to answer the question, please write 'None' and nothing else.", + }], + "n": 1, + "max_tokens": 600, + "metadata": doc.metadata + } + requests.append(dictionary) + + oai = OpenAIAPIProcessor(input_prompts_list=requests, + request_url='https://api.openai.com/v1/chat/completions', + api_key=os.getenv("OPENAI_API_KEY"), + max_requests_per_minute=1500, + max_tokens_per_minute=90000, + token_encoding_name='cl100k_base', + max_attempts=5, + logging_level=20) + + chain_start_time = time.monotonic() + asyncio.run(oai.process_api_requests_from_file()) + results: list[str] = oai.results + print(f"⏰ EXTREME context stuffing runtime: {(time.monotonic() - chain_start_time):.2f} seconds") + + print(f"Cleaned results: {oai.cleaned_results}") + + all_texts = "" + separator = '---' # between each context + token_counter = 0 #keeps track of tokens in each summarization + max_tokens = 7_500 #limit, will keep adding text to string until 8000 tokens reached. + for i, text in enumerate(oai.cleaned_results): + if text.lower().startswith('none') or text.lower().endswith('none.') or text.lower().endswith('none'): + # no useful text, it replied with a summary of "None" + continue + if text is not None: + num_tokens, prompt_cost = count_tokens_and_cost(text) + if token_counter + num_tokens > max_tokens: + print(f"Total tokens yet in loop {i} is {num_tokens}") + break # Stop building the string if it exceeds the maximum number of tokens + token_counter += num_tokens + filename = str(results[i][-1].get('readable_filename', '')) # type: ignore + pagenumber_or_timestamp = str(results[i][-1].get('pagenumber_or_timestamp', '')) # type: ignore + pagenumber = f", page: {pagenumber_or_timestamp}" if pagenumber_or_timestamp else '' + doc = f"Document : filename: {filename}" + pagenumber + summary = f"\nSummary: {text}" + all_texts += doc + summary + '\n' + separator + '\n' + + stuffed_prompt = f"""Please answer the following question. +Use the context below, called 'your documents', only if it's helpful and don't use parts that are very irrelevant. +It's good to quote 'your documents' directly using informal citations, like "in document X it says Y". Try to avoid giving false or misleading information. Feel free to say you don't know. +Try to be helpful, polite, honest, sophisticated, emotionally aware, and humble-but-knowledgeable. +That said, be practical and really do your best, and don't let caution get too much in the way of being useful. +To help answer the question, here's a few passages of high quality documents:\n{all_texts} +Now please respond to my question: {user_question}""" + + # "Please answer the following question. It's good to quote 'your documents' directly, something like 'from ABS source it says XYZ' Feel free to say you don't know. \nHere's a few passages of the high quality 'your documents':\n" + + return stuffed_prompt + + # def ai_summary(self, text: List[str], metadata: List[Dict[str, Any]]) -> List[str]: + # """ + # Given a textual input, return a summary of the text. + # """ + # #print("in AI SUMMARY") + # requests = [] + # for i in range(len(text)): + # dictionary = { + # "model": "gpt-3.5-turbo", + # "messages": [{ + # "role": + # "system", + # "content": + # "You are a factual summarizer of partial documents. Stick to the facts (including partial info when necessary to avoid making up potentially incorrect details), and say I don't know when necessary." + # }, { + # "role": + # "user", + # "content": + # f"Provide a descriptive summary of the given text:\n{text[i]}\nThe summary should cover all the key points, while also condensing the information into a concise format. The length of the summary should not exceed 3 sentences.", + # }], + # "n": 1, + # "max_tokens": 600, + # "metadata": metadata[i] + # } + # requests.append(dictionary) + + # oai = OpenAIAPIProcessor(input_prompts_list=requests, + # request_url='https://api.openai.com/v1/chat/completions', + # api_key=os.getenv("OPENAI_API_KEY"), + # max_requests_per_minute=1500, + # max_tokens_per_minute=90000, + # token_encoding_name='cl100k_base', + # max_attempts=5, + # logging_level=20) + + # asyncio.run(oai.process_api_requests_from_file()) + # #results: list[str] = oai.results + # #print(f"Cleaned results: {oai.cleaned_results}") + # summary = oai.cleaned_results + # return summary + + def bulk_ingest(self, s3_paths: Union[List[str], str], course_name: str, **kwargs) -> Dict[str, List[str]]: + # https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/microsoft_word.html + success_status = {"success_ingest": [], "failure_ingest": []} + + try: + if isinstance(s3_paths, str): + s3_paths = [s3_paths] + + for s3_path in s3_paths: + ext = Path(s3_path).suffix # check mimetype of file + # TODO: no need to download, just guess_type against the s3_path... + with NamedTemporaryFile(suffix=ext) as tmpfile: + self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) + mime_type = mimetypes.guess_type(tmpfile.name)[0] + category, subcategory = mime_type.split('/') + + if s3_path.endswith('.html'): + ret = self._ingest_html(s3_path, course_name, kwargs=kwargs) + if ret != "Success": + success_status['failure_ingest'].append(s3_path) + else: + success_status['success_ingest'].append(s3_path) + elif s3_path.endswith('.py'): + ret = self._ingest_single_py(s3_path, course_name) + if ret != "Success": + success_status['failure_ingest'].append(s3_path) + else: + success_status['success_ingest'].append(s3_path) + elif s3_path.endswith('.vtt'): + ret = self._ingest_single_vtt(s3_path, course_name) + if ret != "Success": + success_status['failure_ingest'].append(s3_path) + else: + success_status['success_ingest'].append(s3_path) + elif s3_path.endswith('.pdf'): + ret = self._ingest_single_pdf(s3_path, course_name, kwargs=kwargs) + if ret != "Success": + success_status['failure_ingest'].append(s3_path) + else: + success_status['success_ingest'].append(s3_path) + elif s3_path.endswith('.txt'): + ret = self._ingest_single_txt(s3_path, course_name) + if ret != "Success": + success_status['failure_ingest'].append(s3_path) + else: + success_status['success_ingest'].append(s3_path) + elif s3_path.endswith('.srt'): + ret = self._ingest_single_srt(s3_path, course_name) + if ret != "Success": + success_status['failure_ingest'].append(s3_path) + else: + success_status['success_ingest'].append(s3_path) + elif s3_path.endswith('.docx'): + ret = self._ingest_single_docx(s3_path, course_name) + if ret != "Success": + success_status['failure_ingest'].append(s3_path) + else: + success_status['success_ingest'].append(s3_path) + elif s3_path.endswith('.ppt') or s3_path.endswith('.pptx'): + ret = self._ingest_single_ppt(s3_path, course_name) + if ret != "Success": + success_status['failure_ingest'].append(s3_path) + else: + success_status['success_ingest'].append(s3_path) + elif category == 'video' or category == 'audio': + ret = self._ingest_single_video(s3_path, course_name) + if ret != "Success": + success_status['failure_ingest'].append(s3_path) + else: + success_status['success_ingest'].append(s3_path) + return success_status + except Exception as e: + success_status['failure_ingest'].append("MAJOR ERROR IN /bulk_ingest: Error: " + str(e)) + return success_status + + def _ingest_single_py(self, s3_path: str, course_name: str): + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into vtt_tmpfile + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) + loader = PythonLoader(tmpfile.name) + documents = loader.load() + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': Path(s3_path).name, + 'pagenumber_or_timestamp': '', + } for doc in documents] + + success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas) + return success_or_failure + except Exception as e: + print(f"ERROR IN py READING {e}") + + def _ingest_single_vtt(self, s3_path: str, course_name: str): + """ + Ingest a single .vtt file from S3. + """ + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into vtt_tmpfile + self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) + loader = TextLoader(tmpfile.name) + documents = loader.load() + texts = [doc.page_content for doc in documents] + + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': Path(s3_path).name, + 'pagenumber_or_timestamp': '', + } for doc in documents] + + success_or_failure = self.split_and_upload(texts=texts, metadatas=metadatas) + return success_or_failure + except Exception as e: + print(f"ERROR IN VTT READING {e}") + + def _ingest_html(self, s3_path: str, course_name: str, **kwargs) -> str: + try: + response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path) + raw_html = response['Body'].read().decode('utf-8') + + soup = BeautifulSoup(raw_html, 'html.parser') + title = s3_path.replace("courses/" + course_name, "") + title = title.replace(".html", "") + title = title.replace("_", " ") + title = title.replace("/", " ") + title = title.strip() + + if kwargs['kwargs'] == {}: + url = '' + base_url = '' + else: + if 'url' in kwargs['kwargs'].keys(): + url = kwargs['kwargs']['url'] + else: + url = '' + if 'base_url' in kwargs['kwargs'].keys(): + base_url = kwargs['kwargs']['base_url'] + else: + base_url = '' + title = str(object=time.localtime()[1]) + "/" + str(time.localtime()[2]) + "/" + str(time.localtime()[0])[2:] + ' ' + str(title) + + text = [soup.get_text()] + metadata: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': str(title), # adding str to avoid error: unhashable type 'slice' + 'url': url, + 'base_url': base_url, + 'pagenumber_or_timestamp': '' + }] + + success_or_failure = self.split_and_upload(text, metadata) + print(f"_ingest_html: {success_or_failure}") + return success_or_failure + except Exception as e: + err: str = f"ERROR IN _ingest_html: {e}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + return f"_ingest_html Error: {e}" + + def _ingest_single_video(self, s3_path: str, course_name: str) -> str: + """ + Ingest a single video file from S3. + """ + try: + # check for file extension + file_ext = Path(s3_path).suffix + print(file_ext[1:]) + + openai.api_key = os.getenv('OPENAI_API_KEY') + transcript_list = [] + #print(os.getcwd()) + with NamedTemporaryFile(suffix=file_ext) as video_tmpfile: + # download from S3 into an video tmpfile + self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=video_tmpfile) + # extract audio from video tmpfile + mp4_version = AudioSegment.from_file(video_tmpfile.name, file_ext[1:]) + #print("Video file: ", video_tmpfile.name) + + # save the extracted audio as a temporary webm file + with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as webm_tmpfile: + mp4_version.export(webm_tmpfile, format="webm") + #print("WEBM file: ", webm_tmpfile.name) + + # check file size + file_size = os.path.getsize(webm_tmpfile.name) + # split the audio into 25MB chunks + if file_size > 26214400: + # load the webm file into audio object + full_audio = AudioSegment.from_file(webm_tmpfile.name, "webm") + file_count = file_size // 26214400 + 1 + split_segment = 35 * 60 * 1000 + start = 0 + count = 0 + + while count < file_count: + with NamedTemporaryFile(suffix=".webm", dir="media", delete=False) as split_tmp: + #print("Splitting file: ", split_tmp.name) + if count == file_count - 1: + # last segment + audio_chunk = full_audio[start:] + else: + audio_chunk = full_audio[start:split_segment] + + audio_chunk.export(split_tmp.name, format="webm") + + # transcribe the split file and store the text in dictionary + with open(split_tmp.name, "rb") as f: + transcript = openai.Audio.transcribe("whisper-1", f) + transcript_list.append(transcript['text']) # type: ignore + start += split_segment + split_segment += split_segment + count += 1 + os.remove(split_tmp.name) + else: + # transcribe the full audio + with open(webm_tmpfile.name, "rb") as f: + transcript = openai.Audio.transcribe("whisper-1", f) + transcript_list.append(transcript['text']) # type: ignore + + os.remove(webm_tmpfile.name) + + text = [txt for txt in transcript_list] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': Path(s3_path).name, + 'pagenumber_or_timestamp': text.index(txt), + } for txt in text] + + self.split_and_upload(texts=text, metadatas=metadatas) + return "Success" + except Exception as e: + print("ERROR IN VIDEO READING ") + print(e) + return f"Error {e}" + + def _ingest_single_docx(self, s3_path: str, course_name: str) -> str: + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into pdf_tmpfile + print("Bucket: ", os.getenv('S3_BUCKET_NAME')) + print("Key: ", s3_path) + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) + print("GOT THE FILE") + print(tmpfile.name) + + loader = Docx2txtLoader(tmpfile.name) + documents = loader.load() + + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': Path(s3_path).name, + 'pagenumber_or_timestamp': '', + } for doc in documents] + + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + print(f"ERROR IN DOCX {e}") + return f"Error: {e}" + + def _ingest_single_srt(self, s3_path: str, course_name: str) -> str: + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into pdf_tmpfile + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=tmpfile) + + loader = SRTLoader(tmpfile.name) + documents = loader.load() + + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': Path(s3_path).name, + 'pagenumber_or_timestamp': '', + } for doc in documents] + + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + print(f"SRT ERROR {e}") + return f"Error: {e}" + + def _ingest_single_pdf(self, s3_path: str, course_name: str, **kwargs): + """ + Both OCR the PDF. And grab the first image as a PNG. + LangChain `Documents` have .metadata and .page_content attributes. + Be sure to use TemporaryFile() to avoid memory leaks! + """ + try: + with NamedTemporaryFile() as pdf_tmpfile: + # download from S3 into pdf_tmpfile + self.s3_client.download_fileobj(Bucket=os.getenv('S3_BUCKET_NAME'), Key=s3_path, Fileobj=pdf_tmpfile) + ### READ OCR of PDF + doc = fitz.open(pdf_tmpfile.name) # type: ignore + + # improve quality of the image + zoom_x = 2.0 # horizontal zoom + zoom_y = 2.0 # vertical zoom + mat = fitz.Matrix(zoom_x, zoom_y) # zoom factor 2 in each dimension + + pdf_pages_OCRed: List[Dict] = [] + for i, page in enumerate(doc): # type: ignore + + # UPLOAD FIRST PAGE IMAGE to S3 + if i == 0: + with NamedTemporaryFile(suffix=".png") as first_page_png: + pix = page.get_pixmap(matrix=mat) + pix.save(first_page_png) # store image as a PNG + + s3_upload_path = str(Path(s3_path)).rsplit('.pdf')[0] + "-pg1-thumb.png" + first_page_png.seek(0) # Seek the file pointer back to the beginning + with open(first_page_png.name, 'rb') as f: + print("Uploading image png to S3") + self.s3_client.upload_fileobj(f, os.getenv('S3_BUCKET_NAME'), s3_upload_path) + + # Extract text + text = page.get_text().encode("utf8").decode('ascii', errors='ignore') # get plain text (is in UTF-8) + pdf_pages_OCRed.append(dict(text=text, page_number=i, readable_filename=Path(s3_path).name)) + + if kwargs['kwargs'] == {}: + url = '' + base_url = '' + else: + if 'url' in kwargs['kwargs'].keys(): + url = kwargs['kwargs']['url'] + else: + url = '' + if 'base_url' in kwargs['kwargs'].keys(): + base_url = kwargs['kwargs']['base_url'] + else: + base_url = '' + page['readable_filename'] = str(object=time.localtime()[1]) + "/" + str(time.localtime()[2]) + "/" + str( + time.localtime()[0])[2:] + ' ' + page['readable_filename'] + + metadatas: List[Dict[str, Any]] = [ + { + 'course_name': course_name, + 's3_path': s3_path, + 'pagenumber_or_timestamp': page['page_number'] + 1, # +1 for human indexing + 'readable_filename': page['readable_filename'], + 'url': url, + 'base_url': base_url, + } for page in pdf_pages_OCRed + ] + pdf_texts = [page['text'] for page in pdf_pages_OCRed] + + self.split_and_upload(texts=pdf_texts, metadatas=metadatas) + print("Success pdf ingest") + except Exception as e: + print("ERROR IN PDF READING ") + print(e) + return f"Error {e}" + return "Success" + + def _ingest_single_txt(self, s3_path: str, course_name: str) -> str: + """Ingest a single .txt file from S3. + Args: + s3_path (str): A path to a .txt file in S3 + course_name (str): The name of the course + Returns: + str: "Success" or an error message + """ + try: + # NOTE: slightly different method for .txt files, no need for download. It's part of the 'body' + response = self.s3_client.get_object(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path) + text = response['Body'].read().decode('utf-8') + text = [text] + + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': Path(s3_path).name, + 'pagenumber_or_timestamp': '', + }] + + success_or_failure = self.split_and_upload(texts=text, metadatas=metadatas) + return success_or_failure + except Exception as e: + print(f"ERROR IN TXT READING {e}") + return f"Error: {e}" + + def _ingest_single_ppt(self, s3_path: str, course_name: str) -> str: + """ + Ingest a single .ppt or .pptx file from S3. + """ + try: + with NamedTemporaryFile() as tmpfile: + # download from S3 into pdf_tmpfile + #print("in ingest PPTX") + self.s3_client.download_fileobj(Bucket=os.environ['S3_BUCKET_NAME'], Key=s3_path, Fileobj=tmpfile) + + loader = UnstructuredPowerPointLoader(tmpfile.name) + documents = loader.load() + + texts = [doc.page_content for doc in documents] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': s3_path, + 'readable_filename': Path(s3_path).name, + 'pagenumber_or_timestamp': '', + } for doc in documents] + + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + print("ERROR IN PDF READING ") + print(e) + return f"Error {e}" + + def list_files_recursively(self, bucket, prefix): + all_files = [] + continuation_token = None + + while True: + list_objects_kwargs = { + 'Bucket': bucket, + 'Prefix': prefix, + } + if continuation_token: + list_objects_kwargs['ContinuationToken'] = continuation_token + + response = self.s3_client.list_objects_v2(**list_objects_kwargs) + + if 'Contents' in response: + for obj in response['Contents']: + all_files.append(obj['Key']) + + if response['IsTruncated']: + continuation_token = response['NextContinuationToken'] + else: + break + + return all_files + + def ingest_coursera(self, coursera_course_name: str, course_name: str) -> str: + """ Download all the files from a coursera course and ingest them. + + 1. Download the coursera content. + 2. Upload to S3 (so users can view it) + 3. Run everything through the ingest_bulk method. + + Args: + coursera_course_name (str): The name of the coursera course. + course_name (str): The name of the course in our system. + + Returns: + _type_: Success or error message. + """ + certificate = "-ca 'FVhVoDp5cb-ZaoRr5nNJLYbyjCLz8cGvaXzizqNlQEBsG5wSq7AHScZGAGfC1nI0ehXFvWy1NG8dyuIBF7DLMA.X3cXsDvHcOmSdo3Fyvg27Q.qyGfoo0GOHosTVoSMFy-gc24B-_BIxJtqblTzN5xQWT3hSntTR1DMPgPQKQmfZh_40UaV8oZKKiF15HtZBaLHWLbpEpAgTg3KiTiU1WSdUWueo92tnhz-lcLeLmCQE2y3XpijaN6G4mmgznLGVsVLXb-P3Cibzz0aVeT_lWIJNrCsXrTFh2HzFEhC4FxfTVqS6cRsKVskPpSu8D9EuCQUwJoOJHP_GvcME9-RISBhi46p-Z1IQZAC4qHPDhthIJG4bJqpq8-ZClRL3DFGqOfaiu5y415LJcH--PRRKTBnP7fNWPKhcEK2xoYQLr9RxBVL3pzVPEFyTYtGg6hFIdJcjKOU11AXAnQ-Kw-Gb_wXiHmu63veM6T8N2dEkdqygMre_xMDT5NVaP3xrPbA4eAQjl9yov4tyX4AQWMaCS5OCbGTpMTq2Y4L0Mbz93MHrblM2JL_cBYa59bq7DFK1IgzmOjFhNG266mQlC9juNcEhc'" + always_use_flags = "-u kastanvday@gmail.com -p hSBsLaF5YM469# --ignore-formats mp4 --subtitle-language en --path ./coursera-dl" + + try: + subprocess.run(f"coursera-dl {always_use_flags} {certificate} {coursera_course_name}", + check=True, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) # capture_output=True, + dl_results_path = os.path.join('coursera-dl', coursera_course_name) + s3_paths: Union[List, None] = upload_data_files_to_s3(course_name, dl_results_path) + + if s3_paths is None: + return "Error: No files found in the coursera-dl directory" + + print("starting bulk ingest") + start_time = time.monotonic() + self.bulk_ingest(s3_paths, course_name) + print("completed bulk ingest") + print(f"⏰ Runtime: {(time.monotonic() - start_time):.2f} seconds") + + # Cleanup the coursera downloads + shutil.rmtree(dl_results_path) + + return "Success" + except Exception as e: + err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + return err + + def ingest_github(self, github_url: str, course_name: str) -> str: + """ + Clones the given GitHub URL and uses Langchain to load data. + 1. Clone the repo + 2. Use Langchain to load the data + 3. Pass to split_and_upload() + Args: + github_url (str): The Github Repo URL to be ingested. + course_name (str): The name of the course in our system. + + Returns: + _type_: Success or error message. + """ + print("in ingest_github") + + try: + repo_path = "media/cloned_repo" + repo = Repo.clone_from(github_url, to_path=repo_path, depth=1, clone_submodules=False) + branch = repo.head.reference + + loader = GitLoader(repo_path="media/cloned_repo", branch=branch) + data = loader.load() + shutil.rmtree("media/cloned_repo") + # create metadata for each file in data + texts = [doc.page_content for doc in data] + metadatas: List[Dict[str, Any]] = [{ + 'course_name': course_name, + 's3_path': '', + 'readable_filename': doc.metadata['file_name'], + 'url': github_url, + 'pagenumber_or_timestamp': '', + } for doc in data] + self.split_and_upload(texts=texts, metadatas=metadatas) + return "Success" + except Exception as e: + print(f"ERROR IN GITHUB INGEST {e}") + return f"Error: {e}" + + def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]]): + """ This is usually the last step of document ingest. Chunk & upload to Qdrant (and Supabase.. todo). + Takes in Text and Metadata (from Langchain doc loaders) and splits / uploads to Qdrant. + + good examples here: https://langchain.readthedocs.io/en/latest/modules/utils/combine_docs_examples/textsplitter.html + + Args: + texts (List[str]): _description_ + metadatas (List[Dict[str, Any]]): _description_ + """ + assert len(texts) == len(metadatas), 'must have equal number of text strings and metadata dicts' + + try: + # generate AI summary + # summary = self.ai_summary(texts, metadatas) + # for i in range(len(summary)): + # metadatas[i]['summary'] = summary[i] + + text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + chunk_size=1000, + chunk_overlap=150, + separators=". ", # try to split on sentences... + ) + documents: List[Document] = text_splitter.create_documents(texts=texts, metadatas=metadatas) + + def remove_small_contexts(documents: List[Document]) -> List[Document]: + # Remove TextSplit contexts with fewer than 50 chars. + return [doc for doc in documents if len(doc.page_content) > 50] + + documents = remove_small_contexts(documents=documents) + + # upload to Qdrant + self.vectorstore.add_texts([doc.page_content for doc in documents], [doc.metadata for doc in documents]) + data = [{"content": doc.page_content, "metadata": doc.metadata} for doc in documents] + self.supabase_client.table(os.getenv('MATERIALS_SUPABASE_TABLE')).insert(data).execute() # type: ignore + + return "Success" + except Exception as e: + err: str = f"ERROR IN split_and_upload(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + return err + + def delete_entire_course(self, course_name: str): + """Delete entire course. + + Delete materials from S3, Supabase SQL, Vercel KV, and QDrant vector DB + + Args: + course_name (str): _description_ + """ + print(f"Deleting entire course: {course_name}") + try: + # Delete file from S3 + objects_to_delete = self.s3_client.list_objects(Bucket=os.getenv('S3_BUCKET_NAME'), Prefix=f'courses/{course_name}/') + for object in objects_to_delete['Contents']: + self.s3_client.delete_object(Bucket=os.getenv('S3_BUCKET_NAME'), Key=object['Key']) + + # Delete from Qdrant + # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key + # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), + self.qdrant_client.delete( + collection_name=os.getenv('QDRANT_COLLECTION_NAME'), + points_selector=models.Filter(must=[ + models.FieldCondition( + key="metadata.course_name", + match=models.MatchValue(value=course_name), + ), + ]), + ) + + # Delete from Supabase + response = self.supabase_client.from_(os.getenv('MATERIALS_SUPABASE_TABLE')).delete().eq('metadata->>course_name', + course_name).execute() + print("supabase response: ", response) + return "Success" + except Exception as e: + err: str = f"ERROR IN delete_entire_course(): Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + return err + + # Create a method to delete file from s3, delete vector from qdrant, and delete row from supabase + def delete_data(self, s3_path: str, course_name: str): + """Delete file from S3, Qdrant, and Supabase.""" + print(f"Deleting {s3_path} from S3, Qdrant, and Supabase for course {course_name}") + try: + # Delete file from S3 + bucket_name = os.getenv('S3_BUCKET_NAME') + self.s3_client.delete_object(Bucket=bucket_name, Key=s3_path) + + # Delete from Qdrant + # docs for nested keys: https://qdrant.tech/documentation/concepts/filtering/#nested-key + # Qdrant "points" look like this: Record(id='000295ca-bd28-ac4a-6f8d-c245f7377f90', payload={'metadata': {'course_name': 'zotero-extreme', 'pagenumber_or_timestamp': 15, 'readable_filename': 'Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf', 's3_path': 'courses/zotero-extreme/Dunlosky et al. - 2013 - Improving Students’ Learning With Effective Learni.pdf'}, 'page_content': '18 \nDunlosky et al.\n3.3 Effects in representative educational contexts. Sev-\neral of the large summarization-training studies have been \nconducted in regular classrooms, indicating the feasibility of \ndoing so. For example, the study by A. King (1992) took place \nin the context of a remedial study-skills course for undergrad-\nuates, and the study by Rinehart et al. (1986) took place in \nsixth-grade classrooms, with the instruction led by students \nregular teachers. In these and other cases, students benefited \nfrom the classroom training. We suspect it may actually be \nmore feasible to conduct these kinds of training studies in \nclassrooms than in the laboratory, given the nature of the time \ncommitment for students. Even some of the studies that did \nnot involve training were conducted outside the laboratory; for \nexample, in the Bednall and Kehoe (2011) study on learning \nabout logical fallacies from Web modules (see data in Table 3), \nthe modules were actually completed as a homework assign-\nment. Overall, benefits can be observed in classroom settings; \nthe real constraint is whether students have the skill to suc-\ncessfully summarize, not whether summarization occurs in the \nlab or the classroom.\n3.4 Issues for implementation. Summarization would be \nfeasible for undergraduates or other learners who already \nknow how to summarize. For these students, summarization \nwould constitute an easy-to-implement technique that would \nnot take a lot of time to complete or understand. The only \nconcern would be whether these students might be better \nserved by some other strategy, but certainly summarization \nwould be better than the study strategies students typically \nfavor, such as highlighting and rereading (as we discuss in the \nsections on those strategies below). A trickier issue would \nconcern implementing the strategy with students who are not \nskilled summarizers. Relatively intensive training programs \nare required for middle school students or learners with learn-\ning disabilities to benefit from summarization. Such efforts \nare not misplaced; training has been shown to benefit perfor-\nmance on a range of measures, although the training proce-\ndures do raise practical issues (e.g., Gajria & Salvia, 1992: \n6.511 hours of training used for sixth through ninth graders \nwith learning disabilities; Malone & Mastropieri, 1991: 2 \ndays of training used for middle school students with learning \ndisabilities; Rinehart et al., 1986: 4550 minutes of instruc-\ntion per day for 5 days used for sixth graders). Of course, \ninstructors may want students to summarize material because \nsummarization itself is a goal, not because they plan to use \nsummarization as a study technique, and that goal may merit \nthe efforts of training.\nHowever, if the goal is to use summarization as a study \ntechnique, our question is whether training students would be \nworth the amount of time it would take, both in terms of the \ntime required on the part of the instructor and in terms of the \ntime taken away from students other activities. For instance, \nin terms of efficacy, summarization tends to fall in the middle \nof the pack when compared to other techniques. In direct \ncomparisons, it was sometimes more useful than rereading \n(Rewey, Dansereau, & Peel, 1991) and was as useful as note-\ntaking (e.g., Bretzing & Kulhavy, 1979) but was less powerful \nthan generating explanations (e.g., Bednall & Kehoe, 2011) or \nself-questioning (A. King, 1992).\n3.5 Summarization: Overall assessment. On the basis of the \navailable evidence, we rate summarization as low utility. It can \nbe an effective learning strategy for learners who are already \nskilled at summarizing; however, many learners (including \nchildren, high school students, and even some undergraduates) \nwill require extensive training, which makes this strategy less \nfeasible. Our enthusiasm is further dampened by mixed find-\nings regarding which tasks summarization actually helps. \nAlthough summarization has been examined with a wide \nrange of text materials, many researchers have pointed to fac-\ntors of these texts that seem likely to moderate the effects of \nsummarization (e.g'}, vector=None), + self.qdrant_client.delete( + collection_name=os.getenv('QDRANT_COLLECTION_NAME'), + points_selector=models.Filter(must=[ + models.FieldCondition( + key="metadata.s3_path", + match=models.MatchValue(value=s3_path), + ), + ]), + ) + + # Delete from Supabase + self.supabase_client.from_(os.getenv('MATERIALS_SUPABASE_TABLE')).delete().eq('metadata->>s3_path', + s3_path).eq('metadata->>course_name', + course_name).execute() + return "Success" + except Exception as e: + err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + return err + + def getAll( + self, + course_name: str, + ): + """Get all course materials based on course name. + Args : + course_name (as uploaded on supabase) + Returns : + list of dictionaries with distinct s3 path, readable_filename and course_name. + """ + response = self.supabase_client.table(os.getenv('MATERIALS_SUPABASE_TABLE')).select( + 'metadata->>course_name, metadata->>s3_path, metadata->>readable_filename').eq( # type: ignore + 'metadata->>course_name', course_name).execute() + + data = response.data + unique_combinations = set() + distinct_dicts = [] + + for item in data: + combination = (item['s3_path'], item['readable_filename'], item['course_name']) + if combination not in unique_combinations: + unique_combinations.add(combination) + distinct_dicts.append(item) + + return distinct_dicts + + def getTopContexts(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]: + """Here's a summary of the work. + + /GET arguments + course name (optional) str: A json response with TBD fields. + + Returns + JSON: A json response with TBD fields. See main.py:getTopContexts docs. + or + String: An error message with traceback. + """ + try: + # TODO: change back to 50+ once we have bigger qdrant DB. + top_n = 80 # HARD CODE TO ENSURE WE HIT THE MAX TOKENS + start_time_overall = time.monotonic() + found_docs = self.vectorstore.similarity_search(search_query, k=top_n, filter={'course_name': course_name}) + if len(found_docs) == 0: + return [] + + pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" + + # count tokens at start and end, then also count each context. + token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) + valid_docs = [] + for d in found_docs: + doc_string = f"Document: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber_or_timestamp']) if d.metadata['pagenumber_or_timestamp'] else ''}\n{d.page_content}\n" + num_tokens, prompt_cost = count_tokens_and_cost(doc_string) + # print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, max_tokens: {token_limit}") + if token_counter + num_tokens <= token_limit: + token_counter += num_tokens + valid_docs.append(d) + else: + break + + print(f"Total tokens: {token_counter} total docs: {len(found_docs)} num docs used: {len(valid_docs)}") + print(f"Course: {course_name} ||| search_query: {search_query}") + print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") + + return self.format_for_json(valid_docs) + except Exception as e: + # return full traceback to front end + err: str = f"In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:\n{e}" # type: ignore + print(err) + return err + + def get_stuffed_prompt(self, search_query: str, course_name: str, token_limit: int = 7_000) -> str: + """ + Returns + String: A fully formatted prompt string. + """ + try: + top_n = 150 + start_time_overall = time.monotonic() + found_docs = self.vectorstore.similarity_search(search_query, k=top_n, filter={'course_name': course_name}) + if len(found_docs) == 0: + return search_query + + pre_prompt = "Please answer the following question. Use the context below, called your documents, only if it's helpful and don't use parts that are very irrelevant. It's good to quote from your documents directly, when you do always use Markdown footnotes for citations. Use react-markdown superscript to number the sources at the end of sentences (1, 2, 3...) and use react-markdown Footnotes to list the full document names for each number. Use ReactMarkdown aka 'react-markdown' formatting for super script citations, use semi-formal style. Feel free to say you don't know. \nHere's a few passages of the high quality documents:\n" + + # count tokens at start and end, then also count each context. + token_counter, _ = count_tokens_and_cost(pre_prompt + '\n\nNow please respond to my query: ' + search_query) + valid_docs = [] + for d in found_docs: + doc_string = f"---\nDocument: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber_or_timestamp']) if d.metadata['pagenumber_or_timestamp'] else ''}\n{d.page_content}\n" + num_tokens, prompt_cost = count_tokens_and_cost(doc_string) + print(f"Page: {d.page_content[:100]}...") + print(f"token_counter: {token_counter}, num_tokens: {num_tokens}, token_limit: {token_limit}") + if token_counter + num_tokens <= token_limit: + token_counter += num_tokens + valid_docs.append(d) + else: + continue + print("running continue") + + # Convert the valid_docs to full prompt + separator = '---\n' # between each context + context_text = separator.join( + f"Document: {d.metadata['readable_filename']}{', page: ' + str(d.metadata['pagenumber_or_timestamp']) if d.metadata['pagenumber_or_timestamp'] else ''}\n{d.page_content}\n" + for d in valid_docs) + + # Create the stuffedPrompt + stuffedPrompt = (pre_prompt + context_text + '\n\nNow please respond to my query: ' + search_query) + + TOTAL_num_tokens, prompt_cost = count_tokens_and_cost(stuffedPrompt, openai_model_name='gpt-4') + print(f"Total tokens: {TOTAL_num_tokens}, prompt_cost: {prompt_cost}") + print("total docs: ", len(found_docs)) + print("num docs used: ", len(valid_docs)) + + print(f"⏰ ^^ Runtime of getTopContexts: {(time.monotonic() - start_time_overall):.2f} seconds") + return stuffedPrompt + except Exception as e: + # return full traceback to front end + err: str = f"Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore + print(err) + return err + + def format_for_json(self, found_docs: List[Document]) -> List[Dict]: + """Formatting only. + {'course_name': course_name, 'contexts': [{'source_name': 'Lumetta_notes', 'source_location': 'pg. 19', 'text': 'In FSM, we do this...'}, {'source_name': 'Lumetta_notes', 'source_location': 'pg. 20', 'text': 'In Assembly language, the code does that...'},]} + + Args: + found_docs (List[Document]): _description_ + + Raises: + Exception: _description_ + + Returns: + List[Dict]: _description_ + """ + + contexts = [{ + 'text': doc.page_content, + 'readable_filename': doc.metadata['readable_filename'], + 'course_name ': doc.metadata['course_name'], + 's3_path': doc.metadata['s3_path'], + 'pagenumber_or_timestamp': doc.metadata['pagenumber_or_timestamp'], + } for doc in found_docs] + + return contexts diff --git a/ai_ta_backend/web_scrape.py b/ai_ta_backend/web_scrape.py index 54b01402..b5a85928 100644 --- a/ai_ta_backend/web_scrape.py +++ b/ai_ta_backend/web_scrape.py @@ -1,23 +1,23 @@ import os import re import shutil -import time from tempfile import NamedTemporaryFile +import time from zipfile import ZipFile import boto3 # type: ignore -import requests from bs4 import BeautifulSoup +import requests from ai_ta_backend.aws import upload_data_files_to_s3 from ai_ta_backend.vector_database import Ingest def valid_url(url): - '''Returns the URL and it's content if it's good, otherwise returns false. Prints the status code.''' + """Returns the URL and it's content if it's good, otherwise returns false. Prints the status code.""" try: response = requests.get(url, allow_redirects=True, timeout=20) - + redirect_loop_counter = 0 while response.status_code == 301: # Check for permanent redirect @@ -27,10 +27,10 @@ def valid_url(url): redirect_url = response.headers['Location'] response = requests.head(redirect_url) redirect_loop_counter += 1 - + if response.status_code == 200: if ".pdf" in response.url: - if f"" not in str(response.content): + if "" not in str(response.content): content = response.content elif str(response.content).startswith("%PDF"): content = response.content @@ -44,22 +44,23 @@ def valid_url(url): print("URL is invalid:", url, "Error:", e) return (False, False) + # Ensures url is in the correct format -def base_url(url:str): +def base_url(url: str): try: # Get rid of double slashes in url # Create a base site for incomplete hrefs if url.startswith("https:"): - site= re.match(pattern=r'https:\/\/[a-zA-Z0-9.]*[a-z]', string=url).group(0) # type: ignore + site = re.match(pattern=r'https:\/\/[a-zA-Z0-9.]*[a-z]', string=url).group(0) # type: ignore url = re.sub(pattern=r"https:\/\/", repl="", string=url) url = re.sub(pattern=r"[\/\/]{2,}", repl="", string=url) - url = "https://"+url + url = "https://" + url return site elif url.startswith("http:"): - site = re.match(pattern=r'http:\/\/[a-zA-Z0-9.]*[a-z]', string=url).group(0) # type: ignore + site = re.match(pattern=r'http:\/\/[a-zA-Z0-9.]*[a-z]', string=url).group(0) # type: ignore url = re.sub(pattern=r"http:\/\/", repl="", string=url) url = re.sub(pattern=r"[\/\/]{2,}", repl="", string=url) - url = "http://"+url + url = "http://" + url return site else: return [] @@ -67,11 +68,12 @@ def base_url(url:str): print("Error:", e) return [] -def find_urls(soup:BeautifulSoup, urls:set, site:str): + +def find_urls(soup: BeautifulSoup, urls: set, site: str): try: - for i in soup.find_all("a"): # type: ignore + for i in soup.find_all("a"): # type: ignore try: - # getting the href tag + # getting the href tag href = i.attrs['href'] except KeyError as e: print("KeyError:", e, "for", i) @@ -81,9 +83,9 @@ def find_urls(soup:BeautifulSoup, urls:set, site:str): if href.startswith("http"): pass elif href.startswith("/"): - href = site+href + href = site + href else: - href = site+'/'+href + href = site + '/' + href urls.add(href) except Exception as e: @@ -92,8 +94,9 @@ def find_urls(soup:BeautifulSoup, urls:set, site:str): return urls -def remove_duplicates(urls:list): -# Delete repeated sites, with different URLs and keeping one + +def remove_duplicates(urls: list): + # Delete repeated sites, with different URLs and keeping one not_repeated_files = [] og_len = len(urls) print("deleting duplicate files") @@ -103,11 +106,19 @@ def remove_duplicates(urls:list): else: urls.remove(row) continue - print("deleted", og_len-len(not_repeated_files), "duplicate files") + print("deleted", og_len - len(not_repeated_files), "duplicate files") return urls -def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url_on:str=None, _depth:int=0, _soup:BeautifulSoup=None, _invalid_urls:list=[]): - '''Function gets titles of urls and the urls themselves''' + +def crawler(url: str, + max_urls: int = 1000, + max_depth: int = 3, + timeout: int = 1, + base_url_on: str = None, + _depth: int = 0, + _soup: BeautifulSoup = None, + _invalid_urls: list = []): + """Function gets titles of urls and the urls themselves""" # Prints the depth of the current search print("depth: ", _depth) url_contents = [] @@ -118,43 +129,41 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url base_url_on = str(base_url_on) amount = max_urls - + # Get rid of double slashes in url # Create a base site for incomplete hrefs base = base_url(url) - if base ==[]: + if base == []: return [] else: site = base - urls= set() + urls = set() if _soup: s = _soup else: url, s = valid_url(url) time.sleep(timeout) - url_contents.append((url,s)) + url_contents.append((url, s)) if url: try: body = s.find("body") - header = s.find("head") + header = s.find("head") except Exception as e: print("Error:", e) body = "" header = "" - - # Check for 403 Forbidden urls try: - if s.title.string.lower() == "403 forbidden" or s.title.string.lower() == 'page not found': # type: ignore + if s.title.string.lower() == "403 forbidden" or s.title.string.lower() == 'page not found': # type: ignore print("403 Forbidden") else: pass except Exception as e: print("Error:", e) - pass + pass if body != "" and header != "": urls = find_urls(body, urls, site) urls = find_urls(header, urls, site) @@ -192,12 +201,12 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url url_contents.append((url, s)) else: _invalid_urls.append(url) - + url_contents = remove_duplicates(url_contents) max_urls = max_urls - len(url_contents) print(max_urls, "urls left") - # recursively go through crawler until we reach the max amount of urls. + # recursively go through crawler until we reach the max amount of urls. for url in url_contents: if url[0] not in _invalid_urls: if max_urls > 0: @@ -209,13 +218,13 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url url_contents.extend(temp_data) url_contents = remove_duplicates(url_contents) else: - print("Depth exceeded:", _depth+1, "out of", max_depth) + print("Depth exceeded:", _depth + 1, "out of", max_depth) break else: break else: pass - + if _depth == 0: if len(url_contents) < amount: print("Max URLS not reached, returning all urls found:", len(url_contents), "out of", amount) @@ -226,7 +235,8 @@ def crawler(url:str, max_urls:int=1000, max_depth:int=3, timeout:int=1, base_url print(len(url_contents), "urls found") return url_contents -def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, timeout:int=1, base_url_on:str=None): + +def main_crawler(url: str, course_name: str, max_urls: int = 100, max_depth: int = 3, timeout: int = 1, base_url_on: str = None): """ Crawl a site and scrape its content and PDFs, then upload the data to S3 and ingest it. @@ -247,10 +257,10 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti base_url_on = str(base_url_on) ingester = Ingest() s3_client = boto3.client( - 's3', - aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), - aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), - ) + 's3', + aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), + ) if url.startswith("https://github.com/"): print("Begin Ingesting GitHub page") @@ -261,8 +271,6 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti print("Begin Ingesting Web page") data = crawler(url, max_urls, max_depth, timeout, base_url_on) - - # Clean some keys for a proper file name # todo: have a default title # titles = [value[1][1].title.string for value in data] @@ -270,18 +278,18 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti titles = [] for value in data: try: - titles.append(value[1].title.string) - except AttributeError as e: + titles.append(value[1].title.string) + except AttributeError: # if no title try: placeholder_title = re.findall(pattern=r'[a-zA-Z0-9.]*[a-z]', string=value[0])[1] - except Exception as e: + except Exception: placeholder_title = "Title Not Found" titles.append(placeholder_title) print(f"URL is missing a title, using this title instead: {placeholder_title}") try: - clean = [re.match(r"[a-zA-Z0-9\s]*", title).group(0) for title in titles] # type: ignore + clean = [re.match(r"[a-zA-Z0-9\s]*", title).group(0) for title in titles] # type: ignore except Exception as e: print("Error:", e) clean = titles @@ -298,9 +306,8 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti else: path_name.append(value) counter += 1 - - print("Cleaned title names", path_name) + print("Cleaned title names", path_name) # Upload each html to S3 print("Uploading files to S3") @@ -310,10 +317,10 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti for i, key in enumerate(data): if ".pdf" in key[0]: with NamedTemporaryFile(suffix=".pdf") as temp_pdf: - if key[1] != "" or key[1] != None: + if key[1] != "" or key[1] is not None: temp_pdf.write(key[1]) temp_pdf.seek(0) - s3_upload_path = "courses/"+ course_name + "/" + path_name[i] + ".pdf" + s3_upload_path = "courses/" + course_name + "/" + path_name[i] + ".pdf" paths.append(s3_upload_path) with open(temp_pdf.name, 'rb') as f: print("Uploading PDF to S3") @@ -324,10 +331,10 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti print("No PDF to upload", key[1]) else: with NamedTemporaryFile(suffix=".html") as temp_html: - if key[1] != "" or key[1] != None: + if key[1] != "" or key[1] is not None: temp_html.write(key[1].encode('utf-8')) temp_html.seek(0) - s3_upload_path = "courses/"+ course_name + "/" + path_name[i] + ".html" + s3_upload_path = "courses/" + course_name + "/" + path_name[i] + ".html" paths.append(s3_upload_path) with open(temp_html.name, 'rb') as f: print("Uploading html to S3") @@ -342,46 +349,45 @@ def main_crawler(url:str, course_name:str, max_urls:int=100, max_depth:int=3, ti print("Successfully uploaded", counter, "files to S3") print("Finished /web-scrape") + # Download an MIT course using its url -def mit_course_download(url:str, course_name:str, local_dir:str): - ingester = Ingest() - base = "https://ocw.mit.edu" - if url.endswith("download"): - pass - else: - url = url + "download" +def mit_course_download(url: str, course_name: str, local_dir: str): + ingester = Ingest() + if url.endswith("download"): + pass + else: + url = url + "download" - r = requests.get(url) - soup = BeautifulSoup(r.text,"html.parser") + r = requests.get(url) + soup = BeautifulSoup(r.text, "html.parser") - zip = '' - for ref in soup.find_all("a"): - if ref.attrs['href'].endswith("zip"): - zip = ref.attrs['href'] - - site = zip - print('site', site) - r = requests.get(url=site, stream=True) + zip = '' + for ref in soup.find_all("a"): + if ref.attrs['href'].endswith("zip"): + zip = ref.attrs['href'] - zip_file = local_dir + ".zip" + site = zip + print('site', site) + r = requests.get(url=site, stream=True) - try: - with open(zip_file, 'wb') as fd: - for chunk in r.iter_content(chunk_size=128): - fd.write(chunk) - print("course downloaded!") - except Exception as e: - print("Error:", e, site) - - with ZipFile(zip_file, 'r') as zObject: - zObject.extractall( - path=local_dir) - - shutil.move(local_dir+"/"+"robots.txt", local_dir+"/static_resources") - s3_paths = upload_data_files_to_s3(course_name, local_dir+"/static_resources") - success_fail = ingester.bulk_ingest(s3_paths, course_name) # type: ignore - - shutil.move(zip_file, local_dir) - shutil.rmtree(local_dir) - print("Finished Ingest") - return success_fail \ No newline at end of file + zip_file = local_dir + ".zip" + + try: + with open(zip_file, 'wb') as fd: + for chunk in r.iter_content(chunk_size=128): + fd.write(chunk) + print("course downloaded!") + except Exception as e: + print("Error:", e, site) + + with ZipFile(zip_file, 'r') as zObject: + zObject.extractall(path=local_dir) + + shutil.move(local_dir + "/" + "robots.txt", local_dir + "/static_resources") + s3_paths = upload_data_files_to_s3(course_name, local_dir + "/static_resources") + success_fail = ingester.bulk_ingest(s3_paths, course_name) # type: ignore + + shutil.move(zip_file, local_dir) + shutil.rmtree(local_dir) + print("Finished Ingest") + return success_fail diff --git a/docs/api_reference.md b/docs/api_reference.md index 50e6e735..81ac0c4f 100644 --- a/docs/api_reference.md +++ b/docs/api_reference.md @@ -1,15 +1,15 @@ -# API Reference - -## Top Level API Reference - -::: ai_ta_backend.main - -## Backend endpoints - -### Database endpoints (Supabase, QDrant) - -::: ai_ta_backend.vector_database - -### AWS endpoints - -::: ai_ta_backend.aws +# API Reference + +## Top Level API Reference + +::: ai_ta_backend.main + +## Backend endpoints + +### Database endpoints (Supabase, QDrant) + +::: ai_ta_backend.vector_database + +### AWS endpoints + +::: ai_ta_backend.aws diff --git a/docs/index.md b/docs/index.md index 233d8330..b672f32d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,47 +1,47 @@ ---- -icon: material/emoticon-happy ---- - -# Welcome to UIUC AI Chatbot - -## Learning to write documentation - -It's just markdown, with _truly OPTIONAL_ extra features. - -``` python -import numpy as np -for i in range(10): - print(i) -``` - -## Callout boxes - -!!! tip "Tip: Use callout boxes to highlight important information." - - It's critical to avoid this. - -``` text title="how to use Admonitions, or 'callout' boxes" -!!! danger "Danger: don't do this!" - - It's critical to avoid this. - -``` - -``` text title="Types of Admonitions, or 'callout' boxes" -note -abstract -info -tip -success -question -warning -failure -danger -bug -example -quote -``` - -## Contribute docs - -Here's the reference for creating new docs: +--- +icon: material/emoticon-happy +--- + +# Welcome to UIUC AI Chatbot + +## Learning to write documentation + +It's just markdown, with _truly OPTIONAL_ extra features. + +```python +import numpy as np +for i in range(10): + print(i) +``` + +## Callout boxes + +!!! tip "Tip: Use callout boxes to highlight important information." + +It's critical to avoid this. + +```text title="how to use Admonitions, or 'callout' boxes" +!!! danger "Danger: don't do this!" + + It's critical to avoid this. + +``` + +```text title="Types of Admonitions, or 'callout' boxes" +note +abstract +info +tip +success +question +warning +failure +danger +bug +example +quote +``` + +## Contribute docs + +Here's the reference for creating new docs: diff --git a/docs/vector_search.md b/docs/vector_search.md index d1e18063..7af1e860 100644 --- a/docs/vector_search.md +++ b/docs/vector_search.md @@ -1,3 +1,3 @@ -# Here's how to do vector search - -TBD. See [References](/reference) for more info. +# Here's how to do vector search + +TBD. See [References](/reference) for more info. diff --git a/lats_testing.ipynb b/lats_testing.ipynb new file mode 100644 index 00000000..13840cce --- /dev/null +++ b/lats_testing.ipynb @@ -0,0 +1,1129 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "73080044-b2f7-49e2-a315-57753d871f8f", + "metadata": {}, + "source": [ + "Langgraph LATS testing\n", + "\n", + "Implement LATS for each of the step.\n", + "Have a generic LATS class.\n", + "First, have LATS for planning step - the system prompt should be rna_seq planner prompt.\n", + "Once the plan is solved, have LATS for each of the step.\n", + "The plan generated in the first LATS will be given to a parser. Parse each of the planning steps\n", + "Then the lats get steps one by one and the plan and each steps response will be given to the next lats class" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "cc5a747b-c728-4fc8-af3d-f6f557bcfa1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "\n", + "from dotenv import load_dotenv\n", + "load_dotenv(override=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "946388e9-f23b-4de3-8a2c-4211f42a19e4", + "metadata": {}, + "outputs": [], + "source": [ + "# define language model\n", + "from langchain_openai import ChatOpenAI, AzureChatOpenAI\n", + "\n", + "llm = AzureChatOpenAI(\n", + " azure_deployment=\"gpt-4-128k\",\n", + " openai_api_version=os.getenv(\"AZURE_0125_MODEL_VERSION\"),\n", + " temperature=0,\n", + " azure_endpoint=os.getenv(\"AZURE_0125_MODEL_ENDPOINT\"),\n", + " openai_api_key=os.getenv(\"AZURE_0125_MODEL_API_KEY\"),\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "abe3d21a-4dea-48e2-9d09-f2b1093fc3ff", + "metadata": {}, + "outputs": [], + "source": [ + "# define tools \n", + "from langchain.agents import load_tools\n", + "from langchain_community.tools import VectorStoreQATool\n", + "from langchain.tools import (BaseTool, StructuredTool)\n", + "\n", + "from code_intrepreter_sanbox import E2B_class\n", + "from vector_db import get_vectorstore_retriever_tool\n", + "\n", + "\n", + "def get_tools(langsmith_run_id: str, sync=True):\n", + " search = load_tools([\"serpapi\"])\n", + " code_execution_class = E2B_class(langsmith_run_id=langsmith_run_id)\n", + " e2b_code_execution_tool = StructuredTool.from_function(\n", + " func=code_execution_class.run_python_code,\n", + " name=\"Python-Code-Execution\",\n", + " description=\"Executes Python3 code in an safe Docker container.\",\n", + " )\n", + " e2b_shell_tool = StructuredTool.from_function(\n", + " func=code_execution_class.run_shell,\n", + " name=\"Shell-commands-except-for-git\",\n", + " description=\n", + " \"Run shell commands to, for example, execute shell scripts or R scripts. It is in the same environment as the Code Execution tool.\",\n", + " )\n", + " docs_tools: List[VectorStoreQATool] = [\n", + " get_vectorstore_retriever_tool(\n", + " course_name='langchain-docs',\n", + " name='Langchain-docs',\n", + " description=\n", + " \"Build context-aware, reasoning applications with LangChain's flexible abstractions and AI-first toolkit.\"),\n", + " get_vectorstore_retriever_tool(\n", + " course_name='ml4bio-star',\n", + " name='STAR-docs',\n", + " description=\n", + " 'Basic STAR workflow consists of 2 steps: (1) Generating genome indexes files and (2) Mapping reads to the genome'\n", + " ),\n", + " get_vectorstore_retriever_tool(\n", + " course_name='ml4bio-fastqc',\n", + " name='FastQC-docs',\n", + " description=\n", + " 'FastQC aims to provide a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses which you can use to give a quick impression of whether your data has any problems of which you should be aware before doing any further analysis. It works with data from BAM, SAM or FastQ files'\n", + " ),\n", + " get_vectorstore_retriever_tool(\n", + " course_name='ml4bio-multiqc',\n", + " name='MultiQC-docs',\n", + " description=\n", + " \"MultiQC is a reporting tool that parses results and statistics from bioinformatics tool outputs, such as log files and console outputs. It helps to summarize experiments containing multiple samples and multiple analysis steps. It's designed to be placed at the end of pipelines or to be run manually when you've finished running your tools.\"\n", + " ),\n", + " get_vectorstore_retriever_tool(\n", + " course_name='ml4bio-bioconductor',\n", + " name='Bioconductor-docs',\n", + " description=\n", + " \"Bioconductor is a project that contains hundreds of individual R packages. They're all high quality libraries that provide widespread access to a broad range of powerful statistical and graphical methods for the analysis of genomic data. Some of them also facilitate the inclusion of biological metadata in the analysis of genomic data, e.g. literature data from PubMed, annotation data from Entrez genes.\"\n", + " ),\n", + " ]\n", + "\n", + " tools: list[BaseTool] = search + docs_tools + [e2b_code_execution_tool, e2b_shell_tool] \n", + " return tools\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "fb6c5732-ae20-4b86-8e55-42a877fbcc90", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/qdrant_client/qdrant_remote.py:116: UserWarning: Api key is used with unsecure connection.\n", + " warnings.warn(\"Api key is used with unsecure connection.\")\n", + "/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/qdrant_client/qdrant_remote.py:116: UserWarning: Api key is used with unsecure connection.\n", + " warnings.warn(\"Api key is used with unsecure connection.\")\n", + "/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/qdrant_client/qdrant_remote.py:116: UserWarning: Api key is used with unsecure connection.\n", + " warnings.warn(\"Api key is used with unsecure connection.\")\n", + "/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/qdrant_client/qdrant_remote.py:116: UserWarning: Api key is used with unsecure connection.\n", + " warnings.warn(\"Api key is used with unsecure connection.\")\n", + "/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/qdrant_client/qdrant_remote.py:116: UserWarning: Api key is used with unsecure connection.\n", + " warnings.warn(\"Api key is used with unsecure connection.\")\n" + ] + }, + { + "data": { + "text/plain": [ + "[Tool(name='Search', description='A search engine. Useful for when you need to answer questions about current events. Input should be a search query.', func=, params={'engine': 'google', 'google_domain': 'google.com', 'gl': 'us', 'hl': 'en'}, serpapi_api_key='edf00a75c49d95767f0f7b99cddb763bea1145f2f94cf9f88879bbcab19c9a8f', aiosession=None)>, coroutine=, params={'engine': 'google', 'google_domain': 'google.com', 'gl': 'us', 'hl': 'en'}, serpapi_api_key='edf00a75c49d95767f0f7b99cddb763bea1145f2f94cf9f88879bbcab19c9a8f', aiosession=None)>),\n", + " VectorStoreQATool(name='Langchain-docs', description=\"Build context-aware, reasoning applications with LangChain's flexible abstractions and AI-first toolkit.\", vectorstore=, llm=AzureChatOpenAI(client=, async_client=, model_name='gpt-4-0613', temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='', request_timeout=180.0, max_retries=3, azure_endpoint='https://uiuc-chat-canada-east.openai.azure.com/', deployment_name='gpt-4-from-canada-east', openai_api_version='2023-05-15', openai_api_type='azure')),\n", + " VectorStoreQATool(name='STAR-docs', description='Basic STAR workflow consists of 2 steps: (1) Generating genome indexes files and (2) Mapping reads to the genome', vectorstore=, llm=AzureChatOpenAI(client=, async_client=, model_name='gpt-4-0613', temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='', request_timeout=180.0, max_retries=3, azure_endpoint='https://uiuc-chat-canada-east.openai.azure.com/', deployment_name='gpt-4-from-canada-east', openai_api_version='2023-05-15', openai_api_type='azure')),\n", + " VectorStoreQATool(name='FastQC-docs', description='FastQC aims to provide a simple way to do some quality control checks on raw sequence data coming from high throughput sequencing pipelines. It provides a modular set of analyses which you can use to give a quick impression of whether your data has any problems of which you should be aware before doing any further analysis. It works with data from BAM, SAM or FastQ files', vectorstore=, llm=AzureChatOpenAI(client=, async_client=, model_name='gpt-4-0613', temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='', request_timeout=180.0, max_retries=3, azure_endpoint='https://uiuc-chat-canada-east.openai.azure.com/', deployment_name='gpt-4-from-canada-east', openai_api_version='2023-05-15', openai_api_type='azure')),\n", + " VectorStoreQATool(name='MultiQC-docs', description=\"MultiQC is a reporting tool that parses results and statistics from bioinformatics tool outputs, such as log files and console outputs. It helps to summarize experiments containing multiple samples and multiple analysis steps. It's designed to be placed at the end of pipelines or to be run manually when you've finished running your tools.\", vectorstore=, llm=AzureChatOpenAI(client=, async_client=, model_name='gpt-4-0613', temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='', request_timeout=180.0, max_retries=3, azure_endpoint='https://uiuc-chat-canada-east.openai.azure.com/', deployment_name='gpt-4-from-canada-east', openai_api_version='2023-05-15', openai_api_type='azure')),\n", + " VectorStoreQATool(name='Bioconductor-docs', description=\"Bioconductor is a project that contains hundreds of individual R packages. They're all high quality libraries that provide widespread access to a broad range of powerful statistical and graphical methods for the analysis of genomic data. Some of them also facilitate the inclusion of biological metadata in the analysis of genomic data, e.g. literature data from PubMed, annotation data from Entrez genes.\", vectorstore=, llm=AzureChatOpenAI(client=, async_client=, model_name='gpt-4-0613', temperature=0.0, openai_api_key=SecretStr('**********'), openai_proxy='', request_timeout=180.0, max_retries=3, azure_endpoint='https://uiuc-chat-canada-east.openai.azure.com/', deployment_name='gpt-4-from-canada-east', openai_api_version='2023-05-15', openai_api_type='azure')),\n", + " StructuredTool(name='Python-Code-Execution', description='Python-Code-Execution(code: str) - Executes Python3 code in an safe Docker container.', args_schema=, func=>),\n", + " StructuredTool(name='Shell-commands-except-for-git', description='Shell-commands-except-for-git(shell_command: str) - Run shell commands to, for example, execute shell scripts or R scripts. It is in the same environment as the Code Execution tool.', args_schema=, func=>)]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Exception in thread e2b-sandbox-refresh:\n", + "Traceback (most recent call last):\n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/sandbox/sandbox_connection.py\", line 431, in _refresh\n", + " api.sandboxes_sandbox_id_refreshes_post(\n", + " File \"pydantic/decorator.py\", line 40, in pydantic.decorator.validate_arguments.validate.wrapper_function\n", + " from contextlib import _GeneratorContextManager\n", + " File \"pydantic/decorator.py\", line 134, in pydantic.decorator.ValidatedFunction.call\n", + " \n", + " File \"pydantic/decorator.py\", line 206, in pydantic.decorator.ValidatedFunction.execute\n", + " \n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/api/v1/client/api/sandboxes_api.py\", line 379, in sandboxes_sandbox_id_refreshes_post\n", + " return self.sandboxes_sandbox_id_refreshes_post_with_http_info(\n", + " File \"pydantic/decorator.py\", line 40, in pydantic.decorator.validate_arguments.validate.wrapper_function\n", + " from contextlib import _GeneratorContextManager\n", + " File \"pydantic/decorator.py\", line 134, in pydantic.decorator.ValidatedFunction.call\n", + " \n", + " File \"pydantic/decorator.py\", line 206, in pydantic.decorator.ValidatedFunction.execute\n", + " \n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/api/v1/client/api/sandboxes_api.py\", line 492, in sandboxes_sandbox_id_refreshes_post_with_http_info\n", + " return self.api_client.call_api(\n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/api/v1/client/api_client.py\", line 466, in call_api\n", + " return self.__call_api(*args)\n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/api/v1/client/api_client.py\", line 242, in __call_api\n", + " raise e\n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/api/v1/client/api_client.py\", line 229, in __call_api\n", + " response_data = self.request(\n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/api/v1/client/api_client.py\", line 507, in request\n", + " return self.rest_client.post_request(\n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/api/v1/client/rest.py\", line 352, in post_request\n", + " return self.request(\n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/api/v1/client/rest.py\", line 259, in request\n", + " raise NotFoundException(http_resp=r)\n", + "e2b.api.v1.client.exceptions.NotFoundException: (404)\n", + "Reason: Not Found\n", + "HTTP response headers: HTTPHeaderDict({'content-type': 'application/json; charset=utf-8', 'date': 'Mon, 11 Mar 2024 19:38:54 GMT', 'Content-Length': '97', 'via': '1.1 google', 'Alt-Svc': 'h3=\":443\"; ma=2592000,h3-29=\":443\"; ma=2592000'})\n", + "HTTP response body: {\"code\":404,\"message\":\"Error refreshing sandbox - sandbox 'idelq6lkojtjj5btki6qq' was not found\"}\n", + "\n", + "\n", + "The above exception was the direct cause of the following exception:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/homebrew/Cellar/python@3.10/3.10.13/Frameworks/Python.framework/Versions/3.10/lib/python3.10/threading.py\", line 1016, in _bootstrap_inner\n", + " self.run()\n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/ipykernel/ipkernel.py\", line 761, in run_closure\n", + " _threading_Thread_run(self)\n", + " File \"/opt/homebrew/Cellar/python@3.10/3.10.13/Frameworks/Python.framework/Versions/3.10/lib/python3.10/threading.py\", line 953, in run\n", + " self._target(*self._args, **self._kwargs)\n", + " File \"/Users/minum/Documents/NCSA/UIUC-Chatbot/ai-ta-backend/.venv310/lib/python3.10/site-packages/e2b/sandbox/sandbox_connection.py\", line 438, in _refresh\n", + " raise SandboxException(\n", + "e2b.sandbox.exception.SandboxException: Sandbox idelq6lkojtjj5btki6qq-22b47deb failed because it cannot be found\n" + ] + } + ], + "source": [ + "# define tools for the executor agent\n", + "import uuid\n", + "from langgraph.prebuilt.tool_executor import ToolExecutor, ToolInvocation\n", + "\n", + "langsmith_run_id = str(uuid.uuid4()) # for Langsmith\n", + "tools = get_tools(langsmith_run_id)\n", + "tool_executor = ToolExecutor(tools=tools)\n", + "tools" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "5372d8fb-4580-40d8-b84c-fbf13420219f", + "metadata": {}, + "outputs": [], + "source": [ + "# generate planner - first candidate - first lats\n", + "planner_system_prompt = \"\"\"\n", + "You are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\n", + "\n", + "First, write a step-by-step plan for the task. The plan should be descriptive and well-explained. \n", + "\n", + "The main objective is to plan and execute the workflow efficiently. Break down the execution into small, informed steps rather than attempting everything in one go.\n", + "\n", + "You have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\n", + "\n", + "When you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\n", + "\n", + "When referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\n", + "\n", + "Communicate with the user in Markdown.\n", + "\n", + "Conclude your plan with ''.\n", + "\n", + "Verify the solution and provide evidence where possible.\n", + "\n", + "\"\"\"\n", + "\n", + "planner_prompt_template = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", planner_system_prompt),\n", + " (\"user\", \"{input}\"),\n", + " MessagesPlaceholder(variable_name=\"messages\", optional=True),\n", + " ]\n", + ")\n", + "\n", + "\n", + "initial_planner_chain = planner_prompt_template | llm.bind_tools(tools=tools).with_config(\n", + " run_name=\"GenerateInitialPlannerCandidate\"\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "4dc88470-cb00-4453-9cc5-2facbac4f554", + "metadata": {}, + "outputs": [], + "source": [ + "executor_system_prompt = \"\"\"\n", + "You are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\n", + "\n", + "Write code to achieve the given task. You have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\n", + "\n", + "When you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\n", + "\n", + "Before any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\n", + "\n", + "For package installations, use pip, and strive to install all necessary packages in a single command. In case of failures, debug and install them correctly, adhering to the Pydantic structure to avoid validation errors.\n", + "\n", + "When referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\n", + "\n", + "Prefer universally applicable packages that are likely to be pre-installed and compatible across different applications. For example, ffmpeg and pandoc are recommended for their broad support and functionality.\n", + "\n", + "Include steps and EXACT CODE SNIPPETS if they are applicable to the task. Do not suggest code that requires user modifications, and ensure all code is complete and executable as is.\n", + "\n", + "For code that needs to be saved to a file, indicate this with # filename: at the start of the code block. Only include one code block per response and avoid asking users to copy and paste results. Use the print function for outputs.\n", + "\n", + "Execute your code and provide results. If an error arises, correct it and provide the updated code. \n", + "\n", + "If a solution isn't reached after successful code execution, reassess your approach, gather more information, and propose an alternative method.\n", + "\n", + "Verify the solution and provide evidence where possible.\n", + "\n", + "End the interaction with \"TERMINATE\" once the task is completed.\n", + "\n", + "\"\"\"\n", + "\n", + "executor_prompt_template = ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"system\", executor_system_prompt),\n", + " (\"user\", \"{input}\"),\n", + " MessagesPlaceholder(variable_name=\"messages\", optional=True),\n", + " ]\n", + ")\n", + "\n", + "\n", + "initial_executor_chain = executor_prompt_template | llm.bind_tools(tools=tools).with_config(\n", + " run_name=\"GenerateInitialExecutorCandidate\"\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ca27d0bb-2387-466f-a8cc-44cef35467b7", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "parser = JsonOutputToolsParser(return_id=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "a348cc90-6e98-4063-82a9-e23440f28013", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ------- MIGHT TRUNCATE MESSAGES ------- \n", + "message content='\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nFirst, write a plan. Recap the plan between each code block. This recapping is necessary to maintain the context due to the short-term memory constraints.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nYou have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nBefore any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\\n\\nFor package installations, use pip, and strive to install all necessary packages in a single command. In case of failures, debug and install them correctly, adhering to the Pydantic structure to avoid validation errors.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nPrefer universally applicable packages that are likely to be pre-installed and compatible across different applications. For example, ffmpeg and pandoc are recommended for their broad support and functionality.\\n\\nCommunicate with the user in Markdown.\\n\\nThe main objective is to plan and execute the workflow efficiently. Break down the execution into small, informed steps rather than attempting everything in one go. Regularly commit to GitHub, aiming to push a comprehensive verbose notebook upon completion.\\n\\nConclude your plan with \\'\\'.\\n\\nInclude steps and EXACT CODE SNIPPETS if they are applicable to the task. Do not suggest code that requires user modifications, and ensure all code is complete and executable as is.\\n\\nFor code that needs to be saved to a file, indicate this with # filename: at the start of the code block. Only include one code block per response and avoid asking users to copy and paste results. Use the print function for outputs.\\n\\nIf an error arises, correct it and provide the updated code. If a solution isn\\'t reached after successful code execution, reassess your approach, gather more information, and propose an alternative method.\\n\\nVerify the solution and provide evidence where possible.\\n\\nEnd the interaction with \"TERMINATE\" once the task is completed.\\n\\n'\n", + "message content='Implement an RNA-Sequence Analysis Workflow using DESEQ2.'\n", + "HumanMessage\n", + "/\\/\\/\\/\\/\\ num_tokens_in_messages 566\n", + "/\\/\\/\\/\\/\\ Hard coded context window size of: 120000\n", + "message content='\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nFirst, write a plan. Recap the plan between each code block. This recapping is necessary to maintain the context due to the short-term memory constraints.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nYou have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nBefore any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\\n\\nFor package installations, use pip, and strive to install all necessary packages in a single command. In case of failures, debug and install them correctly, adhering to the Pydantic structure to avoid validation errors.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nPrefer universally applicable packages that are likely to be pre-installed and compatible across different applications. For example, ffmpeg and pandoc are recommended for their broad support and functionality.\\n\\nCommunicate with the user in Markdown.\\n\\nThe main objective is to plan and execute the workflow efficiently. Break down the execution into small, informed steps rather than attempting everything in one go. Regularly commit to GitHub, aiming to push a comprehensive verbose notebook upon completion.\\n\\nConclude your plan with \\'\\'.\\n\\nInclude steps and EXACT CODE SNIPPETS if they are applicable to the task. Do not suggest code that requires user modifications, and ensure all code is complete and executable as is.\\n\\nFor code that needs to be saved to a file, indicate this with # filename: at the start of the code block. Only include one code block per response and avoid asking users to copy and paste results. Use the print function for outputs.\\n\\nIf an error arises, correct it and provide the updated code. If a solution isn\\'t reached after successful code execution, reassess your approach, gather more information, and propose an alternative method.\\n\\nVerify the solution and provide evidence where possible.\\n\\nEnd the interaction with \"TERMINATE\" once the task is completed.\\n\\n'\n", + "message content='Implement an RNA-Sequence Analysis Workflow using DESEQ2.'\n", + "HumanMessage\n", + "message content='\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nFirst, write a plan. Recap the plan between each code block. This recapping is necessary to maintain the context due to the short-term memory constraints.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nYou have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nBefore any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\\n\\nFor package installations, use pip, and strive to install all necessary packages in a single command. In case of failures, debug and install them correctly, adhering to the Pydantic structure to avoid validation errors.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nPrefer universally applicable packages that are likely to be pre-installed and compatible across different applications. For example, ffmpeg and pandoc are recommended for their broad support and functionality.\\n\\nCommunicate with the user in Markdown.\\n\\nThe main objective is to plan and execute the workflow efficiently. Break down the execution into small, informed steps rather than attempting everything in one go. Regularly commit to GitHub, aiming to push a comprehensive verbose notebook upon completion.\\n\\nConclude your plan with \\'\\'.\\n\\nInclude steps and EXACT CODE SNIPPETS if they are applicable to the task. Do not suggest code that requires user modifications, and ensure all code is complete and executable as is.\\n\\nFor code that needs to be saved to a file, indicate this with # filename: at the start of the code block. Only include one code block per response and avoid asking users to copy and paste results. Use the print function for outputs.\\n\\nIf an error arises, correct it and provide the updated code. If a solution isn\\'t reached after successful code execution, reassess your approach, gather more information, and propose an alternative method.\\n\\nVerify the solution and provide evidence where possible.\\n\\nEnd the interaction with \"TERMINATE\" once the task is completed.\\n\\n'\n", + "message content='Implement an RNA-Sequence Analysis Workflow using DESEQ2.'\n", + "HumanMessage\n", + "message_dicts [{'role': 'system', 'content': '\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nFirst, write a plan. Recap the plan between each code block. This recapping is necessary to maintain the context due to the short-term memory constraints.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nYou have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nBefore any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\\n\\nFor package installations, use pip, and strive to install all necessary packages in a single command. In case of failures, debug and install them correctly, adhering to the Pydantic structure to avoid validation errors.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nPrefer universally applicable packages that are likely to be pre-installed and compatible across different applications. For example, ffmpeg and pandoc are recommended for their broad support and functionality.\\n\\nCommunicate with the user in Markdown.\\n\\nThe main objective is to plan and execute the workflow efficiently. Break down the execution into small, informed steps rather than attempting everything in one go. Regularly commit to GitHub, aiming to push a comprehensive verbose notebook upon completion.\\n\\nConclude your plan with \\'\\'.\\n\\nInclude steps and EXACT CODE SNIPPETS if they are applicable to the task. Do not suggest code that requires user modifications, and ensure all code is complete and executable as is.\\n\\nFor code that needs to be saved to a file, indicate this with # filename: at the start of the code block. Only include one code block per response and avoid asking users to copy and paste results. Use the print function for outputs.\\n\\nIf an error arises, correct it and provide the updated code. If a solution isn\\'t reached after successful code execution, reassess your approach, gather more information, and propose an alternative method.\\n\\nVerify the solution and provide evidence where possible.\\n\\nEnd the interaction with \"TERMINATE\" once the task is completed.\\n\\n'}, {'role': 'user', 'content': 'Implement an RNA-Sequence Analysis Workflow using DESEQ2.'}]\n" + ] + }, + { + "data": { + "text/plain": [ + "AIMessage(content='To implement an RNA-Sequence Analysis Workflow using DESeq2, we will follow a structured approach. DESeq2 is a popular R package used for analyzing count data from RNA sequencing experiments to find differentially expressed genes. The workflow will involve several steps, including quality control, alignment, counting, and differential expression analysis. Here\\'s a detailed plan:\\n\\n1. **Environment Setup**: Ensure R and necessary packages (DESeq2, Bioconductor) are installed.\\n2. **Quality Control**: Use FastQC to assess the quality of the raw sequencing data.\\n3. **Alignment**: Align the reads to a reference genome using a tool like STAR or HISAT2.\\n4. **Counting**: Count the number of reads that map to each gene using featureCounts or a similar tool.\\n5. **Differential Expression Analysis**: Use DESeq2 to perform differential expression analysis on the count data.\\n6. **Results Interpretation**: Interpret the results, focusing on significantly differentially expressed genes.\\n\\n### Step 1: Environment Setup\\n\\nBefore we start, we need to ensure that R is installed and set up correctly, along with the DESeq2 package from Bioconductor. This step will be executed in the Docker environment.\\n\\n```R\\n# filename: setup_environment.R\\nif (!requireNamespace(\"BiocManager\", quietly = TRUE))\\n install.packages(\"BiocManager\")\\nBiocManager::install(\"DESeq2\")\\n```\\n\\n### Step 2: Quality Control with FastQC\\n\\nWe will use FastQC to perform quality control checks on the raw sequencing data. This step ensures that the data is of high quality before proceeding with further analysis.\\n\\n```shell\\n# This is a shell command to be executed in the Docker environment.\\nfastqc data/*.fastq.gz -o fastqc_results/\\n```\\n\\n### Step 3: Alignment with STAR\\n\\nNext, we align the reads to a reference genome using STAR. This step requires a pre-built genome index, which can be generated using STAR if not already available.\\n\\n```shell\\n# This is a shell command to be executed in the Docker environment.\\nSTAR --genomeDir /path/to/genomeIndex --readFilesIn sample1.fastq.gz sample2.fastq.gz --readFilesCommand zcat --outFileNamePrefix aligned/\\n```\\n\\n### Step 4: Counting with featureCounts\\n\\nAfter alignment, we count the number of reads mapping to each gene using featureCounts.\\n\\n```shell\\n# This is a shell command to be executed in the Docker environment.\\nfeatureCounts -a /path/to/annotation.gtf -o counts.txt aligned/Aligned.out.sam\\n```\\n\\n### Step 5: Differential Expression Analysis with DESeq2\\n\\nWe will then use DESeq2 for differential expression analysis on the count data.\\n\\n```R\\n# filename: deseq2_analysis.R\\nlibrary(DESeq2)\\ncountData <- read.csv(\"counts.txt\", sep=\"\\\\t\", header=TRUE, row.names=1)\\ncolData <- read.csv(\"colData.csv\", header=TRUE, row.names=1)\\ndds <- DESeqDataSetFromMatrix(countData = countData, colData = colData, design = ~ condition)\\ndds <- DESeq(dds)\\nres <- results(dds)\\nwrite.csv(as.data.frame(res), file=\"DESeq2_results.csv\")\\n```\\n\\n### Step 6: Results Interpretation\\n\\nFinally, we interpret the results from DESeq2, focusing on genes that are significantly differentially expressed.\\n\\nThis plan outlines the steps and tools required to implement an RNA-Sequence Analysis Workflow using DESeq2. Each step will be executed sequentially, ensuring the environment is correctly set up and all necessary packages and tools are installed.\\n\\n', additional_kwargs={'tool_calls': [{'id': 'call_sbIlTEqjnrZSWyyjEExyhm6z', 'function': {'arguments': '{\"shell_command\":\"Rscript setup_environment.R\"}', 'name': 'Shell-commands-except-for-git'}, 'type': 'function'}]})" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "initial_response = initial_answer_chain.invoke(\n", + " {\"input\": \"Implement an RNA-Sequence Analysis Workflow using DESEQ2.\"}\n", + ")\n", + "initial_response" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "5f06e2aa-f2b3-4946-bbbd-2d330840be12", + "metadata": {}, + "outputs": [], + "source": [ + "from __future__ import annotations\n", + "\n", + "import math\n", + "from collections import deque\n", + "from typing import List, Optional\n", + "\n", + "from langchain_core.messages import AIMessage, BaseMessage, HumanMessage\n", + "\n", + "\n", + "class Node:\n", + " def __init__(\n", + " self,\n", + " messages: List[BaseMessage],\n", + " reflection: Reflection,\n", + " parent: Optional[Node] = None,\n", + " ):\n", + " self.messages = messages\n", + " self.parent = parent\n", + " self.children = []\n", + " self.value = 0\n", + " self.visits = 0\n", + " self.reflection = reflection\n", + " self.depth = parent.depth + 1 if parent is not None else 1\n", + " self._is_solved = reflection.found_solution if reflection else False\n", + " if self._is_solved:\n", + " self._mark_tree_as_solved()\n", + " self.backpropagate(reflection.normalized_score)\n", + "\n", + " def __repr__(self) -> str:\n", + " return (\n", + " f\"\"\n", + " )\n", + "\n", + " @property\n", + " def is_solved(self):\n", + " \"\"\"If any solutions exist, we can end the search.\"\"\"\n", + " return self._is_solved\n", + "\n", + " @property\n", + " def is_terminal(self):\n", + " return not self.children\n", + "\n", + " @property\n", + " def best_child(self):\n", + " \"\"\"Select the child with the highest UCT to search next.\"\"\"\n", + " if not self.children:\n", + " return None\n", + " return max(self.children, key=lambda child: child.upper_confidence_bound())\n", + "\n", + " @property\n", + " def best_child_score(self):\n", + " \"\"\"Return the child with the highest value.\"\"\"\n", + " if not self.children:\n", + " return None\n", + " return max(self.children, key=lambda child: int(child.is_solved) * child.value)\n", + "\n", + " @property\n", + " def height(self) -> int:\n", + " \"\"\"Check for how far we've rolled out the tree.\"\"\"\n", + " if self.children:\n", + " return 1 + max([child.height for child in self.children])\n", + " return 1\n", + "\n", + " def upper_confidence_bound(self, exploration_weight=1.0):\n", + " \"\"\"Return the UCT score. This helps balance exploration vs. exploitation of a branch.\"\"\"\n", + " if self.parent is None:\n", + " raise ValueError(\"Cannot obtain UCT from root node\")\n", + " if self.visits == 0:\n", + " return self.value\n", + " # Encourages exploitation of high-value trajectories\n", + " average_reward = self.value / self.visits\n", + " # Encourages exploration of less-visited trajectories\n", + " exploration_term = math.sqrt(math.log(self.parent.visits) / self.visits)\n", + " return average_reward + exploration_weight * exploration_term\n", + "\n", + " def backpropagate(self, reward: float):\n", + " \"\"\"Update the score of this node and its parents.\"\"\"\n", + " node = self\n", + " while node:\n", + " node.visits += 1\n", + " node.value = (node.value * (node.visits - 1) + reward) / node.visits\n", + " node = node.parent\n", + "\n", + " def get_messages(self, include_reflections: bool = True):\n", + " if include_reflections:\n", + " return self.messages + [self.reflection.as_message()]\n", + " return self.messages\n", + "\n", + " def get_trajectory(self, include_reflections: bool = True) -> List[BaseMessage]:\n", + " \"\"\"Get messages representing this search branch.\"\"\"\n", + " messages = []\n", + " node = self\n", + " while node:\n", + " messages.extend(\n", + " node.get_messages(include_reflections=include_reflections)[::-1]\n", + " )\n", + " node = node.parent\n", + " # Reverse the final back-tracked trajectory to return in the correct order\n", + " return messages[::-1] # root solution, reflection, child 1, ...\n", + "\n", + " def get_best_solution(self):\n", + " \"\"\"Return the best solution from within the current sub-tree.\"\"\"\n", + " all_nodes = [self]\n", + " nodes = deque()\n", + " nodes.append(self)\n", + " while nodes:\n", + " node = nodes.popleft()\n", + " all_nodes.extend(node.children)\n", + " for n in node.children:\n", + " nodes.append(n)\n", + " best_node = max(\n", + " all_nodes,\n", + " # We filter out all non-terminal, non-solution trajectories\n", + " key=lambda node: int(node.is_terminal and node.is_solved) * node.value,\n", + " )\n", + " return best_node\n", + " \n", + " def _get_all_children(self):\n", + " all_nodes = []\n", + " nodes = deque()\n", + " nodes.append(self)\n", + " while nodes:\n", + " node = nodes.popleft()\n", + " all_nodes.extend(node.children)\n", + " for n in node.children:\n", + " nodes.append(n)\n", + " return all_nodes\n", + "\n", + " def _mark_tree_as_solved(self):\n", + " parent = self.parent\n", + " while parent:\n", + " parent._is_solved = True\n", + " parent = parent.parent" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "f7210358-b13e-458f-b8b1-6ccfa49ffc41", + "metadata": {}, + "outputs": [], + "source": [ + "# graph state\n", + "\n", + "from typing_extensions import TypedDict\n", + "\n", + "\n", + "class PlannerTreeState(TypedDict):\n", + " # The full tree\n", + " root: Node\n", + " # The original input\n", + " input: str\n", + "\n", + "\n", + "class ExecutorTreeState(TypedDict):\n", + " # The full tree\n", + " root: Node\n", + " # The plan input - one step at a time\n", + " input: str\n", + " previous_steps: Annotated[List[str], operator.add] = []\n", + " previous_results: Annotated[List[str], operator.add] = []\n", + "\n", + "\n", + "# after each plan execution, clean the tree\n", + "# executortree.clear()\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "8d61cef7-a56d-427a-95dc-3e28ef388d84", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the planner node we will add to the graph\n", + "def generate_initial_planner_response(state: PlannerTreeState) -> dict:\n", + " \"\"\"Generate the initial candidate response.\"\"\"\n", + " res = initial_planner_chain.invoke({\"input\": state[\"input\"]})\n", + " parsed = parser.invoke(res)\n", + " tool_responses = tool_executor.batch(\n", + " [ToolInvocation(tool=r[\"type\"], tool_input=r[\"args\"]) for r in parsed]\n", + " )\n", + " output_messages = [res] + [\n", + " ToolMessage(content=json.dumps(resp), tool_call_id=tool_call[\"id\"])\n", + " for resp, tool_call in zip(tool_responses, parsed)\n", + " ]\n", + " reflection = reflection_chain.invoke(\n", + " {\"input\": state[\"input\"], \"candidate\": output_messages}\n", + " )\n", + " root = Node(output_messages, reflection=reflection)\n", + " return {\n", + " **state,\n", + " \"root\": root,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "11564697-80bd-433d-94c3-6876b259c20b", + "metadata": {}, + "outputs": [], + "source": [ + "# Define the executor node we will add to the graph\n", + "def generate_initial_executor_response(state: ExecutorTreeState) -> dict:\n", + " \"\"\"Generate the initial candidate response.\"\"\"\n", + " res = initial_executor_chain.invoke({\"input\": state[\"input\"]})\n", + " state.previous_steps.append(state[\"input\"])\n", + " parsed = parser.invoke(res)\n", + " tool_responses = tool_executor.batch(\n", + " [ToolInvocation(tool=r[\"type\"], tool_input=r[\"args\"]) for r in parsed]\n", + " )\n", + " output_messages = [res] + [\n", + " ToolMessage(content=json.dumps(resp), tool_call_id=tool_call[\"id\"])\n", + " for resp, tool_call in zip(tool_responses, parsed)\n", + " ]\n", + " reflection = reflection_chain.invoke(\n", + " {\"input\": state[\"input\"], \"candidate\": output_messages}\n", + " )\n", + " root = Node(output_messages, reflection=reflection)\n", + " return {\n", + " **state,\n", + " \"root\": root,\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "85f9d9a3-2cf4-452b-bd34-bea35cb4f419", + "metadata": {}, + "outputs": [], + "source": [ + "# This generates N candidate values\n", + "# for a single input to sample actions from the environment\n", + "from langchain_core.prompt_values import ChatPromptValue\n", + "from langchain_core.runnables import RunnableConfig\n", + "\n", + "\n", + "def generate_candidates(messages: ChatPromptValue, config: RunnableConfig):\n", + " n = config[\"configurable\"].get(\"N\", 5)\n", + " bound_kwargs = llm.bind_tools(tools=tools).kwargs\n", + " print(\"bound kwargs\")\n", + " print(bound_kwargs)\n", + " chat_result = llm.generate(\n", + " [messages.to_messages()],\n", + " n=n,\n", + " callbacks=config[\"callbacks\"],\n", + " run_name=\"GenerateCandidates\",\n", + " **bound_kwargs\n", + " )\n", + " print(\"chat result : \", chat_result)\n", + " return [gen.message for gen in chat_result.generations[0]]\n", + "\n", + "\n", + "expansion_chain = prompt_template | generate_candidates" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "4fd12716-adbd-4e92-8cd7-c276ee6daea5", + "metadata": {}, + "outputs": [], + "source": [ + "# Candidate generation node\n", + "# We will package the candidate generation and reflection steps in the following \"expand\" node. \n", + "\n", + "from collections import defaultdict, deque\n", + "\n", + "\n", + "def plannerexpand(state: PlannerTreeState, config: RunnableConfig) -> dict:\n", + " \"\"\"Starting from the \"best\" node in the tree, generate N candidates for the next step.\"\"\"\n", + " print(\"RunnableConfig :\", config)\n", + " print(\"RunnableConfig configurable:\", config[\"configurable\"])\n", + " root = state[\"root\"]\n", + " best_candidate: Node = root.best_child if root.children else root\n", + " messages = best_candidate.get_trajectory()\n", + " print(\"messages :\")\n", + " print(messages)\n", + " # Generate N candidates from the single child candidate\n", + " new_candidates = expansion_chain.invoke(\n", + " {\"input\": state[\"input\"], \"messages\": messages}, config\n", + " )\n", + " parsed = parser.batch(new_candidates)\n", + " flattened = [\n", + " (i, tool_call)\n", + " for i, tool_calls in enumerate(parsed)\n", + " for tool_call in tool_calls\n", + " ]\n", + " tool_responses = tool_executor.batch(\n", + " [\n", + " ToolInvocation(tool=tool_call[\"type\"], tool_input=tool_call[\"args\"])\n", + " for _, tool_call in flattened\n", + " ]\n", + " )\n", + " collected_responses = defaultdict(list)\n", + " for (i, tool_call), resp in zip(flattened, tool_responses):\n", + " collected_responses[i].append(\n", + " ToolMessage(content=json.dumps(resp), tool_call_id=tool_call[\"id\"])\n", + " )\n", + " output_messages = []\n", + " for i, candidate in enumerate(new_candidates):\n", + " output_messages.append([candidate] + collected_responses[i])\n", + " print(\"candidate output messages\")\n", + " print(output_messages)\n", + " # Reflect on each candidate\n", + " # For tasks with external validation, you'd add that here.\n", + " reflections = reflection_chain.batch(\n", + " [{\"input\": state[\"input\"], \"candidate\": msges} for msges in output_messages],\n", + " config,\n", + " )\n", + " # Grow tree\n", + " child_nodes = [\n", + " Node(cand, parent=best_candidate, reflection=reflection)\n", + " for cand, reflection in zip(output_messages, reflections)\n", + " ]\n", + " print(\"child nodes\")\n", + " print(child_nodes)\n", + " best_candidate.children.extend(child_nodes)\n", + " # We have already extended the tree directly, so we just return the state\n", + " return state" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "5132b75c-98ef-4adf-bc47-b76afc8126e7", + "metadata": {}, + "outputs": [], + "source": [ + "# Candidate generation node\n", + "# We will package the candidate generation and reflection steps in the following \"expand\" node. \n", + "\n", + "from collections import defaultdict, deque\n", + "\n", + "\n", + "def executorexpand(state: ExecutorTreeState, config: RunnableConfig) -> dict:\n", + " \"\"\"Starting from the \"best\" node in the tree, generate N candidates for the next step.\"\"\"\n", + " print(\"RunnableConfig :\", config)\n", + " print(\"RunnableConfig configurable:\", config[\"configurable\"])\n", + " root = state[\"root\"]\n", + " best_candidate: Node = root.best_child if root.children else root\n", + " messages = best_candidate.get_trajectory()\n", + " print(\"messages :\")\n", + " print(messages)\n", + " # Generate N candidates from the single child candidate\n", + " new_candidates = expansion_chain.invoke(\n", + " {\"input\": state[\"current_step\"], \"messages\": messages}, config\n", + " )\n", + " parsed = parser.batch(new_candidates)\n", + " flattened = [\n", + " (i, tool_call)\n", + " for i, tool_calls in enumerate(parsed)\n", + " for tool_call in tool_calls\n", + " ]\n", + " tool_responses = tool_executor.batch(\n", + " [\n", + " ToolInvocation(tool=tool_call[\"type\"], tool_input=tool_call[\"args\"])\n", + " for _, tool_call in flattened\n", + " ]\n", + " )\n", + " collected_responses = defaultdict(list)\n", + " for (i, tool_call), resp in zip(flattened, tool_responses):\n", + " collected_responses[i].append(\n", + " ToolMessage(content=json.dumps(resp), tool_call_id=tool_call[\"id\"])\n", + " )\n", + " output_messages = []\n", + " for i, candidate in enumerate(new_candidates):\n", + " output_messages.append([candidate] + collected_responses[i])\n", + " print(\"candidate output messages\")\n", + " print(output_messages)\n", + " # Reflect on each candidate\n", + " # For tasks with external validation, you'd add that here.\n", + " reflections = reflection_chain.batch(\n", + " [{\"input\": state[\"input\"], \"candidate\": msges} for msges in output_messages],\n", + " config,\n", + " )\n", + " # Grow tree\n", + " child_nodes = [\n", + " Node(cand, parent=best_candidate, reflection=reflection)\n", + " for cand, reflection in zip(output_messages, reflections)\n", + " ]\n", + " print(\"child nodes\")\n", + " print(child_nodes)\n", + " best_candidate.children.extend(child_nodes)\n", + " print(\"best_candidate\")\n", + " print(best_candidate)\n", + " state[\"previous_steps\"].append(state[\"current_step\"])\n", + " state[\"previous_results\"].append(best_candidate.messages[-1])\n", + " # We have already extended the tree directly, so we just return the state\n", + " return state" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "9f34882e-2720-4880-95cc-191d4e55a276", + "metadata": {}, + "outputs": [], + "source": [ + "# create graph\n", + "# With those two nodes defined, we are ready to define the graph. After each agent step, we have the option of finishing.\n", + "\n", + "from langgraph.graph import END, StateGraph\n", + "\n", + "\n", + "def should_loop(state: TreeState):\n", + " \"\"\"Determine whether to continue the tree search.\"\"\"\n", + " root = state[\"root\"]\n", + " if root.is_solved:\n", + " return END\n", + " if root.height > 5:\n", + " return END\n", + " return \"expand\"\n", + "\n", + "\n", + "planner_builder = StateGraph(PlannerTreeState)\n", + "planner_builder.add_node(\"start\", generate_initial_planner_response)\n", + "planner_builder.add_node(\"expand\", plannerexpand)\n", + "planner_builder.set_entry_point(\"start\")\n", + "\n", + "\n", + "planner_builder.add_conditional_edges(\n", + " \"start\",\n", + " # Either expand/rollout or finish\n", + " should_loop,\n", + ")\n", + "planner_builder.add_conditional_edges(\n", + " \"expand\",\n", + " # Either continue to rollout or finish\n", + " should_loop,\n", + ")\n", + "\n", + "planner_graph = planner_builder.compile()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "e5265edb-3b4d-43aa-bffb-61b5f5885718", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ------- MIGHT TRUNCATE MESSAGES ------- \n", + "message content=\"\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nFirst, write a step-by-step plan for the task. The plan should be descriptive and well-explained. \\n\\nThe main objective is to plan and execute the workflow efficiently. Break down the execution into small, informed steps rather than attempting everything in one go.\\n\\nYou have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nCommunicate with the user in Markdown.\\n\\nConclude your plan with ''.\\n\\nVerify the solution and provide evidence where possible.\\n\\n\"\n", + "message content='Implement an RNA-Sequence Analysis Workflow using DESeq2.'\n", + "HumanMessage\n", + "/\\/\\/\\/\\/\\ num_tokens_in_messages 270\n", + "/\\/\\/\\/\\/\\ Hard coded context window size of: 120000\n", + "message content=\"\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nFirst, write a step-by-step plan for the task. The plan should be descriptive and well-explained. \\n\\nThe main objective is to plan and execute the workflow efficiently. Break down the execution into small, informed steps rather than attempting everything in one go.\\n\\nYou have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nCommunicate with the user in Markdown.\\n\\nConclude your plan with ''.\\n\\nVerify the solution and provide evidence where possible.\\n\\n\"\n", + "message content='Implement an RNA-Sequence Analysis Workflow using DESeq2.'\n", + "HumanMessage\n", + "message content=\"\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nFirst, write a step-by-step plan for the task. The plan should be descriptive and well-explained. \\n\\nThe main objective is to plan and execute the workflow efficiently. Break down the execution into small, informed steps rather than attempting everything in one go.\\n\\nYou have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nCommunicate with the user in Markdown.\\n\\nConclude your plan with ''.\\n\\nVerify the solution and provide evidence where possible.\\n\\n\"\n", + "message content='Implement an RNA-Sequence Analysis Workflow using DESeq2.'\n", + "HumanMessage\n", + "message_dicts [{'role': 'system', 'content': \"\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nFirst, write a step-by-step plan for the task. The plan should be descriptive and well-explained. \\n\\nThe main objective is to plan and execute the workflow efficiently. Break down the execution into small, informed steps rather than attempting everything in one go.\\n\\nYou have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nCommunicate with the user in Markdown.\\n\\nConclude your plan with ''.\\n\\nVerify the solution and provide evidence where possible.\\n\\n\"}, {'role': 'user', 'content': 'Implement an RNA-Sequence Analysis Workflow using DESeq2.'}]\n", + " ------- MIGHT TRUNCATE MESSAGES ------- \n", + "message content='Reflect and grade the assistant response to the user question below.'\n", + "message content='Implement an RNA-Sequence Analysis Workflow using DESeq2.'\n", + "HumanMessage\n", + "message content=\"To implement an RNA-Sequence Analysis Workflow using DESeq2, we will follow a structured approach. DESeq2 is a method for differential gene expression analysis based on the negative binomial distribution. It's widely used in bioinformatics for analyzing count data from RNA sequencing experiments. The workflow will involve several steps, from quality control of the raw data to the final differential expression analysis using DESeq2. Here's a step-by-step plan:\\n\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n\\n### Step 2: Preprocessing and Cleaning\\n- **2.1.** If necessary, use tools like Trimmomatic or Cutadapt to trim adapters and low-quality bases.\\n- **2.2.** Optionally, use FastQC again to check the quality of the data post-trimming.\\n\\n### Step 3: Alignment\\n- **3.1.** Align the cleaned reads to the reference genome using an aligner like STAR or HISAT2.\\n- **3.2.** Convert the output SAM files to BAM files for easier handling.\\n\\n### Step 4: Quantification\\n- **4.1.** Use featureCounts (from the Subread package) or a similar tool to count the number of reads mapping to each gene.\\n\\n### Step 5: Differential Expression Analysis with DESeq2\\n- **5.1.** Prepare the count matrix and the metadata (experimental design) for DESeq2 analysis.\\n- **5.2.** Use DESeq2 (within the R environment) to perform differential expression analysis.\\n- **5.3.** Interpret the results, focusing on significantly differentially expressed genes.\\n\\n### Step 6: Visualization and Further Analysis\\n- **6.1.** Generate plots (MA plot, volcano plot) to visualize the results.\\n- **6.2.** Perform additional analyses as needed, such as GO enrichment or pathway analysis.\\n\\n### Step 7: Reporting\\n- **7.1.** Use MultiQC or a similar tool to compile the results and statistics from the various steps into a comprehensive report.\\n\\n### Execution Plan:\\n- **Execution of Steps 1 & 2:** We will start by executing FastQC for quality control and, if necessary, proceed with data cleaning using a trimming tool.\\n- **Execution of Steps 3 & 4:** Next, we will align the reads to the reference genome and perform quantification.\\n- **Execution of Steps 5, 6, & 7:** Finally, we will carry out the differential expression analysis using DESeq2, visualize the results, and compile a comprehensive report.\\n\\nFor each of these steps, we will use the appropriate tools and scripts, ensuring that the workflow is executed efficiently and accurately. This plan outlines a comprehensive approach to RNA-Seq analysis, from raw data to differential expression analysis using DESeq2.\\n\\n\"\n", + "AIMessage\n", + "/\\/\\/\\/\\/\\ num_tokens_in_messages 663\n", + "/\\/\\/\\/\\/\\ Hard coded context window size of: 120000\n", + "message content='Reflect and grade the assistant response to the user question below.'\n", + "message content='Implement an RNA-Sequence Analysis Workflow using DESeq2.'\n", + "HumanMessage\n", + "message content=\"To implement an RNA-Sequence Analysis Workflow using DESeq2, we will follow a structured approach. DESeq2 is a method for differential gene expression analysis based on the negative binomial distribution. It's widely used in bioinformatics for analyzing count data from RNA sequencing experiments. The workflow will involve several steps, from quality control of the raw data to the final differential expression analysis using DESeq2. Here's a step-by-step plan:\\n\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n\\n### Step 2: Preprocessing and Cleaning\\n- **2.1.** If necessary, use tools like Trimmomatic or Cutadapt to trim adapters and low-quality bases.\\n- **2.2.** Optionally, use FastQC again to check the quality of the data post-trimming.\\n\\n### Step 3: Alignment\\n- **3.1.** Align the cleaned reads to the reference genome using an aligner like STAR or HISAT2.\\n- **3.2.** Convert the output SAM files to BAM files for easier handling.\\n\\n### Step 4: Quantification\\n- **4.1.** Use featureCounts (from the Subread package) or a similar tool to count the number of reads mapping to each gene.\\n\\n### Step 5: Differential Expression Analysis with DESeq2\\n- **5.1.** Prepare the count matrix and the metadata (experimental design) for DESeq2 analysis.\\n- **5.2.** Use DESeq2 (within the R environment) to perform differential expression analysis.\\n- **5.3.** Interpret the results, focusing on significantly differentially expressed genes.\\n\\n### Step 6: Visualization and Further Analysis\\n- **6.1.** Generate plots (MA plot, volcano plot) to visualize the results.\\n- **6.2.** Perform additional analyses as needed, such as GO enrichment or pathway analysis.\\n\\n### Step 7: Reporting\\n- **7.1.** Use MultiQC or a similar tool to compile the results and statistics from the various steps into a comprehensive report.\\n\\n### Execution Plan:\\n- **Execution of Steps 1 & 2:** We will start by executing FastQC for quality control and, if necessary, proceed with data cleaning using a trimming tool.\\n- **Execution of Steps 3 & 4:** Next, we will align the reads to the reference genome and perform quantification.\\n- **Execution of Steps 5, 6, & 7:** Finally, we will carry out the differential expression analysis using DESeq2, visualize the results, and compile a comprehensive report.\\n\\nFor each of these steps, we will use the appropriate tools and scripts, ensuring that the workflow is executed efficiently and accurately. This plan outlines a comprehensive approach to RNA-Seq analysis, from raw data to differential expression analysis using DESeq2.\\n\\n\"\n", + "AIMessage\n", + "message content='Reflect and grade the assistant response to the user question below.'\n", + "message content='Implement an RNA-Sequence Analysis Workflow using DESeq2.'\n", + "HumanMessage\n", + "message content=\"To implement an RNA-Sequence Analysis Workflow using DESeq2, we will follow a structured approach. DESeq2 is a method for differential gene expression analysis based on the negative binomial distribution. It's widely used in bioinformatics for analyzing count data from RNA sequencing experiments. The workflow will involve several steps, from quality control of the raw data to the final differential expression analysis using DESeq2. Here's a step-by-step plan:\\n\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n\\n### Step 2: Preprocessing and Cleaning\\n- **2.1.** If necessary, use tools like Trimmomatic or Cutadapt to trim adapters and low-quality bases.\\n- **2.2.** Optionally, use FastQC again to check the quality of the data post-trimming.\\n\\n### Step 3: Alignment\\n- **3.1.** Align the cleaned reads to the reference genome using an aligner like STAR or HISAT2.\\n- **3.2.** Convert the output SAM files to BAM files for easier handling.\\n\\n### Step 4: Quantification\\n- **4.1.** Use featureCounts (from the Subread package) or a similar tool to count the number of reads mapping to each gene.\\n\\n### Step 5: Differential Expression Analysis with DESeq2\\n- **5.1.** Prepare the count matrix and the metadata (experimental design) for DESeq2 analysis.\\n- **5.2.** Use DESeq2 (within the R environment) to perform differential expression analysis.\\n- **5.3.** Interpret the results, focusing on significantly differentially expressed genes.\\n\\n### Step 6: Visualization and Further Analysis\\n- **6.1.** Generate plots (MA plot, volcano plot) to visualize the results.\\n- **6.2.** Perform additional analyses as needed, such as GO enrichment or pathway analysis.\\n\\n### Step 7: Reporting\\n- **7.1.** Use MultiQC or a similar tool to compile the results and statistics from the various steps into a comprehensive report.\\n\\n### Execution Plan:\\n- **Execution of Steps 1 & 2:** We will start by executing FastQC for quality control and, if necessary, proceed with data cleaning using a trimming tool.\\n- **Execution of Steps 3 & 4:** Next, we will align the reads to the reference genome and perform quantification.\\n- **Execution of Steps 5, 6, & 7:** Finally, we will carry out the differential expression analysis using DESeq2, visualize the results, and compile a comprehensive report.\\n\\nFor each of these steps, we will use the appropriate tools and scripts, ensuring that the workflow is executed efficiently and accurately. This plan outlines a comprehensive approach to RNA-Seq analysis, from raw data to differential expression analysis using DESeq2.\\n\\n\"\n", + "AIMessage\n", + "message_dicts [{'role': 'system', 'content': 'Reflect and grade the assistant response to the user question below.'}, {'role': 'user', 'content': 'Implement an RNA-Sequence Analysis Workflow using DESeq2.'}, {'role': 'assistant', 'content': \"To implement an RNA-Sequence Analysis Workflow using DESeq2, we will follow a structured approach. DESeq2 is a method for differential gene expression analysis based on the negative binomial distribution. It's widely used in bioinformatics for analyzing count data from RNA sequencing experiments. The workflow will involve several steps, from quality control of the raw data to the final differential expression analysis using DESeq2. Here's a step-by-step plan:\\n\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n\\n### Step 2: Preprocessing and Cleaning\\n- **2.1.** If necessary, use tools like Trimmomatic or Cutadapt to trim adapters and low-quality bases.\\n- **2.2.** Optionally, use FastQC again to check the quality of the data post-trimming.\\n\\n### Step 3: Alignment\\n- **3.1.** Align the cleaned reads to the reference genome using an aligner like STAR or HISAT2.\\n- **3.2.** Convert the output SAM files to BAM files for easier handling.\\n\\n### Step 4: Quantification\\n- **4.1.** Use featureCounts (from the Subread package) or a similar tool to count the number of reads mapping to each gene.\\n\\n### Step 5: Differential Expression Analysis with DESeq2\\n- **5.1.** Prepare the count matrix and the metadata (experimental design) for DESeq2 analysis.\\n- **5.2.** Use DESeq2 (within the R environment) to perform differential expression analysis.\\n- **5.3.** Interpret the results, focusing on significantly differentially expressed genes.\\n\\n### Step 6: Visualization and Further Analysis\\n- **6.1.** Generate plots (MA plot, volcano plot) to visualize the results.\\n- **6.2.** Perform additional analyses as needed, such as GO enrichment or pathway analysis.\\n\\n### Step 7: Reporting\\n- **7.1.** Use MultiQC or a similar tool to compile the results and statistics from the various steps into a comprehensive report.\\n\\n### Execution Plan:\\n- **Execution of Steps 1 & 2:** We will start by executing FastQC for quality control and, if necessary, proceed with data cleaning using a trimming tool.\\n- **Execution of Steps 3 & 4:** Next, we will align the reads to the reference genome and perform quantification.\\n- **Execution of Steps 5, 6, & 7:** Finally, we will carry out the differential expression analysis using DESeq2, visualize the results, and compile a comprehensive report.\\n\\nFor each of these steps, we will use the appropriate tools and scripts, ensuring that the workflow is executed efficiently and accurately. This plan outlines a comprehensive approach to RNA-Seq analysis, from raw data to differential expression analysis using DESeq2.\\n\\n\"}]\n", + "tool choices : [Reflection(reflections=\"The response provides a comprehensive and structured plan for implementing an RNA-Sequence Analysis Workflow using DESeq2, covering all essential steps from quality control to differential expression analysis and reporting. It outlines the use of various tools and techniques at each stage, ensuring a clear understanding of the workflow. However, the response could have been enhanced by including example commands or code snippets for some of the critical steps, especially for running DESeq2, which is the core of the user's request. This would have made the guide more practical and actionable for someone looking to implement the workflow. Additionally, mentioning the need for basic R programming skills for using DESeq2 and suggesting resources for users unfamiliar with R or the specific tools could have provided more comprehensive support.\", score=8, found_solution=True)]\n", + "start\n", + "rolled out: 1\n", + "---\n", + "__end__\n", + "rolled out: 1\n", + "---\n" + ] + } + ], + "source": [ + "# invoke\n", + "\n", + "question = \"Implement an RNA-Sequence Analysis Workflow using DESeq2.\"\n", + "for step in planner_graph.stream({\"input\": question}):\n", + " step_name, step_state = next(iter(step.items()))\n", + " print(step_name)\n", + " print(\"rolled out: \", step_state[\"root\"].height)\n", + " print(\"---\")" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "bb8a8ef3-78df-49f0-8682-bb241e28f536", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To implement an RNA-Sequence Analysis Workflow using DESeq2, we will follow a structured approach. DESeq2 is a method for differential gene expression analysis based on the negative binomial distribution. It's widely used in bioinformatics for analyzing count data from RNA sequencing experiments. The workflow will involve several steps, from quality control of the raw data to the final differential expression analysis using DESeq2. Here's a step-by-step plan:\n", + "\n", + "### Step 1: Quality Control of Raw Data\n", + "- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\n", + "- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\n", + "\n", + "### Step 2: Preprocessing and Cleaning\n", + "- **2.1.** If necessary, use tools like Trimmomatic or Cutadapt to trim adapters and low-quality bases.\n", + "- **2.2.** Optionally, use FastQC again to check the quality of the data post-trimming.\n", + "\n", + "### Step 3: Alignment\n", + "- **3.1.** Align the cleaned reads to the reference genome using an aligner like STAR or HISAT2.\n", + "- **3.2.** Convert the output SAM files to BAM files for easier handling.\n", + "\n", + "### Step 4: Quantification\n", + "- **4.1.** Use featureCounts (from the Subread package) or a similar tool to count the number of reads mapping to each gene.\n", + "\n", + "### Step 5: Differential Expression Analysis with DESeq2\n", + "- **5.1.** Prepare the count matrix and the metadata (experimental design) for DESeq2 analysis.\n", + "- **5.2.** Use DESeq2 (within the R environment) to perform differential expression analysis.\n", + "- **5.3.** Interpret the results, focusing on significantly differentially expressed genes.\n", + "\n", + "### Step 6: Visualization and Further Analysis\n", + "- **6.1.** Generate plots (MA plot, volcano plot) to visualize the results.\n", + "- **6.2.** Perform additional analyses as needed, such as GO enrichment or pathway analysis.\n", + "\n", + "### Step 7: Reporting\n", + "- **7.1.** Use MultiQC or a similar tool to compile the results and statistics from the various steps into a comprehensive report.\n", + "\n", + "### Execution Plan:\n", + "- **Execution of Steps 1 & 2:** We will start by executing FastQC for quality control and, if necessary, proceed with data cleaning using a trimming tool.\n", + "- **Execution of Steps 3 & 4:** Next, we will align the reads to the reference genome and perform quantification.\n", + "- **Execution of Steps 5, 6, & 7:** Finally, we will carry out the differential expression analysis using DESeq2, visualize the results, and compile a comprehensive report.\n", + "\n", + "For each of these steps, we will use the appropriate tools and scripts, ensuring that the workflow is executed efficiently and accurately. This plan outlines a comprehensive approach to RNA-Seq analysis, from raw data to differential expression analysis using DESeq2.\n", + "\n", + "\n" + ] + } + ], + "source": [ + "planner_solution_node = step[\"__end__\"][\"root\"].get_best_solution()\n", + "planner_best_trajectory = planner_solution_node.get_trajectory(include_reflections=False)\n", + "print(planner_best_trajectory[-1].content)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "d9a44b31-2e5b-42c7-a92a-022ec37b4020", + "metadata": {}, + "outputs": [], + "source": [ + "executor_builder = StateGraph(ExecutorTreeState)\n", + "executor_builder.add_node(\"start\", generate_initial_executor_response)\n", + "executor_builder.add_node(\"expand\", executorexpand)\n", + "executor_builder.set_entry_point(\"start\")\n", + "\n", + "\n", + "executor_builder.add_conditional_edges(\n", + " \"start\",\n", + " # Either expand/rollout or finish\n", + " should_loop,\n", + ")\n", + "executor_builder.add_conditional_edges(\n", + " \"expand\",\n", + " # Either continue to rollout or finish\n", + " should_loop,\n", + ")\n", + "\n", + "executor_graph = executor_builder.compile()" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "06aa3920-9385-4081-acf9-69bda9a8b036", + "metadata": {}, + "outputs": [], + "source": [ + "executor_plan_step = \"\"\"\n", + "### Step 1: Quality Control of Raw Data\n", + "- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\n", + "- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "5f87d204-e0e0-45bc-ac23-f43e37277cf5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " ------- MIGHT TRUNCATE MESSAGES ------- \n", + "message content='\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nWrite code to achieve the given task. You have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nBefore any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\\n\\nFor package installations, use pip, and strive to install all necessary packages in a single command. In case of failures, debug and install them correctly, adhering to the Pydantic structure to avoid validation errors.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nPrefer universally applicable packages that are likely to be pre-installed and compatible across different applications. For example, ffmpeg and pandoc are recommended for their broad support and functionality.\\n\\nInclude steps and EXACT CODE SNIPPETS if they are applicable to the task. Do not suggest code that requires user modifications, and ensure all code is complete and executable as is.\\n\\nFor code that needs to be saved to a file, indicate this with # filename: at the start of the code block. Only include one code block per response and avoid asking users to copy and paste results. Use the print function for outputs.\\n\\nExecute your code and provide results. If an error arises, correct it and provide the updated code. \\n\\nIf a solution isn\\'t reached after successful code execution, reassess your approach, gather more information, and propose an alternative method.\\n\\nVerify the solution and provide evidence where possible.\\n\\nEnd the interaction with \"TERMINATE\" once the task is completed.\\n\\n'\n", + "message content='\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n'\n", + "HumanMessage\n", + "/\\/\\/\\/\\/\\ num_tokens_in_messages 532\n", + "/\\/\\/\\/\\/\\ Hard coded context window size of: 120000\n", + "message content='\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nWrite code to achieve the given task. You have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nBefore any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\\n\\nFor package installations, use pip, and strive to install all necessary packages in a single command. In case of failures, debug and install them correctly, adhering to the Pydantic structure to avoid validation errors.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nPrefer universally applicable packages that are likely to be pre-installed and compatible across different applications. For example, ffmpeg and pandoc are recommended for their broad support and functionality.\\n\\nInclude steps and EXACT CODE SNIPPETS if they are applicable to the task. Do not suggest code that requires user modifications, and ensure all code is complete and executable as is.\\n\\nFor code that needs to be saved to a file, indicate this with # filename: at the start of the code block. Only include one code block per response and avoid asking users to copy and paste results. Use the print function for outputs.\\n\\nExecute your code and provide results. If an error arises, correct it and provide the updated code. \\n\\nIf a solution isn\\'t reached after successful code execution, reassess your approach, gather more information, and propose an alternative method.\\n\\nVerify the solution and provide evidence where possible.\\n\\nEnd the interaction with \"TERMINATE\" once the task is completed.\\n\\n'\n", + "message content='\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n'\n", + "HumanMessage\n", + "message content='\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nWrite code to achieve the given task. You have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nBefore any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\\n\\nFor package installations, use pip, and strive to install all necessary packages in a single command. In case of failures, debug and install them correctly, adhering to the Pydantic structure to avoid validation errors.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nPrefer universally applicable packages that are likely to be pre-installed and compatible across different applications. For example, ffmpeg and pandoc are recommended for their broad support and functionality.\\n\\nInclude steps and EXACT CODE SNIPPETS if they are applicable to the task. Do not suggest code that requires user modifications, and ensure all code is complete and executable as is.\\n\\nFor code that needs to be saved to a file, indicate this with # filename: at the start of the code block. Only include one code block per response and avoid asking users to copy and paste results. Use the print function for outputs.\\n\\nExecute your code and provide results. If an error arises, correct it and provide the updated code. \\n\\nIf a solution isn\\'t reached after successful code execution, reassess your approach, gather more information, and propose an alternative method.\\n\\nVerify the solution and provide evidence where possible.\\n\\nEnd the interaction with \"TERMINATE\" once the task is completed.\\n\\n'\n", + "message content='\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n'\n", + "HumanMessage\n", + "message_dicts [{'role': 'system', 'content': '\\nYou are a world-class programmer and AI assistant capable of executing any goal related to software development, genAI, LLMs, and full-stack technologies.\\n\\nWrite code to achieve the given task. You have access to a variety of tools, including browser, github_tools for interacting with GitHub, and multiple vectorstore instances. Utilize the browser for internet searches and github_tools for all interactions with GitHub repositories. For code execution, rely on PythonRepl and shell tools available in the Docker environment.\\n\\nWhen you send a message containing code, it will be executed in a Docker container. You have been granted full permission to execute any code necessary to complete the task within this Docker environment using PythonRepl and shell tools as required.\\n\\nBefore any execution task, prepare the development environment, whether that be a notebook, .sh, .py, .ipynb, .R, or other file types. Incrementally develop, execute, and debug the code, committing changes to GitHub regularly.\\n\\nFor package installations, use pip, and strive to install all necessary packages in a single command. In case of failures, debug and install them correctly, adhering to the Pydantic structure to avoid validation errors.\\n\\nWhen referencing files, assume they exist in the GitHub repository. Use github_tools for all interactions with GitHub and operate within the current working directory.\\n\\nPrefer universally applicable packages that are likely to be pre-installed and compatible across different applications. For example, ffmpeg and pandoc are recommended for their broad support and functionality.\\n\\nInclude steps and EXACT CODE SNIPPETS if they are applicable to the task. Do not suggest code that requires user modifications, and ensure all code is complete and executable as is.\\n\\nFor code that needs to be saved to a file, indicate this with # filename: at the start of the code block. Only include one code block per response and avoid asking users to copy and paste results. Use the print function for outputs.\\n\\nExecute your code and provide results. If an error arises, correct it and provide the updated code. \\n\\nIf a solution isn\\'t reached after successful code execution, reassess your approach, gather more information, and propose an alternative method.\\n\\nVerify the solution and provide evidence where possible.\\n\\nEnd the interaction with \"TERMINATE\" once the task is completed.\\n\\n'}, {'role': 'user', 'content': '\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n'}]\n", + " ------- MIGHT TRUNCATE MESSAGES ------- \n", + "message content='Reflect and grade the assistant response to the user question below.'\n", + "message content='\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n'\n", + "HumanMessage\n", + "message content='To perform the initial quality checks on the raw sequencing data using FastQC, we\\'ll simulate the process since we can\\'t directly run FastQC in this environment. However, I\\'ll guide you through the steps you would take and provide an example of how to interpret the results.\\n\\n#### Step 1.1: Using FastQC for Initial Quality Checks\\n\\n1. **Installation**: If FastQC is not already installed, you can download it from the FastQC website or install it using Conda or Homebrew for macOS/Linux users. For example, using Conda:\\n ```sh\\n conda install -c bioconda fastqc\\n ```\\n\\n2. **Running FastQC**: To run FastQC on your raw sequencing data, you would use the following command in your terminal (assuming you have FastQC installed and your data is in FastQ format):\\n ```sh\\n fastqc your_data.fastq\\n ```\\n Replace `your_data.fastq` with the path to your actual data file. If you have multiple files, you can run FastQC on all of them by specifying each file name or using wildcards.\\n\\n3. **Output**: FastQC will generate an HTML report for each input file, providing a detailed analysis of the data quality. These reports include information on various quality metrics such as per base sequence quality, sequence duplication levels, and overrepresented sequences.\\n\\n#### Step 1.2: Interpreting FastQC Reports and Deciding on Preprocessing\\n\\nAfter running FastQC, you\\'ll need to review the generated reports to assess the quality of your sequencing data. Here are some key sections to focus on:\\n\\n- **Per Base Sequence Quality**: This section shows the quality scores across all bases. Ideally, the plot should show most bases with high scores (green). If you observe a significant drop in quality towards the ends of reads, you might consider trimming those ends.\\n\\n- **Sequence Duplication Levels**: High duplication levels can indicate PCR artifacts. Depending on your experiment, you may need to remove duplicates.\\n\\n- **Overrepresented Sequences**: This section lists sequences that appear more often than expected. These could be adapters or other contaminant sequences that you might need to trim or filter out.\\n\\nBased on these and other sections of the FastQC report, you can decide if your data requires preprocessing such as trimming for quality or removing adapters. Tools like Trimmomatic or Cutadapt are commonly used for this purpose.\\n\\nSince we can\\'t run FastQC directly here, let\\'s simulate the decision-making process based on a hypothetical FastQC report:\\n\\n- Assume the \"Per Base Sequence Quality\" section shows a significant quality drop at the ends of reads.\\n- The \"Sequence Duplication Levels\" are within acceptable limits.\\n- \"Overrepresented Sequences\" indicates the presence of adapter sequences.\\n\\nBased on this, we would decide to preprocess the data by trimming the low-quality ends and removing adapter sequences.\\n\\nFor the next steps, we would typically proceed with the preprocessing using a tool like Trimmomatic or Cutadapt. However, since this step is hypothetical, we\\'ll pause here. If you have specific questions about the preprocessing or need further assistance, please let me know!'\n", + "AIMessage\n", + "/\\/\\/\\/\\/\\ num_tokens_in_messages 723\n", + "/\\/\\/\\/\\/\\ Hard coded context window size of: 120000\n", + "message content='Reflect and grade the assistant response to the user question below.'\n", + "message content='\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n'\n", + "HumanMessage\n", + "message content='To perform the initial quality checks on the raw sequencing data using FastQC, we\\'ll simulate the process since we can\\'t directly run FastQC in this environment. However, I\\'ll guide you through the steps you would take and provide an example of how to interpret the results.\\n\\n#### Step 1.1: Using FastQC for Initial Quality Checks\\n\\n1. **Installation**: If FastQC is not already installed, you can download it from the FastQC website or install it using Conda or Homebrew for macOS/Linux users. For example, using Conda:\\n ```sh\\n conda install -c bioconda fastqc\\n ```\\n\\n2. **Running FastQC**: To run FastQC on your raw sequencing data, you would use the following command in your terminal (assuming you have FastQC installed and your data is in FastQ format):\\n ```sh\\n fastqc your_data.fastq\\n ```\\n Replace `your_data.fastq` with the path to your actual data file. If you have multiple files, you can run FastQC on all of them by specifying each file name or using wildcards.\\n\\n3. **Output**: FastQC will generate an HTML report for each input file, providing a detailed analysis of the data quality. These reports include information on various quality metrics such as per base sequence quality, sequence duplication levels, and overrepresented sequences.\\n\\n#### Step 1.2: Interpreting FastQC Reports and Deciding on Preprocessing\\n\\nAfter running FastQC, you\\'ll need to review the generated reports to assess the quality of your sequencing data. Here are some key sections to focus on:\\n\\n- **Per Base Sequence Quality**: This section shows the quality scores across all bases. Ideally, the plot should show most bases with high scores (green). If you observe a significant drop in quality towards the ends of reads, you might consider trimming those ends.\\n\\n- **Sequence Duplication Levels**: High duplication levels can indicate PCR artifacts. Depending on your experiment, you may need to remove duplicates.\\n\\n- **Overrepresented Sequences**: This section lists sequences that appear more often than expected. These could be adapters or other contaminant sequences that you might need to trim or filter out.\\n\\nBased on these and other sections of the FastQC report, you can decide if your data requires preprocessing such as trimming for quality or removing adapters. Tools like Trimmomatic or Cutadapt are commonly used for this purpose.\\n\\nSince we can\\'t run FastQC directly here, let\\'s simulate the decision-making process based on a hypothetical FastQC report:\\n\\n- Assume the \"Per Base Sequence Quality\" section shows a significant quality drop at the ends of reads.\\n- The \"Sequence Duplication Levels\" are within acceptable limits.\\n- \"Overrepresented Sequences\" indicates the presence of adapter sequences.\\n\\nBased on this, we would decide to preprocess the data by trimming the low-quality ends and removing adapter sequences.\\n\\nFor the next steps, we would typically proceed with the preprocessing using a tool like Trimmomatic or Cutadapt. However, since this step is hypothetical, we\\'ll pause here. If you have specific questions about the preprocessing or need further assistance, please let me know!'\n", + "AIMessage\n", + "message content='Reflect and grade the assistant response to the user question below.'\n", + "message content='\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n'\n", + "HumanMessage\n", + "message content='To perform the initial quality checks on the raw sequencing data using FastQC, we\\'ll simulate the process since we can\\'t directly run FastQC in this environment. However, I\\'ll guide you through the steps you would take and provide an example of how to interpret the results.\\n\\n#### Step 1.1: Using FastQC for Initial Quality Checks\\n\\n1. **Installation**: If FastQC is not already installed, you can download it from the FastQC website or install it using Conda or Homebrew for macOS/Linux users. For example, using Conda:\\n ```sh\\n conda install -c bioconda fastqc\\n ```\\n\\n2. **Running FastQC**: To run FastQC on your raw sequencing data, you would use the following command in your terminal (assuming you have FastQC installed and your data is in FastQ format):\\n ```sh\\n fastqc your_data.fastq\\n ```\\n Replace `your_data.fastq` with the path to your actual data file. If you have multiple files, you can run FastQC on all of them by specifying each file name or using wildcards.\\n\\n3. **Output**: FastQC will generate an HTML report for each input file, providing a detailed analysis of the data quality. These reports include information on various quality metrics such as per base sequence quality, sequence duplication levels, and overrepresented sequences.\\n\\n#### Step 1.2: Interpreting FastQC Reports and Deciding on Preprocessing\\n\\nAfter running FastQC, you\\'ll need to review the generated reports to assess the quality of your sequencing data. Here are some key sections to focus on:\\n\\n- **Per Base Sequence Quality**: This section shows the quality scores across all bases. Ideally, the plot should show most bases with high scores (green). If you observe a significant drop in quality towards the ends of reads, you might consider trimming those ends.\\n\\n- **Sequence Duplication Levels**: High duplication levels can indicate PCR artifacts. Depending on your experiment, you may need to remove duplicates.\\n\\n- **Overrepresented Sequences**: This section lists sequences that appear more often than expected. These could be adapters or other contaminant sequences that you might need to trim or filter out.\\n\\nBased on these and other sections of the FastQC report, you can decide if your data requires preprocessing such as trimming for quality or removing adapters. Tools like Trimmomatic or Cutadapt are commonly used for this purpose.\\n\\nSince we can\\'t run FastQC directly here, let\\'s simulate the decision-making process based on a hypothetical FastQC report:\\n\\n- Assume the \"Per Base Sequence Quality\" section shows a significant quality drop at the ends of reads.\\n- The \"Sequence Duplication Levels\" are within acceptable limits.\\n- \"Overrepresented Sequences\" indicates the presence of adapter sequences.\\n\\nBased on this, we would decide to preprocess the data by trimming the low-quality ends and removing adapter sequences.\\n\\nFor the next steps, we would typically proceed with the preprocessing using a tool like Trimmomatic or Cutadapt. However, since this step is hypothetical, we\\'ll pause here. If you have specific questions about the preprocessing or need further assistance, please let me know!'\n", + "AIMessage\n", + "message_dicts [{'role': 'system', 'content': 'Reflect and grade the assistant response to the user question below.'}, {'role': 'user', 'content': '\\n### Step 1: Quality Control of Raw Data\\n- **1.1.** Use FastQC to perform initial quality checks on the raw sequencing data.\\n- **1.2.** Based on FastQC reports, decide if any preprocessing (like trimming) is needed.\\n'}, {'role': 'assistant', 'content': 'To perform the initial quality checks on the raw sequencing data using FastQC, we\\'ll simulate the process since we can\\'t directly run FastQC in this environment. However, I\\'ll guide you through the steps you would take and provide an example of how to interpret the results.\\n\\n#### Step 1.1: Using FastQC for Initial Quality Checks\\n\\n1. **Installation**: If FastQC is not already installed, you can download it from the FastQC website or install it using Conda or Homebrew for macOS/Linux users. For example, using Conda:\\n ```sh\\n conda install -c bioconda fastqc\\n ```\\n\\n2. **Running FastQC**: To run FastQC on your raw sequencing data, you would use the following command in your terminal (assuming you have FastQC installed and your data is in FastQ format):\\n ```sh\\n fastqc your_data.fastq\\n ```\\n Replace `your_data.fastq` with the path to your actual data file. If you have multiple files, you can run FastQC on all of them by specifying each file name or using wildcards.\\n\\n3. **Output**: FastQC will generate an HTML report for each input file, providing a detailed analysis of the data quality. These reports include information on various quality metrics such as per base sequence quality, sequence duplication levels, and overrepresented sequences.\\n\\n#### Step 1.2: Interpreting FastQC Reports and Deciding on Preprocessing\\n\\nAfter running FastQC, you\\'ll need to review the generated reports to assess the quality of your sequencing data. Here are some key sections to focus on:\\n\\n- **Per Base Sequence Quality**: This section shows the quality scores across all bases. Ideally, the plot should show most bases with high scores (green). If you observe a significant drop in quality towards the ends of reads, you might consider trimming those ends.\\n\\n- **Sequence Duplication Levels**: High duplication levels can indicate PCR artifacts. Depending on your experiment, you may need to remove duplicates.\\n\\n- **Overrepresented Sequences**: This section lists sequences that appear more often than expected. These could be adapters or other contaminant sequences that you might need to trim or filter out.\\n\\nBased on these and other sections of the FastQC report, you can decide if your data requires preprocessing such as trimming for quality or removing adapters. Tools like Trimmomatic or Cutadapt are commonly used for this purpose.\\n\\nSince we can\\'t run FastQC directly here, let\\'s simulate the decision-making process based on a hypothetical FastQC report:\\n\\n- Assume the \"Per Base Sequence Quality\" section shows a significant quality drop at the ends of reads.\\n- The \"Sequence Duplication Levels\" are within acceptable limits.\\n- \"Overrepresented Sequences\" indicates the presence of adapter sequences.\\n\\nBased on this, we would decide to preprocess the data by trimming the low-quality ends and removing adapter sequences.\\n\\nFor the next steps, we would typically proceed with the preprocessing using a tool like Trimmomatic or Cutadapt. However, since this step is hypothetical, we\\'ll pause here. If you have specific questions about the preprocessing or need further assistance, please let me know!'}]\n", + "tool choices : [Reflection(reflections=\"The response provided a clear and structured guide on how to perform initial quality checks on raw sequencing data using FastQC, including steps for installation, running the tool, and interpreting the results. It also offered a hypothetical scenario to help the user understand how to decide on preprocessing based on FastQC reports. The explanation was thorough and tailored to users who might not be familiar with bioinformatics tools, making it accessible. However, the response could have been improved by explicitly stating that it's a simulation of the process rather than direct instructions for executing in a real environment, to avoid any confusion for users attempting to follow the steps in this format. Additionally, providing direct links to resources or further reading on FastQC and preprocessing tools like Trimmomatic or Cutadapt could enhance the utility of the response.\", score=8, found_solution=True)]\n", + "start\n", + "rolled out: 1 \n", + "---\n", + "__end__\n", + "rolled out: 1 \n", + "---\n" + ] + } + ], + "source": [ + "# invoke executor\n", + "for step in executor_graph.stream({\"input\": executor_plan_step}):\n", + " step_name, step_state = next(iter(step.items()))\n", + " print(step_name)\n", + " print(\"rolled out: \", step_state[\"root\"].height, step_state[\"root\"])\n", + " print(\"---\")" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "aef6c482-cb41-4982-ac54-7b85f238e992", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To perform the initial quality checks on the raw sequencing data using FastQC, we'll simulate the process since we can't directly run FastQC in this environment. However, I'll guide you through the steps you would take and provide an example of how to interpret the results from FastQC reports to decide if any preprocessing, such as trimming, is needed.\n", + "\n", + "### Step 1.1: Using FastQC for Initial Quality Checks\n", + "\n", + "Normally, you would run FastQC on your raw sequencing data files (usually in FASTQ format) using the following command in a terminal:\n", + "\n", + "```bash\n", + "fastqc your_data_file.fastq\n", + "```\n", + "\n", + "This command generates a report in HTML format that you can view in any web browser. The report contains several sections, each providing insights into different aspects of your data quality.\n", + "\n", + "### Step 1.2: Interpreting FastQC Reports for Preprocessing Decisions\n", + "\n", + "Here are some key sections of the FastQC report to pay attention to for deciding on preprocessing:\n", + "\n", + "1. **Per base sequence quality**: This plot shows the quality scores across all bases at each position in the reads. If you see a significant drop in quality towards the ends of the reads, you might consider trimming those ends.\n", + "\n", + "2. **Per sequence quality scores**: This graph shows the distribution of the average quality score over all bases for each read. If a large number of reads have low average quality scores, you might need to filter out low-quality reads.\n", + "\n", + "3. **Per base sequence content**: This section shows the proportion of each base (A, T, C, G) at each position. A significant deviation from the expected proportions might indicate a bias or contamination that could affect further analysis.\n", + "\n", + "4. **Sequence Duplication Levels**: High levels of duplication can indicate PCR artifacts. Depending on your experiment, you might want to remove duplicates.\n", + "\n", + "5. **Overrepresented sequences**: This part lists sequences that appear more often than expected. These could be adapters or other contaminant sequences that you might want to trim.\n", + "\n", + "Based on these sections, if you decide that trimming or filtering is necessary, you can use tools like Trimmomatic for trimming based on quality and removing adapters, or seqtk for simple quality and length-based filtering.\n", + "\n", + "Remember, the decision to preprocess and the choice of parameters depend on the specifics of your data and the requirements of your downstream analyses.\n", + "\n", + "Since we can't run FastQC directly here, if you have specific questions about interpreting FastQC reports or need further guidance on preprocessing steps, feel free to ask!\n" + ] + } + ], + "source": [ + "executor_solution_node = step[\"__end__\"][\"root\"].get_best_solution()\n", + "executor_best_trajectory = executor_solution_node.get_trajectory(include_reflections=False)\n", + "print(executor_best_trajectory[-1].content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9edd42e3-6e3c-433d-b20b-79f4c622bdbe", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/media/logo.png b/media/logo.png index 55bf431d..5bfef08d 100644 Binary files a/media/logo.png and b/media/logo.png differ diff --git a/railway.json b/railway.json index 51bbb22f..d6d92535 100644 --- a/railway.json +++ b/railway.json @@ -1,19 +1,26 @@ -{ - "$schema": "https://railway.app/railway.schema.json", - "build": { - "builder": "NIXPACKS", - "nixpacksPlan": { - "phases": { - "myPhase": { - "name": "myPhase", - "aptPkgs": ["ffmpeg"] - } - } - } - }, - "deploy": { - "numReplicas": 1, - "restartPolicyType": "ON_FAILURE", - "restartPolicyMaxRetries": 1 - } -} \ No newline at end of file +{ + "$schema": "https://railway.app/railway.schema.json", + "build": { + "builder": "NIXPACKS", + "nixpacksVersion": "1.15.0", + "nixpacksPlan": { + "phases": { + "install": { + "cmds": [ + "python -m venv --copies /opt/venv && . /opt/venv/bin/activate", + "pip install pip==23.3.1", + "pip install -r requirements.txt" + ] + }, + "setup": { + "nixPkgs": ["python310", "gcc"] + } + } + } + }, + "deploy": { + "numReplicas": 1, + "restartPolicyType": "ON_FAILURE", + "restartPolicyMaxRetries": 1 + } +} diff --git a/requirements.txt b/requirements.txt index 8fcdb014..a2005f08 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,39 +1,73 @@ -click -Flask==2.3.2 -protobuf<=3.20 +# On Apple Silicon: pip uninstall grpcio -y; conda install grpcio -y +nomic==2.0.14 +protobuf==4.25.0 +click==8.1.7 +aiohttp==3.8.6 +MarkupSafe==2.1.3 +Werkzeug==3.0.1 +mkdocstrings[python]==0.23.0 +mkdocs-material==9.4.7 +itsdangerous==2.1.2 +wheel==0.41.3 +Flask[async]==3.0.0 gunicorn==21.2.0 -aiohttp -tiktoken -itsdangerous -Jinja2 -MarkupSafe -Werkzeug -python-dotenv -flask-cors -qdrant-client -mkdocs -mkdocstrings[python] -mkdocs-material -sqlalchemy -langchain==0.0.256 -openai -supabase -SQLAlchemy -boto3 -PyMuPDF -unstructured -tabulate -pdf2image +tiktoken==0.5.2 +Jinja2==3.1.2 +python-dotenv==1.0.0 +flask-cors==4.0.0 +qdrant-client==1.6.4 +mkdocs==1.5.3 +# openai==0.28.1 +supabase==2.0.2 +SQLAlchemy==2.0.22 +boto3==1.28.79 +PyMuPDF==1.23.6 +tabulate==0.9.0 typing-inspect==0.9.0 -typing_extensions==4.7.1 -pysrt -docx2txt -pydub -ffmpeg-python -ffprobe -ffmpeg -beautifulsoup4 -cs-dlp @ git+https://github.com/raffaem/cs-dlp.git@0.12.0b0 # previously called coursera-dl - +typing_extensions==4.8.0 +pysrt==1.1.2 +docx2txt==0.8 +pydub==0.25.1 +ffmpeg-python==0.2.0 +ffprobe==0.5 +ffmpeg==1.4 +beautifulsoup4==4.12.2 +canvasapi==3.2.0 +GitPython==3.1.40 +flask-executor==1.0.0 +# pdf packages for unstructured +# pdf2image==1.16.3 +# pdfminer.six==20221105 +# opencv-python-headless==4.8.1.78 +# unstructured.pytesseract==0.3.12 +# unstructured-inference==0.7.11 # this is the real large one :( +pytesseract==0.3.10 # image OCR +openpyxl==3.1.2 # excel +networkx==3.2.1 # unused part of excel partitioning :( +python-pptx==0.6.23 +unstructured==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4 +# unstructured[xlsx,image,pptx]==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4 +pydantic==1.10.13 # pydantic v1 works better for ray +ray==2.8.1 +posthog==3.1.0 +sentry-sdk==1.39.1 +google-search-results==2.4.2 # SerpAPI # No arize for now, huge build size with these additions. -# arize[AutoEmbeddings, LLM_Evaluation] \ No newline at end of file +# arize[AutoEmbeddings, LLM_Evaluation] +# langchain==0.0.331 +# langchain-openai==0.0.5 +# langchain-community==0.0.* +git+https://github.com/KastanDay/langchain-improved-agents.git@uiuc-dot-chat#egg=langchain&subdirectory=libs/langchain +git+https://github.com/KastanDay/langchain-improved-agents.git@uiuc-dot-chat#egg=langchain-experimental&subdirectory=libs/experimental +git+https://github.com/KastanDay/langchain-improved-agents.git@uiuc-dot-chat#egg=langchain-openai&subdirectory=libs/partners/openai +git+https://github.com/KastanDay/langchain-improved-agents.git@uiuc-dot-chat#egg=langchain-community&subdirectory=libs/community +langgraph==0.0.* +langchainhub==0.1.* +langsmith==0.0.* +openai==1.10.0 + +# pyautogen +e2b==0.17.1 +termcolor==2.3.0 +PyGithub==2.1.1 +playwright==1.40.0 \ No newline at end of file diff --git a/run.sh b/run.sh index 02359fd1..91195891 100755 --- a/run.sh +++ b/run.sh @@ -1,4 +1,5 @@ -#!/bin/bash - -export PYTHONPATH=$PYTHONPATH:$(pwd)/ai_ta_backend -exec gunicorn --workers=6 --threads=6 --worker-class=gthread ai_ta_backend.main:app --timeout 108000 \ No newline at end of file +#!/bin/bash + +export PYTHONPATH=${PYTHONPATH}:$(pwd)/ai_ta_backend +ray start --head --num-cpus 6 --object-store-memory 400000000 +exec gunicorn --workers=3 --threads=16 --worker-class=gthread ai_ta_backend.main:app --timeout 1800 diff --git a/run_agents.sh b/run_agents.sh new file mode 100755 index 00000000..2c868cda --- /dev/null +++ b/run_agents.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +echo "⚠️ Activate your ** VIRTUAL ENVIRONMENT ** before running this script!" + +# Function to handle script termination +cleanup() { + echo "🧹 Cleaning up..." + pkill -P $$ # Kill all child processes + pkill -f "flask --app ai_ta_backend.main:app --debug run --port 8000" # Kill Flask process + exit 255 +} +# Set trap for catching Ctrl+C and script termination +trap cleanup SIGINT SIGTERM + +#! Check if langchain is up to date with latest commit on branch `uiuc-dot-chat` of https://github.com/KastanDay/langchain-improved-agents.git +# Get the latest commit hash from the repository +latest_commit=$(git ls-remote https://github.com/KastanDay/langchain-improved-agents.git uiuc-dot-chat | head -1 | awk '{print $1}') +# Get the installed version +installed_version=$(pip freeze | grep langchain) +# Extract the commit hash from the installed version +installed_commit=${installed_version#*@} +installed_commit=${installed_commit%%#*} +installed_commit=${installed_commit##*.git@} +echo "Langchain Installed commit: ${installed_commit}" +echo "Langchain Latest commit: ${latest_commit}" + +# Check if the installed commit hash is the latest +if [[ ${installed_commit} != "${latest_commit}" ]]; then + echo "Re-Installing Langchain fork to ensure it's updated..." + pip uninstall langchain langchain-experimental -y + pip install "git+https://github.com/KastanDay/langchain-improved-agents.git@uiuc-dot-chat#egg=langchain&subdirectory=libs/langchain" + pip install "git+https://github.com/KastanDay/langchain-improved-agents.git@uiuc-dot-chat#egg=langchain-experimental&subdirectory=libs/experimental" + pip install "git+https://github.com/KastanDay/langchain-improved-agents.git@uiuc-dot-chat#egg=langchain-openai&subdirectory=libs/partners/openai" + pip install "git+https://github.com/KastanDay/langchain-improved-agents.git@uiuc-dot-chat#egg=langchain-community&subdirectory=libs/community" +else + echo "Langchain is up to date." +fi + +# Start port forwarding if no other instances of smee are already running +if ! pgrep -f smee >/dev/null; then + smee -u https://smee.io/nRnJDGnCbWYUaSGg --port 8000 & +fi + +# Start Flask (with New Relic logging) in the background +flask --app ai_ta_backend.main:app --debug run --port 8000 & + +# Keep script running +while true; do + sleep 1 +done