Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Auto scrape grants.gov, very cool way to find grants for yourself. #324

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
39 changes: 33 additions & 6 deletions ai_ta_backend/database/sql.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import os

from injector import inject

import supabase
from injector import inject


class SQLDatabase:
Expand All @@ -14,10 +13,37 @@ def __init__(self):
supabase_url=os.environ['SUPABASE_URL'], supabase_key=os.environ['SUPABASE_API_KEY'])

def getAllMaterialsForCourse(self, course_name: str):
"""
WARNING: This will hit 1,500 row limit!
Without pagination, you'll be limited by Supabase's default maximum rows limit (1000-1500 depending on the configuration).
"""
return self.supabase_client.table(
os.environ['SUPABASE_DOCUMENTS_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq(
'course_name', course_name).execute()

def getAllMaterialsForCourse_fullUsingPagination(self, course_name: str):
"""
This uses pagination to retrieve all documents for a course.
"""
all_materials = []
page = 1
page_size = 200

while True:
response = self.supabase_client.table(
os.environ['SUPABASE_DOCUMENTS_TABLE']).select('course_name, s3_path, readable_filename, url, base_url').eq(
'course_name', course_name).range((page - 1) * page_size, page * page_size - 1).execute()

materials = response.data
all_materials.extend(materials)

if len(materials) < page_size:
break

page += 1

return all_materials

def getMaterialsForCourseAndS3Path(self, course_name: str, s3_path: str):
return self.supabase_client.from_(os.environ['SUPABASE_DOCUMENTS_TABLE']).select("id, s3_path, contexts").eq(
's3_path', s3_path).eq('course_name', course_name).execute()
Expand Down Expand Up @@ -153,9 +179,10 @@ def getAllConversationsForUserAndProject(self, user_email: str, project_name: st

def insertProject(self, project_info):
return self.supabase_client.table("projects").insert(project_info).execute()

def getPreAssignedAPIKeys(self, email: str):
return self.supabase_client.table("pre_authorized_api_keys").select("*").contains("emails", '["' + email + '"]').execute()

return self.supabase_client.table("pre_authorized_api_keys").select("*").contains("emails",
'["' + email + '"]').execute()

def getConversationsCreatedAtByCourse(self, course_name: str):
return self.supabase_client.table("llm-convo-monitor").select("created_at").eq("course_name", course_name).execute()
return self.supabase_client.table("llm-convo-monitor").select("created_at").eq("course_name", course_name).execute()
120 changes: 69 additions & 51 deletions ai_ta_backend/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from flask_injector import FlaskInjector, RequestScope
from injector import Binder, SingletonScope

from ai_ta_backend.beam.nomic_logging import create_document_map
from ai_ta_backend.database.aws import AWSStorage
from ai_ta_backend.database.sql import SQLDatabase
from ai_ta_backend.database.vector import VectorDatabase
Expand All @@ -36,7 +35,8 @@
ThreadPoolExecutorInterface,
)
from ai_ta_backend.service.export_service import ExportService
from ai_ta_backend.service.nomic_service import NomicService

# from ai_ta_backend.service.nomic_service import NomicService
from ai_ta_backend.service.posthog_service import PosthogService
from ai_ta_backend.service.project_service import ProjectService
from ai_ta_backend.service.retrieval_service import RetrievalService
Expand Down Expand Up @@ -176,22 +176,21 @@ def delete(service: RetrievalService, flaskExecutor: ExecutorInterface):
return response


@app.route('/getNomicMap', methods=['GET'])
def nomic_map(service: NomicService):
course_name: str = request.args.get('course_name', default='', type=str)
map_type: str = request.args.get('map_type', default='conversation', type=str)

if course_name == '':
# proper web error "400 Bad request"
abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`")
# @app.route('/getNomicMap', methods=['GET'])
# def nomic_map(service: NomicService):
# course_name: str = request.args.get('course_name', default='', type=str)
# map_type: str = request.args.get('map_type', default='conversation', type=str)

map_id = service.get_nomic_map(course_name, map_type)
print("nomic map\n", map_id)
# if course_name == '':
# # proper web error "400 Bad request"
# abort(400, description=f"Missing required parameter: 'course_name' must be provided. Course name: `{course_name}`")

response = jsonify(map_id)
response.headers.add('Access-Control-Allow-Origin', '*')
return response
# map_id = service.get_nomic_map(course_name, map_type)
# print("nomic map\n", map_id)

# response = jsonify(map_id)
# response.headers.add('Access-Control-Allow-Origin', '*')
# return response

# @app.route('/createDocumentMap', methods=['GET'])
# def createDocumentMap(service: NomicService):
Expand Down Expand Up @@ -236,28 +235,27 @@ def nomic_map(service: NomicService):
# response.headers.add('Access-Control-Allow-Origin', '*')
# return response

# @app.route('/onResponseCompletion', methods=['POST'])
# def logToNomic(service: NomicService, flaskExecutor: ExecutorInterface):
# data = request.get_json()
# course_name = data['course_name']
# conversation = data['conversation']

@app.route('/onResponseCompletion', methods=['POST'])
def logToNomic(service: NomicService, flaskExecutor: ExecutorInterface):
data = request.get_json()
course_name = data['course_name']
conversation = data['conversation']

if course_name == '' or conversation == '':
# proper web error "400 Bad request"
abort(
400,
description=
f"Missing one or more required parameters: 'course_name' and 'conversation' must be provided. Course name: `{course_name}`, Conversation: `{conversation}`"
)
print(f"In /onResponseCompletion for course: {course_name}")

# background execution of tasks!!
#response = flaskExecutor.submit(service.log_convo_to_nomic, course_name, data)
#result = flaskExecutor.submit(service.log_to_conversation_map, course_name, conversation).result()
response = jsonify({'outcome': 'success'})
response.headers.add('Access-Control-Allow-Origin', '*')
return response
# if course_name == '' or conversation == '':
# # proper web error "400 Bad request"
# abort(
# 400,
# description=
# f"Missing one or more required parameters: 'course_name' and 'conversation' must be provided. Course name: `{course_name}`, Conversation: `{conversation}`"
# )
# print(f"In /onResponseCompletion for course: {course_name}")

# # background execution of tasks!!
# #response = flaskExecutor.submit(service.log_convo_to_nomic, course_name, data)
# #result = flaskExecutor.submit(service.log_to_conversation_map, course_name, conversation).result()
# response = jsonify({'outcome': 'success'})
# response.headers.add('Access-Control-Allow-Origin', '*')
# return response


@app.route('/export-convo-history-csv', methods=['GET'])
Expand Down Expand Up @@ -519,31 +517,34 @@ def switch_workflow(service: WorkflowService) -> Response:
else:
abort(400, description=f"Bad request: {e}")


@app.route('/getConversationStats', methods=['GET'])
def get_conversation_stats(service: RetrievalService) -> Response:
course_name = request.args.get('course_name', default='', type=str)
course_name = request.args.get('course_name', default='', type=str)

if course_name == '':
abort(400, description="Missing required parameter: 'course_name' must be provided.")
if course_name == '':
abort(400, description="Missing required parameter: 'course_name' must be provided.")

conversation_stats = service.getConversationStats(course_name)
conversation_stats = service.getConversationStats(course_name)

response = jsonify(conversation_stats)
response.headers.add('Access-Control-Allow-Origin', '*')
return response

response = jsonify(conversation_stats)
response.headers.add('Access-Control-Allow-Origin', '*')
return response

@app.route('/getConversationHeatmapByHour', methods=['GET'])
def get_questions_heatmap_by_hour(service: RetrievalService) -> Response:
course_name = request.args.get('course_name', default='', type=str)
course_name = request.args.get('course_name', default='', type=str)

if not course_name:
abort(400, description="Missing required parameter: 'course_name' must be provided.")
if not course_name:
abort(400, description="Missing required parameter: 'course_name' must be provided.")

heatmap_data = service.getConversationHeatmapByHour(course_name)
heatmap_data = service.getConversationHeatmapByHour(course_name)

response = jsonify(heatmap_data)
response.headers.add('Access-Control-Allow-Origin', '*')
return response

response = jsonify(heatmap_data)
response.headers.add('Access-Control-Allow-Origin', '*')
return response

@app.route('/run_flow', methods=['POST'])
def run_flow(service: WorkflowService) -> Response:
Expand Down Expand Up @@ -603,13 +604,30 @@ def createProject(service: ProjectService, flaskExecutor: ExecutorInterface) ->
return response


@app.route('/scrapeGrantsDotGov', methods=['POST'])
def scrapeGrantsDotGov(retrievalService: RetrievalService, sentryService: SentryService) -> Response:
"""
Scrape grants.gov and ingest. We're careful to delete expired grants.
"""
# data = request.get_json()

from ai_ta_backend.service.scrape_grants_dot_gov import ScrapeGrantsDotGov
scraper = ScrapeGrantsDotGov(retrievalService=retrievalService, sentryService=sentryService)
scraper.main_scrape()
result = "success"

response = jsonify(result)
response.headers.add('Access-Control-Allow-Origin', '*')
return response


def configure(binder: Binder) -> None:
binder.bind(ThreadPoolExecutorInterface, to=ThreadPoolExecutorAdapter(max_workers=10), scope=SingletonScope)
binder.bind(ProcessPoolExecutorInterface, to=ProcessPoolExecutorAdapter(max_workers=10), scope=SingletonScope)
binder.bind(RetrievalService, to=RetrievalService, scope=RequestScope)
binder.bind(PosthogService, to=PosthogService, scope=SingletonScope)
binder.bind(SentryService, to=SentryService, scope=SingletonScope)
binder.bind(NomicService, to=NomicService, scope=SingletonScope)
# binder.bind(NomicService, to=NomicService, scope=SingletonScope)
binder.bind(ExportService, to=ExportService, scope=SingletonScope)
binder.bind(WorkflowService, to=WorkflowService, scope=SingletonScope)
binder.bind(VectorDatabase, to=VectorDatabase, scope=SingletonScope)
Expand Down
30 changes: 14 additions & 16 deletions ai_ta_backend/service/nomic_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@
import pandas as pd
from injector import inject
from langchain.embeddings.openai import OpenAIEmbeddings
from nomic import AtlasProject, atlas

from ai_ta_backend.database.sql import SQLDatabase
from ai_ta_backend.service.sentry_service import SentryService

# from nomic import AtlasProject, atlas

LOCK_EXCEPTIONS = [
'Project is locked for state access! Please wait until the project is unlocked to access embeddings.',
'Project is locked for state access! Please wait until the project is unlocked to access data.',
Expand Down Expand Up @@ -63,7 +64,6 @@ def get_nomic_map(self, course_name: str, type: str):
self.sentry.capture_exception(e)
return {"map_id": None, "map_link": None}


def log_to_conversation_map(self, course_name: str, conversation):
"""
This function logs new conversations to existing nomic maps.
Expand All @@ -82,12 +82,12 @@ def log_to_conversation_map(self, course_name: str, conversation):
if not response.data:
print("Map does not exist for this course. Redirecting to map creation...")
return self.create_conversation_map(course_name)

# entry present for doc map, but not convo map
elif not response.data[0]['convo_map_id']:
print("Map does not exist for this course. Redirecting to map creation...")
return self.create_conversation_map(course_name)

project_id = response.data[0]['convo_map_id']
last_uploaded_convo_id = response.data[0]['last_uploaded_convo_id']

Expand Down Expand Up @@ -154,17 +154,16 @@ def log_to_conversation_map(self, course_name: str, conversation):
project_info = {'course_name': course_name, 'convo_map_id': project_id, 'last_uploaded_convo_id': last_id}
project_response = self.sql.updateProjects(course_name, project_info)
print("Update response from supabase: ", project_response)

# rebuild the map
self.rebuild_map(course_name, "conversation")
return "success"

except Exception as e:
print(e)
self.sentry.capture_exception(e)
return "Error in logging to conversation map: {e}"



def log_to_existing_conversation(self, course_name: str, conversation):
"""
This function logs follow-up questions to existing conversations in the map.
Expand All @@ -176,18 +175,18 @@ def log_to_existing_conversation(self, course_name: str, conversation):

# fetch id from supabase
incoming_id_response = self.sql.getConversation(course_name, key="convo_id", value=conversation_id)

project_name = 'Conversation Map for ' + course_name
project = AtlasProject(name=project_name, add_datums_if_exists=True)

prev_id = incoming_id_response.data[0]['id']
uploaded_data = project.get_data(ids=[prev_id]) # fetch data point from nomic
uploaded_data = project.get_data(ids=[prev_id]) # fetch data point from nomic
prev_convo = uploaded_data[0]['conversation']

# update conversation
messages = conversation['messages']
messages_to_be_logged = messages[-2:]

for message in messages_to_be_logged:
if message['role'] == 'user':
emoji = "🙋 "
Expand All @@ -200,7 +199,7 @@ def log_to_existing_conversation(self, course_name: str, conversation):
text = message['content']

prev_convo += "\n>>> " + emoji + message['role'] + ": " + text + "\n"

# create embeddings of first query
embeddings_model = OpenAIEmbeddings(openai_api_type="openai",
openai_api_base="https://api.openai.com/v1/",
Expand Down Expand Up @@ -228,15 +227,14 @@ def log_to_existing_conversation(self, course_name: str, conversation):
# re-insert updated conversation
result = self.append_to_map(embeddings, metadata, project_name)
print("Result of appending to existing map:", result)

return "success"

except Exception as e:
print("Error in log_to_existing_conversation():", e)
self.sentry.capture_exception(e)
return "Error in logging to existing conversation: {e}"


def create_conversation_map(self, course_name: str):
"""
This function creates a conversation map for a given course from scratch.
Expand Down Expand Up @@ -370,7 +368,6 @@ def create_conversation_map(self, course_name: str):
project_response = self.sql.insertProjectInfo(project_info)
print("Response from supabase: ", project_response)


# rebuild the map
self.rebuild_map(course_name, "conversation")
return "success"
Expand Down Expand Up @@ -470,7 +467,8 @@ def data_prep_for_convo_map(self, df: pd.DataFrame):

for _index, row in df.iterrows():
current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
created_at = datetime.datetime.strptime(row['created_at'], "%Y-%m-%dT%H:%M:%S.%f%z").strftime("%Y-%m-%d %H:%M:%S")
created_at = datetime.datetime.strptime(row['created_at'],
"%Y-%m-%dT%H:%M:%S.%f%z").strftime("%Y-%m-%d %H:%M:%S")
conversation_exists = False
conversation = ""
emoji = ""
Expand Down
Loading