diff --git a/.env.template b/.env.template index a67a9b61..ff5cb614 100644 --- a/.env.template +++ b/.env.template @@ -1,10 +1,10 @@ -# ALL SERVICES ARE CONFIGURED HERE. -# You can use these defaults, or BYO services to fit your needs. -# The defaults should work 'out of the box' without any changes. -# ⚠️ For Security, we recommend changing all variables marked with CHANGE ME. +# OpenAI API key, required and please add one +OPENAI_API_KEY=YOUR_OPENAI_API_KEY_HERE +# Main backend Flask app +FLASK_PORT=3012 # OpenAI key is REQUIRED for Embeddings during ingest & RAG retrieval -OPENAI_API_KEY= # ⚠️ CHANGE ME +OPENAI_API_KEY= # ⚠️ REQUIRED: CHANGE ME # Qdrant Vector DB QDRANT_URL=http://qdrant:6333 # container name @@ -42,15 +42,34 @@ PUBLIC_MINIO_DASHBOARD_PORT=9001 # Ingest queue state is managed by Redis INGEST_REDIS_HOST=redis # container name -INGEST_REDIS_PORT=6379 INGEST_REDIS_PASSWORD=your-strong-password-here # ⚠️ CHANGE ME +INGEST_REDIS_PORT=6379 -# Main backend Flask app -FLASK_PORT=8000 - -# Optional services. Adds functionality if you want it, but not necessary. -# NOMIC_API_KEY= -# POSTHOG_API_KEY= -# SENTRY_DSN= -# EMAIL_SENDER= -# N8N_URL= +# Object Storage: You can use either Minio or S3. Choose one, not both. Minio is used by default. +AWS_ACCESS_KEY_ID=minioadmin +AWS_SECRET_ACCESS_KEY=minioadmin +DOCKER_INTERNAL_MINIO_API_PORT=10000 +DOCKER_INTERNAL_MINIO_DASHBOARD_PORT=10001 +PUBLIC_MINIO_API_PORT=3013 +PUBLIC_MINIO_DASHBOARD_PORT=3014 +MINIO_URL=http://minio:${DOCKER_INTERNAL_MINIO_API_PORT} + +# Qdrant Vector DB +# QDRANT_API_KEY is set in the qdrant_config.yaml +QDRANT_COLLECTION_NAME=uiuc-chat +QDRANT_URL=http://qdrant:6333 +S3_BUCKET_NAME=uiuc-chat + +# Supabase related, should match the values in the Supabase .env file +POSTGRES_PASSWORD=your-super-secret-and-long-postgres-password # Must match Supabase's password +POSTGRES_HOST=supabase-db +POSTGRES_PORT=5432 +POSTGRES_DB=postgres +POSTGRES_USER=postgres + +# Application variables +S3_BUCKET_NAME=uiuc-chat + +# Optional +# POSTHOG_API_KEY=OPTIONAL +# NOMIC_API_KEY=OPTIONAL \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..9e71c663 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "supabase"] + path = supabase + url = https://github.com/supabase/supabase +[submodule "ic_crawlee"] + path = ic_crawlee + url = https://github.com/UIUC-Chatbot/crawlee.git diff --git a/Dockerfile b/Dockerfile index 3c10285b..ead125df 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,12 +13,13 @@ RUN apt-get update && apt-get install -y \ ENV PATH="/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin" +RUN pip install uv # Copy the requirements file first to leverage Docker cache COPY ai_ta_backend/requirements.txt . # Install any needed packages specified in requirements.txt -RUN pip install -r requirements.txt +RUN uv pip install -r requirements.txt --system # Mkdir for sqlite db RUN mkdir -p /usr/src/app/db @@ -33,4 +34,4 @@ ENV PYTHONPATH="${PYTHONPATH}:/usr/src/app/ai_ta_backend" EXPOSE 8000 # Run the application using Gunicorn with specified configuration -CMD ["gunicorn", "--workers=1", "--threads=100", "--worker-class=gthread", "ai_ta_backend.main:app", "--timeout=1800", "--bind=0.0.0.0:8000"] +CMD ["gunicorn", "--workers=1", "--threads=3", "--worker-class=gthread", "ai_ta_backend.main:app", "--timeout=1800", "--bind=0.0.0.0:8000"] diff --git a/README.md b/README.md index 241e5a4c..701351d9 100644 --- a/README.md +++ b/README.md @@ -19,12 +19,26 @@ Failure to obtain a commercial license for commercial use is a violation of the ## Docker Deployment +### Supabase + +1. Duplicate `.env.example` from `supabase/docker/.env.example` and rename it to `.env`. (example: cp ./supabase/docker/.env.example ./supabase/docker/.env) +2. Customize your env variables as needed in the supabase docker + +### Self-host docker + 1. Duplicate `.env.template` and rename it to `.env`. E.g. `cp .env.template .env` 2. Customize your env variables. Your vector database can be either Qdrant and Pinecone. The SQL database can be any of SQLite, Postgres, and Supabase. The object storage can be Minio or AWS S3. -3. Run Docker Compose `docker compose up --build` -4. Navitage to `localhost:8000` (or whatever your `$FLASK_PORT` is) -To customize HTTP port used as the main entrypoint, set the `FLASK_PORT` variabel in your `.env`. It defaults to 8000. +### Running simultaneously + +We've created an `init.sh` file to run both docker-compose files with run command. To do this, first initialize the `init.sh` with right permission. + +```bash +chmod +x init.sh +./init.sh #runs the script +``` + +To customize HTTP port used as the main entrypoint, set the `FLASK_PORT` variabel in your `.env`. It defaults to 8188. Works on version: `Docker Compose version v2.27.1-desktop.1` @@ -47,7 +61,7 @@ For local dev: 1. Rename `.env.template` to `.env` and fill in the required variables 2. Install Python requirements `pip install -r requirements.txt` -3. Start the server for development (with live reloads) `cd ai_ta_backend` then `flask --app ai_ta_backend.main:app --debug run --port 8000` +3. Start the server for development (with live reloads) `cd ai_ta_backend` then `flask --app ai_ta_backend.main:app --debug run --port 8188` ### Course metadata structure @@ -62,3 +76,8 @@ For local dev: 'url': doc.metadata.get('url'), # wouldn't this error out? 'base_url': doc.metadata.get('base_url'), ``` + + +### Note + +For Supabase, the current version we are using is v1.24.09 ([link](https://github.com/supabase/supabase/tree/v1.24.09)) \ No newline at end of file diff --git a/ai_ta_backend/database/qdrant.py b/ai_ta_backend/database/qdrant.py index 18a792ff..0373363b 100644 --- a/ai_ta_backend/database/qdrant.py +++ b/ai_ta_backend/database/qdrant.py @@ -23,10 +23,10 @@ def __init__(self): """ # vector DB self.qdrant_client = QdrantClient( - url='http://qdrant:6333', + url=os.getenv('QDRANT_URL', 'http://qdrant:6333'), # Default to localhost if not set https=False, - api_key=os.environ['QDRANT_API_KEY'], - timeout=20, # default is 5 seconds. Getting timeout errors w/ document groups. + api_key=os.getenv('QDRANT_API_KEY'), + timeout=20, ) self.vectorstore = Qdrant( diff --git a/ai_ta_backend/main.py b/ai_ta_backend/main.py index 898c26cf..8da3607f 100644 --- a/ai_ta_backend/main.py +++ b/ai_ta_backend/main.py @@ -20,8 +20,10 @@ from injector import SingletonScope from langchain_core.messages import HumanMessage from langchain_core.messages import SystemMessage -from sqlalchemy import inspect +from sqlalchemy import inspect, text import urllib3 +from qdrant_client import QdrantClient, models +import boto3 from ai_ta_backend.database.aws import AWSStorage from ai_ta_backend.database.qdrant import VectorDatabase @@ -38,7 +40,7 @@ ThreadPoolExecutorInterface from ai_ta_backend.extensions import db from ai_ta_backend.service.export_service import ExportService -#from ai_ta_backend.service.nomic_service import NomicService +from ai_ta_backend.service.nomic_service import NomicService from ai_ta_backend.service.posthog_service import PosthogService from ai_ta_backend.service.retrieval_service import RetrievalService from ai_ta_backend.service.sentry_service import SentryService @@ -361,34 +363,34 @@ def exportDocuments(service: ExportService): return response -@app.route('/getTopContextsWithMQR', methods=['GET']) -def getTopContextsWithMQR(service: RetrievalService, posthog_service: PosthogService) -> Response: - """ - Get relevant contexts for a given search query, using Multi-query retrieval + filtering method. - """ - search_query: str = request.args.get('search_query', default='', type=str) - course_name: str = request.args.get('course_name', default='', type=str) - token_limit: int = request.args.get('token_limit', default=3000, type=int) - if search_query == '' or course_name == '': - # proper web error "400 Bad request" - abort( - 400, - description= - f"Missing one or more required parameters: 'search_query' and 'course_name' must be provided. Search query: `{search_query}`, Course name: `{course_name}`" - ) +# @app.route('/getTopContextsWithMQR', methods=['GET']) +# def getTopContextsWithMQR(service: RetrievalService, posthog_service: PosthogService) -> Response: +# """ +# Get relevant contexts for a given search query, using Multi-query retrieval + filtering method. +# """ +# search_query: str = request.args.get('search_query', default='', type=str) +# course_name: str = request.args.get('course_name', default='', type=str) +# token_limit: int = request.args.get('token_limit', default=3000, type=int) +# if search_query == '' or course_name == '': +# # proper web error "400 Bad request" +# abort( +# 400, +# description= +# f"Missing one or more required parameters: 'search_query' and 'course_name' must be provided. Search query: `{search_query}`, Course name: `{course_name}`" +# ) - posthog_service.capture(event_name='filter_top_contexts_invoked', - properties={ - 'user_query': search_query, - 'course_name': course_name, - 'token_limit': token_limit, - }) +# posthog_service.capture(event_name='filter_top_contexts_invoked', +# properties={ +# 'user_query': search_query, +# 'course_name': course_name, +# 'token_limit': token_limit, +# }) - found_documents = service.getTopContextsWithMQR(search_query, course_name, token_limit) +# found_documents = service.getTopContextsWithMQR(search_query, course_name, token_limit) - response = jsonify(found_documents) - response.headers.add('Access-Control-Allow-Origin', '*') - return response +# response = jsonify(found_documents) +# response.headers.add('Access-Control-Allow-Origin', '*') +# return response @app.route('/getworkflows', methods=['GET']) @@ -506,48 +508,96 @@ def configure(binder: Binder) -> None: sql_bound = False storage_bound = False - # Define database URLs with conditional checks for environment variables - encoded_password = quote_plus(os.getenv('SUPABASE_PASSWORD')) + # Encode the PostgreSQL password + #encoded_password = quote_plus(os.getenv('POSTGRES_PASSWORD')) + #print("ENCODED PASSWORD (i.e., POSTGRES_PASSWORD):", encoded_password) + + # Define database URLs with corrected environment variables + # DB_URLS = { + # 'supabase': + # f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@{os.getenv('POSTGRES_HOST')}:{os.getenv('POSTGRES_PORT')}/{os.getenv('POSTGRES_DB')}", + # 'sqlite': + # f"sqlite:///{os.getenv('SQLITE_DB_NAME')}" if os.getenv('SQLITE_DB_NAME') else None, + # 'postgres': + # f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@{os.getenv('POSTGRES_HOST')}:{os.getenv('POSTGRES_PORT')}/{os.getenv('POSTGRES_DB')}" + # if all([ + # os.getenv('POSTGRES_USER'), + # os.getenv('POSTGRES_PASSWORD'), + # os.getenv('POSTGRES_HOST'), + # os.getenv('POSTGRES_PORT'), + # os.getenv('POSTGRES_DB') + # ]) else None + # } + # print("DB_URLS:", DB_URLS) + + # # Bind to the first available SQL database configuration + # for db_type, url in DB_URLS.items(): + # if url: + # logging.info(f"Binding to {db_type} database with URL: {url}") + # with app.app_context(): + # app.config['SQLALCHEMY_DATABASE_URI'] = url + # db.init_app(app) + + # # Check if tables exist before creating them + # inspector = inspect(db.engine) + # existing_tables = inspector.get_table_names() + # print("Existing tables:", existing_tables) + # if not existing_tables: + # logging.info("Creating tables as the database is empty") + # db.create_all() + # else: + # logging.info("Tables already exist, skipping creation") + + # binder.bind(SQLAlchemyDatabase, to=SQLAlchemyDatabase(db), scope=SingletonScope) + # sql_bound = True + # break DB_URLS = { 'supabase': - f"postgresql://{os.getenv('SUPABASE_USER')}:{encoded_password}@{os.getenv('SUPABASE_URL')}", - 'sqlite': - f"sqlite:///{os.getenv('SQLITE_DB_NAME')}" if os.getenv('SQLITE_DB_NAME') else None, - 'postgres': - f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@{os.getenv('POSTGRES_URL')}" - if os.getenv('POSTGRES_USER') and os.getenv('POSTGRES_PASSWORD') and os.getenv('POSTGRES_URL') else None + f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@{os.getenv('POSTGRES_HOST')}:{os.getenv('POSTGRES_PORT')}/{os.getenv('POSTGRES_DB')}", } - - # Bind to the first available SQL database configuration + # Try to connect to Supabase and verify connection for db_type, url in DB_URLS.items(): if url: - logging.info(f"Binding to {db_type} database with URL: {url}") - with app.app_context(): - app.config['SQLALCHEMY_DATABASE_URI'] = url - db.init_app(app) - - # Check if tables exist before creating them - inspector = inspect(db.engine) - existing_tables = inspector.get_table_names() - - if not existing_tables: - logging.info("Creating tables as the database is empty") - db.create_all() - else: - logging.info("Tables already exist, skipping creation") - - binder.bind(SQLAlchemyDatabase, to=SQLAlchemyDatabase(db), scope=SingletonScope) - sql_bound = True - break + logging.info(f"Attempting to connect to {db_type} database with URL: {url}") + try: + with app.app_context(): + app.config['SQLALCHEMY_DATABASE_URI'] = url + db.init_app(app) + + # Test connection by executing a simple query with text() + with db.engine.connect() as connection: + connection.execute(text("SELECT 1")) + connection.commit() # Add commit to ensure transaction completion + logging.info(f"✅ Successfully connected to {db_type} database") + + # Check if tables exist + inspector = inspect(db.engine) + existing_tables = inspector.get_table_names() + logging.info(f"Found existing tables: {existing_tables}") + + if not existing_tables: + logging.info("Creating tables as database is empty") + db.create_all() + + binder.bind(SQLAlchemyDatabase, to=SQLAlchemyDatabase(db), scope=SingletonScope) + sql_bound = True + break + + except Exception as e: + logging.error(f"❌ Failed to connect to {db_type} database: {str(e)}") + continue # Conditionally bind databases based on the availability of their respective secrets - if all(os.getenv(key) for key in ["QDRANT_URL", "QDRANT_API_KEY", "QDRANT_COLLECTION_NAME"]) or any( + if all(os.getenv(key) for key in ["QDRANT_URL", "QDRANT_COLLECTION_NAME"]) or any( os.getenv(key) for key in ["PINECONE_API_KEY", "PINECONE_PROJECT_NAME"]): logging.info("Binding to Qdrant database") logging.info(f"Qdrant Collection Name: {os.environ['QDRANT_COLLECTION_NAME']}") logging.info(f"Qdrant URL: {os.environ['QDRANT_URL']}") - logging.info(f"Qdrant API Key: {os.environ['QDRANT_API_KEY']}") + if os.getenv("QDRANT_API_KEY"): + logging.info(f"Qdrant API Key: {os.environ['QDRANT_API_KEY']}") + else: + logging.warning("Qdrant API Key is not set") binder.bind(VectorDatabase, to=VectorDatabase, scope=SingletonScope) vector_bound = True @@ -590,7 +640,66 @@ def configure(binder: Binder) -> None: binder.bind(ProcessPoolExecutorInterface, to=ProcessPoolExecutorAdapter, scope=SingletonScope) logging.info("Configured all services and adapters", binder._bindings) + # TODO: Initialize the databases + + # Qdrant + # Initialize Qdrant collection if it doesn't exist + try: + qdrant_client = QdrantClient( + url=os.getenv('QDRANT_URL', 'http://qdrant:6333'), + https=False, + api_key=os.getenv('QDRANT_API_KEY'), + timeout=20, + ) + + # Create collection with OpenAI embedding dimensions + qdrant_client.recreate_collection( + collection_name=os.environ['QDRANT_COLLECTION_NAME'], + vectors_config=models.VectorParams( + size=1536, # OpenAI embedding dimensions + distance=models.Distance.COSINE + ) + ) + logging.info(f"Initialized Qdrant collection: {os.environ['QDRANT_COLLECTION_NAME']}") + except Exception as e: + logging.error(f"Failed to initialize Qdrant collection: {str(e)}") + + # Initialize Minio + try: + s3_client = boto3.client( + 's3', + endpoint_url=os.getenv('MINIO_URL'), + aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'), + aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'), + ) + + # Create bucket if it doesn't exist + bucket_name = os.environ['S3_BUCKET_NAME'] + try: + s3_client.head_bucket(Bucket=bucket_name) + logging.info(f"S3 bucket already exists: {bucket_name}") + + # Create courses/ path by putting an empty object + s3_client.put_object( + Bucket=bucket_name, + Key='courses/' + ) + logging.info(f"Created courses/ path in bucket: {bucket_name}") + except: + s3_client.create_bucket(Bucket=bucket_name) + logging.info(f"Created S3 bucket: {bucket_name}") + + # Create courses/ path in new bucket + s3_client.put_object( + Bucket=bucket_name, + Key='courses/' + ) + logging.info(f"Created courses/ path in bucket: {bucket_name}") + except Exception as e: + logging.error(f"Failed to initialize S3 bucket: {str(e)}") + + FlaskInjector(app=app, modules=[configure]) if __name__ == '__main__': - app.run(debug=True, port=int(os.getenv("PORT", default=8000))) # nosec -- reasonable bandit error suppression + app.run(debug=True, port=int(os.getenv("PORT", default=8000))) # nosec -- reasonable bandit error suppression \ No newline at end of file diff --git a/ai_ta_backend/model/models.py b/ai_ta_backend/model/models.py index 375a975c..068f1880 100644 --- a/ai_ta_backend/model/models.py +++ b/ai_ta_backend/model/models.py @@ -108,7 +108,7 @@ class DocumentsInProgress(Base): error = Column(Text) beam_task_id = Column(Text) - __table_args__ = (Index('documents_in_progress_pkey', 'id', postgresql_using='btree'),) + # __table_args__ = (Index('documents_in_progress_pkey', 'id', postgresql_using='btree'),) def to_dict(self): return { @@ -138,7 +138,7 @@ class DocumentsFailed(Base): doc_groups = Column(Text) error = Column(Text) - __table_args__ = (Index('documents_failed_pkey', 'id', postgresql_using='btree'),) + # __table_args__ = (Index('documents_failed_pkey', 'id', postgresql_using='btree'),) def to_dict(self): return { @@ -170,7 +170,7 @@ class Project(Base): __table_args__ = ( Index('projects_course_name_key', 'course_name', postgresql_using='btree'), - Index('projects_pkey', 'id', postgresql_using='btree'), + # Index('projects_pkey', 'id', postgresql_using='btree'), ) def to_dict(self): @@ -213,8 +213,8 @@ class LlmConvoMonitor(Base): __table_args__ = ( Index('llm_convo_monitor_course_name_idx', 'course_name', postgresql_using='hash'), - Index('llm-convo-monitor_convo_id_key', 'convo_id', postgresql_using='btree'), - Index('llm-convo-monitor_pkey', 'id', postgresql_using='btree'), + # Index('llm-convo-monitor_convo_id_key', 'convo_id', postgresql_using='btree'), + # Index('llm-convo-monitor_pkey', 'id', postgresql_using='btree'), ) def to_dict(self): @@ -242,7 +242,7 @@ class Conversations(Base): folder_id = Column(UUID(as_uuid=True)) __table_args__ = ( - Index('conversations_pkey', 'id', postgresql_using='btree'), + # Index('conversations_pkey', 'id', postgresql_using='btree'), Index('idx_user_email_updated_at', 'user_email', 'updated_at', postgresql_using='btree'), ) @@ -277,9 +277,9 @@ class Messages(Base): content_image_url = Column(Text) image_description = Column(Text) - __table_args__ = ( - Index('messages_pkey', 'id', postgresql_using='btree'), - ) + # __table_args__ = ( + # Index('messages_pkey', 'id', postgresql_using='btree'), + # ) def to_dict(self): return { @@ -307,9 +307,9 @@ class PreAuthAPIKeys(Base): providerName = Column(Text) notes = Column(Text) - __table_args__ = ( - Index('pre-authorized-api-keys_pkey', 'id', postgresql_using='btree'), - ) + # __table_args__ = ( + # Index('pre-authorized-api-keys_pkey', 'id', postgresql_using='btree'), + # ) def to_dict(self): return { diff --git a/ai_ta_backend/redis_queue/ingest.py b/ai_ta_backend/redis_queue/ingest.py index d786fef4..2f2f1521 100644 --- a/ai_ta_backend/redis_queue/ingest.py +++ b/ai_ta_backend/redis_queue/ingest.py @@ -52,7 +52,6 @@ from ai_ta_backend.redis_queue.ingestSQL import SQLAlchemyIngestDB from dotenv import load_dotenv -import logging load_dotenv() logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') @@ -111,8 +110,9 @@ def initialize_resources(self): self.sql_session = SQLAlchemyIngestDB() if self.posthog_api_key: - self.posthog = Posthog(sync_mode=True, project_api_key=self.posthog_api_key, host='https://app.posthog.com') + self.posthog = Posthog(sync_mode=False, project_api_key=self.posthog_api_key, host='https://app.posthog.com') else: + self.posthog = None print("POSTHOG API KEY NOT FOUND!") def main_ingest(self, **inputs: Dict[str | List[str], Any]): @@ -999,6 +999,7 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]], ** ), f'must have equal number of text strings and metadata dicts. len(texts) is {len(texts)}. len(metadatas) is {len(metadatas)}' try: + logging.info("Before Text Splitter") text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( chunk_size=1000, chunk_overlap=150, @@ -1010,6 +1011,7 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]], ** input_texts = [{'input': context.page_content, 'model': 'text-embedding-ada-002'} for context in contexts] # check for duplicates + logging.info(f"Before checking for duplicates") is_duplicate = self.check_for_duplicates(input_texts, metadatas) if is_duplicate: if self.posthog: @@ -1030,8 +1032,7 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]], ** context.metadata['chunk_index'] = i context.metadata['doc_groups'] = kwargs.get('groups', []) - logging.info("Starting to call embeddings API") - + logging.info("Before call to embeddings API") embeddings_start_time = time.monotonic() oai = OpenAIAPIProcessor( input_prompts_list=input_texts, @@ -1059,6 +1060,7 @@ def split_and_upload(self, texts: List[str], metadatas: List[Dict[str, Any]], ** PointStruct(id=str(uuid.uuid4()), vector=embeddings_dict[context.page_content], payload=upload_metadata)) try: + logging.info(f"Before Upsert to Qdrant") self.qdrant_client.upsert( collection_name=os.environ['QDRANT_COLLECTION_NAME'], # type: ignore points=vectors, # type: ignore @@ -1137,10 +1139,12 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any] incoming_s3_path = metadatas[0]['s3_path'] url = metadatas[0]['url'] + logging.info(f"In check_for_duplicates") + if incoming_s3_path: # check if uuid exists in s3_path -- not all s3_paths have uuids! incoming_filename = incoming_s3_path.split('/')[-1] - # print("Full filename: ", incoming_filename) + logging.debug(f"Full filename: {incoming_filename}") pattern = re.compile(r'[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', re.I) # uuid V4 pattern, and v4 only. if bool(pattern.search(incoming_filename)): @@ -1149,17 +1153,17 @@ def check_for_duplicates(self, texts: List[Dict], metadatas: List[Dict[str, Any] else: # do not remove anything and proceed with duplicate checking original_filename = incoming_filename - print(f"Filename after removing uuid: {original_filename}") + logging.info(f"Filename after removing uuid: {original_filename}") supabase_contents = self.sql_session.get_like_docs_by_s3_path(course_name, original_filename) supabase_contents = supabase_contents['data'] - print(f"No. of S3 path based records retrieved: {len(supabase_contents)}") # multiple records can be retrieved: 3.pdf and 453.pdf + logging.info(f"No. of S3 path based records retrieved: {len(supabase_contents)}") # multiple records can be retrieved: 3.pdf and 453.pdf elif url: original_filename = url supabase_contents = self.sql_session.get_like_docs_by_url(course_name, url) supabase_contents = supabase_contents['data'] - print(f"No. of URL-based records retrieved: {len(supabase_contents)}") + logging.info(f"No. of URL-based records retrieved: {len(supabase_contents)}") else: original_filename = None supabase_contents = [] diff --git a/ai_ta_backend/redis_queue/ingestSQL.py b/ai_ta_backend/redis_queue/ingestSQL.py index 7a019623..4dccc4f2 100644 --- a/ai_ta_backend/redis_queue/ingestSQL.py +++ b/ai_ta_backend/redis_queue/ingestSQL.py @@ -1,4 +1,5 @@ import os +import logging from urllib.parse import quote_plus from ai_ta_backend.model import models @@ -59,13 +60,16 @@ def __init__(self) -> None: db_uri = f"postgresql://{os.getenv('SUPABASE_USER')}:{encoded_password}@{os.getenv('SUPABASE_URL')}" elif db_type == 'sqlite': db_uri = f"sqlite:///{os.getenv('SQLITE_DB_NAME')}" - else: # postgres - db_uri = f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@{os.getenv('POSTGRES_HOST')}" + else: + # postgres + db_uri = f"postgresql://{os.getenv('POSTGRES_USER')}:{os.getenv('POSTGRES_PASSWORD')}@{os.getenv('POSTGRES_HOST')}:{os.getenv('POSTGRES_PORT')}/{os.getenv('POSTGRES_DB')}" # Create engine and session + print("About to connect to DB from IngestSQL.py, with URI:", db_uri) engine = create_engine(db_uri) Session = sessionmaker(bind=engine) self.session = Session() + print("Successfully connected to DB from IngestSQL.py") def insert_document_in_progress(self, doc_progress_payload: dict): insert_stmt = insert(models.DocumentsInProgress).values(doc_progress_payload) @@ -146,6 +150,7 @@ def add_document_to_group(self, contexts, groups): return None, 0 def get_like_docs_by_s3_path(self, course_name, original_filename): + logging.info(f"In get_like_docs_by_s3_path") query = ( select(models.Document.id, models.Document.contexts, models.Document.s3_path) .where(models.Document.course_name == course_name) @@ -153,6 +158,7 @@ def get_like_docs_by_s3_path(self, course_name, original_filename): .order_by(desc(models.Document.id)) ) result = self.session.execute(query).mappings().all() + logging.info(f"In get_like_docs_by_s3_path, result: {result}") response = DatabaseResponse(data=result, count=len(result)).to_dict() return response diff --git a/ai_ta_backend/requirements.txt b/ai_ta_backend/requirements.txt index d29c6e01..4c7a56c2 100644 --- a/ai_ta_backend/requirements.txt +++ b/ai_ta_backend/requirements.txt @@ -3,7 +3,7 @@ flask-cors==4.0.0 Flask-Injector==0.15.0 gunicorn==21.2.0 protobuf==4.25.0 -aiohttp==3.8.6 +aiohttp==3.11.11 wheel==0.41.3 click==8.1.7 MarkupSafe==2.1.3 @@ -16,7 +16,8 @@ mkdocs==1.5.3 Flask-SQLAlchemy==3.1.1 tabulate==0.9.0 typing-inspect==0.9.0 -typing_extensions==4.8.0 +typing_extensions==4.12.2 +psycopg2-binary==2.9.10 # Utils tiktoken==0.7.0 @@ -36,8 +37,8 @@ langchain-openai==0.1.8 # Data boto3==1.28.79 -qdrant-client==1.7.3 -supabase==2.0.2 +qdrant-client==1.12.2 +supabase==2.11.0 # Logging posthog==3.1.0 @@ -76,6 +77,3 @@ pdfplumber==0.11.4 # unstructured.pytesseract==0.3.12 # unstructured-inference==0.7.11 # this is the real large one :( # unstructured[xlsx,image,pptx]==0.10.29 # causes huge ~5.3 GB of installs. Probbably from onnx: https://github.com/Unstructured-IO/unstructured/blob/ad14321016533dc03c1782f6ebea00bc9c804846/requirements/extra-pdf-image.in#L4 - - -psycopg2-binary \ No newline at end of file diff --git a/ai_ta_backend/service/nomic_service.py b/ai_ta_backend/service/nomic_service.py index 92cc1377..f3c70d76 100644 --- a/ai_ta_backend/service/nomic_service.py +++ b/ai_ta_backend/service/nomic_service.py @@ -25,10 +25,13 @@ class NomicService(): @inject def __init__(self, sentry: SentryService, sql: SQLAlchemyDatabase): - if os.getenv("NOMIC_API_KEY"): - nomic.login(os.getenv("NOMIC_API_KEY")) - self.sentry = sentry - self.sql = sql + self.sentry = sentry + self.sql = sql + + if os.getenv('NOMIC_API_KEY'): + nomic.login(os.getenv('NOMIC_API_KEY')) + else: + logging.info("NOMIC_API_KEY not found. Nomic functionality will be disabled.") def get_nomic_map(self, course_name: str, type: str): """ @@ -38,6 +41,9 @@ def get_nomic_map(self, course_name: str, type: str): map link: https://atlas.nomic.ai/map/ed222613-97d9-46a9-8755-12bbc8a06e3a/f4967ad7-ff37-4098-ad06-7e1e1a93dd93 map id: f4967ad7-ff37-4098-ad06-7e1e1a93dd93 """ + if not os.getenv('NOMIC_API_KEY'): + logging.warning("Nomic functionality is disabled. Cannot get Nomic map.") + return {"map_id": None, "map_link": None} # nomic.login(os.getenv('NOMIC_API_KEY')) # login during start of flask app if type.lower() == 'document': NOMIC_MAP_NAME_PREFIX = 'Document Map for ' diff --git a/ai_ta_backend/service/posthog_service.py b/ai_ta_backend/service/posthog_service.py index bff1f89b..ab57f601 100644 --- a/ai_ta_backend/service/posthog_service.py +++ b/ai_ta_backend/service/posthog_service.py @@ -8,18 +8,15 @@ class PosthogService: @inject def __init__(self): - if not os.getenv("POSTHOG_API_KEY"): + if os.getenv("POSTHOG_API_KEY"): + self.posthog = Posthog( + sync_mode=False, + project_api_key=os.getenv("POSTHOG_API_KEY", None), + host="https://app.posthog.com", + ) + else: self.posthog = None - return - - self.posthog = Posthog( - sync_mode=True, - project_api_key=os.environ["POSTHOG_API_KEY"], - host="https://app.posthog.com", - ) def capture(self, event_name, properties): - if not self.posthog: - return - - self.posthog.capture("distinct_id_of_the_user", event=event_name, properties=properties) + if self.posthog: + self.posthog.capture("distinct_id_of_the_user", event=event_name, properties=properties) diff --git a/ai_ta_backend/service/retrieval_service.py b/ai_ta_backend/service/retrieval_service.py index 2a4386d8..c95aa7a1 100644 --- a/ai_ta_backend/service/retrieval_service.py +++ b/ai_ta_backend/service/retrieval_service.py @@ -36,6 +36,7 @@ def __init__(self, vdb: VectorDatabase, sqlDb: SQLAlchemyDatabase, aws: AWSStora self.nomicService = nomicService logging.info(f"Vector DB: {self.vdb}") + logging.info(f"Posthog service: {self.posthog}") openai.api_key = os.environ["OPENAI_API_KEY"] @@ -106,7 +107,7 @@ def getTopContexts(self, if len(valid_docs) == 0: return [] - if self.posthog is not None: + if self.posthog: self.posthog.capture( event_name="getTopContexts_success_DI", properties={ @@ -127,7 +128,7 @@ def getTopContexts(self, err: str = f"ERROR: In /getTopContexts. Course: {course_name} ||| search_query: {search_query}\nTraceback: {traceback.print_exc()} \n{e}" # type: ignore traceback.print_exc() logging.info(err) - if self.sentry is not None: + if self.sentry: self.sentry.capture_exception(e) return err @@ -175,14 +176,14 @@ def delete_data(self, course_name: str, s3_path: str, source_url: str): # Delete from Qdrant self.delete_from_qdrant(identifier_key, identifier_value) - # Delete from Nomic and Supabase - self.delete_from_nomic_and_supabase(course_name, identifier_key, identifier_value) + # Delete from Nomic and Supabase (commented out for now because Nomic is having issues) + # self.delete_from_nomic_and_supabase(course_name, identifier_key, identifier_value) return "Success" except Exception as e: err: str = f"ERROR IN delete_data: Traceback: {traceback.extract_tb(e.__traceback__)}❌❌ Error in {inspect.currentframe().f_code.co_name}:{e}" # type: ignore logging.info(err) - if self.sentry is not None: + if self.sentry: self.sentry.capture_exception(e) return err @@ -193,7 +194,7 @@ def delete_from_s3(self, bucket_name: str, s3_path: str): logging.info(f"AWS response: {response}") except Exception as e: logging.info("Error in deleting file from s3:", e) - if self.sentry is not None: + if self.sentry: self.sentry.capture_exception(e) def delete_from_qdrant(self, identifier_key: str, identifier_value: str): @@ -207,7 +208,7 @@ def delete_from_qdrant(self, identifier_key: str, identifier_value: str): pass else: logging.info("Error in deleting file from Qdrant:", e) - if self.sentry is not None: + if self.sentry: self.sentry.capture_exception(e) def getTopContextsWithMQR(self, search_query: str, course_name: str, token_limit: int = 4_000) -> Union[List[Dict], str]: @@ -341,11 +342,11 @@ def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, if not data: raise Exception(f"No document map found for this course: {course_name}") project_id = str(data[0].doc_map_id) - if self.nomicService is not None: + if self.nomicService: self.nomicService.delete_from_document_map(project_id, nomic_ids_to_delete) except Exception as e: logging.info(f"Nomic Error in deleting. {identifier_key}: {identifier_value}", e) - if self.sentry is not None: + if self.sentry: self.sentry.capture_exception(e) try: @@ -353,7 +354,7 @@ def delete_from_nomic_and_supabase(self, course_name: str, identifier_key: str, response = self.sqlDb.deleteMaterialsForCourseAndKeyAndValue(course_name, identifier_key, identifier_value) except Exception as e: logging.info(f"Supabase Error in delete. {identifier_key}: {identifier_value}", e) - if self.sentry is not None: + if self.sentry: self.sentry.capture_exception(e) def vector_search(self, search_query, course_name, doc_groups: List[str] | None = None): @@ -384,7 +385,7 @@ def _embed_query_and_measure_latency(self, search_query): return user_query_embedding def _capture_search_invoked_event(self, search_query, course_name, doc_groups): - if self.posthog is not None: + if self.posthog: self.posthog.capture( event_name="vector_search_invoked", properties={ @@ -393,6 +394,8 @@ def _capture_search_invoked_event(self, search_query, course_name, doc_groups): "doc_groups": doc_groups, }, ) + else: + logging.info("Posthog service not available. Skipping event capture.") def _perform_vector_search(self, search_query, course_name, doc_groups, user_query_embedding, top_n): qdrant_start_time = time.monotonic() @@ -413,14 +416,14 @@ def _process_search_results(self, search_results, course_name): found_docs.append(Document(page_content=page_content, metadata=metadata)) except Exception as e: logging.info(f"Error in vector_search(), for course: `{course_name}`. Error: {e}") - if self.sentry is not None: + if self.sentry: self.sentry.capture_exception(e) return found_docs def _capture_search_succeeded_event(self, search_query, course_name, search_results): vector_score_calc_latency_sec = time.monotonic() max_vector_score, min_vector_score, avg_vector_score = self._calculate_vector_scores(search_results) - if self.posthog is not None: + if self.posthog: self.posthog.capture( event_name="vector_search_succeeded", properties={ @@ -434,6 +437,8 @@ def _capture_search_succeeded_event(self, search_query, course_name, search_resu "vector_score_calculation_latency_sec": time.monotonic() - vector_score_calc_latency_sec, }, ) + else: + logging.info("Posthog service not available. Skipping event capture.") def _calculate_vector_scores(self, search_results): max_vector_score = 0 diff --git a/db/migrations/20240814013542_remote_schema.sql b/db/migrations/20240814013542_remote_schema.sql new file mode 100644 index 00000000..4bead0dc --- /dev/null +++ b/db/migrations/20240814013542_remote_schema.sql @@ -0,0 +1,1175 @@ + +SET statement_timeout = 0; +SET lock_timeout = 0; +SET idle_in_transaction_session_timeout = 0; +SET client_encoding = 'UTF8'; +SET standard_conforming_strings = on; +SELECT pg_catalog.set_config('search_path', '', false); +SET check_function_bodies = false; +SET xmloption = content; +SET client_min_messages = warning; +SET row_security = off; + +CREATE EXTENSION IF NOT EXISTS "pg_tle"; + +-- CREATE EXTENSION IF NOT EXISTS "supabase-dbdev" WITH SCHEMA "public"; + +CREATE EXTENSION IF NOT EXISTS "pgsodium" WITH SCHEMA "pgsodium"; + +CREATE EXTENSION IF NOT EXISTS "http" WITH SCHEMA "extensions"; + +CREATE EXTENSION IF NOT EXISTS "hypopg" WITH SCHEMA "public"; + +-- CREATE EXTENSION IF NOT EXISTS "olirice-index_advisor" WITH SCHEMA "public"; + +CREATE EXTENSION IF NOT EXISTS "pg_graphql" WITH SCHEMA "graphql"; + +CREATE EXTENSION IF NOT EXISTS "pg_stat_statements" WITH SCHEMA "extensions"; + +CREATE EXTENSION IF NOT EXISTS "pgcrypto" WITH SCHEMA "extensions"; + +CREATE EXTENSION IF NOT EXISTS "pgjwt" WITH SCHEMA "extensions"; + +CREATE EXTENSION IF NOT EXISTS "supabase_vault" WITH SCHEMA "vault"; + +CREATE EXTENSION IF NOT EXISTS "uuid-ossp" WITH SCHEMA "extensions"; + +CREATE OR REPLACE FUNCTION "public"."add_document_to_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) RETURNS boolean + LANGUAGE "plpgsql" + AS $$DECLARE + v_document_id bigint; + v_doc_group_id bigint; + v_success boolean := true; + p_doc_group text; +BEGIN + -- Ensure the document exists + SELECT id INTO v_document_id FROM public.documents WHERE course_name = p_course_name AND ( + (s3_path <> '' AND s3_path IS NOT NULL AND s3_path = p_s3_path) OR + (url = p_url) +); + + raise log 'id of document: %', v_document_id; + + IF NOT FOUND THEN + RAISE EXCEPTION 'Document does not exist'; + END IF; + + -- Loop through document groups + FOREACH p_doc_group IN ARRAY p_doc_groups + LOOP + -- Upsert document group, assuming 'name' and 'course_name' can uniquely identify a row + INSERT INTO public.doc_groups(name, course_name) + VALUES (p_doc_group, p_course_name) + ON CONFLICT (name, course_name) DO UPDATE + SET name = EXCLUDED.name + RETURNING id INTO v_doc_group_id; + + raise log 'id of document group: %', v_doc_group_id; + + -- Upsert the association in documents_doc_groups + INSERT INTO public.documents_doc_groups(document_id, doc_group_id) + VALUES (v_document_id, v_doc_group_id) + ON CONFLICT (document_id, doc_group_id) DO NOTHING; + + raise log 'completed for %',v_doc_group_id; + END LOOP; + + raise log 'completed for %',v_document_id; + RETURN v_success; +EXCEPTION + WHEN OTHERS THEN + v_success := false; + RAISE; + RETURN v_success; +END;$$; + +ALTER FUNCTION "public"."add_document_to_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."add_document_to_group_url"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) RETURNS boolean + LANGUAGE "plpgsql" + AS $$DECLARE + v_document_id bigint; + v_doc_group_id bigint; + v_success boolean := true; + p_doc_group text; +BEGIN + -- Ensure the document exists + SELECT id INTO v_document_id FROM public.documents WHERE course_name = p_course_name AND ( + (s3_path <> '' AND s3_path IS NOT NULL AND s3_path = p_s3_path) +); + + raise log 'id of document: %', v_document_id; + + IF NOT FOUND THEN + RAISE EXCEPTION 'Document does not exist'; + END IF; + + -- Loop through document groups + FOREACH p_doc_group IN ARRAY p_doc_groups + LOOP + -- Upsert document group, assuming 'name' and 'course_name' can uniquely identify a row + INSERT INTO public.doc_groups(name, course_name) + VALUES (p_doc_group, p_course_name) + ON CONFLICT (name, course_name) DO UPDATE + SET name = EXCLUDED.name + RETURNING id INTO v_doc_group_id; + + raise log 'id of document group: %', v_doc_group_id; + + -- Upsert the association in documents_doc_groups + INSERT INTO public.documents_doc_groups(document_id, doc_group_id) + VALUES (v_document_id, v_doc_group_id) + ON CONFLICT (document_id, doc_group_id) DO NOTHING; + + raise log 'completed for %',v_doc_group_id; + END LOOP; + + raise log 'completed for %',v_document_id; + RETURN v_success; +EXCEPTION + WHEN OTHERS THEN + v_success := false; + RAISE; + RETURN v_success; +END;$$; + +ALTER FUNCTION "public"."add_document_to_group_url"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."c"() RETURNS "record" + LANGUAGE "plpgsql" + AS $$DECLARE + course_names record; +BEGIN + SELECT distinct course_name INTO course_names + FROM public.documents + LIMIT 1; + + RAISE LOG 'distinct_course_names: %', course_names; + RETURN course_names; +END;$$; + +ALTER FUNCTION "public"."c"() OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."check_and_lock_flows_v2"("id" integer) RETURNS "text" + LANGUAGE "plpgsql" + AS $$DECLARE + workflow_id bigint; + workflow_locked boolean; +BEGIN + -- Get the latest workflow id and its lock status + select latest_workflow_id, is_locked + into workflow_id, workflow_locked + from public.n8n_workflows + order by latest_workflow_id desc + limit 1; + + -- Check if the latest workflow is locked + if id = workflow_id then + return 'id already exists'; + elseif workflow_locked then + return 'Workflow is locked'; + else + -- Update the latest_workflow_id + update public.n8n_workflows + set latest_workflow_id = id, + is_locked = True + where latest_workflow_id = workflow_id; + return 'Workflow updated'; + + + end if; +end;$$; + +ALTER FUNCTION "public"."check_and_lock_flows_v2"("id" integer) OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."cn"() RETURNS "text" + LANGUAGE "plpgsql" + AS $$DECLARE + course_names text; +BEGIN + SELECT distinct course_name INTO course_names + FROM public.documents; + + RAISE LOG 'distinct_course_names: %', course_names; + RETURN course_names; +END;$$; + +ALTER FUNCTION "public"."cn"() OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."get_course_names"() RETURNS "json" + LANGUAGE "plpgsql" + AS $$DECLARE + course_names text; +BEGIN + -- Get the latest workflow id and its lock status + SELECT distinct course_name INTO course_names + FROM public.documents; + + RAISE LOG 'distinct_course_names: %', course_names; + RETURN course_names; +END;$$; + +ALTER FUNCTION "public"."get_course_names"() OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."get_distinct_course_names"() RETURNS TABLE("course_name" "text") + LANGUAGE "plpgsql" + AS $$ +BEGIN + RETURN QUERY + SELECT DISTINCT d.course_name + FROM public.documents d + WHERE d.course_name IS NOT NULL; + + RAISE LOG 'Distinct course names retrieved'; +END; +$$; + +ALTER FUNCTION "public"."get_distinct_course_names"() OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."get_latest_workflow_id"() RETURNS bigint + LANGUAGE "plpgsql" + AS $$DECLARE + v_workflow_id bigint; +BEGIN + -- Get the latest workflow id and its lock status + SELECT latest_workflow_id INTO v_workflow_id + FROM public.n8n_workflows + ORDER BY latest_workflow_id DESC + LIMIT 1; + + RAISE LOG 'latest_workflow_id: %', v_workflow_id; + RETURN v_workflow_id; +END;$$; + +ALTER FUNCTION "public"."get_latest_workflow_id"() OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."hello"() RETURNS "text" + LANGUAGE "sql" + AS $$select 'hello world';$$; + +ALTER FUNCTION "public"."hello"() OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."increment"("usage" integer, "apikey" "text") RETURNS "void" + LANGUAGE "sql" + AS $$ + update api_keys + set usage_count = usage_count + usage + where key = apikey + $$; + +ALTER FUNCTION "public"."increment"("usage" integer, "apikey" "text") OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."increment_api_usage"("usage" integer, "apikey" "text") RETURNS "void" + LANGUAGE "sql" SECURITY DEFINER + AS $_$create function increment (usage int, apikey string) + returns void as + $$ + update api_keys + set usage_count = usage_count + usage + where api_key = apiKey + $$ + language sql volatile;$_$; + +ALTER FUNCTION "public"."increment_api_usage"("usage" integer, "apikey" "text") OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."increment_api_usage_count"("usage" integer, "apikey" "text") RETURNS "void" + LANGUAGE "sql" SECURITY DEFINER + AS $_$create function increment (usage int, apikey text) + returns void as + $$ + update api_keys + set usage_count = usage_count + usage + where key = apikey + $$ + language sql volatile;$_$; + +ALTER FUNCTION "public"."increment_api_usage_count"("usage" integer, "apikey" "text") OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."increment_workflows"() RETURNS "trigger" + LANGUAGE "plpgsql" + AS $$BEGIN + -- Increase doc_count on insert + IF TG_OP = 'INSERT' THEN + UPDATE n8n_workflows + SET latest_workflow_id = NEW.latest_workflow_id, + is_locked = True + WHERE latest_workflow_id = NEW.latest_workflow_id; + RETURN NEW; + -- Decrease doc_count on delete + ELSIF TG_OP = 'DELETE' THEN + UPDATE n8n_workflows + SET latest_workflow_id = OLD.latest_workflow_id, + is_locked = False + WHERE latest_workflow_id = OLD.latest_workflow_id; + RETURN OLD; + END IF; + RETURN NULL; -- Should never reach here +END;$$; + +ALTER FUNCTION "public"."increment_workflows"() OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."remove_document_from_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_doc_group" "text") RETURNS "void" + LANGUAGE "plpgsql" + AS $$DECLARE + v_document_id bigint; + v_doc_group_id bigint; + v_doc_count bigint; +BEGIN + -- Check if the document exists + SELECT id INTO v_document_id FROM public.documents WHERE course_name = p_course_name AND ( + (s3_path <> '' AND s3_path IS NOT NULL AND s3_path = p_s3_path) OR + (url = p_url) +); + IF NOT FOUND THEN + RAISE EXCEPTION 'Document does not exist'; + END IF; + + -- Check if the document group exists + SELECT id, doc_count INTO v_doc_group_id, v_doc_count + FROM public.doc_groups + WHERE name = p_doc_group AND course_name = p_course_name; + + IF NOT FOUND THEN + RAISE EXCEPTION 'Document group does not exist'; + END IF; + + -- Delete the association + DELETE FROM public.documents_doc_groups + WHERE document_id = v_document_id AND doc_group_id = v_doc_group_id; + + -- If the doc_count becomes 0, delete the doc_group + IF v_doc_count = 1 THEN + DELETE FROM public.doc_groups + WHERE id = v_doc_group_id; + END IF; +END;$$; + +ALTER FUNCTION "public"."remove_document_from_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_doc_group" "text") OWNER TO "postgres"; + +CREATE OR REPLACE FUNCTION "public"."update_doc_count"() RETURNS "trigger" + LANGUAGE "plpgsql" + AS $$BEGIN + -- Increase doc_count on insert + IF TG_OP = 'INSERT' THEN + UPDATE doc_groups + SET doc_count = doc_count + 1 + WHERE id = NEW.doc_group_id; + RETURN NEW; + -- Decrease doc_count on delete + ELSIF TG_OP = 'DELETE' THEN + UPDATE doc_groups + SET doc_count = doc_count - 1 + WHERE id = OLD.doc_group_id; + RETURN OLD; + END IF; + RETURN NULL; -- Should never reach here +END;$$; + +ALTER FUNCTION "public"."update_doc_count"() OWNER TO "postgres"; + +SET default_tablespace = ''; + +SET default_table_access_method = "heap"; + +CREATE TABLE IF NOT EXISTS "public"."api_keys" ( + "user_id" "text" NOT NULL, + "key" "text" NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "modified_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "usage_count" bigint DEFAULT '0'::bigint NOT NULL, + "is_active" boolean DEFAULT true NOT NULL +); + +ALTER TABLE "public"."api_keys" OWNER TO "postgres"; + +COMMENT ON COLUMN "public"."api_keys"."user_id" IS 'User ID from Clerk auth'; + +CREATE TABLE IF NOT EXISTS "public"."course_names" ( + "course_name" "text" +); + +ALTER TABLE "public"."course_names" OWNER TO "postgres"; + +CREATE TABLE IF NOT EXISTS "public"."depricated_uiuc_chatbot" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"(), + "metadata" "json", + "content" "text" +); + +ALTER TABLE "public"."depricated_uiuc_chatbot" OWNER TO "postgres"; + +COMMENT ON TABLE "public"."depricated_uiuc_chatbot" IS 'Depricated course materials'; + +CREATE TABLE IF NOT EXISTS "public"."documents" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"(), + "s3_path" "text", + "readable_filename" "text", + "course_name" "text", + "url" "text", + "contexts" "jsonb", + "base_url" "text" +); + +ALTER TABLE "public"."documents" OWNER TO "postgres"; + +COMMENT ON TABLE "public"."documents" IS 'Course materials, full info for each document'; + +COMMENT ON COLUMN "public"."documents"."base_url" IS 'Input url for web scraping function'; + +CREATE OR REPLACE VIEW "public"."distinct_course_names" AS + SELECT DISTINCT "documents"."course_name" + FROM "public"."documents"; + +ALTER TABLE "public"."distinct_course_names" OWNER TO "postgres"; + +CREATE TABLE IF NOT EXISTS "public"."doc_groups" ( + "id" bigint NOT NULL, + "name" "text" NOT NULL, + "course_name" "text" NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "enabled" boolean DEFAULT true NOT NULL, + "private" boolean DEFAULT true NOT NULL, + "doc_count" bigint DEFAULT '0'::bigint +); + +ALTER TABLE "public"."doc_groups" OWNER TO "postgres"; + +COMMENT ON TABLE "public"."doc_groups" IS 'This table is to store metadata for the document groups'; + +ALTER TABLE "public"."doc_groups" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."doc_groups_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."documents_doc_groups" ( + "document_id" bigint NOT NULL, + "doc_group_id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL +); + +ALTER TABLE "public"."documents_doc_groups" OWNER TO "postgres"; + +COMMENT ON TABLE "public"."documents_doc_groups" IS 'This is a junction table between documents and doc_groups'; + +ALTER TABLE "public"."documents_doc_groups" ALTER COLUMN "document_id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."documents_doc_groups_document_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."documents_failed" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "s3_path" "text", + "readable_filename" "text", + "course_name" "text", + "url" "text", + "contexts" "jsonb", + "base_url" "text", + "doc_groups" "text", + "error" "text" +); + +ALTER TABLE "public"."documents_failed" OWNER TO "postgres"; + +COMMENT ON TABLE "public"."documents_failed" IS 'Documents that failed to ingest. If we retry and they succeed, it should be removed from this table.'; + +ALTER TABLE "public"."documents_failed" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."documents_failed_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +ALTER TABLE "public"."documents" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."documents_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."documents_in_progress" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "s3_path" "text", + "readable_filename" "text", + "course_name" "text", + "url" "text", + "contexts" "jsonb", + "base_url" "text", + "doc_groups" "text", + "error" "text", + "beam_task_id" "text" +); + +ALTER TABLE "public"."documents_in_progress" OWNER TO "postgres"; + +COMMENT ON TABLE "public"."documents_in_progress" IS 'Document ingest in progress. In Beam.cloud ingest queue.'; + +ALTER TABLE "public"."documents_in_progress" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."documents_in_progress_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."documents_v2" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "course_name" "text", + "readable_filename" "text", + "s3_path" "text", + "url" "text", + "base_url" "text", + "contexts" "jsonb" +); + +ALTER TABLE "public"."documents_v2" OWNER TO "postgres"; + +ALTER TABLE "public"."documents_v2" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."documents_v2_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."email-newsletter" ( + "id" "uuid" DEFAULT "gen_random_uuid"() NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "email" "text", + "unsubscribed-from-newsletter" boolean +); + +ALTER TABLE "public"."email-newsletter" OWNER TO "postgres"; + +CREATE TABLE IF NOT EXISTS "public"."insights" ( + "insight_id" bigint NOT NULL, + "document_id" bigint, + "name" "text" NOT NULL, + "insight" "jsonb", + "description" "text", + "created_at" timestamp with time zone DEFAULT "now"(), + "updated_at" timestamp with time zone DEFAULT "now"() +); + +ALTER TABLE "public"."insights" OWNER TO "postgres"; + +ALTER TABLE "public"."insights" ALTER COLUMN "insight_id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."insights_insight_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."llm-convo-monitor" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"(), + "convo" "json", + "convo_id" "text", + "course_name" "text", + "user_email" "text" +); + +ALTER TABLE "public"."llm-convo-monitor" OWNER TO "postgres"; + +COMMENT ON COLUMN "public"."llm-convo-monitor"."convo_id" IS 'id from Conversation object in Typescript.'; + +COMMENT ON COLUMN "public"."llm-convo-monitor"."user_email" IS 'The users'' email address (first email only, if they have multiple)'; + +ALTER TABLE "public"."llm-convo-monitor" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."llm-convo-monitor_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."n8n_workflows" ( + "latest_workflow_id" bigint NOT NULL, + "is_locked" boolean NOT NULL +); + +ALTER TABLE "public"."n8n_workflows" OWNER TO "postgres"; + +COMMENT ON TABLE "public"."n8n_workflows" IS 'Just the highest number of the latest workflow being run...'; + +COMMENT ON COLUMN "public"."n8n_workflows"."latest_workflow_id" IS 'The highest possible workflow number as it corresponds to N8n workflow IDs.'; + +COMMENT ON COLUMN "public"."n8n_workflows"."is_locked" IS 'During the time between when we getExpectedWorkflowID and the time that we actually start the workflow, another workflow could be started.'; + +ALTER TABLE "public"."n8n_workflows" ALTER COLUMN "latest_workflow_id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."n8n_api_keys_in_progress_workflow_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."nal_publications" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "doi" "text", + "title" "text", + "publisher" "text", + "license" "text", + "doi_number" "text", + "metadata" "jsonb", + "link" "text" +); + +ALTER TABLE "public"."nal_publications" OWNER TO "postgres"; + +ALTER TABLE "public"."nal_publications" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."nal_publications_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."projects" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "course_name" character varying, + "doc_map_id" character varying, + "convo_map_id" character varying, + "n8n_api_key" "text", + "last_uploaded_doc_id" bigint, + "last_uploaded_convo_id" bigint, + "subscribed" bigint, + "description" "text", + "insight_schema" "json" +); + +ALTER TABLE "public"."projects" OWNER TO "postgres"; + +COMMENT ON COLUMN "public"."projects"."n8n_api_key" IS 'N8N API key(s) for each course. If multiple users create tools, they EACH need to store their API key.'; + +ALTER TABLE "public"."projects" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."projects_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."publications" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "pmid" character varying NOT NULL, + "pmcid" character varying, + "doi" character varying, + "journal_title" character varying, + "article_title" character varying, + "issn" character varying, + "published" "date", + "last_revised" "date", + "license" character varying, + "modified_at" timestamp with time zone DEFAULT "now"(), + "full_text" boolean, + "live" boolean, + "release_date" "date", + "pubmed_ftp_link" "text", + "filepath" "text", + "xml_filename" "text" +); + +ALTER TABLE "public"."publications" OWNER TO "postgres"; + +COMMENT ON COLUMN "public"."publications"."filepath" IS 'A comma-separated list of filepaths. Either to the .txt for abstracts, or to PDFs for full text. There can be multiple PDFs (supplementary materials, etc) per article.'; + +ALTER TABLE "public"."publications" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."publications_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +ALTER TABLE "public"."depricated_uiuc_chatbot" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."uiuc-chatbot_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."uiuc-course-table" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "total_tokens" real, + "total_prompt_price" real, + "total_completions_price" real, + "total_embeddings_price" real, + "total_queries" real, + "course_name" "text" +); + +ALTER TABLE "public"."uiuc-course-table" OWNER TO "postgres"; + +COMMENT ON TABLE "public"."uiuc-course-table" IS 'Details about each course'; + +ALTER TABLE "public"."uiuc-course-table" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."uiuc-course-table_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +CREATE TABLE IF NOT EXISTS "public"."usage_metrics" ( + "id" bigint NOT NULL, + "created_at" timestamp with time zone DEFAULT "now"() NOT NULL, + "course_name" "text", + "total_docs" bigint, + "total_convos" bigint, + "most_recent_convo" timestamp without time zone, + "owner_name" "text", + "admin_name" "text" +); + +ALTER TABLE "public"."usage_metrics" OWNER TO "postgres"; + +ALTER TABLE "public"."usage_metrics" ALTER COLUMN "id" ADD GENERATED BY DEFAULT AS IDENTITY ( + SEQUENCE NAME "public"."usage_metrics_id_seq" + START WITH 1 + INCREMENT BY 1 + NO MINVALUE + NO MAXVALUE + CACHE 1 +); + +ALTER TABLE ONLY "public"."api_keys" + ADD CONSTRAINT "api_keys_pkey" PRIMARY KEY ("user_id"); + +ALTER TABLE ONLY "public"."doc_groups" + ADD CONSTRAINT "doc_groups_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."documents_doc_groups" + ADD CONSTRAINT "documents_doc_groups_pkey" PRIMARY KEY ("document_id", "doc_group_id"); + +ALTER TABLE ONLY "public"."documents_failed" + ADD CONSTRAINT "documents_failed_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."documents_in_progress" + ADD CONSTRAINT "documents_in_progress_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."documents" + ADD CONSTRAINT "documents_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."documents_v2" + ADD CONSTRAINT "documents_v2_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."email-newsletter" + ADD CONSTRAINT "email-newsletter_email_key" UNIQUE ("email"); + +ALTER TABLE ONLY "public"."email-newsletter" + ADD CONSTRAINT "email-newsletter_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."insights" + ADD CONSTRAINT "insights_pkey" PRIMARY KEY ("insight_id"); + +ALTER TABLE ONLY "public"."llm-convo-monitor" + ADD CONSTRAINT "llm-convo-monitor_convo_id_key" UNIQUE ("convo_id"); + +ALTER TABLE ONLY "public"."llm-convo-monitor" + ADD CONSTRAINT "llm-convo-monitor_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."n8n_workflows" + ADD CONSTRAINT "n8n_api_keys_pkey" PRIMARY KEY ("latest_workflow_id"); + +ALTER TABLE ONLY "public"."nal_publications" + ADD CONSTRAINT "nal_publications_doi_key" UNIQUE ("doi"); + +ALTER TABLE ONLY "public"."nal_publications" + ADD CONSTRAINT "nal_publications_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."projects" + ADD CONSTRAINT "projects_course_name_key" UNIQUE ("course_name"); + +ALTER TABLE ONLY "public"."projects" + ADD CONSTRAINT "projects_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."publications" + ADD CONSTRAINT "publications_id_key" UNIQUE ("id"); + +ALTER TABLE ONLY "public"."publications" + ADD CONSTRAINT "publications_pkey" PRIMARY KEY ("pmid"); + +ALTER TABLE ONLY "public"."publications" + ADD CONSTRAINT "publications_pmid_key" UNIQUE ("pmid"); + +ALTER TABLE ONLY "public"."depricated_uiuc_chatbot" + ADD CONSTRAINT "uiuc-chatbot_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."uiuc-course-table" + ADD CONSTRAINT "uiuc-course-table_pkey" PRIMARY KEY ("id"); + +ALTER TABLE ONLY "public"."doc_groups" + ADD CONSTRAINT "unique_name_course_name" UNIQUE ("name", "course_name"); + +ALTER TABLE ONLY "public"."usage_metrics" + ADD CONSTRAINT "usage_metrics_pkey" PRIMARY KEY ("id"); + +CREATE INDEX "doc_groups_enabled_course_name_idx" ON "public"."doc_groups" USING "btree" ("enabled", "course_name"); + +CREATE INDEX "documents_course_name_idx" ON "public"."documents" USING "hash" ("course_name"); + +CREATE INDEX "documents_created_at_idx" ON "public"."documents" USING "btree" ("created_at"); + +CREATE INDEX "documents_doc_groups_doc_group_id_idx" ON "public"."documents_doc_groups" USING "btree" ("doc_group_id"); + +CREATE INDEX "documents_doc_groups_document_id_idx" ON "public"."documents_doc_groups" USING "btree" ("document_id"); + +CREATE INDEX "idx_doc_s3_path" ON "public"."documents" USING "btree" ("s3_path"); + +CREATE INDEX "insights_document_id_idx" ON "public"."insights" USING "btree" ("document_id"); + +CREATE INDEX "insights_insight_gin_idx" ON "public"."insights" USING "gin" ("insight"); + +CREATE INDEX "insights_name_idx" ON "public"."insights" USING "btree" ("name"); + +CREATE INDEX "llm-convo-monitor_convo_id_idx" ON "public"."llm-convo-monitor" USING "hash" ("convo_id"); + +CREATE INDEX "llm-convo-monitor_course_name_idx" ON "public"."llm-convo-monitor" USING "hash" ("course_name"); + +CREATE OR REPLACE TRIGGER "trg_update_doc_count_after_insert" AFTER INSERT OR DELETE ON "public"."documents_doc_groups" FOR EACH ROW EXECUTE FUNCTION "public"."update_doc_count"(); + +ALTER TABLE ONLY "public"."insights" + ADD CONSTRAINT "insights_document_id_fkey" FOREIGN KEY ("document_id") REFERENCES "public"."documents"("id") ON DELETE CASCADE; + +ALTER TABLE ONLY "public"."projects" + ADD CONSTRAINT "projects_subscribed_fkey" FOREIGN KEY ("subscribed") REFERENCES "public"."doc_groups"("id") ON UPDATE CASCADE ON DELETE SET NULL; + +ALTER TABLE ONLY "public"."documents_doc_groups" + ADD CONSTRAINT "public_documents_doc_groups_doc_group_id_fkey" FOREIGN KEY ("doc_group_id") REFERENCES "public"."doc_groups"("id") ON DELETE CASCADE; + +ALTER TABLE ONLY "public"."documents_doc_groups" + ADD CONSTRAINT "public_documents_doc_groups_document_id_fkey" FOREIGN KEY ("document_id") REFERENCES "public"."documents"("id") ON DELETE CASCADE; + +CREATE POLICY "Enable execute for anon/service_role users only" ON "public"."api_keys" TO "anon", "service_role"; + +ALTER TABLE "public"."api_keys" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."depricated_uiuc_chatbot" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."doc_groups" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."documents" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."documents_doc_groups" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."documents_failed" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."documents_in_progress" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."documents_v2" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."email-newsletter" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."insights" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."llm-convo-monitor" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."n8n_workflows" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."nal_publications" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."projects" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."publications" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."uiuc-course-table" ENABLE ROW LEVEL SECURITY; + +ALTER TABLE "public"."usage_metrics" ENABLE ROW LEVEL SECURITY; + +ALTER PUBLICATION "supabase_realtime" OWNER TO "postgres"; + +REVOKE USAGE ON SCHEMA "public" FROM PUBLIC; +GRANT USAGE ON SCHEMA "public" TO "postgres"; +GRANT USAGE ON SCHEMA "public" TO "anon"; +GRANT USAGE ON SCHEMA "public" TO "authenticated"; +GRANT USAGE ON SCHEMA "public" TO "service_role"; + +GRANT ALL ON FUNCTION "public"."add_document_to_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) TO "anon"; +GRANT ALL ON FUNCTION "public"."add_document_to_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) TO "authenticated"; +GRANT ALL ON FUNCTION "public"."add_document_to_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) TO "service_role"; + +GRANT ALL ON FUNCTION "public"."add_document_to_group_url"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) TO "anon"; +GRANT ALL ON FUNCTION "public"."add_document_to_group_url"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) TO "authenticated"; +GRANT ALL ON FUNCTION "public"."add_document_to_group_url"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_readable_filename" "text", "p_doc_groups" "text"[]) TO "service_role"; + +GRANT ALL ON FUNCTION "public"."c"() TO "anon"; +GRANT ALL ON FUNCTION "public"."c"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."c"() TO "service_role"; + +GRANT ALL ON FUNCTION "public"."check_and_lock_flows_v2"("id" integer) TO "anon"; +GRANT ALL ON FUNCTION "public"."check_and_lock_flows_v2"("id" integer) TO "authenticated"; +GRANT ALL ON FUNCTION "public"."check_and_lock_flows_v2"("id" integer) TO "service_role"; + +GRANT ALL ON FUNCTION "public"."cn"() TO "anon"; +GRANT ALL ON FUNCTION "public"."cn"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."cn"() TO "service_role"; + +GRANT ALL ON FUNCTION "public"."get_course_names"() TO "anon"; +GRANT ALL ON FUNCTION "public"."get_course_names"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."get_course_names"() TO "service_role"; + +GRANT ALL ON FUNCTION "public"."get_distinct_course_names"() TO "anon"; +GRANT ALL ON FUNCTION "public"."get_distinct_course_names"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."get_distinct_course_names"() TO "service_role"; + +GRANT ALL ON FUNCTION "public"."get_latest_workflow_id"() TO "anon"; +GRANT ALL ON FUNCTION "public"."get_latest_workflow_id"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."get_latest_workflow_id"() TO "service_role"; + +GRANT ALL ON FUNCTION "public"."hello"() TO "anon"; +GRANT ALL ON FUNCTION "public"."hello"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."hello"() TO "service_role"; + +GRANT ALL ON FUNCTION "public"."hypopg"(OUT "indexname" "text", OUT "indexrelid" "oid", OUT "indrelid" "oid", OUT "innatts" integer, OUT "indisunique" boolean, OUT "indkey" "int2vector", OUT "indcollation" "oidvector", OUT "indclass" "oidvector", OUT "indoption" "oidvector", OUT "indexprs" "pg_node_tree", OUT "indpred" "pg_node_tree", OUT "amid" "oid") TO "postgres"; +GRANT ALL ON FUNCTION "public"."hypopg"(OUT "indexname" "text", OUT "indexrelid" "oid", OUT "indrelid" "oid", OUT "innatts" integer, OUT "indisunique" boolean, OUT "indkey" "int2vector", OUT "indcollation" "oidvector", OUT "indclass" "oidvector", OUT "indoption" "oidvector", OUT "indexprs" "pg_node_tree", OUT "indpred" "pg_node_tree", OUT "amid" "oid") TO "anon"; +GRANT ALL ON FUNCTION "public"."hypopg"(OUT "indexname" "text", OUT "indexrelid" "oid", OUT "indrelid" "oid", OUT "innatts" integer, OUT "indisunique" boolean, OUT "indkey" "int2vector", OUT "indcollation" "oidvector", OUT "indclass" "oidvector", OUT "indoption" "oidvector", OUT "indexprs" "pg_node_tree", OUT "indpred" "pg_node_tree", OUT "amid" "oid") TO "authenticated"; +GRANT ALL ON FUNCTION "public"."hypopg"(OUT "indexname" "text", OUT "indexrelid" "oid", OUT "indrelid" "oid", OUT "innatts" integer, OUT "indisunique" boolean, OUT "indkey" "int2vector", OUT "indcollation" "oidvector", OUT "indclass" "oidvector", OUT "indoption" "oidvector", OUT "indexprs" "pg_node_tree", OUT "indpred" "pg_node_tree", OUT "amid" "oid") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."hypopg_create_index"("sql_order" "text", OUT "indexrelid" "oid", OUT "indexname" "text") TO "postgres"; +GRANT ALL ON FUNCTION "public"."hypopg_create_index"("sql_order" "text", OUT "indexrelid" "oid", OUT "indexname" "text") TO "anon"; +GRANT ALL ON FUNCTION "public"."hypopg_create_index"("sql_order" "text", OUT "indexrelid" "oid", OUT "indexname" "text") TO "authenticated"; +GRANT ALL ON FUNCTION "public"."hypopg_create_index"("sql_order" "text", OUT "indexrelid" "oid", OUT "indexname" "text") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."hypopg_drop_index"("indexid" "oid") TO "postgres"; +GRANT ALL ON FUNCTION "public"."hypopg_drop_index"("indexid" "oid") TO "anon"; +GRANT ALL ON FUNCTION "public"."hypopg_drop_index"("indexid" "oid") TO "authenticated"; +GRANT ALL ON FUNCTION "public"."hypopg_drop_index"("indexid" "oid") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."hypopg_get_indexdef"("indexid" "oid") TO "postgres"; +GRANT ALL ON FUNCTION "public"."hypopg_get_indexdef"("indexid" "oid") TO "anon"; +GRANT ALL ON FUNCTION "public"."hypopg_get_indexdef"("indexid" "oid") TO "authenticated"; +GRANT ALL ON FUNCTION "public"."hypopg_get_indexdef"("indexid" "oid") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."hypopg_relation_size"("indexid" "oid") TO "postgres"; +GRANT ALL ON FUNCTION "public"."hypopg_relation_size"("indexid" "oid") TO "anon"; +GRANT ALL ON FUNCTION "public"."hypopg_relation_size"("indexid" "oid") TO "authenticated"; +GRANT ALL ON FUNCTION "public"."hypopg_relation_size"("indexid" "oid") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."hypopg_reset"() TO "postgres"; +GRANT ALL ON FUNCTION "public"."hypopg_reset"() TO "anon"; +GRANT ALL ON FUNCTION "public"."hypopg_reset"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."hypopg_reset"() TO "service_role"; + +GRANT ALL ON FUNCTION "public"."hypopg_reset_index"() TO "postgres"; +GRANT ALL ON FUNCTION "public"."hypopg_reset_index"() TO "anon"; +GRANT ALL ON FUNCTION "public"."hypopg_reset_index"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."hypopg_reset_index"() TO "service_role"; + +GRANT ALL ON FUNCTION "public"."increment"("usage" integer, "apikey" "text") TO "anon"; +GRANT ALL ON FUNCTION "public"."increment"("usage" integer, "apikey" "text") TO "authenticated"; +GRANT ALL ON FUNCTION "public"."increment"("usage" integer, "apikey" "text") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."increment_api_usage"("usage" integer, "apikey" "text") TO "anon"; +GRANT ALL ON FUNCTION "public"."increment_api_usage"("usage" integer, "apikey" "text") TO "authenticated"; +GRANT ALL ON FUNCTION "public"."increment_api_usage"("usage" integer, "apikey" "text") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."increment_api_usage_count"("usage" integer, "apikey" "text") TO "anon"; +GRANT ALL ON FUNCTION "public"."increment_api_usage_count"("usage" integer, "apikey" "text") TO "authenticated"; +GRANT ALL ON FUNCTION "public"."increment_api_usage_count"("usage" integer, "apikey" "text") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."increment_workflows"() TO "anon"; +GRANT ALL ON FUNCTION "public"."increment_workflows"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."increment_workflows"() TO "service_role"; + +-- GRANT ALL ON FUNCTION "public"."index_advisor"("query" "text") TO "anon"; +-- GRANT ALL ON FUNCTION "public"."index_advisor"("query" "text") TO "authenticated"; +-- GRANT ALL ON FUNCTION "public"."index_advisor"("query" "text") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."remove_document_from_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_doc_group" "text") TO "anon"; +GRANT ALL ON FUNCTION "public"."remove_document_from_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_doc_group" "text") TO "authenticated"; +GRANT ALL ON FUNCTION "public"."remove_document_from_group"("p_course_name" "text", "p_s3_path" "text", "p_url" "text", "p_doc_group" "text") TO "service_role"; + +GRANT ALL ON FUNCTION "public"."update_doc_count"() TO "anon"; +GRANT ALL ON FUNCTION "public"."update_doc_count"() TO "authenticated"; +GRANT ALL ON FUNCTION "public"."update_doc_count"() TO "service_role"; + +GRANT ALL ON TABLE "public"."api_keys" TO "anon"; +GRANT ALL ON TABLE "public"."api_keys" TO "authenticated"; +GRANT ALL ON TABLE "public"."api_keys" TO "service_role"; + +GRANT ALL ON TABLE "public"."course_names" TO "anon"; +GRANT ALL ON TABLE "public"."course_names" TO "authenticated"; +GRANT ALL ON TABLE "public"."course_names" TO "service_role"; + +GRANT ALL ON TABLE "public"."depricated_uiuc_chatbot" TO "anon"; +GRANT ALL ON TABLE "public"."depricated_uiuc_chatbot" TO "authenticated"; +GRANT ALL ON TABLE "public"."depricated_uiuc_chatbot" TO "service_role"; + +GRANT ALL ON TABLE "public"."documents" TO "anon"; +GRANT ALL ON TABLE "public"."documents" TO "authenticated"; +GRANT ALL ON TABLE "public"."documents" TO "service_role"; + +GRANT ALL ON TABLE "public"."distinct_course_names" TO "anon"; +GRANT ALL ON TABLE "public"."distinct_course_names" TO "authenticated"; +GRANT ALL ON TABLE "public"."distinct_course_names" TO "service_role"; + +GRANT ALL ON TABLE "public"."doc_groups" TO "anon"; +GRANT ALL ON TABLE "public"."doc_groups" TO "authenticated"; +GRANT ALL ON TABLE "public"."doc_groups" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."doc_groups_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."doc_groups_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."doc_groups_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."documents_doc_groups" TO "anon"; +GRANT ALL ON TABLE "public"."documents_doc_groups" TO "authenticated"; +GRANT ALL ON TABLE "public"."documents_doc_groups" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."documents_doc_groups_document_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."documents_doc_groups_document_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."documents_doc_groups_document_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."documents_failed" TO "anon"; +GRANT ALL ON TABLE "public"."documents_failed" TO "authenticated"; +GRANT ALL ON TABLE "public"."documents_failed" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."documents_failed_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."documents_failed_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."documents_failed_id_seq" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."documents_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."documents_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."documents_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."documents_in_progress" TO "anon"; +GRANT ALL ON TABLE "public"."documents_in_progress" TO "authenticated"; +GRANT ALL ON TABLE "public"."documents_in_progress" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."documents_in_progress_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."documents_in_progress_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."documents_in_progress_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."documents_v2" TO "anon"; +GRANT ALL ON TABLE "public"."documents_v2" TO "authenticated"; +GRANT ALL ON TABLE "public"."documents_v2" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."documents_v2_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."documents_v2_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."documents_v2_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."email-newsletter" TO "anon"; +GRANT ALL ON TABLE "public"."email-newsletter" TO "authenticated"; +GRANT ALL ON TABLE "public"."email-newsletter" TO "service_role"; + +GRANT ALL ON TABLE "public"."hypopg_list_indexes" TO "postgres"; +GRANT ALL ON TABLE "public"."hypopg_list_indexes" TO "anon"; +GRANT ALL ON TABLE "public"."hypopg_list_indexes" TO "authenticated"; +GRANT ALL ON TABLE "public"."hypopg_list_indexes" TO "service_role"; + +GRANT ALL ON TABLE "public"."insights" TO "anon"; +GRANT ALL ON TABLE "public"."insights" TO "authenticated"; +GRANT ALL ON TABLE "public"."insights" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."insights_insight_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."insights_insight_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."insights_insight_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."llm-convo-monitor" TO "anon"; +GRANT ALL ON TABLE "public"."llm-convo-monitor" TO "authenticated"; +GRANT ALL ON TABLE "public"."llm-convo-monitor" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."llm-convo-monitor_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."llm-convo-monitor_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."llm-convo-monitor_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."n8n_workflows" TO "anon"; +GRANT ALL ON TABLE "public"."n8n_workflows" TO "authenticated"; +GRANT ALL ON TABLE "public"."n8n_workflows" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."n8n_api_keys_in_progress_workflow_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."n8n_api_keys_in_progress_workflow_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."n8n_api_keys_in_progress_workflow_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."nal_publications" TO "anon"; +GRANT ALL ON TABLE "public"."nal_publications" TO "authenticated"; +GRANT ALL ON TABLE "public"."nal_publications" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."nal_publications_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."nal_publications_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."nal_publications_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."projects" TO "anon"; +GRANT ALL ON TABLE "public"."projects" TO "authenticated"; +GRANT ALL ON TABLE "public"."projects" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."projects_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."projects_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."projects_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."publications" TO "anon"; +GRANT ALL ON TABLE "public"."publications" TO "authenticated"; +GRANT ALL ON TABLE "public"."publications" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."publications_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."publications_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."publications_id_seq" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."uiuc-chatbot_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."uiuc-chatbot_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."uiuc-chatbot_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."uiuc-course-table" TO "anon"; +GRANT ALL ON TABLE "public"."uiuc-course-table" TO "authenticated"; +GRANT ALL ON TABLE "public"."uiuc-course-table" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."uiuc-course-table_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."uiuc-course-table_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."uiuc-course-table_id_seq" TO "service_role"; + +GRANT ALL ON TABLE "public"."usage_metrics" TO "anon"; +GRANT ALL ON TABLE "public"."usage_metrics" TO "authenticated"; +GRANT ALL ON TABLE "public"."usage_metrics" TO "service_role"; + +GRANT ALL ON SEQUENCE "public"."usage_metrics_id_seq" TO "anon"; +GRANT ALL ON SEQUENCE "public"."usage_metrics_id_seq" TO "authenticated"; +GRANT ALL ON SEQUENCE "public"."usage_metrics_id_seq" TO "service_role"; + +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON SEQUENCES TO "postgres"; +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON SEQUENCES TO "anon"; +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON SEQUENCES TO "authenticated"; +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON SEQUENCES TO "service_role"; + +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON FUNCTIONS TO "postgres"; +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON FUNCTIONS TO "anon"; +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON FUNCTIONS TO "authenticated"; +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON FUNCTIONS TO "service_role"; + +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON TABLES TO "postgres"; +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON TABLES TO "anon"; +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON TABLES TO "authenticated"; +ALTER DEFAULT PRIVILEGES FOR ROLE "postgres" IN SCHEMA "public" GRANT ALL ON TABLES TO "service_role"; + +RESET ALL; \ No newline at end of file diff --git a/docker-compose.override.yml b/docker-compose.override.yml new file mode 100644 index 00000000..42328092 --- /dev/null +++ b/docker-compose.override.yml @@ -0,0 +1,4 @@ +services: + db: + volumes: + - ./db/migrations:/docker-entrypoint-initdb.d/migrations \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 792e1d4f..ee889838 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -11,35 +11,31 @@ services: volumes: - redis-data:/data healthcheck: - test: ["CMD", "redis-cli", "--raw", "ping"] + test: [ "CMD", "redis-cli", "--raw", "ping" ] interval: 30s timeout: 10s retries: 3 qdrant: - image: qdrant/qdrant:v1.9.5 + image: qdrant/qdrant:v1.12.6 restart: unless-stopped container_name: qdrant - # ports: - # - 6333:6333 - # - 6334:6334 - environment: - - QDRANT_API_KEY=${QDRANT_API_KEY} + # environment: + # - QDRANT_API_KEY=${QDRANT_API_KEY} + ports: + - 6333:6333 + - 6334:6334 + expose: + - 6333 + - 6334 + - 6335 volumes: - ./qdrant_data:/qdrant/storage - ./qdrant_config.yaml:/qdrant/config/production.yaml # Mount the config file directly as a volume networks: - uiuc-chat-network healthcheck: - test: - [ - "CMD", - "curl", - "-f", - "-H", - "Authorization: Bearer ${QDRANT_API_KEY}", - "http://qdrant:6333/health", - ] + test: [ "CMD", "curl", "-f", "-H", "Authorization: Bearer ${QDRANT_API_KEY}", "http://qdrant:6333/health" ] interval: 30s timeout: 10s retries: 3 @@ -52,22 +48,21 @@ services: environment: MINIO_ROOT_USER: ${AWS_ACCESS_KEY_ID} MINIO_ROOT_PASSWORD: ${AWS_SECRET_ACCESS_KEY} - command: server /data --address ":${DOCKER_INTERNAL_MINIO_API_PORT}" --console-address ":${DOCKER_INTERNAL_MINIO_DASHBOARD_PORT}" + MINIO_API_PORT: ${DOCKER_INTERNAL_MINIO_API_PORT} + MINIO_CONSOLE_PORT: ${DOCKER_INTERNAL_MINIO_DASHBOARD_PORT} + command: server /data --console-address ":${DOCKER_INTERNAL_MINIO_DASHBOARD_PORT}" --address ":${DOCKER_INTERNAL_MINIO_API_PORT}" ports: - - ${PUBLIC_MINIO_API_PORT}:${DOCKER_INTERNAL_MINIO_API_PORT} - - ${PUBLIC_MINIO_DASHBOARD_PORT}:${DOCKER_INTERNAL_MINIO_DASHBOARD_PORT} + - ${PUBLIC_MINIO_API_PORT}:${DOCKER_INTERNAL_MINIO_API_PORT} # API access + - ${PUBLIC_MINIO_DASHBOARD_PORT}:${DOCKER_INTERNAL_MINIO_DASHBOARD_PORT} # Dashboard access + expose: + - ${PUBLIC_MINIO_API_PORT} + - ${PUBLIC_MINIO_DASHBOARD_PORT} networks: - uiuc-chat-network volumes: - minio-data:/data healthcheck: - test: - [ - "CMD", - "curl", - "-f", - "http://minio:${DOCKER_INTERNAL_MINIO_API_PORT}/minio/health/live", - ] + test: [ "CMD", "curl", "-f", "http://minio:${DOCKER_INTERNAL_MINIO_API_PORT}/minio/health/live" ] interval: 30s timeout: 10s retries: 3 @@ -83,12 +78,15 @@ services: - ./db:/usr/src/app/db # Mount local directory to store SQLite database networks: - uiuc-chat-network + - supabase_default # Add connection to Supabase network depends_on: - qdrant - redis - minio + env_file: + - .env healthcheck: - test: ["CMD", "curl", "-f", "http://flask-app:8000"] + test: [ "CMD", "curl", "-f", "http://flask-app:8000" ] interval: 30s timeout: 10s retries: 3 @@ -100,24 +98,39 @@ services: container_name: ingest-worker networks: - uiuc-chat-network + - supabase_default depends_on: - redis + env_file: + - .env healthcheck: - test: - [ - "CMD", - "python", - "-c", - "from redis import Redis; from rq import Worker; r = Redis(host='redis', port=6379, password='${INGEST_REDIS_PASSWORD}'); exit(0 if Worker.count(r) > 0 else 1)", - ] + test: [ "CMD", "python", "-c", "from redis import Redis; from rq import Worker; r = Redis(host='redis', port=6379, password='${INGEST_REDIS_PASSWORD}'); exit(0 if Worker.count(r) > 0 else 1)" ] interval: 30s timeout: 10s retries: 3 + crawlee: + build: ./ic_crawlee + container_name: crawlee + networks: + - uiuc-chat-network + volumes: + - ./data:/data + ports: + - "3345:3000" + environment: + - INGEST_URL=http://flask-app:8000/ingest # The port has to be the internal port of the Flask app, not the external one in the env file + - uiuc-chat-network + depends_on: + - redis + # declare the network resource # this will allow you to use service discovery and address a container by its name from within the network networks: - uiuc-chat-network: {} + uiuc-chat-network: + driver: bridge + supabase_default: + external: true # Mark as external since it's managed by Supabase volumes: redis-data: {} diff --git a/ic_crawlee b/ic_crawlee new file mode 160000 index 00000000..6011231b --- /dev/null +++ b/ic_crawlee @@ -0,0 +1 @@ +Subproject commit 6011231b02d572b9a64c0eb9b029667dadacd4f2 diff --git a/init.sh b/init.sh new file mode 100755 index 00000000..fae83560 --- /dev/null +++ b/init.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# USAGE: sudo sh init.sh +# If you want to delete all your data for a fresh start, use: sudo sh init.sh --wipe_data + +# Parse command line arguments +wipe_data=false +for arg in "$@"; do + case $arg in + --wipe_data) wipe_data=true ;; + *) echo "Usage: $0 [--wipe_data] (use --wipe_data to delete volumes)" && exit 1 ;; + esac +done + +# Sparse checkout for supabase/docker +git submodule update --init --depth 1 --recursive && \ +cd supabase && \ +git sparse-checkout init --cone && \ +git sparse-checkout set docker && \ +cd .. + +if [ ! -f ./supabase/docker/.env ]; then + cp ./supabase/docker/.env.example ./supabase/docker/.env +fi + +if [ ! -f .env ]; then + cp .env.template .env +fi + +set -e +# Start the Supabase Docker Compose +echo "Starting Supabase services..." +if [ "$wipe_data" = true ]; then + docker compose -f ./supabase/docker/docker-compose.yml down -v +else + docker compose -f ./supabase/docker/docker-compose.yml down +fi +sudo docker compose -f ./supabase/docker/docker-compose.yml -f ./docker-compose.override.yml up -d --build + +# Wait for the database to be ready +# echo "Waiting for the database to be ready..." +# until docker exec supabase-db pg_isready -U postgres; do +# sleep 1 +# done + +# Start the parent Docker Compose +chmod -R 777 ./supabase +echo "Starting application services..." +if [ "$wipe_data" = true ]; then + docker compose -f ./docker-compose.yaml down -v +else + docker compose -f ./docker-compose.yaml down +fi +# Note: you may need to give docker with sufficient permissions to run this command (eg: sudo chmod -r 777 .) +sudo docker compose -f ./docker-compose.yaml up -d --build + +echo "All services are up!" \ No newline at end of file diff --git a/qdrant_config.yaml b/qdrant_config.yaml index 47f74c3f..d970d014 100644 --- a/qdrant_config.yaml +++ b/qdrant_config.yaml @@ -151,8 +151,8 @@ service: # (Either above or via an external service like nginx.) # Sending an api-key over an unencrypted channel is insecure. # - # Uncomment to enable. - # api_key: ${QDRANT_API_KEY} + # ⚠️ MANUALLY KEEP THIS IN SYNC WITH .env FILE + api_key: your-strong-key-here cluster: # Use `enabled: true` to run Qdrant in distributed deployment mode diff --git a/supabase b/supabase new file mode 160000 index 00000000..e36be4e1 --- /dev/null +++ b/supabase @@ -0,0 +1 @@ +Subproject commit e36be4e10258052c1c55937ecb3c34c6af149720