From 86aae734a431f5572e9ae09d2ac4f66a7c34c58b Mon Sep 17 00:00:00 2001 From: chojuninengu Date: Tue, 29 Apr 2025 13:19:55 +0100 Subject: [PATCH 1/2] Enhance GitHub RAG application: Introduced retry mechanism for error handling, added new custom exceptions for validation and processing errors, improved session cleanup logic, and refined repository name extraction. Updated logging format for better traceability. --- github-rag/app.py | 94 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 13 deletions(-) diff --git a/github-rag/app.py b/github-rag/app.py index ab55b3d2a..4eb3ee7e2 100644 --- a/github-rag/app.py +++ b/github-rag/app.py @@ -5,6 +5,8 @@ import pandas as pd from typing import Optional, Dict, Any import logging +from functools import wraps +import time from gitingest import ingest from llama_index.core import Settings, PromptTemplate, VectorStoreIndex, SimpleDirectoryReader @@ -13,7 +15,10 @@ from dotenv import load_dotenv # Configure logging -logging.basicConfig(level=logging.INFO) +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) logger = logging.getLogger(__name__) load_dotenv() @@ -21,43 +26,106 @@ # Constants MAX_REPO_SIZE = 100 * 1024 * 1024 # 100MB SUPPORTED_REPO_TYPES = ['.py', '.md', '.ipynb', '.js', '.ts', '.json'] +MAX_RETRIES = 3 +RETRY_DELAY = 1 # seconds class GitHubRAGError(Exception): - """Custom exception for GitHub RAG application errors""" + """Base exception for GitHub RAG application errors""" pass +class ValidationError(GitHubRAGError): + """Raised when input validation fails""" + pass + +class ProcessingError(GitHubRAGError): + """Raised when repository processing fails""" + pass + +class QueryEngineError(GitHubRAGError): + """Raised when query engine creation or operation fails""" + pass + +class SessionError(GitHubRAGError): + """Raised when session management fails""" + pass + +def retry_on_error(max_retries=MAX_RETRIES, delay=RETRY_DELAY): + """Decorator for retrying operations on failure""" + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + last_exception = None + for attempt in range(max_retries): + try: + return func(*args, **kwargs) + except Exception as e: + last_exception = e + if attempt < max_retries - 1: + logger.warning(f"Attempt {attempt + 1} failed: {str(e)}. Retrying...") + time.sleep(delay) + raise last_exception + return wrapper + return decorator + def validate_github_url(url: str) -> bool: """Validate GitHub repository URL""" - return url.startswith(('https://github.com/', 'http://github.com/')) + if not url: + raise ValidationError("Repository URL cannot be empty") + if not url.startswith(('https://github.com/', 'http://github.com/')): + raise ValidationError("Invalid GitHub URL format. URL must start with 'https://github.com/' or 'http://github.com/'") + return True def get_repo_name(url: str) -> str: """Extract repository name from URL""" try: - return url.split('/')[-1].replace('.git', '') + parts = url.split('/') + if len(parts) < 5: + raise ValidationError("Invalid repository URL format") + repo_name = parts[-1].replace('.git', '') + if not repo_name: + raise ValidationError("Could not extract repository name from URL") + return repo_name except Exception as e: - raise GitHubRAGError(f"Invalid repository URL: {str(e)}") + raise ValidationError(f"Failed to extract repository name: {str(e)}") + +def cleanup_session(): + """Clean up session resources""" + try: + if hasattr(st.session_state, 'file_cache'): + for key, value in st.session_state.file_cache.items(): + try: + del value + except Exception as e: + logger.warning(f"Failed to cleanup cache entry {key}: {str(e)}") + st.session_state.file_cache.clear() + gc.collect() + logger.info("Session cleanup completed successfully") + except Exception as e: + logger.error(f"Error during session cleanup: {str(e)}") + raise SessionError(f"Failed to cleanup session: {str(e)}") def reset_chat(): """Reset chat session and clean up resources""" try: st.session_state.messages = [] st.session_state.context = None - gc.collect() + cleanup_session() logger.info("Chat session reset successfully") except Exception as e: logger.error(f"Error resetting chat: {str(e)}") - raise GitHubRAGError("Failed to reset chat session") + raise SessionError("Failed to reset chat session") +@retry_on_error() def process_with_gitingets(github_url: str) -> tuple: """Process GitHub repository using gitingest""" try: summary, tree, content = ingest(github_url) if not all([summary, tree, content]): - raise GitHubRAGError("Failed to process repository: Missing data") + raise ProcessingError("Failed to process repository: Missing data") return summary, tree, content except Exception as e: logger.error(f"Error processing repository: {str(e)}") - raise GitHubRAGError(f"Failed to process repository: {str(e)}") + raise ProcessingError(f"Failed to process repository: {str(e)}") def create_query_engine(content_path: str, repo_name: str) -> Any: """Create and configure query engine""" @@ -97,7 +165,7 @@ def create_query_engine(content_path: str, repo_name: str) -> Any: return query_engine except Exception as e: logger.error(f"Error creating query engine: {str(e)}") - raise GitHubRAGError(f"Failed to create query engine: {str(e)}") + raise QueryEngineError(f"Failed to create query engine: {str(e)}") # Initialize session state if "id" not in st.session_state: @@ -147,7 +215,7 @@ def create_query_engine(content_path: str, repo_name: str) -> Any: st.success("Repository loaded successfully! Ready to chat.") logger.info(f"Successfully processed repository: {repo_name}") - except GitHubRAGError as e: + except ProcessingError as e: st.error(str(e)) logger.error(f"Error processing repository {repo_name}: {str(e)}") st.stop() @@ -198,7 +266,7 @@ def create_query_engine(content_path: str, repo_name: str) -> Any: query_engine = st.session_state.file_cache.get(file_key) if query_engine is None: - raise GitHubRAGError("Please load a repository first!") + raise QueryEngineError("Please load a repository first!") response = query_engine.query(prompt) @@ -214,7 +282,7 @@ def create_query_engine(content_path: str, repo_name: str) -> Any: message_placeholder.markdown(full_response) st.session_state.messages.append({"role": "assistant", "content": full_response}) - except GitHubRAGError as e: + except QueryEngineError as e: st.error(str(e)) logger.error(f"Error in chat processing: {str(e)}") except Exception as e: From bec6a1612bbf3db6c3cd4f68be977fb33d1356cb Mon Sep 17 00:00:00 2001 From: chojuninengu Date: Tue, 29 Apr 2025 13:20:18 +0100 Subject: [PATCH 2/2] Refactor session state management and repository loading in GitHub RAG application: Introduced dedicated functions for initializing session state and handling repository loading with improved error handling. Enhanced chat message processing and updated logging for better traceability. --- github-rag/app.py | 242 ++++++++++++++++++++++++---------------------- 1 file changed, 128 insertions(+), 114 deletions(-) diff --git a/github-rag/app.py b/github-rag/app.py index 4eb3ee7e2..3ff4b92a5 100644 --- a/github-rag/app.py +++ b/github-rag/app.py @@ -167,128 +167,142 @@ def create_query_engine(content_path: str, repo_name: str) -> Any: logger.error(f"Error creating query engine: {str(e)}") raise QueryEngineError(f"Failed to create query engine: {str(e)}") -# Initialize session state -if "id" not in st.session_state: - st.session_state.id = uuid.uuid4() - st.session_state.file_cache = {} - st.session_state.messages = [] - -session_id = st.session_state.id - -# Sidebar -with st.sidebar: - st.header("Add your GitHub repository!") - - github_url = st.text_input( - "Enter GitHub repository URL", - placeholder="https://github.com/username/repo", - help="Enter a valid GitHub repository URL" - ) - - load_repo = st.button("Load Repository", type="primary") - - if github_url and load_repo: - try: - # Validate URL - if not validate_github_url(github_url): - st.error("Please enter a valid GitHub repository URL") - st.stop() +def initialize_session_state(): + """Initialize or reset session state variables""" + try: + if 'messages' not in st.session_state: + st.session_state.messages = [] + if 'context' not in st.session_state: + st.session_state.context = None + if 'file_cache' not in st.session_state: + st.session_state.file_cache = {} + if 'current_repo' not in st.session_state: + st.session_state.current_repo = None + if 'query_engine' not in st.session_state: + st.session_state.query_engine = None + logger.info("Session state initialized successfully") + except Exception as e: + logger.error(f"Error initializing session state: {str(e)}") + raise SessionError("Failed to initialize session state") - repo_name = get_repo_name(github_url) - file_key = f"{session_id}-{repo_name}" +def handle_repository_loading(github_url: str) -> None: + """Handle repository loading process with proper error handling""" + try: + validate_github_url(github_url) + repo_name = get_repo_name(github_url) + + with st.spinner(f"Processing repository {repo_name}..."): + summary, tree, content = process_with_gitingets(github_url) - if file_key not in st.session_state.file_cache: - with st.spinner("Processing your repository..."): - with tempfile.TemporaryDirectory() as temp_dir: - try: - summary, tree, content = process_with_gitingets(github_url) - - # Write content to temporary file - content_path = os.path.join(temp_dir, f"{repo_name}_content.md") - with open(content_path, "w", encoding="utf-8") as f: - f.write(content) - - # Create and cache query engine - query_engine = create_query_engine(temp_dir, repo_name) - st.session_state.file_cache[file_key] = query_engine - - st.success("Repository loaded successfully! Ready to chat.") - logger.info(f"Successfully processed repository: {repo_name}") - - except ProcessingError as e: - st.error(str(e)) - logger.error(f"Error processing repository {repo_name}: {str(e)}") - st.stop() - except Exception as e: - st.error("An unexpected error occurred while processing the repository") - logger.error(f"Unexpected error: {str(e)}") - st.stop() - else: - st.info("Repository already loaded. Ready to chat!") + # Create temporary directory for repository content + with tempfile.TemporaryDirectory() as temp_dir: + content_path = os.path.join(temp_dir, repo_name) + os.makedirs(content_path, exist_ok=True) - except Exception as e: - st.error(f"An error occurred: {str(e)}") - logger.error(f"Error in repository loading process: {str(e)}") - st.stop() - -# Main content -col1, col2 = st.columns([6, 1]) - -with col1: - st.header("Chat with GitHub using RAG ") - -with col2: - st.button("Clear Chat ↺", on_click=reset_chat, help="Clear chat history and reset session") - -# Display chat history -for message in st.session_state.messages: - with st.chat_message(message["role"]): - st.markdown(message["content"]) + # Save repository content + for file_path, file_content in content.items(): + file_dir = os.path.dirname(os.path.join(content_path, file_path)) + os.makedirs(file_dir, exist_ok=True) + with open(os.path.join(content_path, file_path), 'w', encoding='utf-8') as f: + f.write(file_content) + + # Create query engine + query_engine = create_query_engine(content_path, repo_name) + + # Update session state + st.session_state.query_engine = query_engine + st.session_state.current_repo = repo_name + st.session_state.context = { + 'summary': summary, + 'tree': tree, + 'content': content + } + + st.success(f"Successfully loaded repository: {repo_name}") + logger.info(f"Repository {repo_name} loaded successfully") + + except ValidationError as e: + st.error(f"Validation error: {str(e)}") + logger.warning(f"Validation error for URL {github_url}: {str(e)}") + except ProcessingError as e: + st.error(f"Processing error: {str(e)}") + logger.error(f"Error processing repository {github_url}: {str(e)}") + except QueryEngineError as e: + st.error(f"Query engine error: {str(e)}") + logger.error(f"Error creating query engine for {github_url}: {str(e)}") + except Exception as e: + st.error(f"Unexpected error: {str(e)}") + logger.error(f"Unexpected error processing {github_url}: {str(e)}") + finally: + cleanup_session() -# Chat input -if prompt := st.chat_input("What's up?"): +def handle_chat_message(prompt: str) -> None: + """Handle chat message processing with proper error handling""" try: - # Add user message to chat history + if not st.session_state.query_engine: + raise QueryEngineError("Please load a repository first!") + + if not prompt.strip(): + raise ValidationError("Please enter a non-empty message") + + # Add user message to chat st.session_state.messages.append({"role": "user", "content": prompt}) - # Display user message - with st.chat_message("user"): - st.markdown(prompt) + # Get response from query engine + response = st.session_state.query_engine.query(prompt) + + # Format and display response + full_response = f"Repository: {st.session_state.current_repo}\n\n{response}" + st.session_state.messages.append({"role": "assistant", "content": full_response}) + + logger.info(f"Successfully processed chat message for repository {st.session_state.current_repo}") + + except ValidationError as e: + st.error(f"Validation error: {str(e)}") + logger.warning(f"Chat validation error: {str(e)}") + except QueryEngineError as e: + st.error(f"Query engine error: {str(e)}") + logger.error(f"Error in chat processing: {str(e)}") + except Exception as e: + st.error(f"Unexpected error: {str(e)}") + logger.error(f"Unexpected error in chat: {str(e)}") - # Process and display assistant response - with st.chat_message("assistant"): - message_placeholder = st.empty() - full_response = "" +def main(): + """Main application function""" + st.title("GitHub Repository RAG") + + try: + # Initialize session state + initialize_session_state() + + # Sidebar for repository input + with st.sidebar: + st.header("Repository Settings") + github_url = st.text_input("Enter GitHub Repository URL") - try: - repo_name = get_repo_name(github_url) - file_key = f"{session_id}-{repo_name}" - query_engine = st.session_state.file_cache.get(file_key) - - if query_engine is None: - raise QueryEngineError("Please load a repository first!") - - response = query_engine.query(prompt) - - if hasattr(response, 'response_gen'): - for chunk in response.response_gen: - if isinstance(chunk, str): - full_response += chunk - message_placeholder.markdown(full_response + "▌") + if st.button("Load Repository"): + if github_url: + handle_repository_loading(github_url) else: - full_response = str(response) - message_placeholder.markdown(full_response) - - message_placeholder.markdown(full_response) - st.session_state.messages.append({"role": "assistant", "content": full_response}) - - except QueryEngineError as e: - st.error(str(e)) - logger.error(f"Error in chat processing: {str(e)}") - except Exception as e: - st.error("An unexpected error occurred while processing your query") - logger.error(f"Unexpected error in chat: {str(e)}") - + st.warning("Please enter a GitHub repository URL") + + if st.button("Reset Chat"): + reset_chat() + + # Main chat interface + for message in st.session_state.messages: + with st.chat_message(message["role"]): + st.write(message["content"]) + + if prompt := st.chat_input("Ask a question about the repository"): + handle_chat_message(prompt) + + except SessionError as e: + st.error(f"Session error: {str(e)}") + logger.error(f"Session error in main: {str(e)}") except Exception as e: - st.error("An error occurred in the chat system") - logger.error(f"Chat system error: {str(e)}") \ No newline at end of file + st.error(f"Unexpected error: {str(e)}") + logger.error(f"Unexpected error in main: {str(e)}") + +if __name__ == "__main__": + main() \ No newline at end of file